In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
url = "https://raw.githubusercontent.com/akilan0303/Spam-Email-Prediction/refs/heads/main/mail_data.csv"
mail_data = pd.read_csv(url)

# Preprocessing
mail_data.fillna('', inplace=True)
mail_data['Category'] = mail_data['Category'].map({'spam': 0, 'ham': 1})

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(mail_data['Message'], mail_data['Category'], test_size=0.2, random_state=3)

# Convert text data to numerical vectors
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)


In [3]:
# Train model
model = LogisticRegression()
model.fit(X_train_features, Y_train)

# Accuracy scores
print("Training Accuracy:", accuracy_score(Y_train, model.predict(X_train_features)))
print("Test Accuracy:", accuracy_score(Y_test, model.predict(X_test_features)))


Training Accuracy: 0.9676912721561588
Test Accuracy: 0.9668161434977578


In [5]:
# Sample mail prediction
sample_mail = ["You won a free iPhone! Click here to claim now."]
sample_features = vectorizer.transform(sample_mail)
prediction = model.predict(sample_features)

# Output result
# Check the prediction result
if prediction[0] == 1:
    print("Prediction: Ham Mail")
else:
    print("Prediction: Spam Mail")



Prediction: Spam Mail
