In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [42]:
# Load dataset
data = pd.read_csv(r'H:\Project\Spam Email Detection\emails.csv', encoding='latin-1')

In [43]:
# Drop unnecessary columns and rename for clarity
data = data[['text', 'spam']]
data.columns = ['EmailText', 'Label']

In [44]:
import joblib

# Preprocess data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['EmailText'])
y = data['Label'].map({'ham': 0, 'spam': 1})

# Save the vectorizer as a pickle file
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [45]:
# Handle missing values in the target variable
y = data['Label'].dropna()

# Ensure X and y have the same length after dropping NaNs
X = X[data['Label'].notna()]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
import joblib

# Train SVM with linear kernel
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)
y_pred_svm_linear = svm_linear.predict(X_test)
# Save the linear kernel model
joblib.dump(svm_linear, 'model.pkl')


['model.pkl']

In [47]:
# Train SVM with RBF kernel
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train, y_train)
y_pred_svm_rbf = svm_rbf.predict(X_test)

In [48]:
# Train Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

In [49]:
# Train Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [50]:
# Evaluate models
print("SVM (Linear Kernel) Classification Report:")
print(classification_report(y_test, y_pred_svm_linear))

print("SVM (RBF Kernel) Classification Report:")
print(classification_report(y_test, y_pred_svm_rbf))



SVM (Linear Kernel) Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       856
           1       0.99      0.97      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146

SVM (RBF Kernel) Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       0.99      0.95      0.97       290

    accuracy                           0.99      1146
   macro avg       0.99      0.97      0.98      1146
weighted avg       0.99      0.99      0.99      1146



In [51]:
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       856
           1       1.00      0.60      0.75       290

    accuracy                           0.90      1146
   macro avg       0.94      0.80      0.84      1146
weighted avg       0.91      0.90      0.89      1146

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       856
           1       1.00      0.89      0.94       290

    accuracy                           0.97      1146
   macro avg       0.98      0.94      0.96      1146
weighted avg       0.97      0.97      0.97      1146

