In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score


In [2]:
# Step 1: Load the dataset
data = pd.read_csv("email.csv")  # Replace with the path to your dataset
print(data.head())  # Check the first few rows of the dataset


   IsSpam                                               text
0       0  key issues going forwarda year end reviews rep...
1       0  congrats contratulations the execution the cen...
2       0   key issues going forwardall under control set...
3       0  epmi files protest entergy transcoattached our...
4       0  california power please contact kristin walsh ...


In [3]:
# Step 2: Data Preprocessing
# Clean the text data
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)           # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)          # Replace multiple spaces with a single space
    text = text.lower()                       # Convert to lowercase
    return text

In [4]:
data['text'] = data['text'].apply(preprocess_text)

In [6]:
# Step 3: Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)  # Adjust max_features as needed
X = vectorizer.fit_transform(data['text']).toarray()
y = data['IsSpam']  # The target column, where 1 indicates spam and 0 indicates not spam


In [7]:
# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Step 5: K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can tune 'n_neighbors' for better performance
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)


In [9]:
# Performance Evaluation for KNN
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_confusion = confusion_matrix(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)

In [10]:
print("K-Nearest Neighbors Performance:")
print(f"Accuracy: {knn_accuracy:.2f}")
print(f"Confusion Matrix:\n{knn_confusion}")
print(f"Precision: {knn_precision:.2f}")
print(f"Recall: {knn_recall:.2f}\n")

K-Nearest Neighbors Performance:
Accuracy: 0.93
Confusion Matrix:
[[88  8]
 [ 6 98]]
Precision: 0.92
Recall: 0.94



In [11]:
# Step 6: Support Vector Machine Classifier
svm_model = SVC(kernel='linear')  # Linear kernel is suitable for text classification
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

In [12]:
# Performance Evaluation for SVM
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_confusion = confusion_matrix(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)

In [13]:
print("Support Vector Machine Performance:")
print(f"Accuracy: {svm_accuracy:.2f}")
print(f"Confusion Matrix:\n{svm_confusion}")
print(f"Precision: {svm_precision:.2f}")
print(f"Recall: {svm_recall:.2f}\n")

Support Vector Machine Performance:
Accuracy: 0.97
Confusion Matrix:
[[ 91   5]
 [  1 103]]
Precision: 0.95
Recall: 0.99



In [14]:
# Step 7: Compare the Performance
print("Performance Comparison:")
print("Model\t\tAccuracy\tPrecision\tRecall")
print(f"KNN\t\t{knn_accuracy:.2f}\t\t{knn_precision:.2f}\t\t{knn_recall:.2f}")
print(f"SVM\t\t{svm_accuracy:.2f}\t\t{svm_precision:.2f}\t\t{svm_recall:.2f}")

Performance Comparison:
Model		Accuracy	Precision	Recall
KNN		0.93		0.92		0.94
SVM		0.97		0.95		0.99
