In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
)


In [2]:
df = pd.read_csv("emails.csv")
print("Data shape:", df.shape)

Data shape: (5172, 3002)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [8]:
print("Columns:", df.columns[:5], "...", df.columns[-3:])

Columns: Index(['Email No.', 'the', 'to', 'ect', 'and'], dtype='object') ... Index(['ff', 'dry', 'Prediction'], dtype='object')


In [10]:
X = df.iloc[:, 1:-1].values   
y = df.iloc[:, -1].values

In [None]:
print("Feature matrix X shape:", X.shape)
print("Label vector y shape:", y.shape)


Feature matrix X shape: (5172, 3000)
Label vector y shape: (5172,)
Spam counts: [3672 1500]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

In [13]:
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
knn = KNeighborsClassifier(n_neighbors=5)  
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

In [15]:
svm = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)

In [16]:
def evaluate_model(name, y_true, y_pred, y_proba=None):
    print("=== Evaluation:", name, "===")
    print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
    if y_proba is not None:
        try:
            auc = roc_auc_score(y_true, y_proba)
            print("ROC AUC:", round(auc, 4))
        except Exception as e:
            print("ROC AUC could not be computed:", e)
    print("\n")

In [18]:
evaluate_model("K-Nearest Neighbors (k=5)", y_test, y_pred_knn)
evaluate_model("Support Vector Machine (RBF)", y_test, y_pred_svm)

=== Evaluation: K-Nearest Neighbors (k=5) ===
Accuracy: 0.8338

Classification report:
              precision    recall  f1-score   support

           0     0.9779    0.7837    0.8701       735
           1     0.6435    0.9567    0.7694       300

    accuracy                         0.8338      1035
   macro avg     0.8107    0.8702    0.8198      1035
weighted avg     0.8810    0.8338    0.8409      1035

Confusion matrix:
 [[576 159]
 [ 13 287]]


=== Evaluation: Support Vector Machine (RBF) ===
Accuracy: 0.9304

Classification report:
              precision    recall  f1-score   support

           0     0.9149    0.9946    0.9531       735
           1     0.9831    0.7733    0.8657       300

    accuracy                         0.9304      1035
   macro avg     0.9490    0.8839    0.9094      1035
weighted avg     0.9346    0.9304    0.9277      1035

Confusion matrix:
 [[731   4]
 [ 68 232]]


