In [20]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc
)
from scipy.sparse import hstack



In [23]:
df = pd.read_csv("spamv2.csv", encoding='latin-1')
df=df[["v1", "v2"]]
df.columns = ["label", "text"]
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok larÂ… then Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

df["text"] = df["text"].apply(clean_text)

# encode labels
df["label"] = df["label"].map({"ham":0, "spam":1})


In [25]:
df["length"] = df["text"].apply(len)
df["num_links"] = df["text"].str.count("http")
df["num_digits"] = df["text"].str.count(r"\d")


In [26]:
X_text = df["text"]
y = df["label"]

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [27]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=4000,
    ngram_range=(1,2)
)

X_train_vec = vectorizer.fit_transform(X_train_text)
X_test_vec = vectorizer.transform(X_test_text)


In [28]:
extra_train = df.loc[X_train_text.index, ["length","num_links","num_digits"]].values
extra_test  = df.loc[X_test_text.index,  ["length","num_links","num_digits"]].values

X_train_final = hstack([X_train_vec, extra_train])
X_test_final  = hstack([X_test_vec, extra_test])


In [31]:
knn = KNeighborsClassifier(
    n_neighbors=5,
    metric="cosine"
)

knn.fit(X_train_final, y_train)
knn_pred = knn.predict(X_test_final)
print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
print("KNN Classification Report:\n", classification_report(y_test, knn_pred))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, knn_pred))


KNN Accuracy: 0.9587443946188341
KNN Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98       966
           1       0.83      0.87      0.85       149

    accuracy                           0.96      1115
   macro avg       0.91      0.92      0.91      1115
weighted avg       0.96      0.96      0.96      1115

KNN Confusion Matrix:
 [[940  26]
 [ 20 129]]


## Decision Tree classifier model

In [30]:
dt = DecisionTreeClassifier(
    max_depth=25,
    criterion="entropy",
    random_state=42
)

dt.fit(X_train_final, y_train)
dt_pred = dt.predict(X_test_final)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Decision Tree Classification Report:\n", classification_report(y_test, dt_pred))


Decision Tree Accuracy: 0.9730941704035875
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       966
           1       0.91      0.89      0.90       149

    accuracy                           0.97      1115
   macro avg       0.95      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115

