In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


## Load Dataset

In [2]:
df = pd.read_csv("spam.csv", sep="\t", names=["label", "text"])
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Convert Labels to Numbers

In [3]:
df["label_num"] = df["label"].map({"ham": 0, "spam": 1})
df.head()


Unnamed: 0,label,text,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## convert Text to Numerical Features

In [4]:
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["text"])
y = df["label_num"]


## Train–Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


## Train KNN Model

In [6]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)


## Evaluate KNN

In [9]:
knn_pred = knn.predict(X_test)
accuracy_score(y_test, knn_pred)


0.9167264895908112

In [10]:
print(classification_report(y_test, knn_pred))


              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1207
           1       1.00      0.38      0.55       186

    accuracy                           0.92      1393
   macro avg       0.96      0.69      0.75      1393
weighted avg       0.92      0.92      0.90      1393



# DECISION TREE CLASSIFICATION

## Train Decision Tree

In [11]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)


## Evaluate Decision Tree

In [13]:
dt_pred = dt.predict(X_test)
accuracy_score(y_test, dt_pred)
print(classification_report(y_test, dt_pred))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1207
           1       0.95      0.85      0.90       186

    accuracy                           0.97      1393
   macro avg       0.96      0.92      0.94      1393
weighted avg       0.97      0.97      0.97      1393

