In [None]:
import pandas as pd

data = pd.read_csv("./spam.csv", encoding="ISO-8859-1")
data.info()

In [None]:
data.head()

In [None]:
# Some of these columns look suspicious.
# Let's explore what's in there

cols = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"]

for col in cols:
    uniq_vals = data[col].unique()
    print(f"{len(uniq_vals)} unique value(s) for {col}:")
    print(uniq_vals)
    print("\n")

In [None]:
# Seems like the values in these unnamed cols are a result of
# badly structured CSV. Let's merge them so we end up with 2 columns:
# v1: spam/ham
# v2: email content

# first, clean NaN values:
data.fillna("")

# print(cols[0])

# then, merge all 3 lame cols into one:
resulting_col = (data[cols[0]] + data[cols[1]] + data[cols[2]]).fillna("")

# # then, update v2 col
data["v2"] += resulting_col

# finally, drop the unnecessary cols:
data = data.drop(columns=cols)

In [None]:
data.head()

In [None]:
# let's add a numerical label
from sklearn.preprocessing import LabelEncoder

labels = data["v1"].unique()
enc = LabelEncoder()
enc.fit(labels)
label_map = dict(zip(enc.classes_, enc.transform(enc.classes_)))
print(label_map)
data["labels"] = enc.transform(data["v1"])
print(data.head())

In [None]:
# we'll split the dataset into a train (80%) and test (20%)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data["v2"], data["labels"], test_size=0.2, random_state=69)

In [None]:
# convert the email texts into tokens
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# train the model
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

In [None]:
# test the trained model over the test data and evaluate

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = classifier.predict(X_test_vectorized)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

In [None]:
# Let's try the same, this time using a different model:
# k-Means clustering into 2 clusters
# We'll use a different approach to evaluate the model

X = vectorizer.fit_transform(data["v2"])

In [None]:
# train the model
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=69)
data["result"] = kmeans.fit_predict(X)

In [None]:
data.head()

In [None]:
def counter(expected_label, actual_label):
    return len(data[(data["v1"] == expected_label) & (data["result"] == label_map[actual_label])])

true_positives = counter("spam", "spam")
true_negatives = counter("ham", "ham")
false_positives = counter("ham", "spam")
false_negatives = counter("spam", "ham")

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1 = 2 * precision * recall / (precision + recall)

print(f"precision: {precision}")
print(f"recall: {recall}")
print(f"f1: {f1}")