In [None]:
from typing import cast
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


In [None]:
# retrieve input datsets
wordlist = set(pd.read_csv("input_data/words.csv", encoding="UTF-8")["words"])
mails = pd.read_csv("input_data/enron_spam_data.csv")
mails.drop(labels=["Message ID", "Date", "Subject"], axis=1, inplace=True)
mails.dropna(subset=["Message"], inplace=True)
mails.rename(
    columns={"Message": "message", "Spam/Ham": "spam"},
    inplace=True,
)
mails["spam"] = mails["spam"].map(
    lambda spam_ham: True if spam_ham == "spam" else False
)

mails_split = train_test_split(mails, test_size=0.2, random_state=123, shuffle=True)
train_data, test_data = cast(tuple[pd.DataFrame, pd.DataFrame], mails_split)
train_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

# download nltk datsets
nltk.download("stopwords")
nltk.download("punkt_tab")
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [25]:
# Implement Naive Bayes using CountVectorizer + scikit-learn
def clean_message(text: str) -> list[str]:
    """Lower-case, tokenize, stem and filter tokens."""
    text = text.lower()
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in tokens if w.isalpha() and w not in stop_words and w in wordlist]

# Vectorize using our tokenizer so the same cleaning/filtering is applied
vectorizer = CountVectorizer(tokenizer=clean_message, lowercase=True, stop_words=list(stop_words))
X_train = vectorizer.fit_transform(train_data["message"])
X_test = vectorizer.transform(test_data["message"])
y_train = train_data["spam"]
y_test = test_data["spam"]

In [26]:
# Train Multinomial Naive Bayes
clf = MultinomialNB(alpha=1.0)
clf.fit(X_train, y_train)

# Predict & evaluate
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Naive Bayes (CountVectorizer) accuracy: {accuracy:.4f}")

Naive Bayes (CountVectorizer) accuracy: 0.9517
