In [9]:
@Au

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
import os


# Preprocessing

In [10]:

def populateDF(dir_label):
    emails = pd.DataFrame(columns={"text","category","category_num"})
    for dir_path in dir_label.keys():
        entries = os.listdir(dir_path)
        for entry in entries:
            entry = dir_path+"/"+entry
            with open(entry) as file:
                try:
                    text = file.read()
                except:
                    print("file "+ entry + " was not readable")
                    continue
                new_row = pd.DataFrame(data={'text': [text], 'category': [dir_label[dir_path]]})
                new_row['category_num'] = new_row.category.map({'ham':0, 'spam':1})
                emails = emails.append(new_row, ignore_index=True)
                  
    return emails
emails_easy_ham = populateDF({"easy_ham_2002":"ham","spam_2002":"spam"})
emails_hard_ham = populateDF({"hard_ham_2002":"ham","spam_2002":"spam"})
emails_spam = populateDF({"spam_2002":"spam"})
emails_easy_ham_only = populateDF({"easy_ham_2002":"ham"})
emails_hard_ham_only = populateDF({"hard_ham_2002":"ham"})


file spam_2002/0123.68e87f8b736959b1ab5c4b5f2ce7484a was not readable
file spam_2002/0273.51c482172b47ce926021aa7cc2552549 was not readable
file spam_2002/0330.a4df526233e524104c3b3554dd8ab5a8 was not readable
file spam_2002/0334.3e4946e69031f3860ac6de3d3f27aadd was not readable
file spam_2002/0335.9822e1787fca0741a8501bdef7e8bc79 was not readable
file spam_2002/0123.68e87f8b736959b1ab5c4b5f2ce7484a was not readable
file spam_2002/0273.51c482172b47ce926021aa7cc2552549 was not readable
file spam_2002/0330.a4df526233e524104c3b3554dd8ab5a8 was not readable
file spam_2002/0334.3e4946e69031f3860ac6de3d3f27aadd was not readable
file spam_2002/0335.9822e1787fca0741a8501bdef7e8bc79 was not readable
file spam_2002/0123.68e87f8b736959b1ab5c4b5f2ce7484a was not readable
file spam_2002/0273.51c482172b47ce926021aa7cc2552549 was not readable
file spam_2002/0330.a4df526233e524104c3b3554dd8ab5a8 was not readable
file spam_2002/0334.3e4946e69031f3860ac6de3d3f27aadd was not readable
file spam_2002/0335.

In [12]:
def preprocess(x,y, vectorizer):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)
    y_train = y_train.astype("int")
    return x_train, x_test, y_train, y_test

x_train_easy, x_test_easy, y_train_easy, y_test_easy = preprocess(emails_easy_ham["text"], emails_easy_ham["category_num"], CountVectorizer())
x_train_hard, x_test_hard, y_train_hard, y_test_hard = preprocess(emails_hard_ham["text"], emails_hard_ham["category_num"], CountVectorizer())



# Fitting and predictions

In [13]:
def train_multiNB(x_train,y_train):
    multiNB = MultinomialNB()
    multiNB.fit(x_train,y_train)
    return multiNB


multiNB_easy = train_multiNB(x_train_easy,y_train_easy)
multiNB_hard = train_multiNB(x_train_hard,y_train_hard)


In [36]:
def train_berNB(x_train,y_train):
    bernNB = BernoulliNB(binarize=1.0)
    bernNB.fit(x_train,y_train)
    return bernNB

berNB_easy = train_berNB(x_train_easy,y_train_easy)
berNB_hard = train_berNB(x_train_hard,y_train_hard)

In [37]:
def predict(model, x_test):
    y_pred = model.predict(x_test)
    return y_pred

y_pred_multiNB_easy = predict(multiNB_easy,x_test_easy)
y_pred_multiNB_hard = predict(multiNB_hard,x_test_hard)

y_pred_berNB_easy = predict(berNB_easy, x_test_easy)
y_pred_berNB_hard = predict(berNB_hard, x_test_hard)

print("Accuracy of multiNB on easy ham: ", accuracy_score(y_pred_multiNB_easy, y_test_easy.to_list()))
print("Accuracy of multiNB on hard ham: ", accuracy_score(y_pred_multiNB_hard, y_test_hard.to_list()))
print("Accuracy of berNB on easy ham: ", accuracy_score(y_pred_berNB_easy, y_test_easy.to_list()))
print("Accuracy of berNB on hard ham: ", accuracy_score(y_pred_berNB_hard, y_test_hard.to_list()))

Accuracy of multiNB on easy ham:  0.9776902887139107
Accuracy of multiNB on hard ham:  0.9411764705882353
Accuracy of berNB on easy ham:  0.889763779527559
Accuracy of berNB on hard ham:  0.8449197860962567


# With filtered data

In [38]:
def find_words(common, words, vectorizer, num):
    words = vectorizer.fit_transform(words)
    sum_words = words.sum(axis=0)
    word_count = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    word_count = pd.DataFrame(word_count)
    found_words = word_count.sort_values(by=[1], ascending= not common)[0][0:num]
    return found_words
common_spam = find_words(True, emails_spam["text"], CountVectorizer(),30).to_list()
uncommon_spam = find_words(False, emails_spam["text"], CountVectorizer(),30).to_list()
common_easy_ham = find_words(True, emails_easy_ham_only["text"], CountVectorizer(),30).to_list()
uncommon_easy_ham = find_words(False, emails_easy_ham_only["text"], CountVectorizer(),30).to_list()
common_hard_ham = find_words(True, emails_hard_ham_only["text"], CountVectorizer(),30).to_list()
uncommon_hard_ham = find_words(False, emails_hard_ham_only["text"], CountVectorizer(),30).to_list()
all_common_and_uncommon = common_spam + uncommon_spam + common_easy_ham + uncommon_easy_ham + common_hard_ham + uncommon_hard_ham

In [39]:
x_train_easy_filtered, x_test_easy_filtered, y_train_easy_filtered, y_test_easy_filtered = preprocess(emails_easy_ham["text"], emails_easy_ham["category_num"], CountVectorizer(stop_words=all_common_and_uncommon))

x_train_hard_filtered, x_test_hard_filtered, y_train_hard_filtered, y_test_hard_filtered = preprocess(emails_hard_ham["text"], emails_hard_ham["category_num"], CountVectorizer(stop_words=all_common_and_uncommon))




In [40]:
multiNB_easy_filtered = train_multiNB(x_train_easy_filtered,y_train_easy_filtered)
multiNB_hard_filtered = train_multiNB(x_train_hard_filtered,y_train_hard_filtered)

berNB_easy_filtered = train_berNB(x_train_easy_filtered,y_train_easy_filtered)
berNB_hard_filtered = train_berNB(x_train_hard_filtered,y_train_hard_filtered)


In [41]:
y_pred_multiNB_easy_filtered = predict(multiNB_easy_filtered,x_test_easy_filtered)
y_pred_multiNB_hard_filtered = predict(multiNB_hard_filtered,x_test_hard_filtered)

y_pred_berNB_easy_filtered = predict(berNB_easy_filtered, x_test_easy_filtered)
y_pred_berNB_hard_filtered = predict(berNB_hard_filtered, x_test_hard_filtered)

print("Accuracy of multiNB on filtered easy ham: ", accuracy_score(y_pred_multiNB_easy_filtered, y_test_easy_filtered.to_list()))
print("Accuracy of multiNB on filtered hard ham: ", accuracy_score(y_pred_multiNB_hard_filtered, y_test_hard_filtered.to_list()))
print("Accuracy of berNB on filtered easy ham: ", accuracy_score(y_pred_berNB_easy_filtered, y_test_easy_filtered.to_list()))
print("Accuracy of berNB on filtered hard ham: ", accuracy_score(y_pred_berNB_hard_filtered, y_test_hard_filtered.to_list()))

Accuracy of multiNB on filtered easy ham:  0.994750656167979
Accuracy of multiNB on filtered hard ham:  0.9946524064171123
Accuracy of berNB on filtered easy ham:  0.8753280839895013
Accuracy of berNB on filtered hard ham:  0.8342245989304813
