In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
import os


# Preprocessing

In [35]:

def populateDF(dir_label):
    emails = pd.DataFrame(columns={"text","category","category_num"})
    for dir_path in dir_label.keys():
        entries = os.listdir(dir_path)
        for entry in entries:
            entry = dir_path+"/"+entry
            with open(entry) as file:
                try:
                    text = file.read()
                except:
                    print("file "+ entry + " was not readable")
                    continue
                new_row = pd.DataFrame(data={'text': [text], 'category': [dir_label[dir_path]]})
                new_row['category_num'] = new_row.category.map({'ham':0, 'spam':1})
                emails = emails.append(new_row, ignore_index=True)
                  
    return emails
emails_easy_ham = populateDF({"easy_ham_2002":"ham","spam_2002":"spam"})
emails_hard_ham = populateDF({"hard_ham_2002":"ham","spam_2002":"spam"})


 am not 
sure how to verify if the DVD is CSS-encrypted. If someone has a suggestion, 
please let me know. However, if it is true, then it's very interesting that 
Disney has released such a major movie without encryption.




From rssfeeds@jmason.org  Tue Oct  8 10:55:34 2002
Return-Path: <rssfeeds@example.com>
Delivered-To: yyyy@localhost.example.com
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id A0CCC16F17
	for <jm@localhost>; Tue,  8 Oct 2002 10:55:29 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Tue, 08 Oct 2002 10:55:29 +0100 (IST)
Received: from dogma.slashnull.org (localhost [127.0.0.1]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g9880QK06043 for
    <jm@jmason.org>; Tue, 8 Oct 2002 09:00:27 +0100
Message-Id: <200210080800.g9880QK06043@dogma.slashnull.org>
To: yyyy@example.com
From: boingboing <rssfeeds@example.com>
Subject: Reclaiming privacy with 

KeyboardInterrupt: 

In [45]:
x_train_easy, x_test_easy, y_train_easy, y_test_easy = train_test_split(emails_easy_ham["text"], emails_easy_ham["category_num"], test_size=0.25)
x_train_hard, x_test_hard, y_train_hard, y_test_hard = train_test_split(emails_hard_ham["text"], emails_hard_ham["category_num"], test_size=0.25)

vectorizer_easy = CountVectorizer()
vectorizer_hard = CountVectorizer()

x_train_easy = vectorizer_easy.fit_transform(x_train_easy)
fsdf = vectorizer_easy.get_feature_names()

x_test_easy = vectorizer_easy.transform(x_test_easy)
x_train_hard = vectorizer_hard.fit_transform(x_train_hard)
x_test_hard = vectorizer_hard.transform(x_test_hard)

y_train_easy = y_train_easy.astype("int")
y_train_hard = y_train_hard.astype("int")


<class 'scipy.sparse.csr.csr_matrix'>


# Fitting and predictions

In [24]:
multiNB_easy = MultinomialNB()
multiNB_hard = MultinomialNB()


multiNB_easy.fit(x_train_easy,y_train_easy)
multiNB_hard.fit(x_train_hard,y_train_hard)


y_pred_multiNB_easy = multiNB_easy.predict(x_test_easy)
y_pred_multiNB_hard = multiNB_hard.predict(x_test_hard)


print("Accuracy of multiNB on easy ham: ", accuracy_score(y_pred_multiNB_easy, y_test_easy.to_list()))
print("Accuracy of multiNB on hard ham: ", accuracy_score(y_pred_multiNB_hard, y_test_hard.to_list()))

Accuracy of multiNB on easy ham:  0.9711286089238845
Accuracy of multiNB on hard ham:  0.9358288770053476


In [32]:
bernNB_easy = BernoulliNB(binarize=1.0)
bernNB_hard = BernoulliNB(binarize=1.0)

bernNB_easy.fit(x_train_easy,y_train_easy)
bernNB_hard.fit(x_train_hard,y_train_hard)

y_pred_berNB_easy = bernNB_easy.predict(x_test_easy)
y_pred_berNB_hard = bernNB_hard.predict(x_test_hard)


print("Accuracy of berNB on easy ham: ", accuracy_score(y_pred_berNB_easy, y_test_easy.to_list()))
print("Accuracy of berNB on hard ham: ", accuracy_score(y_pred_berNB_hard, y_test_hard.to_list()))

Accuracy of berNB on easy ham:  0.8976377952755905
Accuracy of berNB on hard ham:  0.8716577540106952


# With filtered data

In [67]:
x_train_easy, x_test_easy, y_train_easy, y_test_easy = train_test_split(emails_easy_ham["text"], emails_easy_ham["category_num"], test_size=0.25)
x_train_hard, x_test_hard, y_train_hard, y_test_hard = train_test_split(emails_hard_ham["text"], emails_hard_ham["category_num"], test_size=0.25)

custom_vectorizer_easy = CountVectorizer(stop_words="english",max_df=500, min_df=1)
custom_vectorizer_hard = CountVectorizer(stop_words="english", max_df=1000, min_df=100)

x_train_easy = custom_vectorizer_easy.fit_transform(x_train_easy)
x_test_easy = custom_vectorizer_easy.transform(x_test_easy)

y_train_easy = y_train_easy.astype("int")

In [70]:
multiNB_easy_filtered = MultinomialNB()

multiNB_easy_filtered.fit(x_train_easy,y_train_easy)

pred = multiNB_easy_filtered.predict(x_test_easy)

print("Accuracy of multiNB on filtered easy ham:", accuracy_score(pred,y_test_easy.to_list()))



Accuracy of multiNB on filtered easy ham: 0.9881889763779528
