# Multinomial Naive Bayes Classification
This work is inspired from zacstewart.com and uses some methods and explanation from there.

In [56]:
import os
import numpy as np

from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

## Load email texts and label them

In [12]:

NEWLINE = '\n'
SKIP_FILES = {'cmds'}

#Read all files
def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        #method walk() generates the file names in a directory tree
        
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content


In [13]:
#Build data frame from raw data
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame


This function will build us a DataFrame from all the files in path. It will include the body text in one column and the class in another. Each row will be indexed by the corresponding email’s filename. 

In [14]:
data_folders_path = r"C:\Users\Being_Aerys\PycharmProjects\Machine_Learning_Algorithms_Collection\Supervised_Methods\Naive_Bayes_Classifier\Data"
#\U in "C:\Users... starts an eight-character Unicode escape, such as \U00014321. 
#The escape is followed by the character 's', which is invalid.
#You either need to duplicate all backslashes.
#r"C:\Users\Eric\Desktop\beeline.txt"

In [40]:
HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/spam',        SPAM),
    ('data/spam_2',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/easy_ham_2',    HAM),
    ('data/hard_ham',    HAM),
#     ('data/beck-s',      HAM),
#     ('data/farmer-d',    HAM),
#     ('data/kaminski-v',  HAM),
#     ('data/kitchen-l',   HAM),
#     ('data/lokay-m',     HAM),
#     ('data/williams-w3', HAM),
#     ('data/BG',          SPAM),
#     ('data/GP',          SPAM),
#     ('data/SH',          SPAM)
]

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

#To shuffle the ham and spam indices
data = data.reindex(np.random.permutation(data.index),)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Increasing the size of the training set is just a matter of dumping a collection of emails into a directory and then adding it to SOURCES with an applicable class. The last thing we do is use DataFrame’s reindex to shuffle the whole dataset.

In [41]:
data.head()

Unnamed: 0,class,text
data/easy_ham\00949.5a860f580179b99a227c4064ac28724c,ham,--==_Exmh_1581673767P\n\nContent-Type: text/pl...
data/easy_ham\2505.550c8b2240659bb8bf54b17edea4a96b,ham,URL: http://boingboing.net/#85538591\n\nDate: ...
data/easy_ham_2\01312.715fe5f6f31d47697d5a8f625fd2e49f,ham,From: Matt Kettler <mkettler@evi-inc.com>\n\n\...
data/spam_2\01209.01df2f8f68a70062085ef787973f9ba0,spam,This is a multipart message in MIME format.\n\...
data/spam\0128.4da9b2cfacbe9bfd128aacbb526d68d4,spam,"<html>\n\n\n\n<head>\n\n<meta http-equiv=3D""Co..."


In [42]:
#Convert labels from text values to 0 and 1 values
data['class'] = data['class'].map({'ham': 0, 'spam': 1})

In [51]:
count_vectorizer = CountVectorizer(stop_words = 'english')
all_features = count_vectorizer.fit_transform(data['text'])

The vectorizer has learned the vocabulary of all the words in the email texts and also the count of each word in total in all the emails.

In [52]:
#count_vectorizer.vocabulary_

In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(all_features, data['class'], test_size = 0.3, random_state = 42)

In [66]:
classifier = MultinomialNB() #understand why using a multinomial Naive Bayes
classifier.fit(X_train, Y_train)
predictions = classifier.predict(X_test)

num_acccurate = (Y_test == predictions).sum()
num_inaccurate = len(Y_test) - num_acccurate

print(f" Num of accurate predictions: {num_acccurate} and Num of inaccurate predictions {num_inaccurate}")

 Num of accurate predictions: 2751 and Num of inaccurate predictions 54


In [68]:
accuracy_percent = classifier.score(X_test, Y_test)
print(f"Accuracy Percentage: {accuracy_percent}")

Accuracy Percentage: 0.9807486631016042


Work on precision recall f1 score, other forms of vectorizer, prior that couldve been used, etc now.