# Multinomial Naive Bayes Classification
This work is inspired from zacstewart.com and uses some code  and explanation from there.

## Import libraries

In [1]:
import os
import numpy as np
import copy

from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import KFold

## Helper functions

In [2]:
#Custin train_test_split indices generator

def train_test_split_indices(len_data, test_split_size = 0.3):
    np.random.seed(42)
    shuffled_data_indices = np.random.permutation(len_data)
    total_test_data = int(test_split_size * len_data)
    testing_indices = shuffled_data_indices[:total_test_data]
    training_indices = shuffled_data_indices[total_test_data:]
    
    return training_indices, testing_indices
    

In [3]:

NEWLINE = '\n'
SKIP_FILES = {'cmds'}

#Read all files
def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        #method walk() generates the file names in a directory tree
        
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content


In [4]:
#Build data frame from raw data
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame


This function will build us a DataFrame from all the files in path. It will include the body text in one column and the class in another. Each row will be indexed by the corresponding email’s filename. 

## Execution begins.

In [5]:
data_folders_path = r"C:\Users\Being_Aerys\PycharmProjects\Machine_Learning_Algorithms_Collection\Supervised_Methods\Naive_Bayes_Classifier\Data"
#\U in "C:\Users... starts an eight-character Unicode escape, such as \U00014321. 
#The escape is followed by the character 's', which is invalid.
#You either need to duplicate all backslashes.
#r"C:\Users\Eric\Desktop\beeline.txt"

In [6]:
HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/spam',        SPAM),
    ('data/spam_2',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/easy_ham_2',    HAM),
    ('data/hard_ham',    HAM),
]

data = DataFrame({'text': [], 'class': []})

for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

#To shuffle the ham and spam indices
data = data.reindex(np.random.permutation(data.index),)

#add an index column to the data and set it as the index colmn
data['index'] = range(0, len(data))
data = data.set_index("index")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Increasing the size of the training set is just a matter of dumping a collection of emails into a directory and then adding it to SOURCES with an applicable class. The last thing we do is use DataFrame’s reindex to shuffle the whole dataset.

In [7]:
data.head()

Unnamed: 0_level_0,class,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
1,ham,http://news.bbc.co.uk/1/hi/world/asia-pacific/...
2,ham,What I understood was that the activists on th...
3,ham,URL: http://scriptingnews.userland.com/backiss...
4,spam,*****************BANNEDCD:::::::::::::::::::::...


In [8]:
#Convert labels from text values to 0 and 1 values
data['class'] = data['class'].map({'ham': 0, 'spam': 1})

In [9]:
data.head()

Unnamed: 0_level_0,class,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...
1,0,http://news.bbc.co.uk/1/hi/world/asia-pacific/...
2,0,What I understood was that the activists on th...
3,0,URL: http://scriptingnews.userland.com/backiss...
4,1,*****************BANNEDCD:::::::::::::::::::::...


In [10]:
pure_data = copy.deepcopy(data)

## Count Vectorizer with stop words filter

In [11]:
count_vectorizer = CountVectorizer(stop_words = 'english')
all_features = count_vectorizer.fit_transform(data['text'])

The vectorizer has learned the vocabulary of all the words in the email texts and also the count of each word in total in all the emails.

In [12]:
#count_vectorizer.vocabulary_

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(all_features, data['class'], test_size = 0.3, random_state = 42)

In [14]:
classifier = MultinomialNB()
classifier.fit(X_train, Y_train)
predictions = classifier.predict(X_test)

## Calculate performance metrics

In [103]:
num_acccurate = (Y_test == predictions).sum()
num_inaccurate = len(Y_test) - num_acccurate

print(f" Num of accurate predictions: {num_acccurate} and Num of inaccurate predictions {num_inaccurate}")

 Num of accurate predictions: 2725 and Num of inaccurate predictions 80


In [104]:
accuracy_percent = classifier.score(X_test, Y_test)
print(f"Accuracy Percentage: {accuracy_percent}")

Accuracy Percentage: 0.9714795008912656


In [105]:
precision = precision_score(Y_test, predictions)
recall = recall_score(Y_test, predictions)
f1_scoree = f1_score(Y_test, predictions)
print(f"Precision: {precision}, Recall: {recall} f1_score: {f1_scoree}")

Precision: 0.9730941704035875, Recall: 0.9130434782608695 f1_score: 0.9421128798842258


## Lets do some ham/ spam classification on the fly.

In [106]:
#Following are some real ham/ spam samples from my own OSU email inbox.
samples = [
    "Win $500 in 2 minutes View this email in the browser. We want to hear from you. Take our 1-2 minute OSU student survey. ",
    "CI Partnerships*Corrections* Dear Professor Adhikari, With reference to my previous email, we have discovered an error that may have",
    "Sierra Live Construction Tour SIERRA SPIRIT TOUR for a $10 starbucks gift card.",
    "Dear All I am trying to get a RAID controller for star. Currently it is using CentOS software RAID and that is unreliable. Please give me some time to obtain the controller.",
    "Hi Aashish, The Sprintax team wishes you a happy Global Pride Day! We hope you enjoy a weekend filled with fun and activities. PS: don’t forget to prepare your taxes! The US tax deadline is just a couple of weeks away.",
    "MyOregonState is a new digital experience just for you, one that's modern, friendly and personalized. It will be officially replacing MyOSU this summer — and this is just the beginning."
]

In [107]:
vectorized_samples = count_vectorizer.transform(samples)
classifier.predict(vectorized_samples)

array([0, 0, 1, 0, 0, 0], dtype=int64)

The first three samples were from my spam folder while the rest three were from my regular inbox.
The model correctly classified all the ham emails and missed just one spa email.

# Pipelining

In [15]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('count_vectorizer',  CountVectorizer(stop_words = "english")),
    ('multinomial_NB_classifier',  MultinomialNB()) ])

training_indices, testing_indices = train_test_split_indices(len(data), 0.3)

pipeline.fit(data.iloc[training_indices]['text'], data.iloc[training_indices]['class'])
pipeline_predictions = pipeline.predict(data.iloc[testing_indices]['text'])

#Now calculate the performance metrices as above if you like

# Cross-Validation

In [133]:
k_fold = KFold(n_splits = 8, shuffle = False, random_state = 42)
#returns training and testing indices iterator for k folds

f1_scores = []
confusion = np.array([[0, 0], [0, 0]])

for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text']
    train_y = data.iloc[train_indices]['class']

    test_text = data.iloc[test_indices]['text']
    test_y = data.iloc[test_indices]['class']

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)
    
    #We will consider each iteration of k-fold as a different set and thus add onto the previous confusion matrix
    confusion += confusion_matrix(test_y, predictions)
    f1score = f1_score(test_y, predictions)
    f1_scores.append(f1score)

print(f"f1 Score: {sum(f1_scores)/len(f1_scores)}\n Confusion Matrix: {confusion}")

f1 Score:, 0.9571283432540243
 Confusion Matrix: [[6925   26]
 [ 173 2225]]


# Improving Accuracy
Above, we used Count Vectorization : Words from all the emails/ documents combinely formed a vocabulary. For each document/ training sample, each word acted as a feature of the document/ sample and we set that feature to 1 for this email/ document and set the remaining features/ words to 0. Thus this was an example of a count vectorization with unigram. i.e., we used a single word as one feature/ token. However, this can be changed such that we use two consecutive words (and thus take the order of the words into acccount as well unlike the unigram model above). This new method is called Count Vectorization with bigrams. In practice, using a bigram along with unigrams helps to improve accuracy in text classification.

In [18]:
print(f"*********************************Using both unigrams and bigrams*********************************")
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('count_vectorizer',  CountVectorizer(stop_words = "english", ngram_range=(1, 2))),
    ('multinomial_NB_classifier',  MultinomialNB()) ])

training_indices, testing_indices = train_test_split_indices(len(data), 0.3)

pipeline.fit(data.iloc[training_indices]['text'], data.iloc[training_indices]['class'])
pipeline_predictions = pipeline.predict(data.iloc[testing_indices]['text'])


k_fold = KFold(n_splits = 8, shuffle = False, random_state = 42)
#returns training and testing indices iterator for k folds

f1_scores = []
confusion = np.array([[0, 0], [0, 0]])

for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text']
    train_y = data.iloc[train_indices]['class']

    test_text = data.iloc[test_indices]['text']
    test_y = data.iloc[test_indices]['class']

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)
    
    #We will consider each iteration of k-fold as a different set and thus add onto the previous confusion matrix
    confusion += confusion_matrix(test_y, predictions)
    f1score = f1_score(test_y, predictions)
    f1_scores.append(f1score)

print(f"With both unigrams and bigrams: \nf1 Score: {sum(f1_scores)/len(f1_scores)}\n Confusion Matrix: {confusion}")

*********************************Using both unigrams and bigrams*********************************
With both unigrams and bigrams: 
f1 Score: 0.980260936850388
 Confusion Matrix: [[6931   20]
 [  74 2324]]


Further Exploration Topics:
1.Bernoulli Naive Bayes with binarization threshold
2.TF-IDF rather than a COunt Vectorizer