In [0]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [0]:
#This function is used to create a dictionary with the 3000 most common words from the email files. It does this by adding all the words and symbols into the dictionary and then removes all the non-alpha numeric characters, leaving it with the most used words.

def make_Dictionary(root_dir):
  list_of_words = []
  emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
  for mail in emails:
    with open(mail) as m:
      for line in m:
        all_words = line.split()
        list_of_words += all_words
  dictionary = Counter(list_of_words)
  list_to_remove = list(dictionary)

  for item in list_to_remove:
    if item.isalpha() == False:
      del dictionary[item]
    elif len(item) == 1:
      del dictionary[item]
  dictionary = dictionary.most_common(3000)
  return dictionary



In [0]:
# This functions extracts the feature columns and populates the values. It also analyzes the file names of each email and figures out if it is spam or not based on the name. Afterwards, it uses the function to create the labelled data column. Overall, it extracts the feature dataset, test and training data, and labelled column.

def extract_features(mail_dir):
  allfiles = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
  features_matrix = np.zeros((len(allfiles),3000))
  train_labels = np.zeros(len(allfiles))
  count = 1;
  docID = 0;
  for fil in allfiles:
    with open(fil) as fi:
      for i, line in enumerate(fi):
        if i ==2:
          words = line.split()
          for word in words:
            wordID = 0
            for i, d in enumerate(dictionary):
              if d[0] == word:
                wordID = i
                features_matrix[docID,wordID] = words.count(word)
      train_labels[docID] = 0;
      filepathTokens = fil.split('/')
      lastToken = filepathTokens[len(filepathTokens)-1]
      if lastToken.startswith("spmsg"):
        train_labels[docID] = 1;
        count = count + 1
      docID = docID + 1
  return features_matrix, train_labels


In [20]:
# This uses the two functions created and runs them. It trains the Gaussian model by using model.fit function and then runs the trained model with the test data set. This will output an accuracy score of the model's performance.
TRAIN_DIR = '/content/drive/My Drive/MSBA_Colab_2020/ML_Algorithms/CA_02/Data/train-mails'
TEST_DIR = '/content/drive/My Drive/MSBA_Colab_2020/ML_Algorithms/CA_02/Data/test-mails'

new_dictionary = make_Dictionary(TRAIN_DIR)

print ("reading and processing emails from TRAIN and TEST folders")
features_matrix, labels = extract_features(TRAIN_DIR)
test_features_matrix, test_labels = extract_features(TEST_DIR)

model = GaussianNB()

print ("Training Model using Gaussian Naive Bayes algorithm...")
model.fit(features_matrix, labels)
print ("Completed Training")
print ("Testing trained model to predict Test Data labels")
predicted_labels = model.predict(test_features_matrix)
print ("Completed classification of the Test Data, Accuracy Score by comparing the Predicted Labels with the Test Labels:")
print (accuracy_score(test_labels, predicted_labels))

reading and processing emails from TRAIN and TEST folders
Training Model using Gaussian Naive Bayes algorithm...
Completed Training
Testing trained model to predict Test Data labels
Completed classification of the Test Data, Accuracy Score by comparing the Predicted Labels with the Test Labels:
0.9615384615384616
