In [1]:
import numpy as np
import os
from collections import Counter
from sklearn.metrics import accuracy_score

In [2]:
#preprocessing and cleaning the data
def make_dict(root_dir):
    all_words=[]
    emails=[os.path.join(root_dir,mail_path) for mail_path in os.listdir(root_dir)] #list of all files in the directory
    for mail in emails:
        with open(mail) as m:
            for line in m:
                words = line.split()
                all_words += words #appending the words into a list
    dictionary_words=Counter(all_words) #creates a dictionary that maps each word to its corresponding frequency
    word_list=list(dictionary_words) #creates a list of all the words in the above dict
    for item in word_list:
        if not item.isalpha():
            del dictionary_words[item]
        elif(len(item)==1): #removing single characters 
            del dictionary_words[item]
    dictionary = dictionary_words.most_common(3000) #return only the 3000 most common words in the dictionary
    return dictionary

In [6]:
#for training purpose, we consider the frequent 3000 wrds as our features
#corresponding to each of these features we store the frequency of the word in the mail
def extract_features(mail_dir):
  files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
  features_matrix = np.zeros((len(files),3000))
  train_labels = np.zeros(len(files))
  count = 0;
  docID = 0;
  for fil in files:
    with open(fil) as fi:
      for i,line in enumerate(fi):
        if i == 2:
          words = line.split()
          for word in words:
            wordID = 0
            for i,d in enumerate(dictionary):
              if d[0] == word:
                wordID = i
                features_matrix[docID,wordID] = words.count(word)
      train_labels[docID] = 0;
      filepathTokens = fil.split('\\')
      #print(filepathTokens,"..")
      lastToken = filepathTokens[len(filepathTokens) - 1]
      if lastToken.startswith("spmsg"):
          train_labels[docID] = 1;
          count = count + 1
      docID = docID + 1
  return features_matrix, train_labels

In [4]:
TRAIN_DIR = "./train-mails"
TEST_DIR = "./test-mails"
dictionary = make_dict(TRAIN_DIR) #returns a dictionary of 3000 mst frequent words and their frequencies
# using functions mentioned above.
print("reading and processing emails from file.")
features_matrix, labels = extract_features(TRAIN_DIR)
test_feature_matrix, test_labels = extract_features(TEST_DIR)
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
#train model
print("Training model.")

model.fit(features_matrix, labels)
#predict
predicted_labels = model.predict(test_feature_matrix)

reading and processing emails from file.
['./train-mails', '3-1msg1.txt'] ..
['./train-mails', '3-1msg2.txt'] ..
['./train-mails', '3-1msg3.txt'] ..
['./train-mails', '3-375msg1.txt'] ..
['./train-mails', '3-378msg1.txt'] ..
['./train-mails', '3-378msg2.txt'] ..
['./train-mails', '3-378msg3.txt'] ..
['./train-mails', '3-378msg4.txt'] ..
['./train-mails', '3-378msg5.txt'] ..
['./train-mails', '3-379msg1.txt'] ..
['./train-mails', '3-379msg2.txt'] ..
['./train-mails', '3-379msg3.txt'] ..
['./train-mails', '3-380msg1.txt'] ..
['./train-mails', '3-380msg2.txt'] ..
['./train-mails', '3-380msg3.txt'] ..
['./train-mails', '3-380msg4.txt'] ..
['./train-mails', '3-380msg5.txt'] ..
['./train-mails', '3-380msg6.txt'] ..
['./train-mails', '3-380msg7.txt'] ..
['./train-mails', '3-383msg0.txt'] ..
['./train-mails', '3-383msg1.txt'] ..
['./train-mails', '3-384msg0.txt'] ..
['./train-mails', '3-384msg1.txt'] ..
['./train-mails', '3-384msg2.txt'] ..
['./train-mails', '3-384msg3.txt'] ..
['./train-mails

['./train-mails', '6-4msg2.txt'] ..
['./train-mails', '6-4msg3.txt'] ..
['./train-mails', '6-50msg0.txt'] ..
['./train-mails', '6-50msg1.txt'] ..
['./train-mails', '6-50msg2.txt'] ..
['./train-mails', '6-50msg3.txt'] ..
['./train-mails', '6-51msg1.txt'] ..
['./train-mails', '6-52msg1.txt'] ..
['./train-mails', '6-53msg1.txt'] ..
['./train-mails', '6-54msg1.txt'] ..
['./train-mails', '6-55msg1.txt'] ..
['./train-mails', '6-57msg1.txt'] ..
['./train-mails', '6-60msg1.txt'] ..
['./train-mails', '6-61msg1.txt'] ..
['./train-mails', '6-64msg1.txt'] ..
['./train-mails', '6-65msg1.txt'] ..
['./train-mails', '6-68msg1.txt'] ..
['./train-mails', '6-70msg1.txt'] ..
['./train-mails', '6-72msg1.txt'] ..
['./train-mails', '6-73msg1.txt'] ..
['./train-mails', '6-74msg1.txt'] ..
['./train-mails', '6-75msg1.txt'] ..
['./train-mails', '6-76msg1.txt'] ..
['./train-mails', '6-77msg1.txt'] ..
['./train-mails', '6-798msg3.txt'] ..
['./train-mails', '6-799msg1.txt'] ..
['./train-mails', '6-7msg1.txt'] ..
['

['./train-mails', 'spmsga23.txt'] ..
['./train-mails', 'spmsga24.txt'] ..
['./train-mails', 'spmsga25.txt'] ..
['./train-mails', 'spmsga26.txt'] ..
['./train-mails', 'spmsga27.txt'] ..
['./train-mails', 'spmsga28.txt'] ..
['./train-mails', 'spmsga29.txt'] ..
['./train-mails', 'spmsga3.txt'] ..
['./train-mails', 'spmsga30.txt'] ..
['./train-mails', 'spmsga31.txt'] ..
['./train-mails', 'spmsga32.txt'] ..
['./train-mails', 'spmsga33.txt'] ..
['./train-mails', 'spmsga34.txt'] ..
['./train-mails', 'spmsga35.txt'] ..
['./train-mails', 'spmsga36.txt'] ..
['./train-mails', 'spmsga37.txt'] ..
['./train-mails', 'spmsga38.txt'] ..
['./train-mails', 'spmsga39.txt'] ..
['./train-mails', 'spmsga4.txt'] ..
['./train-mails', 'spmsga40.txt'] ..
['./train-mails', 'spmsga41.txt'] ..
['./train-mails', 'spmsga42.txt'] ..
['./train-mails', 'spmsga43.txt'] ..
['./train-mails', 'spmsga44.txt'] ..
['./train-mails', 'spmsga45.txt'] ..
['./train-mails', 'spmsga46.txt'] ..
['./train-mails', 'spmsga47.txt'] ..
['.

['./train-mails', 'spmsgb70.txt'] ..
['./train-mails', 'spmsgb71.txt'] ..
['./train-mails', 'spmsgb72.txt'] ..
['./train-mails', 'spmsgb73.txt'] ..
['./train-mails', 'spmsgb74.txt'] ..
['./train-mails', 'spmsgb75.txt'] ..
['./train-mails', 'spmsgb76.txt'] ..
['./train-mails', 'spmsgb77.txt'] ..
['./train-mails', 'spmsgb78.txt'] ..
['./train-mails', 'spmsgb79.txt'] ..
['./train-mails', 'spmsgb8.txt'] ..
['./train-mails', 'spmsgb80.txt'] ..
['./train-mails', 'spmsgb81.txt'] ..
['./train-mails', 'spmsgb82.txt'] ..
['./train-mails', 'spmsgb83.txt'] ..
['./train-mails', 'spmsgb84.txt'] ..
['./train-mails', 'spmsgb85.txt'] ..
['./train-mails', 'spmsgb86.txt'] ..
['./train-mails', 'spmsgb87.txt'] ..
['./train-mails', 'spmsgb88.txt'] ..
['./train-mails', 'spmsgb89.txt'] ..
['./train-mails', 'spmsgb9.txt'] ..
['./train-mails', 'spmsgb90.txt'] ..
['./train-mails', 'spmsgb91.txt'] ..
['./train-mails', 'spmsgb92.txt'] ..
['./train-mails', 'spmsgb93.txt'] ..
['./train-mails', 'spmsgb94.txt'] ..
['.

['./test-mails', 'spmsgc144.txt'] ..
['./test-mails', 'spmsgc145.txt'] ..
['./test-mails', 'spmsgc146.txt'] ..
['./test-mails', 'spmsgc147.txt'] ..
['./test-mails', 'spmsgc18.txt'] ..
['./test-mails', 'spmsgc19.txt'] ..
['./test-mails', 'spmsgc20.txt'] ..
['./test-mails', 'spmsgc21.txt'] ..
['./test-mails', 'spmsgc22.txt'] ..
['./test-mails', 'spmsgc23.txt'] ..
['./test-mails', 'spmsgc24.txt'] ..
['./test-mails', 'spmsgc25.txt'] ..
['./test-mails', 'spmsgc26.txt'] ..
['./test-mails', 'spmsgc27.txt'] ..
['./test-mails', 'spmsgc28.txt'] ..
['./test-mails', 'spmsgc29.txt'] ..
['./test-mails', 'spmsgc30.txt'] ..
['./test-mails', 'spmsgc31.txt'] ..
['./test-mails', 'spmsgc32.txt'] ..
['./test-mails', 'spmsgc33.txt'] ..
['./test-mails', 'spmsgc34.txt'] ..
['./test-mails', 'spmsgc35.txt'] ..
['./test-mails', 'spmsgc36.txt'] ..
['./test-mails', 'spmsgc37.txt'] ..
['./test-mails', 'spmsgc38.txt'] ..
['./test-mails', 'spmsgc39.txt'] ..
['./test-mails', 'spmsgc40.txt'] ..
['./test-mails', 'spmsgc

In [5]:

print("Finished classifying. accuracy score : ")
print(accuracy_score(test_labels, predicted_labels))

Finished classifying. accuracy score : 
0.9615384615384616
