### Import Required Packages

In [110]:
import sys
import os
from collections import Counter
import numpy as np
from sklearn.naive_bayes import GaussianNB

### Read In Email messages

In [120]:
def make_dict(filepath):
    filenames = os.listdir(filepath)
    words = []
    for filename in filenames:
        with open(os.path.join('./train-mails', filename), 'r') as file:
            lines = file.read().splitlines()
        file.close()
        words += lines[2].split()

    clean_words = [word for word in words if len(word) > 1 and word.isalpha() == True]
    word_list = Counter(clean_words).most_common(10)
    
    return word_list

### Create Matrix of Word Counts

In [107]:
def extract_features(filepath, word_counts):

    filenames = os.listdir(filepath)
    file_id = -1

    word_matrix = np.zeros((len(filenames), len(word_list)))
    labels = np.zeros(len(filenames))

    for filename in filenames:
        file_id += 1

        with open(os.path.join(filepath, filename), 'r') as file:
            lines = file.read().splitlines()
        file.close()
        words_in_file = lines[2].split()
        clean_words_in_file = [word for word in words_in_file if len(word) > 1 and word.isalpha() == True]

        for word in clean_words_in_file:
            word_id = 0
            for i, d in enumerate(word_counts):
                if d[0] == word:
                    word_id = i
                    word_matrix[file_id,word_id] = clean_words_in_file.count(word)

        if 'spmsg' in filename:
            labels[file_id] = 1

    return (word_matrix, labels)

### Create Features and Train the Model

In [121]:
train_path = './train-mails'
test_path = './test-mails'

word_counts = make_dict(train_path)
train_matrix, train_labels = extract_features(train_path, word_counts)
test_matrix, test_labels = extract_features(test_path, word_counts)


In [122]:
model = GaussianNB()
model.fit(train_matrix, train_labels)

GaussianNB(priors=None, var_smoothing=1e-09)

### Predict Test Scores

In [123]:
predicted_labels = model.predict(test_matrix)
predicted_probs = model.predict_proba(test_matrix)

predicted_probs

array([[1.00000000e+000, 1.26508759e-104],
       [9.99998235e-001, 1.76487080e-006],
       [1.00000000e+000, 6.24012154e-050],
       [9.99998725e-001, 1.27525300e-006],
       [9.99999616e-001, 3.84283736e-007],
       [1.00000000e+000, 0.00000000e+000],
       [3.77723017e-001, 6.22276983e-001],
       [1.00000000e+000, 0.00000000e+000],
       [9.99999668e-001, 3.31915300e-007],
       [1.00000000e+000, 5.35810468e-016],
       [9.99999600e-001, 4.00119623e-007],
       [1.00000000e+000, 1.94179318e-017],
       [9.99999696e-001, 3.03585674e-007],
       [9.99933001e-001, 6.69991454e-005],
       [1.00000000e+000, 1.03972655e-280],
       [9.99999597e-001, 4.03320652e-007],
       [9.99999696e-001, 3.03585674e-007],
       [1.00000000e+000, 1.72468419e-014],
       [1.00000000e+000, 5.90820843e-018],
       [9.99999696e-001, 3.03585674e-007],
       [9.99998563e-001, 1.43738252e-006],
       [9.99998162e-001, 1.83759897e-006],
       [1.00000000e+000, 0.00000000e+000],
       [1.0

In [124]:
word_counts

[('order', 1414),
 ('address', 1293),
 ('report', 1216),
 ('mail', 1127),
 ('send', 1079),
 ('language', 1072),
 ('email', 1051),
 ('program', 1001),
 ('our', 987),
 ('list', 935)]