###  Created by Luis A. Sanchez-Perez (alejand@umich.edu).
<p><span style="color:green"><b>Copyright &#169;</b> Do not distribute or use without authorization from author.</span></p>

In [1]:
import re
import numpy as np
import scipy.io as sio
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# Loads dataset
dataset = sio.loadmat('../../datasets/classification/emails.mat')

In [3]:
vocab = [element[0] for element in dataset['vocab'][0]]
vocab[:5]

['the', 'and', 'you', 'for', 'that']

In [4]:
X = dataset['X']
X

<3000x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 338915 stored elements in Compressed Sparse Column format>

In [5]:
y = dataset['Y'].ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=uint8)

In [6]:
# First email content (only showing first 10 words)
email = ([vocab[i] for i,value in enumerate(X[:,0]) if value])
email[:10]

['the', 'and', 'you', 'for', 'that', 'this', 'your', 'with', 'are', 'from']

In [7]:
# Preprocessing
X[X > 0] = 1
# Splitting the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [8]:
mdl = BernoulliNB()
mdl.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [9]:
# Predicting the training set results
y_pred = mdl.predict(X_train)
# Making the Confusion Matrix
cm = confusion_matrix(y_train, y_pred)
print(cm)
print(accuracy_score(y_train,y_pred))

[[1487    9]
 [  78  676]]
0.9613333333333334


In [10]:
# Predicting the test set results
y_pred = mdl.predict(X_test)
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))

[[497   7]
 [ 38 208]]
0.94


In [11]:
def load_email(url, vocab):
    corpus = dict(zip(vocab, np.zeros(len(vocab))))
    with open(url,'r') as fid:
        text = fid.read()
#         print('Email: ', text)
        text = text.lower()
        text = re.sub(r'[^a-z\'\s]',' ',text)
#         print('Preprocessed email: ', text)
        for word in text.split():
            if word in corpus:
                corpus[word] = 1
            else:
                print('Word', '"{}"'.format(word), 'not in dictionary')
        arr = np.array([corpus[element] for element in vocab])
        print('List of words in arr:', [vocab[i] for i, value in enumerate(arr) if value])
    return arr.reshape(1,len(vocab))

In [15]:
arr = load_email('../../datasets/classification/email1.txt', vocab)

Word "i" not in dictionary
Word "if" not in dictionary
Word "be" not in dictionary
Word "aviable" not in dictionary
Word "to" not in dictionary
Word "wednesday" not in dictionary
Word "i" not in dictionary
Word "we" not in dictionary
Word "i" not in dictionary
Word "to" not in dictionary
Word "i" not in dictionary
List of words in arr: ['the', 'you', 'that', 'your', 'are', 'have', 'but', 'was', 'time', 'would', 'some', 'over', 'going', 'come', 'hours', 'current', 'questions', 'office', 'during', 'kind', 'normal', 'regards', 'meet', 'hello', 'class', 'material', 'regarding', 'wondering', 'professor']


In [16]:
mdl.predict(arr.reshape(1,len(vocab)))

array([0], dtype=uint8)

In [17]:
mdl.predict_proba(arr.reshape(1,len(vocab)))

array([[9.99998365e-01, 1.63517072e-06]])