###  Created by Luis A. Sanchez-Perez (alejand@umich.edu)

In [1]:
import re
import numpy as np
import scipy.io as sio
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from scipy.special import logsumexp

In [2]:
# Loads dataset
dataset = sio.loadmat('../../datasets/classification/emails.mat')
vocab = [element[0] for element in dataset['vocab'][0]]
X = dataset['X']
y = dataset['Y'].ravel()
# Preprocessing
X[X > 0] = 1
# Splitting the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

### Using sklearn implementation

In [3]:
mdl = BernoulliNB()
mdl.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [4]:
# Predicting the training set results
y_pred = mdl.predict(X_train)
# Making the Confusion Matrix
cm = confusion_matrix(y_train, y_pred)
print(cm)
print(accuracy_score(y_train,y_pred))

[[1489   10]
 [ 101  650]]
0.9506666666666667


In [5]:
# Predicting the test set results
y_pred = mdl.predict(X_test)
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))

[[492   9]
 [ 58 191]]
0.9106666666666666


### Creating new input

In [6]:
def load_email(url, vocab):
    corpus = dict(zip(vocab, np.zeros(len(vocab))))
    with open(url,'r') as fid:
        text = fid.read()
#         print('Email: ', text)
        text = text.lower()
        text = re.sub(r'[^a-z\'\s]',' ',text)
#         print('Preprocessed email: ', text)
        for word in text.split():
            if word in corpus:
                corpus[word] = 1
            else:
                print('Word', '"{}"'.format(word), 'not in dictionary')
        arr = np.array([corpus[element] for element in vocab])
        print('List of words in arr:', [vocab[i] for i, value in enumerate(arr) if value])
    return arr.reshape(1,len(vocab))

In [7]:
arr = load_email('../../datasets/classification/email.txt', vocab)

Word "of" not in dictionary
Word "neurology" not in dictionary
Word "neurological" not in dictionary
Word "issn" not in dictionary
Word "luis" not in dictionary
Word "alejandro" not in dictionary
Word "sanchezperez" not in dictionary
Word "it" not in dictionary
Word "is" not in dictionary
Word "a" not in dictionary
Word "to" not in dictionary
Word "my" not in dictionary
Word "to" not in dictionary
Word "a" not in dictionary
Word "scholar" not in dictionary
Word "we" not in dictionary
Word "on" not in dictionary
Word "neuroscience" not in dictionary
Word "neurological" not in dictionary
Word "disorders" not in dictionary
Word "is" not in dictionary
Word "to" not in dictionary
Word "of" not in dictionary
Word "we" not in dictionary
Word "to" not in dictionary
Word "we" not in dictionary
Word "be" not in dictionary
Word "if" not in dictionary
Word "us" not in dictionary
Word "of" not in dictionary
Word "clinical" not in dictionary
Word "we" not in dictionary
Word "to" not in dictionary
Wo

### Custom implementation

In [8]:
# Computing sklearn model output to compare
mdl.predict_proba(arr.reshape(1,len(vocab)))

array([[3.54308546e-05, 9.99964569e-01]])

In [9]:
# Determines number of times each feature was one per class
is_ham = (y_train == 0)
occurrences_ham = np.array(X_train[is_ham,:].sum(axis=0)).flatten()
occurrences_spam = np.array(X_train[~is_ham,:].sum(axis=0)).flatten()
occurrences_total = np.array(X_train.sum(axis=0)).flatten()

In [10]:
assert(((occurrences_ham + occurrences_spam) == occurrences_total).all())

In [11]:
# Computes the probability of observing each feature being one per class
prob_ham = occurrences_ham / is_ham.sum()
prob_spam = occurrences_spam / (~is_ham).sum()

In [12]:
### How many features (words) are never observed in ham emails? (zero-freq problem)
(prob_ham == 0).sum()

866

In [13]:
### Implementing laplace smoothing
G = len(np.unique(y_train))
alpha = 1
prob_ham = (occurrences_ham + alpha) / (is_ham.sum() + alpha*G)
prob_spam = (occurrences_spam  + alpha) / ((~is_ham).sum() + alpha*G)

In [14]:
# No probabilities equal to zero
(prob_ham == 0).sum()

0

In [15]:
prior = np.array([is_ham.sum(), (~is_ham).sum()]) / len(y_train)
prior

array([0.66622222, 0.33377778])

In [16]:
likelihood_ham = np.array([prob_ham[i] if arr[0,i] == 1 else 1 - prob_ham[i] for i in range(len(prob_ham))])
likelihood_ham.prod()

3.692602677855177e-160

In [17]:
likelihood_spam = np.array([prob_spam[i] if arr[0,i] == 1 else 1 - prob_spam[i] for i in range(len(prob_spam))])
likelihood_spam.prod()

2.0801624918174632e-155

In [18]:
likelihood = np.vstack((likelihood_ham, likelihood_spam),)
likelihood.prod(axis=1)

array([3.69260268e-160, 2.08016249e-155])

In [19]:
loglikelihood = np.log(likelihood)
loglikelihood.sum(axis=1)

array([-367.10728334, -356.1682434 ])

In [20]:
numerator = likelihood.prod(axis=1) * prior # unsafe
numerator

array([2.46009396e-160, 6.94312014e-156])

In [21]:
lognumerator = loglikelihood.sum(axis=1) + np.log(prior)
lognumerator

array([-367.51341533, -357.26552325])

In [22]:
posterior = numerator / numerator.sum()
posterior

array([3.54308546e-05, 9.99964569e-01])

In [23]:
posterior.sum()

1.0

In [24]:
numerator.sum()

6.94336614875136e-156

In [25]:
logposterior = lognumerator - np.log(numerator.sum()) # unsafe
logposterior

array([-1.02479275e+01, -3.54314822e-05])

In [26]:
posterior = np.exp(logposterior)
posterior

array([3.54308546e-05, 9.99964569e-01])

In [27]:
posterior.sum()

1.0000000000000313

In [28]:
logposterior = lognumerator - logsumexp(lognumerator) # more stable
logposterior

array([-1.02479275e+01, -3.54314823e-05])

In [29]:
posterior = np.exp(logposterior)
posterior

array([3.54308546e-05, 9.99964569e-01])

In [30]:
posterior.sum()

0.9999999999999746