In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process data
data_path = 'smsspamcollection'

sms_raw = pd.read_csv(data_path, delimiter='\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)


In [3]:
# Calculate accuracy of the model here
n = len(sms_raw.message)
correct = sms_raw[sms_raw['spam'] == y_pred].message.count()
percent_correct = correct/n * 100

print('{} out of {} were correctly classified as spam.'.format(correct, n))
print('{:.2f}% were correctly classified'.format(percent_correct))

4968 out of 5572 were correctly classified as spam.
89.16% were correctly classified


In [4]:
# How Thinkful calculated accuracy
# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 5572 points : 604


In [5]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[4770,   55],
       [ 549,  198]])

In [16]:
# confusion matrix the manual way

confusion = np.zeros((2,2))
# Negatives correctly identified
confusion[0][0] = ((target == y_pred) & (target == 0)).sum()
# Negatives incorrectly identified
confusion[0][1] = ((target != y_pred) & (target == 0)).sum()
# Positives correctly identified
confusion[1][1] = ((target == y_pred) & (target == 1)).sum()
# Positives incorrectly identified
confusion[1][0] = ((target != y_pred) & (target == 1)).sum()

pos_tot = confusion[1][1] + confusion[1][0]
neg_tot = confusion[0][0] + confusion[0][1]

print('{:.0f} out of {:.0f} positives were correctly identified for a sensitivity of {:.2f}'
    .format(confusion[1][1], pos_tot, confusion[1][1]/pos_tot))

print('{:.0f} out of {:.0f} negatives were correctly identified for a specificity of {:.2f}'
    .format(confusion[0][0], neg_tot, confusion[0][0]/neg_tot))


198 out of 747 positives were correctly identified for a sensitivity of 0.27
4770 out of 4825 negatives were correctly identified for a specificity of 0.99
