In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [6]:
# Calculate the accuracy of your model here.
(sms_raw["spam"] == y_pred).sum()/len(y_pred)


0.89160086145010764

In [7]:
from sklearn.metrics import confusion_matrix

# counts for when a message was ham and we predicted ham,
# when a message was ham and we predicted spam,
# when a message was spam and we predicted ham,
# and when a message was spam and we predicted spam
confusion_matrix(target, y_pred)

# positive = spam
# got it right: was ham
# false positive (Type 1 error: a fasle alarm)
# false negative (Type 2 error: a miss)
# got it right: was spam

array([[4770,   55],
       [ 549,  198]])

In [10]:
# Build your confusion matrix and calculate sensitivity and specificity here.

# Sensitivity = positives correct identified
# Specificity = negatives correct identified

sensitivity = 198 / (198 + 549)
sensitivity

specificity = 4770 / (4770 + 55)
specificity


0.9886010362694301

In [25]:
# Question: How could we have made this more succinct?

true_true = 0
false_false = 0
false_negative = 0
false_positive = 0

for i, spam in enumerate(sms_raw["spam"]):
    if spam == True and y_pred[i] == True:
        true_true += 1
    if spam == False and y_pred[i] == False:
        false_false += 1
    if spam == True and y_pred[i] == False:
        false_negative += 1
    if spam == False and y_pred[i] == True:
        false_positive += 1

print(false_false, false_positive, false_negative, true_true)
        

4770 55 549 198


In [26]:
sms_raw["spam"].value_counts()

False    4825
True      747
Name: spam, dtype: int64