In [91]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [92]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

round((sms_raw['spam'] == y_pred).sum() / sms_raw.shape[0], 3)

0.89200000000000002

In [104]:
# Test your model with different holdout groups.

# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
# 5572 records. Let's holdout 10%
holdout = .1
sms_raw = sms_raw.head(round(sms_raw.shape[0] - (sms_raw.shape[0] * holdout)))
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

round((sms_raw['spam'] == y_pred).sum()/sms_raw.shape[0], 3)

0.89100000000000001

In [94]:
# Question: What is this doing?
# http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-with-stratification-based-on-class-labels
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)

X = np.ones(len(sms_raw['spam']))
y = sms_raw['spam']

for train, test in skf.split(X, y):
    print("%s %s" % (train, test))

[1597 1598 1613 ..., 5012 5013 5014] [   0    1    2 ..., 1683 1684 1685]
[   0    1    2 ..., 5012 5013 5014] [1597 1598 1613 ..., 3382 3385 3391]
[   0    1    2 ..., 3382 3385 3391] [3340 3341 3342 ..., 5012 5013 5014]


In [95]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(sms_raw, test_size = 0.2)
print(len(train))
print(len(test))
print(sms_raw.shape)

4012
1003
(5015, 10)


In [96]:
# Implement your own cross validation with your spam model.
# Question: How would we do folds? And why is that better than randomly selecting variables?
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

hold_out_percent = 0.2
sample_size = round(len(sms_raw) * hold_out_percent)
holdout = sms_raw.sample(sample_size)[["spam", "message"]]
training_data = sms_raw[~sms_raw.isin(holdout)].dropna()[["spam", "message"]]


# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    training_data[str(key)] = training_data.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

training_data['allcaps'] = training_data.message.str.isupper()
training_data['spam'] = (sms_raw['spam'] == 'spam')
data = training_data[keywords + ['allcaps']]
target = training_data['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

round((training_data['spam'] == y_pred).sum() / training_data.shape[0], 3)


0.89100000000000001

In [105]:
# We will write our own k-fold functions next time
