In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Grab and process the data
data_path = 'SMSspamcollection'

sms_raw = pd.read_csv(data_path, delimiter='\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case = False
)
    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)


In [3]:
# Test your model with different holdout groups

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necesssary training and test groups

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target))) 
      

With 20% Holdout: 0.885201793721973
Testing on Sample: 0.8916008614501076


In [4]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.89784946, 0.89426523, 0.89426523, 0.890681  , 0.89605735,
       0.89048474, 0.88150808, 0.89028777, 0.88489209, 0.89568345])

In [5]:
# Implement your own cross validation
sms_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 10 columns):
spam       5572 non-null bool
message    5572 non-null object
click      5572 non-null bool
offer      5572 non-null bool
winner     5572 non-null bool
buy        5572 non-null bool
free       5572 non-null bool
cash       5572 non-null bool
urgent     5572 non-null bool
allcaps    5572 non-null bool
dtypes: bool(9), object(1)
memory usage: 92.6+ KB


In [6]:
sms_raw['kfolds'] = 0
sms_raw['kfolds'] = [np.random.randint(10) for x in sms_raw['kfolds']]

In [25]:
for x in range(0,10):
    train_data = sms_raw[sms_raw['kfolds'] != x][keywords + ['allcaps']]
    train_target = sms_raw[sms_raw['kfolds'] != x]['spam']
    test_data = sms_raw[sms_raw['kfolds'] == x][keywords + ['allcaps']]
    test_target = sms_raw[sms_raw['kfolds'] == x]['spam']
    
    print('{} Fold Score: '.format(x) + 
         str(bnb.fit(train_data, train_target).score(test_data, test_target)))


0 Fold Score: 0.8982142857142857
1 Fold Score: 0.8856152512998267
2 Fold Score: 0.8957169459962756
3 Fold Score: 0.8723404255319149
4 Fold Score: 0.8894736842105263
5 Fold Score: 0.8971119133574007
6 Fold Score: 0.8912280701754386
7 Fold Score: 0.895017793594306
8 Fold Score: 0.8994307400379506
9 Fold Score: 0.8892921960072595


In [27]:
from sklearn.metrics import confusion_matrix

# Print confusion matrix for the original model on all data
print('Confusion matrix all data')
print(confusion_matrix(sms_raw['spam'], y_pred))


[[4770   55]
 [ 549  198]]


In [38]:
# Model accuracy and 
# Confusion matrix for 20% holdout
model = bnb.fit(X_train, y_train)
model_pred1 = model.predict(X_test)
print('{:.2f}'.format(model.score(X_test, y_test)))
print(confusion_matrix(y_test, model_pred1))


0.89
[[945  12]
 [116  42]]


These models were barely more accurate than the dominate class.  
To improve the models:
1) Cost function for errors
2) Deliberately oversample the spam class
3) Use a different method like SVM and introduce a cutoff or specific rule