In [188]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.naive_bayes import BernoulliNB

In [119]:
# Load the raw data
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw["spam"] = sms_raw["spam"] == "spam"

# Grab the samples
sample_positive = sms_raw[sms_raw["spam"] == True].sample(100)
sample_negative = sms_raw[sms_raw["spam"] == False].sample(100)
combined_sample = sample_positive.append(sample_negative)

# Turn each word into a feature
split_combined_sample = combined_sample["message"].str.lower().str.replace("\W+", " ").str.split()
data = split_combined_sample.str.join("|").str.get_dummies(sep="|")
target = combined_sample["spam"]

# Reduce the number of features
from sklearn.feature_selection import SelectKBest, mutual_info_classif
kbest = SelectKBest(mutual_info_classif, k=50)
best_features = kbest.fit_transform(data, target)

# Build the model
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(best_features, target)

# Predict
prediction = bnb.predict(best_features)
data_size = best_features.shape[0]
wrong_identification = (prediction != target).sum()
print("{}% accurate ({} wrong out of {})".format((1 - (wrong_identification / data_size))*100, wrong_identification, data_size))

87.5% accurate (25 wrong out of 200)


In [201]:
def run_model(sample):
    # Turn each word into a feature
    split_sample = sample["message"].str.lower().str.replace("\W+", " ").str.split()
    data = split_sample.str.join("|").str.get_dummies(sep="|")
    target = sample["spam"]

    # Reduce the number of features
    kbest = SelectKBest(mutual_info_classif, k=50).fit(data, target)
    best_features = kbest.transform(data)

    # Build the model
    bnb = BernoulliNB()
    bnb.fit(best_features, target)

    # Predict
    prediction = bnb.predict(best_features)
    data_size = best_features.shape[0]
    wrong_identification = (prediction != target).sum()
    accuracy = (1 - (wrong_identification / data_size))*100
    print("{}% accurate ({} wrong out of {})".format((1 - (wrong_identification / data_size))*100, wrong_identification, data_size))
    
    # Return the values we want
    return kbest, bnb, accuracy, data

In [211]:
def run_model_with_params(sample, fit_features, fit_bernouilli, fit_columns):
    # Turn each word into a feature
    split_sample = sample["message"].str.lower().str.replace("\W+", " ").str.split()
    data = split_sample.str.join("|").str.get_dummies(sep="|")
    
    # Make sure we use the same columns as our fit data
    # Save the same columns
    data = data[[col for col in data.columns if col in fit_columns.columns]]

    # Add potentially missing columsn
    for col in fit_columns.columns:
        if col not in data.columns:
            data[col] = [0] * data.shape[0]
    target = sample["spam"]
    


    # Reduce the number of features
    best_features = fit_features.transform(data)

    # Predict
    prediction = fit_bernouilli.predict(best_features)
    data_size = best_features.shape[0]
    wrong_identification = (prediction != target).sum()
    run_model.accuracy = (1 - (wrong_identification / data_size))*100
    print("{}% accurate ({} wrong out of {})".format((1 - (wrong_identification / data_size))*100, wrong_identification, data_size))

In [203]:
# Load the data
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw["spam"] = sms_raw["spam"] == "spam"

# Define our holdout group size
n_classes = 2
n_holdout_groups = 10
sample_size = 500
sample_size_per_class = int(sample_size / n_classes) # 250
holdout = int(sample_size / n_holdout_groups) # 50
n_holdouts_per_class = int(holdout / n_classes) # 25

# To stratify, split training data into one data frame per class.
positive = sms_raw[sms_raw["spam"] == True].sample(n=sample_size_per_class)
negative = sms_raw[sms_raw["spam"] == False].sample(n=sample_size_per_class)

In [214]:
# Take 25 from each and use those as our training dataset (keep track of the indexes we used to train the data)
xs = range(0, sample_size_per_class, holdout)
best_model = None
best_feature_selection = None
best_accuracy = 0
best_columns = None

for x in xs:
    # the holdout group of 25. will be 'stratified', e.g. will have even numbers of positive, negative class.
    combined_holdout = positive.iloc[x:x+sample_size_per_class].append(negative.iloc[x:x+sample_size_per_class])
    
    # everything around the holdout group (everything else)
    combined_training = positive.iloc[0:x].append(positive.iloc[x+holdout:]).append(negative.iloc[0:x]).append(negative.iloc[x+holdout:])
    
    # Train your model on combined_training dataset
    kbest, bernoulli, accuracy, data = run_model(combined_training)
    
    # Run it against the holdout
    run_model_with_params(combined_holdout, kbest, bernoulli, data)
    
    # Test the data on the combined_holdout dataset
    if accuracy > best_accuracy:
        best_model = bernoulli
        best_feature_selection = kbest
        best_accuracy = accuracy
        best_columns = data

# now we have our best model, use it to classify the whole dataset as we did before we knew about cross validation.

89.25% accurate (43 wrong out of 400)
87.6% accurate (62 wrong out of 500)
88.5% accurate (46 wrong out of 400)
50.24999999999999% accurate (199 wrong out of 400)
89.25% accurate (43 wrong out of 400)
51.0% accurate (147 wrong out of 300)
85.0% accurate (60 wrong out of 400)
52.5% accurate (95 wrong out of 200)
90.0% accurate (40 wrong out of 400)
56.00000000000001% accurate (44 wrong out of 100)


In [215]:
# Run the whole data through run_model_with_params

data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw["spam"] = sms_raw["spam"] == "spam"

run_model_with_params(sms_raw, best_feature_selection, best_model, best_columns)

91.87006460875807% accurate (453 wrong out of 5572)


In [None]:
# Test your model with different holdout groups.

# Grab and process the raw data.
# data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
#              "master/sms_spam_collection/SMSSpamCollection"
#             )
# sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
# # 5572 records. Let's holdout 10%
# holdout = .1
# sms_raw = sms_raw.head(round(sms_raw.shape[0] - (sms_raw.shape[0] * holdout)))
# sms_raw.columns = ['spam', 'message']

# # Enumerate our spammy keywords.
# keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

# for key in keywords:
#     sms_raw[str(key)] = sms_raw.message.str.contains(
#         ' ' + str(key) + ' ',
#         case=False
# )

# sms_raw['allcaps'] = sms_raw.message.str.isupper()
# sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# data = sms_raw[keywords + ['allcaps']]
# target = sms_raw['spam']

# from sklearn.naive_bayes import BernoulliNB
# bnb = BernoulliNB()
# y_pred = bnb.fit(data, target).predict(data)

# round((sms_raw['spam'] == y_pred).sum()/sms_raw.shape[0], 3)

In [None]:
# Question: What is this doing?
# http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-with-stratification-based-on-class-labels
# from sklearn.model_selection import StratifiedKFold
# skf = StratifiedKFold(n_splits=3)

# X = np.ones(len(sms_raw['spam']))
# y = sms_raw['spam']

# for train, test in skf.split(X, y):
#     print("%s %s" % (train, test))

In [2]:
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(sms_raw, test_size = 0.2)
# print(len(train))
# print(len(test))
# print(sms_raw.shape)

In [3]:
# Implement your own cross validation with your spam model.
# Question: How would we do folds? And why is that better than randomly selecting variables?
# data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
#              "master/sms_spam_collection/SMSSpamCollection"
#             )
# sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
# sms_raw.columns = ['spam', 'message']

# hold_out_percent = 0.2
# sample_size = round(len(sms_raw) * hold_out_percent)
# holdout = sms_raw.sample(sample_size)[["spam", "message"]]
# training_data = sms_raw[~sms_raw.isin(holdout)].dropna()[["spam", "message"]]


# # Enumerate our spammy keywords.
# keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

# for key in keywords:
#     training_data[str(key)] = training_data.message.str.contains(
#         ' ' + str(key) + ' ',
#         case=False
# )

# training_data['allcaps'] = training_data.message.str.isupper()
# training_data['spam'] = (sms_raw['spam'] == 'spam')
# data = training_data[keywords + ['allcaps']]
# target = training_data['spam']

# from sklearn.naive_bayes import BernoulliNB
# bnb = BernoulliNB()
# y_pred = bnb.fit(data, target).predict(data)

# round((training_data['spam'] == y_pred).sum() / training_data.shape[0], 3)
