In [91]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [94]:
# Load the raw data
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw["spam"] = sms_raw["spam"] == "spam"

# Grab the samples
sample_positive = sms_raw[sms_raw["spam"] == True].sample(100)
sample_negative = sms_raw[sms_raw["spam"] == False].sample(100)
combined_sample = sample_positive.append(sample_negative)

# Turn each word into a feature
split_combined_sample = combined_sample["message"].str.lower().str.replace("\W+", " ").str.split()
data = split_combined_sample.str.join("|").str.get_dummies(sep="|")
target = combined_sample["spam"]

# Reduce the number of features
from sklearn.feature_selection import SelectKBest, mutual_info_classif
kbest = SelectKBest(mutual_info_classif, k=50)
best_features = kbest.fit_transform(data, target)

# Build the model
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(best_features, target)

# Predict
prediction = bnb.predict(best_features)
data_size = best_features.shape[0]
wrong_identification = (prediction != target).sum()
print("{}% accurate ({} wrong out of {})".format((1 - (wrong_identification / data_size))*100, wrong_identification, data_size))

89.0% accurate (22 wrong out of 200)


In [108]:
# Load the data
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']
sms_raw["spam"] = sms_raw["spam"] == "spam"

# We will write our own k-fold functions next time
positive = sms_raw[sms_raw["spam"] == True].sample(n=250)
negative = sms_raw[sms_raw["spam"] == False].sample(n=250)
holdout = 50


# Take 100 from each and use those as our training dataset (keep track of the indexes we used to train the data)
# Train your model on test dataset
# Test the data on the remaining dataset


In [None]:
# Test your model with different holdout groups.

# Grab and process the raw data.
# data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
#              "master/sms_spam_collection/SMSSpamCollection"
#             )
# sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
# # 5572 records. Let's holdout 10%
# holdout = .1
# sms_raw = sms_raw.head(round(sms_raw.shape[0] - (sms_raw.shape[0] * holdout)))
# sms_raw.columns = ['spam', 'message']

# # Enumerate our spammy keywords.
# keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

# for key in keywords:
#     sms_raw[str(key)] = sms_raw.message.str.contains(
#         ' ' + str(key) + ' ',
#         case=False
# )

# sms_raw['allcaps'] = sms_raw.message.str.isupper()
# sms_raw['spam'] = (sms_raw['spam'] == 'spam')
# data = sms_raw[keywords + ['allcaps']]
# target = sms_raw['spam']

# from sklearn.naive_bayes import BernoulliNB
# bnb = BernoulliNB()
# y_pred = bnb.fit(data, target).predict(data)

# round((sms_raw['spam'] == y_pred).sum()/sms_raw.shape[0], 3)

In [None]:
# Question: What is this doing?
# http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-with-stratification-based-on-class-labels
# from sklearn.model_selection import StratifiedKFold
# skf = StratifiedKFold(n_splits=3)

# X = np.ones(len(sms_raw['spam']))
# y = sms_raw['spam']

# for train, test in skf.split(X, y):
#     print("%s %s" % (train, test))

In [2]:
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(sms_raw, test_size = 0.2)
# print(len(train))
# print(len(test))
# print(sms_raw.shape)

In [3]:
# Implement your own cross validation with your spam model.
# Question: How would we do folds? And why is that better than randomly selecting variables?
# data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
#              "master/sms_spam_collection/SMSSpamCollection"
#             )
# sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
# sms_raw.columns = ['spam', 'message']

# hold_out_percent = 0.2
# sample_size = round(len(sms_raw) * hold_out_percent)
# holdout = sms_raw.sample(sample_size)[["spam", "message"]]
# training_data = sms_raw[~sms_raw.isin(holdout)].dropna()[["spam", "message"]]


# # Enumerate our spammy keywords.
# keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

# for key in keywords:
#     training_data[str(key)] = training_data.message.str.contains(
#         ' ' + str(key) + ' ',
#         case=False
# )

# training_data['allcaps'] = training_data.message.str.isupper()
# training_data['spam'] = (sms_raw['spam'] == 'spam')
# data = training_data[keywords + ['allcaps']]
# target = training_data['spam']

# from sklearn.naive_bayes import BernoulliNB
# bnb = BernoulliNB()
# y_pred = bnb.fit(data, target).predict(data)

# round((training_data['spam'] == y_pred).sum() / training_data.shape[0], 3)
