In [74]:
%matplotlib notebook
import pprint
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)

mpl_style had been deprecated and will be removed in a future version.
Use `matplotlib.pyplot.style.use` instead.

  exec(code_obj, self.user_global_ns, self.user_ns)


In [94]:
sms_df = pd.read_csv('resources/SMSSpamCollection', sep='\t', header=None, names = ['status', 'msg'])
# adding the presence of words as attributes 
# -> cheap, free, offer
sms_df['spam_status'] = sms_df['status'].map(lambda x: x == 'spam')
attribute_key_list = ['cheap', 'free', 'offer', 'win']
for key in attribute_key_list:
    sms_df['attr_' + key] = sms_df['msg'].map(lambda msg: key in msg.lower())
sms_df[:3]

Unnamed: 0,status,msg,spam_status,attr_cheap,attr_free,attr_offer,attr_win
0,ham,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False
1,ham,Ok lar... Joking wif u oni...,False,False,False,False,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True,False,True,False,True


In [95]:
# takes 2 columns and returns stats of how they match
# first column is the expected column
def get_stats(dframe, columns):
    sensitive = dframe[(dframe[columns[0]] == True) & (dframe[columns[1]] == True)]
    specific = dframe[(dframe[columns[0]] == False) & (dframe[columns[1]] == False)]
    stats = {
        "sensitivity" : len(sensitive)/len(dframe[dframe[columns[0]] == True]),
        "specificity" : len(specific)/len(dframe[dframe[columns[0]] == False])
    }
    return stats

for key in attribute_key_list:
    print(key)
    pprint.pprint(get_stats(sms_df,("spam_status", "attr_"+key)))


cheap
{'sensitivity': 0.00535475234270415, 'specificity': 0.9981347150259068}
free
{'sensitivity': 0.26639892904953144, 'specificity': 0.9863212435233161}
offer
{'sensitivity': 0.050870147255689425, 'specificity': 0.9983419689119171}
win
{'sensitivity': 0.13386880856760375, 'specificity': 0.9861139896373057}


### Inspired by the udacity solution

#### Bag of words approach

In [73]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#document -> word frequency matrix
def get_freq_matrix(documents, vectorizer):
    vectorized_msgs = vectorizer.fit_transform(documents)
    frequency_matrix = pd.DataFrame(vectorized_msgs.toarray(),
                                    columns = vectorizer.get_feature_names())
    return frequency_matrix

vectorizer = CountVectorizer(stop_words='english')
sample_docs = sms_df["msg"][:3]
get_freq_matrix(sample_docs, vectorizer)

Unnamed: 0,08452810075over18,2005,21st,87121,amore,apply,available,buffet,bugis,cine,...,receive,std,text,tkts,txt,wat,wif,win,wkly,world
0,0,0,0,0,1,0,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,1,1,1,0,1,0,0,0,0,...,1,1,1,1,1,0,0,1,1,0


#### TF-IDF

In [76]:
ti_vectorizer = TfidfVectorizer(stop_words='english')
get_freq_matrix(sample_docs, ti_vectorizer)

Unnamed: 0,08452810075over18,2005,21st,87121,amore,apply,available,buffet,bugis,cine,...,receive,std,text,tkts,txt,wat,wif,win,wkly,world
0,0.0,0.0,0.0,0.0,0.27735,0.0,0.27735,0.27735,0.27735,0.27735,...,0.0,0.0,0.0,0.0,0.0,0.27735,0.0,0.0,0.0,0.27735
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0
2,0.196116,0.196116,0.196116,0.196116,0.0,0.196116,0.0,0.0,0.0,0.0,...,0.196116,0.196116,0.196116,0.196116,0.196116,0.0,0.0,0.196116,0.196116,0.0


#### Udacity solution

##### Split into training and testing sets

In [82]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sms_df['msg'],
                                                    sms_df['spam_status'],
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(sms_df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


##### Apply BoF