In [139]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [140]:
df = pd.DataFrame.from_csv("../data/SMSSpamCollection.tsv", sep = '\t', header = 0, index_col = None) 

In [141]:
#This looks at the top 10 entries of the 'label' feature that is equivalent
#to 'spam'

df[df.label=='spam'].head(10)


Unnamed: 0,label,msg
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
12,spam,URGENT! You have won a 1 week FREE membership ...
15,spam,"XXXMobileMovieClub: To use your credit, click ..."
19,spam,England v Macedonia - dont miss the goals/team...
34,spam,Thanks for your subscription to Ringtone UK yo...
42,spam,07732584351 - Rodger Burns - MSG = We tried to...


In [142]:
#df.label.value_counts()
df.msg.value_counts()

#in the panda 'dataframe', for the 'msg' feature, these are the number
#of different values there 

Sorry, I'll call later                                                                                                                                                                                                                                                                                                                  30
I cant pick the phone right now. Pls send a message                                                                                                                                                                                                                                                                                     12
Ok...                                                                                                                                                                                                                                                                                                                                   10
7 wonde

In [143]:
df.label.describe()

#the .describe() gives us a very brief description/blurb about information
#behind the feature that we are looking at (label)

count     5572
unique       2
top        ham
freq      4825
Name: label, dtype: object

In [144]:
df['label'] = df.label.map({'ham': 0, 'spam':1})

#we are looking at 'label' feature, and then if there is a 'ham'/'spam'
#value in either of them, we then give it a binary value of 0 or 1

In [145]:
df.head()

Unnamed: 0,label,msg
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [146]:
from sklearn.cross_validation import train_test_split

#understand that train_test_split outputs 4 values
X_train, X_test, y_train, y_test = train_test_split(df.msg, df.label, random_state = 1)

#the first value is the training data's features
#the second value is the training data's class

print X_train.describe()
print X_test.describe()
print y_train.describe()
print y_test.describe()

count                       4179
unique                      3937
top       Sorry, I'll call later
freq                          20
Name: msg, dtype: object
count                       1393
unique                      1356
top       Sorry, I'll call later
freq                          10
Name: msg, dtype: object
count    4179.000000
mean        0.134482
std         0.341210
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64
count    1393.000000
mean        0.132807
std         0.339488
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64


In [176]:
print X_test.shape

(1393,)


In [148]:
#Now we need to convert the text feature into feature vectors which can
#be used for machine learning purposes

from sklearn.feature_extraction.text import CountVectorizer

In [149]:
train_simple = ['call you tonight',
                'Call me a cab',
               'please call me... PLEASE!']

#user-generated test data

In [150]:
vect = CountVectorizer(decode_error = 'ignore')
vect.fit(train_simple) #fits the data into the CountVectorizer
vect.get_feature_names()

[u'cab', u'call', u'me', u'please', u'tonight', u'you']

In [151]:
print train_simple
train_simple_dtm = vect.transform(train_simple)
#The transform applies the data into the features
train_simple_dtm.toarray()

pd.DataFrame(train_simple_dtm.toarray(), columns = vect.get_feature_names())

#print train_simple_dtm

['call you tonight', 'Call me a cab', 'please call me... PLEASE!']


Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [152]:
test_simple = ["please don't call me"]
test_simple_dtm = vect.transform(test_simple)

test_simple_dtm.toarray()
pd.DataFrame(test_simple_dtm.toarray(), columns = vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [153]:
train_exp = ['where is my taco?',
                'did I eat the taco',
                'I can easily eat my way through that whole box of tacos!',
                'I think way too much about tacos, huh',
                'taco, taco, taco!!!'                
               ]
test_exp = [
    'where did he go?', 'how long did the whole thing last', 'lets go eat one taco or multiple tacos'
]

In [154]:
#Vectorize the Text

vect = CountVectorizer(decode_error = 'ignore') #defined vect

In [155]:
vect.fit(train_exp) #you will need to first fit the data before transforming
train_exp_dtm = vect.transform(train_exp)
train_exp_pd = pd.DataFrame(train_exp_dtm.toarray(), columns = vect.get_feature_names())
train_exp_dtm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1],
       [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [156]:
#not sure how to 'show word count'

In [157]:
vect.fit(test_exp)
test_exp_dtm = vect.transform(test_exp)

In [158]:
#Returning back to Vectorizing our SMS Dataset
X_train

vect = CountVectorizer(decode_error = 'ignore')
vect.fit(X_train)
vect.get_feature_names()

X_train_dtm = vect.transform(X_train)
#X_test_dtm = vect.transform(X_test) 

#question: why are we able to do this without having to fit the data?
#print X_train.describe()
print y_train.describe()

count    4179.000000
mean        0.134482
std         0.341210
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64


In [159]:
train_features = vect.get_feature_names()
len(train_features)

7456

In [160]:
X_train_arr = X_train_dtm.toarray()



In [178]:
#Exercise: Calculate the number of tokens in the 0th message in train_arr

#print len(X_train_arr)
print X_train_dtm.shape
    

(4179, 7456)


NAIVE BAYES MODEL PORTION

In [162]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train) #note how it is still in its transformed state and not
#in its toarray state

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [179]:
#predict only takes in array
vect.fit(X_train)
test_exp = vect.transform(X_test)

preds = nb.predict(test_exp)
preds

print len(preds)
print len(y_train)
print test_exp.shape

1393
4179
(1393, 7456)


In [181]:
from sklearn import metrics

print metrics.accuracy_score(y_test, preds)
print metrics.confusion_matrix(y_test, preds)

0.988513998564
[[1203    5]
 [  11  174]]


In [187]:
from sklearn.cross_validation import cross_val_score
nb = MultinomialNB()

vect = CountVectorizer(decode_error = 'ignore')
vect.fit(df.msg)

X_dtm = vect.transform(df.msg)
y = df.label

cross_val_score(nb, X_dtm, y, cv = 5) #cross validation to score our model

array([ 0.98026906,  0.98026906,  0.97845601,  0.98114901,  0.97935368])

In [193]:
#Exercise: Calculating the spamminess of each token:

df_ham = df[df.label==0] #all items that have a label of 0
df_spam = df[df.label==1] #all items that have a label of 1

vect.fit(df.msg)
all_features = vect.get_feature_names()

ham_dtm = vect.transform(df_ham.msg) #created a document-term matrix of spam
ham_arr = ham_dtm.toarray() #array form of the DTM

spam_dtm = vect.transform(df_spam.msg)
spam_arr = spam_dtm.toarray()

In [194]:
ham_counts = np.sum(ham_arr, axis = 0)
ham_counts

array([0, 0, 1, ..., 1, 0, 1])

In [195]:
spam_counts = np.sum(spam_arr, axis = 0)
spam_counts

array([10, 29,  0, ...,  0,  1,  0])

In [197]:
all_token_counts = pd.DataFrame({'token': all_features, 'ham': ham_counts, 'spam': spam_counts})
all_token_counts.head()

Unnamed: 0,ham,spam,token
0,0,10,00
1,0,29,000
2,1,0,000pes
3,0,2,008704050406
4,0,1,0089


In [199]:
all_token_counts['ham'] = all_token_counts.ham + 1
all_token_counts['spam'] = all_token_counts.spam + 1

In [202]:
all_token_counts['spam_ratio'] = all_token_counts.spam / all_token_counts.ham
all_token_counts.sort_index(by = 'spam_ratio', ascending = False)

type(all_token_counts)

  from ipykernel import kernelapp as app


pandas.core.frame.DataFrame