In [10]:
import pandas as pd

# import the class
from sklearn.neighbors import KNeighborsClassifier
# instantiate the model (with the default parameters)
knn = KNeighborsClassifier()

In [11]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(simple_train)
vect.get_feature_names()


['cab', 'call', 'me', 'please', 'tonight', 'you']

In [13]:
# transform TRAINING data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm


<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [14]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [15]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [16]:
# check the type of the document-term matrix
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [8]:
# examine the sparse matrix contents
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [17]:
# example text for model testing
simple_test = ["please don't call me"]

# transform TESTING data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
print(vect.get_feature_names())
simple_test_dtm.toarray()

['cab', 'call', 'me', 'please', 'tonight', 'you']


array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [18]:
simple_test_dtm

<1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [19]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [20]:
print(simple_test_dtm)

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1


In [None]:
###
###
###
#### WORKING WITH THE URL AND CSV FILES PANDAS.
###
###
###

In [23]:
# alternative: read file into pandas from a URL
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['labelID', 'messageText'])
sms.shape

(5572, 2)

In [24]:
sms.head(10)


Unnamed: 0,labelID,messageText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [25]:
# examine the class distribution
sms.labelID.value_counts()

ham     4825
spam     747
Name: labelID, dtype: int64

In [26]:
#convert label to a numerical variable
sms['labelNum'] = sms.labelID.map({'ham':0,'spam':1})

In [103]:
sms.head(10)

Unnamed: 0,labelID,messageText,labelNum
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [28]:
X = sms.messageText
y = sms.labelID
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [40]:
from sklearn.cross_validation import train_test_split
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y, random_state=1)
print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [None]:
###
###
### VECTORIZING our DATASET
###
###

In [41]:
# instantiate the vectorizer
vect = CountVectorizer()

In [42]:
# learn the training data vocabuilary, then use it to create a document - term matrix
vect.fit(Xtrain)
Xtraindtm=vect.transform(Xtrain)

In [44]:
#equivalent: combine fit and transorfm into a single step
Xtraindtm = vect.fit_transform(Xtrain)

In [45]:
Xtraindtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [46]:
Xtestdtm = vect.transform(Xtest)

In [47]:
Xtestdtm

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [None]:
###
### Building and evaluating model
###

In [49]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()


In [50]:
# train the model
%time nb.fit(Xtraindtm, ytrain)

Wall time: 43 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [63]:
# class predictions for xtraindtm
ypredClass = nb.predict(Xtestdtm)


['ham' 'ham' 'ham' ..., 'ham' 'spam' 'ham']


In [60]:
# calculate accuracy
from sklearn import metrics
metrics.accuracy_score(ytest, ypredClass)

0.98851399856424982

In [71]:
#confusion matrix
metrics.confusion_matrix(ytest, ypredClass)


array([[1203,    5],
       [  11,  174]])

In [76]:
Xtest[3132]

"LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323."

In [65]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(Xtestdtm)[:, 1]
y_pred_prob

array([  2.87744864e-03,   1.83488846e-05,   2.07301295e-03, ...,
         1.09026171e-06,   1.00000000e+00,   3.98279868e-09])

In [82]:
# calculate AUC
#metrics.roc_auc_score(ytest, y_pred_prob)

In [None]:
#####################Compareing MODELS ##############################

In [81]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

# train the model using X_train_dtm
%time logreg.fit(Xtraindtm, ytrain)

# make class predictions for X_test_dtm
ypredClass = logreg.predict(Xtestdtm)

# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(Xtestdtm)[:, 1]
y_pred_prob

# calculate accuracy
metrics.accuracy_score(ytest, ypredClass)

# calculate AUC
#metrics.roc_auc_score(ytest, y_pred_prob)

Wall time: 42 ms


0.9877961234745154

In [99]:
### Part 7: Examining a model for further insight
###
###

# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
print("length of the features ", len(X_train_tokens))

# examine the first 50 tokens
print(X_train_tokens[0:10])

# examine the last 50 tokens
print(X_train_tokens[-50:])

# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

length of the features  7456
['00', '000', '008704050406', '0121', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400']
['yer', 'yes', 'yest', 'yesterday', 'yet', 'yetunde', 'yijue', 'ym', 'ymca', 'yo', 'yoga', 'yogasana', 'yor', 'yorge', 'you', 'youdoing', 'youi', 'youphone', 'your', 'youre', 'yourjob', 'yours', 'yourself', 'youwanna', 'yowifes', 'yoyyooo', 'yr', 'yrs', 'ything', 'yummmm', 'yummy', 'yun', 'yunny', 'yuo', 'yuou', 'yup', 'zac', 'zaher', 'zealand', 'zebra', 'zed', 'zeros', 'zhong', 'zindgi', 'zoe', 'zoom', 'zouk', 'zyada', 'èn', '〨ud']


array([[  0.,   0.,   0., ...,   1.,   1.,   1.],
       [  5.,  23.,   2., ...,   0.,   0.,   0.]])

In [100]:
# rows represent classes, columns represent tokens
print(nb.feature_count_.shape)

# number of times each token appears across all HAM messages
ham_token_count = nb.feature_count_[0,:]
ham_token_count

(2, 7456)


array([ 0.,  0.,  0., ...,  1.,  1.,  1.])

In [102]:
# number of times each token appears across all SPAM messages
spam_token_count = nb.feature_count_[1, :]
print(spam_token_count)

# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'ham':ham_token_count, 'spam':spam_token_count}).set_index('token')
print(tokens.head())

# examine 5 random DataFrame rows
tokens.sample(5, random_state=6)


# Naive Bayes counts the number of observations in each class
nb.class_count_

[  5.  23.   2. ...,   0.   0.   0.]
              ham  spam
token                  
00            0.0   5.0
000           0.0  23.0
008704050406  0.0   2.0
0121          0.0   1.0
01223585236   0.0   1.0


array([ 3617.,   562.])

In [105]:
# add 1 to ham and spam counts to avoid dividing by 0
tokens['ham'] = tokens.ham + 1
tokens['spam'] = tokens.spam + 1
print(tokens.sample(5, random_state=6))

# convert the ham and spam counts into frequencies
tokens['ham'] = tokens.ham / nb.class_count_[0]
tokens['spam'] = tokens.spam / nb.class_count_[1]
print(tokens.sample(5, random_state=6))

# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens.spam / tokens.ham
print(tokens.sample(5, random_state=6))

# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
print(tokens.sort_values('spam_ratio', ascending=False))

# look up the spam_ratio for a given token
print(tokens.loc['dating', 'spam_ratio'])

                   ham      spam  spam_ratio
token                                       
very          1.017971  1.005338    0.297044
nasty         1.000553  1.003559    6.435943
villa         1.000276  1.003559   12.871886
beloved       1.000553  1.001779    3.217972
textoperator  1.000276  1.005338   19.307829
                   ham      spam  spam_ratio
token                                       
very          0.000281  0.001789    0.297044
nasty         0.000277  0.001786    6.435943
villa         0.000277  0.001786   12.871886
beloved       0.000277  0.001783    3.217972
textoperator  0.000277  0.001789   19.307829
                   ham      spam  spam_ratio
token                                       
very          0.000281  0.001789    6.356076
nasty         0.000277  0.001786    6.455277
villa         0.000277  0.001786    6.457062
beloved       0.000277  0.001783    6.443832
textoperator  0.000277  0.001789    6.468510
             ham      spam  spam_ratio
token           