In [3]:
# Working on text using machine learning algorithms

sample_train = ['call me tonight', 'Call me cab', 'please call me....PLEASE!']

In [4]:
#vectorising the data words(tokens)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

vect.fit(sample_train)

# note the parameters inside CountVectorizer() 

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight']

In [6]:
simple_train_dtm = vect.transform(sample_train)
simple_train_dtm

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [11]:
simple_train_dtm.toarray()

array([[0, 1, 1, 0, 1],
       [1, 1, 1, 0, 0],
       [0, 1, 1, 2, 0]])

In [9]:
import warnings
warnings.filterwarnings("always")

import pandas as pd
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight
0,0,1,1,0,1
1,1,1,1,0,0
2,0,1,1,2,0


In [12]:
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [13]:
print(simple_train_dtm)
# insteed of puting all the values it prints the location : value(non zero) pairs 

  (0, 1)	1
  (0, 2)	1
  (0, 4)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [15]:
simple_test = ["please don't call me"]
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm

<1x5 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [16]:
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0]])

In [20]:
pd.DataFrame(simple_test_dtm.toarray(), columns = vect.get_feature_names())

# note : the word "don't" is removed from the document vector because the model dost not know the relationship 
# between new words(eg. don't) and the label as it is trained according to train data vocabulary

Unnamed: 0,cab,call,me,please,tonight
0,0,1,1,1,0


In [22]:
# working with real data set
# sms data : contain list of spam or non_spam emails

sms = pd.read_table('data/sms.tsv', header=None, names=['label', 'message'])

In [23]:
sms.shape

(5572, 2)

In [25]:
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [26]:
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [27]:
sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [30]:
X = sms.message
y = sms.label_num 
print(X.shape)
print(y.shape)

# note : the data variable we created X is 1D because we split it into features using CountVectrisor()
# always make sure that the vector you are going to vectorise is 1D(column vector)

(5572,)
(5572,)


In [32]:
# note : we are going to split data first before vectorising the datasets
# because as during the splitting the data it contains the all real world case(i.e all worlds)
# whereas if we vectorise first it will remove some of the world(s)

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1, stratify=y)

print(X_train.shape)
print(y_train.shape)

(4179,)
(4179,)


In [33]:
# now vectorising the dataset
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_train_dtm

# 4179 is the number of samples
# 7456 is the number of words(tokens) present in the vocabulary

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [37]:
X_test_dtm = vect.transform(X_test)
X_test_dtm.shape

(1393, 7456)

In [36]:
# or we can do
X_train_dtm = vect.fit_transform(X_train)
# this is more faster as compare to the privious method

In [54]:
# using Multinmial Naive Bayes classifier 

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()



In [55]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.21 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
y_pred = nb.predict(X_test_dtm)

In [57]:
from sklearn import metrics

metrics.accuracy_score(y_test,y_pred)

0.9885139985642498

In [58]:
metrics.confusion_matrix(y_test,y_pred)

#By definition a confusion matrix C is such that C_{i, j} is equal to the number of observations known to be 
#in group i but predicted to be in group j.
#Thus in binary classification, the count of true negatives is C_{0,0}, false negatives is C_{1,0}, 
#true positives is C_{1,1} and false positives is C_{0,1}.

array([[1203,    5],
       [  11,  174]])

In [59]:
# printing out all the false positive

# X_train[(y_pred==1) & (y_test==0)]

# X_train[y_pred > y_test]

In [60]:
# printing all false negative

# X_train[y_pred < y_test]