In [14]:
import pandas as pd

In [15]:
msg = pd.read_csv('naivetext.csv', names = ['message', 'label'])
print('Total instances in the dataset:', msg.shape[0])

msg['labelnum'] = msg.label.map({'pos':1, 'neg':0})

X = msg.message
Y = msg.labelnum

print('\nThe message and its label of first 5 instances are listed below: \n')

X5, Y5 = X[0:5], msg.label[0:5]
for x, y in zip(X5, Y5):
    print(x, ',' ,y)

Total instances in the dataset: 18

The message and its label of first 5 instances are listed below: 

I love this sandwich , pos
This is an amazing place , pos
I feel very good about these beers , pos
This is my best work , pos
What an awesome view , pos


In [16]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y)

print('Dataset is split into Training and Testing samples')
print('Total training instances:', xtrain.shape[0])
print('Total testing instances:', xtest.shape[0])

Dataset is split into Training and Testing samples
Total training instances: 13
Total testing instances: 5


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm = count_vect.transform(xtest)

print('Total features extracted using CountVectorizer:', xtrain_dtm.shape[1])
print('\nFeatures for first 5 training instances are listed below:\n')

df = pd.DataFrame(xtrain_dtm.toarray(), columns = count_vect.get_feature_names())
print(df[0:5])

Total features extracted using CountVectorizer: 44

Features for first 5 training instances are listed below:

   about  am  an  and  awesome  bad  beers  best  boss  can  ...  the  these  \
0      0   0   0    0        0    0      0     0     0    0  ...    0      0   
1      0   0   0    0        0    0      0     0     0    0  ...    0      0   
2      0   0   0    0        0    1      0     0     0    0  ...    0      0   
3      0   0   1    0        1    0      0     0     0    0  ...    0      0   
4      0   0   0    0        0    0      0     0     0    0  ...    0      0   

   this  tired  to  very  view  what  with  work  
0     1      0   0     0     0     0     0     0  
1     0      0   0     0     0     0     0     0  
2     0      0   1     0     0     0     0     0  
3     0      0   0     0     1     1     0     0  
4     0      0   1     0     0     0     0     0  

[5 rows x 44 columns]


In [18]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(xtrain_dtm, ytrain)
predicted = clf.predict(xtest_dtm)

print('Classstification results of testing samples are given below:\n')
for doc, p in zip(xtest, predicted):
    pred = 'pos' if p == 1 else 'neg'
    print('%s -> %s ' % (doc, pred))

Classstification results of testing samples are given below:

I am tired of this stuff -> neg 
We will have good fun tomorrow -> pos 
I went to my enemy's house today -> neg 
This is an amazing place -> pos 
What a great holiday -> pos 


In [19]:
from sklearn import metrics

print('Accuracy metrics\n')
print('Accuracy of the classifer:', metrics.accuracy_score(ytest, predicted))

print('Recall:', metrics.recall_score(ytest,predicted))
print('Precison:', metrics.precision_score(ytest, predicted))

print('\nConfusion matrix')
print(metrics.confusion_matrix(ytest, predicted))

Accuracy metrics

Accuracy of the classifer: 1.0
Recall: 1.0
Precison: 1.0

Confusion matrix
[[2 0]
 [0 3]]
