In [39]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score


In [10]:
DATA_JOSON_FILE = 'C:/Users/amar/ML Projects/SpamData/01_Processing/email-text-data.json'

In [11]:
data = pd.read_json(DATA_JOSON_FILE)

In [12]:
data.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",00001.7848dde101aa985090474a91ec93fcf0
1,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,1,##############################################...,00004.eac8de8d759b7e74154f142194282724
4,1,I thought you might like these:\n\n1) Slim Dow...,00005.57696a39d7d84318ce497886896bf90d


In [13]:
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
6895,0,"Hi Gianni,\n\n\n\nA very good resource for thi...",02497.60497db0a06c2132ec2374b2898084d3
6896,0,Gianni Ponzi wrote:\n\n> I have a prob when tr...,02498.09835f512f156da210efb99fcc523e21
6897,0,Neale Pickett <neale@woozle.org> writes:\n\n\n...,02499.b4af165650f138b10f9941f6cc5bce3c
6898,0,"\n\nHi,\n\n\n\nI think you need to give us a l...",02500.05b3496ce7bca306bed0805425ec8621
6899,0,,cmds


In [14]:
data.shape

(6900, 3)

In [16]:
vectorizer = CountVectorizer(stop_words='english')  #stopword removing in one line


In [17]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [18]:
all_features.shape

(6900, 92347)

In [19]:
vectorizer.vocabulary_

{'doctype': 31626,
 'html': 43690,
 'public': 66564,
 'w3c': 84033,
 'dtd': 33009,
 'transitional': 79323,
 'en': 34875,
 'head': 42398,
 'meta': 55616,
 'content': 27527,
 '3d': 5705,
 'text': 77892,
 'charset': 25335,
 '3dwindows': 6600,
 '1252': 1755,
 'http': 43711,
 'equiv': 35375,
 '3dcontent': 6219,
 'ype': 88818,
 'mshtml': 57046,
 '00': 0,
 '2314': 3713,
 '1000': 1292,
 '3dgenerator': 6295,
 'body': 22309,
 'inserted': 47030,
 'calypso': 24236,
 'table': 77136,
 'border': 22468,
 '3d0': 5706,
 'cellpadding': 24955,
 'cellspacing': 24963,
 '3d2': 5845,
 'id': 44970,
 '3d_calyprintheader_': 6077,
 'ules': 80851,
 '3dnone': 6436,
 'style': 75896,
 'color': 26745,
 'black': 21859,
 'display': 31213,
 'width': 85580,
 '100': 1291,
 'tbody': 77432,
 'tr': 79199,
 'td': 77506,
 'colspan': 26763,
 '3d3': 5890,
 'hr': 43603,
 '3dblack': 6182,
 'noshade': 59209,
 'size': 73894,
 '3d1': 5737,
 'end': 34936,
 'font': 38284,
 '000000': 4,
 'face': 36728,
 '3dverdana': 6586,
 'arial': 18449

In [22]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, 
                                                   test_size=0.3, random_state=88)

In [23]:
X_train.shape

(4830, 92347)

In [24]:
X_test.shape

(2070, 92347)

In [26]:
classifier = MultinomialNB()

In [27]:
classifier.fit(X_train, y_train)

MultinomialNB()

CHALLENGE: Calculate the following for test dataset:
* num. of doccuments classified correctly
* num. of doccuments classified incorrectly
* acuraccy of the model

In [28]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [29]:
print(f'{nr_correct} documents classified coreectly')

1963 documents classified coreectly


In [30]:
nr_incorrect = y_test.size - nr_correct

In [31]:
print(f'{nr_incorrect} documents classified incoreectly')

107 documents classified incoreectly


In [32]:
fraction_wrong = nr_incorrect/ (nr_correct + nr_incorrect)

In [37]:
print(f'the testing accuracy of the model is :{1-fraction_wrong:.2%}')

the testing accuracy of the model is :94.83%


In [38]:
classifier.score(X_test, y_test)

0.9483091787439614

CHALLEGE: for testing dataset calculate the  (google for scikitlearn doccumentation)
* recall
* precision
* f1 score


In [40]:
recall_score(y_test, classifier.predict(X_test))

0.8181818181818182

In [41]:
precision_score(y_test, classifier.predict(X_test))

0.9892241379310345

In [42]:
f1_score(y_test, classifier.predict(X_test))

0.8956097560975611

In [46]:
examlple = ['get viagra for free now!', 
           'need a mortage?, Repply to arrange call with a specialist and get a quate',
           'could you please help me with project for tommarrow',
           'hi jonathan how about game of golf tommarow?'
           ]

In [47]:
doc_term_matrix = vectorizer.transform(examlple)

In [48]:
classifier.predict(doc_term_matrix)

array([1, 0, 0, 0], dtype=int64)