In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
4395,http://news.bbc.co.uk/1/hi/england/2515127.stm...,0,01396.61983fbe6ec43f55fd44e30fce24ffa6
4396,"> >-- be careful when using this one.) Also, t...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
4397,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",0,01398.169b51731fe569f42169ae8f948ec676
4398,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896
4399,"Hi there,\n\n\n\nNow this is probably of no us...",0,01400.f897f0931e461e7b2e964d28e927c35e


In [5]:
data.shape

(4400, 3)

In [29]:
data.sort_index(inplace=True) # sorting index

In [7]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
4395,http://news.bbc.co.uk/1/hi/england/2515127.stm...,0,01396.61983fbe6ec43f55fd44e30fce24ffa6
4396,"> >-- be careful when using this one.) Also, t...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
4397,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",0,01398.169b51731fe569f42169ae8f948ec676
4398,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896
4399,"Hi there,\n\n\n\nNow this is probably of no us...",0,01400.f897f0931e461e7b2e964d28e927c35e


In [31]:
vectorizer = CountVectorizer(stop_words='english') # cremoves stop words

In [37]:
all_features = vectorizer.fit_transform(data.MESSAGE) # creating a sparse matrix

In [39]:
all_features.shape # all emails

(4400, 73683)

In [33]:
vectorizer.vocabulary_ # vocabulary

{'doctype': 23803,
 'html': 33826,
 'public': 52287,
 'w3c': 67619,
 'dtd': 24618,
 'transitional': 63740,
 'en': 26040,
 'head': 32672,
 'meta': 43295,
 'content': 20308,
 '3d': 4383,
 'text': 62584,
 'charset': 18468,
 '3dwindows': 4602,
 '1252': 1477,
 'http': 33841,
 'equiv': 26497,
 '3dcontent': 4505,
 'ype': 71436,
 'mshtml': 44643,
 '00': 0,
 '2314': 3055,
 '1000': 1093,
 '3dgenerator': 4524,
 'body': 16012,
 'inserted': 36108,
 'calypso': 17564,
 'table': 61903,
 'border': 16174,
 '3d0': 4384,
 'cellpadding': 18184,
 'cellspacing': 18189,
 '3d2': 4403,
 'id': 34610,
 '3d_calyprintheader_': 4470,
 'ules': 64965,
 '3dnone': 4556,
 'style': 60774,
 'color': 19585,
 'black': 15649,
 'display': 23432,
 'width': 68857,
 '100': 1092,
 'tbody': 62174,
 'tr': 63628,
 'td': 62235,
 'colspan': 19604,
 '3d3': 4417,
 'hr': 33742,
 '3dblack': 4487,
 'noshade': 46453,
 'size': 58834,
 '3d1': 4386,
 'end': 26098,
 'font': 28921,
 '000000': 3,
 'face': 27604,
 '3dverdana': 4598,
 'arial': 13073

In [12]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, 
                                                   test_size=0.3, random_state=88)

In [13]:
X_train.shape

(3080, 73683)

In [14]:
X_test.shape

(1320, 73683)

In [41]:
classifier = MultinomialNB() # our naive bayes model(Training)

In [43]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

**Challenge:** Calculate the following for the test dataset: <br>
The number of documents classified correctly. <br>
The number of documents classified incorrectly. <br>
The accuracy of the model. <br>

In [48]:
nr_correct = (y_test == classifier.predict(X_test)).sum() # how many predictions are correct
classifier.predict(X_test)

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [45]:
print(f'{nr_correct} documents classfied correctly')

1290 documents classfied correctly


In [47]:
nr_incorrect = y_test.size - nr_correct # how many predicitions are incorrect

In [20]:
print(f'Number of documents incorrectly classified is {nr_incorrect}')

Number of documents incorrectly classified is 30


In [49]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect) # Models testing accuracy
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 97.73%


In [50]:
classifier.score(X_test, y_test) # Models score

0.9772727272727273

**Challenge:** For the testing dataset calculate the recall, precision and f1 score. Google for the scikit learn documentation on this topic to work it out. 

In [51]:
recall_score(y_test, classifier.predict(X_test)) # Models recall score

0.8275862068965517

In [52]:
precision_score(y_test, classifier.predict(X_test)) # Models precision score

0.96

In [53]:
f1_score(y_test, classifier.predict(X_test)) # Models F1 score

0.888888888888889

### Another example

In [26]:
example = ['get viagra for free now!', 
          'need a mortgage? Reply to arrange a call with a specialist and get a quote', 
          'Could you please help me with the project for tomorrow?', 
          'Hello Jonathan, how about a game of golf tomorrow?', 
          'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.'
          ]

In [55]:
doc_term_matrix = vectorizer.transform(example)

In [56]:
classifier.predict(doc_term_matrix) # 2nd email is spam

array([0, 1, 0, 0, 0], dtype=int64)