In [46]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
0,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1,00249.5f45607c1bffe89f60ba1ec9f878039a
1,ATTENTION: This is a MUST for ALL Computer Use...,1,00373.ebe8670ac56b04125c25100a36ab0510
2,This is a multi-part message in MIME format.\n...,1,00214.1367039e50dc6b7adb0f2aa8aba83216
3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1,00210.050ffd105bd4e006771ee63cabc59978
4,This is the bottom line. If you can GIVE AWAY...,1,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,"I'm one of the 30,000 but it's not working ver...",0,00609.dd49926ce94a1ea328cce9b62825bc97
5792,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,0,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",0,01127.841233b48eceb74a825417d8d918abf8
5794,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",0,01178.5c977dff972cd6eef64d4173b90307f0


In [6]:
data.sort_index(inplace=True)

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
0,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1,00249.5f45607c1bffe89f60ba1ec9f878039a
1,ATTENTION: This is a MUST for ALL Computer Use...,1,00373.ebe8670ac56b04125c25100a36ab0510
2,This is a multi-part message in MIME format.\n...,1,00214.1367039e50dc6b7adb0f2aa8aba83216
3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1,00210.050ffd105bd4e006771ee63cabc59978
4,This is the bottom line. If you can GIVE AWAY...,1,00033.9babb58d9298daa2963d4f514193d7d6


In [9]:
vectorizer = CountVectorizer(stop_words='english')

In [10]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size=0.3, random_state=88)

In [16]:
classifier = MultinomialNB()

In [17]:
classifier.fit(X_train, y_train)

MultinomialNB()

In [36]:
nr_correct = (classifier.predict(X_test) == y_test).sum()
print(f'{nr_correct} documents classfied correctly')

1660 documents classfied correctly


In [37]:
nr_incorrect = X_test.shape[0] - nr_correct
print(f'{nr_incorrect} documents classfied incorrectly')

79 documents classfied incorrectly


In [39]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'Testing Accurary of the model : {1-fraction_wrong:.2%}')

Testing Accurary of the model : 95.46%


In [40]:
classifier.score(X_test, y_test)

0.9545715928694652

In [47]:
print(f'recall score: {recall_score(y_test, classifier.predict(X_test))}')
print(f'precision : {precision_score(y_test, classifier.predict(X_test))}')
print(f'f1 score: {f1_score(y_test, classifier.predict(X_test))}')

recall score: 0.8646209386281588
precision : 0.9917184265010351
f1 score: 0.9238187078109932


In [48]:
example = ['get viagra for free now!', 'need a mortgage? Reply to arrange a call with a specialist and get a quote',
          'Could you please help me with the project for tomorrow?',
          'Hello Jonathan, how about a game of golf tomorrow?']

In [50]:
doc_term_matrix = vectorizer.transform(example)


In [51]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0])