In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [20]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [21]:
data = pd.read_json(DATA_JSON_FILE)
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
995,1,This is a multi-part message in MIME format.\n...,00497.353a61b265f11dd0bae116c0149abbe1
996,1,PROMOTE YOUR PRODUCT OR SERVICE TO MILLIONS TO...,00498.7f293b818e2e46d3a8bad44eda672947
997,1,<html>\n\n<head>\n\n</head>\n\n<body>\n\n\n\n<...,00499.257302b8f6056eb85e0daa37bfcd2c68
998,1,As to\n\n\n\n\n\n\n\nWant to refinance?\n\n\n\...,00500.87320162ab5b79f67978406cf909c3d1
999,1,"Dear Sirs,\n\nWe know your esteemed company in...",00501.32679091b0520132ad888ef3b134ce48


In [22]:
data.shape

(5796, 3)

In [23]:
data.sort_index(inplace=True)
data.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",00001.7848dde101aa985090474a91ec93fcf0
1,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,1,##############################################...,00004.eac8de8d759b7e74154f142194282724
4,1,I thought you might like these:\n\n1) Slim Dow...,00005.57696a39d7d84318ce497886896bf90d


In [24]:
vectorizer = CountVectorizer(stop_words='english')

In [25]:
all_features = vectorizer.fit_transform(data.MESSAGE)
all_features.shape

(5796, 102694)

In [26]:
#vectorizer.vocabulary_

In [27]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size=0.3, random_state=33)

In [28]:
X_train.shape

(4057, 102694)

In [29]:
spam_classifier = MultinomialNB()

In [30]:
spam_classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
nr_correct = (y_test == spam_classifier.predict(X_test)).sum()

In [32]:
print(f'{nr_correct} documents classified correctly')

1648 documents classified correctly


In [33]:
nr_incorrectly = y_test.size - nr_correct

print(f'{nr_incorrectly} documments classified incorrectly')

91 documments classified incorrectly


In [34]:
accuracy = nr_correct/y_test.size

print(f'Accuracy of the classifier is: {accuracy:.2%}')

Accuracy of the classifier is: 94.77%


In [35]:
# can also be calculated using the inbuilt method 'score' of the MultiNomialNB classifier

print(f'{spam_classifier.score(X_test, y_test):.2%}')

94.77%


In [36]:
print(f'The recall score is {recall_score(y_test, spam_classifier.predict(X_test))}')

print(f'The precision score is {precision_score(y_test, spam_classifier.predict(X_test))}')

print(f'The f_1 score is {f1_score(y_test, spam_classifier.predict(X_test))}')

The recall score is 0.8444846292947559
The precision score is 0.989406779661017
The f_1 score is 0.911219512195122
