In [228]:
import os
import re

import numpy as np

from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

from collections import Counter

import random

## Loading Data

The first step is to load our sample data for both spam and ham. For this, we created a utility called loader which, given an input file path, reads the content from the file and append it to a python list.

In [229]:
def loader(file_input):
    data = []
    for (dirpath, dirnames, filenames) in os.walk(file_input):
        for file in filenames:
            path = os.path.join(dirpath, file)
            with open(path, encoding='latin-1') as f:
                data.append(f.read())
                f.close()
    return data

In [230]:
file_input = './data/enron1/ham'
ham = loader(file_input)

In [231]:
file_input = './data/enron1/spam'
spam = loader(file_input)

In [232]:
# Tokenize
patt = re.compile(r'\W')
stops = set(stopwords.words('english'))

def process_words(data):
    words = word_tokenize(data)
    
#     # Lowercase
#     words = [word.lower() for word in words]

#     # Remove stop words
#     words = [word for word in words if word not in stops]

#     # Remove special characters
#     words = [word for word in words if not patt.search(word)]

#     # Remove digit
#     words = [word for word in words if not word.isdigit()]
    
#     # Strip
#     words = [word.strip() for word in words]

    return dict([(word, True) for word in words])

In [233]:
ham_data = [(process_words(words), 'ham') for words in ham]
spam_data = [(process_words(words), 'spam') for words in spam]
all_data = spam_data + ham_data
print('done')

done


In [234]:
all_data = np.array(all_data)
X = all_data[:, 0]
y = all_data[:, 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [235]:
clf = NaiveBayesClassifier.train(zip(X_train, y_train))
print('done')

done


In [236]:
clf.show_most_informative_features()

Most Informative Features
               forwarded = True              ham : spam   =    164.7 : 1.0
                     hou = True              ham : spam   =    157.0 : 1.0
                  farmer = True              ham : spam   =    112.3 : 1.0
            prescription = True             spam : ham    =    111.7 : 1.0
                     nom = True              ham : spam   =    100.1 : 1.0
                    pain = True             spam : ham    =     88.2 : 1.0
                    2001 = True              ham : spam   =     86.1 : 1.0
                creative = True             spam : ham    =     71.4 : 1.0
                     ect = True              ham : spam   =     69.2 : 1.0
                  health = True             spam : ham    =     64.7 : 1.0


In [237]:
text = 'fake babe is amazing'
text_data = process_words(text)
clf.classify(text_data)

'spam'

In [238]:
text = 'december is amazing'
text_data = process_words(text)
clf.classify(text_data)

'ham'

In [239]:
y_pred = [clf.classify(X) for X in X_test]

In [240]:
actual_counts = Counter(y_test)
predicted_counts = Counter(y_pred)

print('Actual count:', actual_counts.most_common())
print('Predicted count:', predicted_counts.most_common())

Actual count: [('ham', 1191), ('spam', 516)]
Predicted count: [('ham', 1119), ('spam', 588)]


## Confusion Matrix

The confusion matrix is a tabular structure that helps visualize the performance of classifiers. Each column in the matrix represents classified instances based on predictions, and each row of the matrix represents classified instances based on the actual class labels.

|            | p' (Predicted) | n' (Predicted) |
|------------|----------------|----------------|
| p (Actual) | True Positive  | False Negative |
| n (Actual) | False Positive | True Negative  |

- __True Positive (TP)__ indicates the number of correct hits or predictions for our positive class.
- __False Negative (FN)__ indicates the number of instances we missed for that class by predicting it falsely as the negative class.
- __False Positive (FP)__ is the number of instances we predicted wrongly as the positive class when it was actually not.
- __True Negative (TN)__ is the number of instances we correctly predicted as the negative class.

In [241]:
cm = confusion_matrix(y_true = y_test, 
                      y_pred = y_pred, 
                      labels = ['spam', 'ham'])
cm

array([[ 512,    4],
       [  76, 1115]])

In [242]:
true_positive = cm[0][0]
true_negative = cm[1][1]
false_positive = cm[1][0]
false_negative = cm[0][1]

print('true_positive =', true_positive)
print('true_negative =', true_negative)
print('false_positive =', false_positive)
print('false_negative =', false_negative)

true_positive = 512
true_negative = 1115
false_positive = 76
false_negative = 4


## Accuracy

Accuracy is defined as the overall accuracy or proportion of correct predictions of the model, which can be depicted by the formula:

$Accuracy = \cfrac{TP + TN }{TP + FP + FN + TN}$

In [243]:
accuracy = accuracy_score(y_true = y_test, y_pred = y_pred)
accuracy_manual = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)

print('Accuracy = {:.2f}%'.format(accuracy * 100))
print('Manually computed accuracy = {:.2f}%'.format(accuracy_manual * 100))

Accuracy = 95.31%
Manually computed accuracy = 95.31%


## Precision

Precision is defined as the number of predictions made that are actually correct or relevant out of all the predictions based on the positive class. This is also known as the _positive predictive value_ and can be depicted by the formula:

$Precision = \cfrac{TP}{TP + FP}$

In [244]:
precision = precision_score(y_true = y_test, 
                            y_pred = y_pred,
                            pos_label = 'spam')

precision_manual = true_positive / (true_positive + false_positive)

print('Precision = {:.2f}%'.format(precision * 100))
print('Manually computed precision = {:.2f}%'.format(precision_manual * 100))

Precision = 87.07%
Manually computed precision = 87.07%


## Recall

Recall is defined as the number of instances of the positive class that were correctly predicted. This is also known as the _hit rate_, _coverage_, _sensitivity_ and can be depicted by the formula:

$Recall = \cfrac{TP}{TP + FN}$

In [245]:
recall = recall_score(y_true = y_test, 
                      y_pred = y_pred,
                      pos_label = 'spam')

recall_manual = true_positive / (true_positive + false_negative)

print('Recall = {:.2f}%'.format(recall * 100))
print('Manually computed recall = {:.2f}%'.format(recall_manual * 100))

Recall = 99.22%
Manually computed recall = 99.22%


## F1 Score

F1 Score is another accuracy measure that is computed by taking the harmonic mean of the precision and recall and can be represented as follows:

$F1 Score = \cfrac{2 \times Precision \times Recall}{Precision + Recall}$

In [246]:
f1 = f1_score(y_true = y_test, 
              y_pred = y_pred,
              pos_label = 'spam')
f1_manual = (2 * precision * recall) / (precision + recall)

print('F1-Score = {:.2f}%'.format(f1 * 100))
print('Manually computed F1-Score = {:.2f}%'.format(f1_manual * 100))

F1-Score = 92.75%
Manually computed F1-Score = 92.75%
