# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Constants

In [2]:
VOCAB_SIZE = 2500

TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TRAGET_FILE = 'SpamData/03_Testing/test-target.txt'

# Load the data

In [3]:
X_test = np.loadtxt(TEST_FEATURE_MATRIX,delimiter=' ')
y_test = np.loadtxt(TEST_TRAGET_FILE, delimiter=' ')
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all_tokkens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')

# Calculating Joint Probability

In [4]:
print(f'The dimensions of the dot product between X_test and prob_token_spam are {X_test.dot(prob_token_spam).shape}')

The dimensions of the dot product between X_test and prob_token_spam are (1724,)


## Set the priror 

$P(Spam|X) = \frac{P(X|Spam)P(Spam)}{P(X)}$

In [5]:
PROB_SPAM = 0.3109

```
Taking log because having log reduces the calculation to addition and subtraction instead of multiplication and division 
and since the numbers are very close, taking log spreads them out so that the plots become much neater

```

In [6]:
np.log(prob_token_spam)

array([ -4.40757517,  -5.25364998,  -4.99005241, ..., -10.30243704,
       -10.01475496, -12.0941965 ])

# Joint probability in log format

In [7]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)-np.log(prob_all_tokkens)) + np.log(PROB_SPAM)

In [8]:
joint_log_spam[:5]

array([24.28521714,  2.16166327, 20.5925235 , 17.76088562, 20.51597073])

In [9]:
joint_log_ham = X_test.dot(np.log(prob_token_ham)-np.log(prob_all_tokkens)) + np.log(1-PROB_SPAM)

In [10]:
joint_log_ham[:5]

array([-60.97491033, -11.01100812, -37.96946378, -59.14004125,
       -53.79514888])

In [11]:
joint_log_ham.size

1724

# Making Predictions 

## Checking for higher probabilities

$P(Spam|X)>P(Ham|X)$

$P(Spam|X)<P(Ham|X)$

In [12]:
predictions = (joint_log_spam>joint_log_ham)*1
predictions[-5:]

array([1, 0, 0, 0, 0])

In [13]:
y_test[-5:]

array([0., 0., 0., 0., 0.])

# Metrics and Evaluation

## Accuracy

In [14]:
correct_docs = (y_test == predictions).sum()

In [15]:
print(f'Docs classified correctly {correct_docs}')

Docs classified correctly 1685


In [16]:
num_docs_wrong = X_test.shape[0] - correct_docs
print(f'Number of documents classified incorrectly {num_docs_wrong}')

Number of documents classified incorrectly 39


In [32]:
accuracy = (correct_docs/X_test.shape[0])*100
print(f'Accuracy is {accuracy:.3f}%')

Accuracy is 97.738%


In [31]:
print(f'Fraction of emails misclassified {100-accuracy:.3f}%')

Fraction of emails misclassified 2.262%


Demerits of accuracy <br>
1.If there is a high number of positive outcomes and we build a model that classifies all the outcomes as positive then we have a bad model which can lead to serious problems but the accuracy will be high which is misleading

In [19]:
np.unique(predictions,return_counts=True)

(array([0, 1]), array([1136,  588], dtype=int64))

In [20]:
true_pos = (y_test == 1)&(predictions == 1) #the email is spam and it is classified as spam
true_pos.sum()

569

In [21]:
false_pos = (y_test == 0)&(predictions == 1) #the email is not spam but it is classified as spam
false_pos.sum()

19

In [22]:
false_neg = (y_test == 1)&(predictions==0) #the email is spam but it is classified as not spam
false_neg.sum()

20

In [23]:
true_neg = (y_test==0)&(predictions==0) #the email is not spam but it is classified as not spam
true_neg.sum()

1116

In [24]:
confusion_matrix = np.array([[true_pos.sum(), false_pos.sum()],[false_neg.sum(), true_neg.sum()]])
confusion_matrix

array([[ 569,   19],
       [  20, 1116]])

$Recall = \frac{TP}{TP+FN}$ 

We can think of this as out of all the spam emails, how many spam emails did we classify correctly

Weakness of recall score is that it can be easily maximized/manipulated by labelling all emails as spam

In [25]:
recall = true_pos.sum()/(true_pos.sum()+false_neg.sum())

In [30]:
print(f'The recall score is {recall:.3f}')

The recall score is 0.966


$Precision = \frac{TP}{TP+FP}$

Ratio of correctly classified spam messages to the total number of times we predicted spam

In [29]:
precision = true_pos.sum()/(true_pos.sum()+false_pos.sum())
print(f'The precision is {precision:.3f}')

The precision is 0.968


F Score = $2 \times \frac{precision \times recall}{precision + recall}$

F score is the harmonic mean of precision and recall and because of this it takes both the false positives and the false negatives into account and also it has a value between 0 and 1 so it provides a universal scale for comparison

In [33]:
f1_score = 2*((precision*recall)/(precision+recall))
print(f'The F1 score is {f1_score:.3f}')

The F1 score is 0.967
