## Add extension to find exectution time

In [1]:
%load_ext autotime

## Imports

In [87]:
import numpy as np
from math import log
import re
from pathlib import Path
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

time: 4.41 s


## Dataset

In [3]:
documents = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
labels = [0, 1, 0, 1, 0, 1]
#1 is abusive, 0 not

time: 1.92 ms


## Creating Vocabulary

In [4]:
def create_vocab(documents):
    """
    Create set of all words that are present in all documents
    """
    vocab = set()
    for document in documents:
        vocab |= set(document)
    return vocab

time: 134 ms


In [5]:
vocab = create_vocab(documents)
print(f"Vocabulary in the documents : {create_vocab(documents)}")

Vocabulary in the documents : {'so', 'dog', 'garbage', 'stop', 'mr', 'dalmation', 'not', 'posting', 'cute', 'steak', 'licks', 'how', 'park', 'stupid', 'has', 'him', 'worthless', 'quit', 'I', 'to', 'buying', 'my', 'maybe', 'ate', 'help', 'take', 'love', 'problems', 'please', 'food', 'flea', 'is'}
time: 102 ms


## Creating vector from a document

In [6]:
def word2vec(document, vocab):
    word_vector = [0] * len(vocab)
    vocab_list = list(vocab)
    for token in document:
        if token in vocab:
            word_vector[vocab_list.index(token)] = 1
    return word_vector

time: 84.7 ms


In [7]:
print(f"Word vector for first document is : \n{word2vec(documents[0], vocab)}")
print(f"Vocabulary being : \n{vocab}")
print(f"Document being : \n{documents[0]}")
print("Looks good!")

Word vector for first document is : 
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0]
Vocabulary being : 
{'so', 'dog', 'garbage', 'stop', 'mr', 'dalmation', 'not', 'posting', 'cute', 'steak', 'licks', 'how', 'park', 'stupid', 'has', 'him', 'worthless', 'quit', 'I', 'to', 'buying', 'my', 'maybe', 'ate', 'help', 'take', 'love', 'problems', 'please', 'food', 'flea', 'is'}
Document being : 
['my', 'dog', 'has', 'flea', 'problems', 'help', 'please']
Looks good!
time: 248 ms


## Conditional probabilities

In [8]:
document_vectors = [word2vec(document, vocab) for document in documents]

time: 84.4 ms


In [9]:
len_train_docs = len(document_vectors)
len_words_vocab = len(vocab)
print(f"There are {len_train_docs} documents")
print(f"There are {len_words_vocab} features")

There are 6 documents
There are 32 features
time: 52.8 ms


In [10]:
p_abusive = sum(labels) / len_train_docs
print(f"Probability of a document being abusive is {p_abusive}")

Probability of a document being abusive is 0.5
time: 1.81 ms


In [11]:
p0_num, p1_num = np.zeros(len_words_vocab), np.zeros(len_words_vocab)
p0_denom, p1_denom = 0, 0

time: 703 µs


In [12]:
for index in range(len_train_docs):
    if labels[index] == 1: # If the document belongs to class 1
        p1_num += document_vectors[index] # Add occurances
        p1_denom += sum(document_vectors[index]) # Add sum of occurances
    
    else:
        p0_num += document_vectors[index]
        p0_denom += sum(document_vectors[index])

time: 1.48 ms


In [13]:
p1_vect = p1_num / p1_denom
p0_vect = p0_num / p0_denom
print(f"Condititonal probability vector for class 1 : \n {p1_vect}")
print(f"Condititonal probability vector for class 0 : \n {p0_vect}")
print("Conditional probability here is : Probability of occurance of the word, if it belongs to the given class")

Condititonal probability vector for class 1 : 
 [0.         0.10526316 0.05263158 0.05263158 0.         0.
 0.05263158 0.05263158 0.         0.         0.         0.
 0.05263158 0.15789474 0.         0.05263158 0.10526316 0.05263158
 0.         0.05263158 0.05263158 0.         0.05263158 0.
 0.         0.05263158 0.         0.         0.         0.05263158
 0.         0.        ]
Condititonal probability vector for class 0 : 
 [0.04166667 0.04166667 0.         0.04166667 0.04166667 0.04166667
 0.         0.         0.04166667 0.04166667 0.04166667 0.04166667
 0.         0.         0.04166667 0.08333333 0.         0.
 0.04166667 0.04166667 0.         0.125      0.         0.04166667
 0.04166667 0.         0.04166667 0.04166667 0.04166667 0.
 0.04166667 0.04166667]
Conditional probability here is : Probability of occurance of the word, if it belongs to the given class
time: 3.7 ms


When we attempt to classify a document, we multiply a lot of probabilities together to
get the probability that a document belongs to a given class. This will look something
like p(w 0 |1)p(w 1 |1)p(w 2 |1) . If any of these numbers are 0, then when we multiply
them together we get 0. To lessen the impact of this, we’ll initialize all of our occur-
rence counts to 1, and we’ll initialize the denominators to 2.

In [33]:
p0_num, p1_num = np.ones(len_words_vocab), np.ones(len_words_vocab)
p0_denom, p1_denom = 2.0, 2.0

for index in range(len_train_docs):
    if labels[index] == 1: # If the document belongs to class 1
        p1_num += document_vectors[index] # Add occurances
        p1_denom += sum(document_vectors[index]) # Add sum of occurances
    
    else:
        p0_num += document_vectors[index]
        p0_denom += sum(document_vectors[index])

time: 1.51 ms


Another problem is underflow: doing too many multiplications of small numbers.
When we go to calculate the product p(w 0 |c i )p(w 1 |c i )p(w 2 |c i )...p(w N |c i ) and many
of these numbers are very small, we’ll get underflow, or an incorrect answer. We use natural logarithm here

In [34]:
p1_vect = np.log(p1_num / p1_denom)
p0_vect = np.log(p0_num / p0_denom)

time: 986 µs


In [36]:
p1_vect
p0_vect

array([-3.04452244, -1.94591015, -2.35137526, -2.35137526, -3.04452244,
       -3.04452244, -2.35137526, -2.35137526, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -1.65822808, -3.04452244,
       -2.35137526, -1.94591015, -2.35137526, -3.04452244, -2.35137526,
       -2.35137526, -3.04452244, -2.35137526, -3.04452244, -3.04452244,
       -2.35137526, -3.04452244, -3.04452244, -3.04452244, -2.35137526,
       -3.04452244, -3.04452244])

array([-2.56494936, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -3.25809654, -2.56494936, -2.56494936,
       -2.56494936, -2.56494936, -3.25809654, -3.25809654, -2.56494936,
       -2.15948425, -3.25809654, -3.25809654, -2.56494936, -2.56494936,
       -3.25809654, -1.87180218, -3.25809654, -2.56494936, -2.56494936,
       -3.25809654, -2.56494936, -2.56494936, -2.56494936, -3.25809654,
       -2.56494936, -2.56494936])

time: 9.02 ms


## Classify the new vector based on the conditional probability and probability of it belonging to one class

p(ci / w) = p(w / ci) * p(ci) / p(w)

In [58]:
def classifier(vector, p0_vect, p1_vect, p_class1):
    p_class0 = 1 - p_class1
    p1 = np.sum(vector * p1_vect) / np.log(p_class1)
    p0 = np.sum(vector * p0_vect) / np.log(p_class0)
    return 0 if p1 > p0 else 1

time: 1.91 ms


In [60]:
document = ['love', 'my', 'dalmation']
test_vector = np.array(word2vec(document, vocab))
print(f"{document} is classified as {classifier(test_vector, p0_vect, p1_vect, p_abusive)}")

['love', 'my', 'dalmation'] is classified as 0
time: 1.48 ms


In [61]:
document = ['stupid', 'garbage']
test_vector = np.array(word2vec(document, vocab))
print(f"{document} is classified as {classifier(test_vector, p0_vect, p1_vect, p_abusive)}")

['stupid', 'garbage'] is classified as 1
time: 1.29 ms


## Bag of words

Up until this point we’ve treated the presence or absence of a word as a feature. This
could be described as a set-of-words model. If a word appears more than once in a
document, that might convey some sort of information about the document over just
the word occurring in the document or not. This approach is known as a bag-of-words
model. A bag of words can have multiple occurrences of each word, whereas a set of
words can have only one occurrence of each word.

In [62]:
def bag_of_words_vec(document, vocab):
    word_vector = [0] * len(vocab)
    vocab_list = list(vocab)
    for token in document:
        if token in vocab:
            word_vector[vocab_list.index(token)] += 1
    return word_vector

time: 987 µs


# Email Classification

## Remove punctuations and split sentences to tokens

In [64]:
def clean_string(text_str):
    list_tokens = re.split(r'\W*', text_str)
    return [tok.lower() for tok in list_tokens if len(tok) > 2]

time: 1.39 ms


## Read text files

In [81]:
docs = [Path("../data/email/spam"), Path("../data/email/ham")]
labels = list()
doc_list, full_text = list(), list()
for path_docs in docs:
    for text_file in path_docs.glob("*.txt"):
        with open(text_file, "r", encoding="utf8", errors='ignore') as f:
            text = f.read()
        clean_text = clean_string(text)
        doc_list.append(clean_text)
        full_text.extend(clean_text)
        labels.append(1 if "spam" in str(text_file) else 0)

time: 9.9 ms


## Create Vocabulary

In [86]:
vocab = create_vocab(doc_list)

time: 998 µs


## Split data into train and test

In [89]:
features_train, features_test, labels_train, labels_test = train_test_split(doc_list, labels, test_size=0.1)

time: 1.02 ms


In [97]:
train_document_vectors = [bag_of_words_vec(document, vocab) for document in features_train]
test_document_vectors = [bag_of_words_vec(document, vocab) for document in features_test]

time: 16.5 ms


In [98]:
def train_nb(document_vectors, labels):
    len_train_docs = len(document_vectors)
    len_words_vocab = len(vocab)
    p_abusive = sum(labels) / len_train_docs

    p0_num, p1_num = np.ones(len_words_vocab), np.ones(len_words_vocab)
    p0_denom, p1_denom = 2.0, 2.0

    for index in range(len_train_docs):
        if labels[index] == 1: # If the document belongs to class 1
            p1_num += document_vectors[index] # Add occurances
            p1_denom += sum(document_vectors[index]) # Add sum of occurances

        else:
            p0_num += document_vectors[index]
            p0_denom += sum(document_vectors[index])

    p1_vect = np.log(p1_num / p1_denom)
    p0_vect = np.log(p0_num / p0_denom)
    return p1_vect, p0_vect, p_abusive

time: 3.56 ms


In [99]:
p1_vect, p0_vect, p_spam = train_nb(train_document_vectors, labels_train)

time: 7.73 ms


## Test accuracy!

In [101]:
error_count = 0
for ind, doc_vector in enumerate(test_document_vectors):
    if classifier(doc_vector, p0_vect, p1_vect, p_spam) != labels_test[ind]:
        error_count += 1
print(f"Error count is {error_count / len(test_document_vectors)}")

Error count is 0.0
time: 4.06 ms


As the dataset is small, we are able to get 100 percent accuracy.