In [1]:
reset -fs

<center><h2>Implement Naive Bayes From Scratch</h2></center>

<center><h2>Bayes' Theorem</h2></center>


<br>
<center><img src="images/bayes_rule.png" width="75%"/></center>

<center><h2>Steps for Training Naive Bayes</h2></center>

1. Acquire labeled data
1. Preprocess data
1. Calculate document class priors
1. Calculate word by class conditional probabilities
1. Calculate the proportional probabilities for each class of new document
1. Pick the winning class

Acquire data & preprocess
-----

In [1]:
corpus  = ["🐈 🐯 🐱 🐩 🐱", 
           "🐶 🐶 🐈 🐶 🐩 🐈 🐶 🐶", 
           "🐈 🐈 🐯 🐶 🐈",  
           "🐈 🐈 🐈",
           "🐶 🐶 🐯 🐈 🐩 🐱 🐩 🐶 🐩 🐶 "]

labels = ['cat', 'dog', 'cat', 'cat','dog'] 

In [5]:
data = zip(labels, corpus)

In [6]:
for label, item in data:
    print(f"{label}: {item}")

cat: 🐈 🐯 🐱 🐩 🐱
dog: 🐶 🐶 🐈 🐶 🐩 🐈 🐶 🐶
cat: 🐈 🐈 🐯 🐶 🐈
cat: 🐈 🐈 🐈
dog: 🐶 🐶 🐯 🐈 🐩 🐱 🐩 🐶 🐩 🐶 


<center><h2>Calculate document class priors</h2></center>

$$P(c) = \frac{N_c}{N}$$

In [3]:
# What labels are we dealing with?
set(labels)

{'cat', 'dog'}

In [4]:
# How many documents are dealing with?
n_docs = len(corpus)
n_docs

5

In [9]:
from collections import defaultdict

In [None]:
revise here

add something like this

# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum()   # spam_messages already defined in a cell above
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    
    n_word_given_ham = ham_messages[word].sum()   # ham_messages already defined in a cell above
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [10]:
# For each label, find the probability of baseline occurance
doc_priors = defaultdict(float)

for label in labels:
    doc_priors[label] = sum(1 for d in train if d.label == label) / n_docs

print(*doc_priors.items(), sep='\n')

('cat', 0.6)
('dog', 0.4)


Calculate conditional probabilities of each word for each class
-----

In [11]:
# Get all tokens, aka the vocabulary
vocab = []

for doc in train:
    vocab.extend(doc.tokens)
    
print("Vocab:", vocab)

Vocab: ['🐈', '🐯', '🐱', '🐩', '🐱', '🐶', '🐶', '🐈', '🐶', '🐩', '🐈', '🐶', '🐶', '🐈', '🐈', '🐯', '🐶', '🐈', '🐈', '🐈', '🐈', '🐶', '🐶', '🐯', '🐈', '🐩', '🐱', '🐩', '🐶', '🐩', '🐶']


In [12]:
# Unique tokens
set(vocab)

{'🐈', '🐩', '🐯', '🐱', '🐶'}

In [13]:
# Number of unique tokens, aka cardinality
v = len(set(vocab))
print("Cardinality of vocab:", v)

Cardinality of vocab: 5


In [14]:
# A default dict of default dicts; inner default dict is probability
cond_prob = defaultdict(lambda: defaultdict(float))

for label in labels:
    
    label_tokens = []
    for doc in train:
         # For a given label, get a list of all the tokens for all the docs 
        if doc.label == label:
            label_tokens.extend(doc.tokens)

    for token in vocab:
        # Find conditional probability: token count / total count
        cond_prob[label][token] = label_tokens.count(token) / len(label_tokens) 

cond_prob

defaultdict(<function __main__.<lambda>()>,
            {'cat': defaultdict(float,
                         {'🐈': 0.5384615384615384,
                          '🐯': 0.15384615384615385,
                          '🐱': 0.15384615384615385,
                          '🐩': 0.07692307692307693,
                          '🐶': 0.07692307692307693}),
             'dog': defaultdict(float,
                         {'🐈': 0.16666666666666666,
                          '🐯': 0.05555555555555555,
                          '🐱': 0.05555555555555555,
                          '🐩': 0.2222222222222222,
                          '🐶': 0.5})})

In [24]:
# Test that each label is a probability mass function (pmf). A pmf sums to 1
from math import isclose

for label in labels:
    assert isclose(sum(cond_prob[label].values()), 1)

<center><h2>Steps for Predicting with Naive Bayes</h2></center>

1. Acquire and process data
1. Calculate the proportional probabilities for each class of new document
1. Pick the winning class

Given a new document without a label,  calculate the proportional probabilities for each class
-------

$$ P(c | X) = P(c) •  \prod_{i=1}^n P(x_i | c)$$

In [16]:
import operator
from functools import reduce

def product(iterable):
    return reduce(operator.mul, iterable, 1)

In [17]:
test = LabeledTextData(id_num=90, label=None, tokens="🐱".split())
# test = LabeledTextData(id_num=91, label=None, tokens="🐶 🐶".split()) 
# test = LabeledTextData(id_num=92, label=None, tokens="🐶 🐱".split())
# test = LabeledTextData(id_num=93, label=None, tokens="🐈 🐈 🐶 🐶 🐩 🐱 🐱".split())
# test = LabeledTextData(id_num=94, label=None, tokens="🐬 ".split()) # Out of sample prediction

prob_predicted = defaultdict(float)
for label in labels:
    # For each label, calculate the conditional probability based on the prior and the tokens that appear
    prob_predicted[label] = doc_priors[label] * product(cond_prob[label][t] for t in test.tokens)
    
print(*dict(prob_predicted).items(), sep='\n')

('cat', 0.09230769230769231)
('dog', 0.022222222222222223)


# Pick the winning class

In [18]:
from operator import itemgetter

In [20]:
label, prob = max(prob_predicted.items(), key=itemgetter(1))

The predicted class is: cat


In [None]:
# Handle ties and fall back to document priors if winning probability is zero

label, prob = max(prob_predicted.items(), key=itemgetter(1))
if prob > 0:
    print("The predicted class is: ", end="")
    print(*(k for k, v in prob_predicted.items() if v == prob))
else:
    label, prob = max(doc_priors.items(),
                      key=itemgetter(1))
    print("The predicted class is:", label)

<br>
<br> 
<br>

----

<center><h2>Bonus Material</h2></center>

- Other implementations by Brian Spiering
    - [Using NormalDist from statistics module](https://github.com/brianspiering/naive_bayes_classifer_in_python_3_8)
    - [Naive Bayes for Text Classification](https://github.com/brianspiering/bayesian-text)
    
- [Implementation by mircealex](https://github.com/mircealex)