In [2]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

# Naive Bayes Exercises

<h2 id="tocheading">Table of Contents</h2>
<div id="toc"></div>

## Function definitions

We'll define some useful routines for generating nonsense sentences of words. These routines will be used to generate a corpus of documents to use with Naive Bayes classifiers. 

* `distribution`: takes a list of words and weights and generates a distribution based on the weights (this just scales the total weight to 1). If no weights are provided, a random distribution is created. 
* `sentence`: takes the word vocabulary, a distribution, and a sentence size, and returns a sentence -- a string of tokens where each token is selected based on the provided distribution.  
* `sentences`: generates a collection of random sentences based on the provided words, distribution, and sentence size. 
* `random_choice`: takes a list of words and a distribution and returns a single word based on the distribution.   

In [3]:
import numpy as np
import pandas as pd

def distribution(words, probs=None):
    """Given a list (and weights for them), return a probability 
    distribution for the words.

    This is used to generate sentences for different classes."""
    if probs is None:
        probs = np.random.random(len(words))
    else:
        probs = np.array(probs)
    return probs/probs.sum()
    
def sentence(words, dist, size):
    """Given a list of words and a probability distribution for the words,
    generate a sentence of a given number of tokens.
    """
    return ' '.join([random_choice(words,dist) for i in range(size)])

def sentences(words, dist, size, howmany, class_label):
    df = pd.DataFrame([sentence(words, dist, size) for i in range(howmany)], columns = ['X'])
    df['target'] = class_label
    return df

def random_choice(words, dist):
    """Given a list of words and a probability distribution for the words,
    return a single word."""
    return np.random.choice(words, p=dist)

### Generating conditional probabilities for classes

Now that we have the definitions, we'll generate probabilities for a small vocabulary and two classes. 

In [4]:
# The vocabulary to use. Change this to suit your needs.
vocabulary = ['one', 'two', 'red', 'blue', 'fish', 'bird']

# The relative weights of the words in each class. 
# These must have the same size as the vocabulary size. 
weights1 =     [5,5,1,1,1,1]
weights2 =     [1,1,1,1,5,5]

d1 = distribution(vocabulary, probs=weights1)
d2 = distribution(vocabulary, probs=weights2)


print("Word Probability, by Class")
print("Class 1")
for word, prob in zip(vocabulary,d1):
    print(f'\t{word}\t{prob:.4f}')

    
print("Class 2")
for word, prob in zip(vocabulary,d2):
    print(f'\t{word}\t{prob:.4f}')

Word Probability, by Class
Class 1
	one	0.3571
	two	0.3571
	red	0.0714
	blue	0.0714
	fish	0.0714
	bird	0.0714
Class 2
	one	0.0714
	two	0.0714
	red	0.0714
	blue	0.0714
	fish	0.3571
	bird	0.3571


### Generating a corpus

We'll take the probabilities and generate a set of sentences (a corpus), printing the corpus out afterwards.

In [6]:
# number of sentences of each class to generate
class_1_size = 5
class_2_size = 5

# number of tokens in each sentence
sentence_size = 5

# create dataframes of sentences for each class
df1 = sentences(vocabulary, d1, sentence_size, class_1_size, 1)
df2 = sentences(vocabulary, d2, sentence_size, class_2_size, 2)

# join them into a single dataframe (the corpus)
corpus = df1.append(df2, ignore_index=True)

# assign the targets column to a variable. 
targets = corpus.target

print(corpus)

                          X  target
0      two two two blue red       1
1      blue one two one two       1
2     two bird two blue two       1
3     one one blue red fish       1
4     red blue fish two two       1
5    red two bird bird bird       2
6    red fish fish blue red       2
7  bird fish fish bird bird       2
8   red bird fish fish bird       2
9    fish fish bird two two       2


### Vectorizing the Corpus (counting words)

To run a Naive Bayes classifier, we need to convert each string in the corpus into vectors of word counts. We use `CountVectorizer` to do this.  

Afterwards, we print out the new form of the corpus.

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(corpus.X)
names = count_vect.get_feature_names()
arr = X_train_counts.toarray()
counts_df = pd.DataFrame(arr, columns=names)

print('WORD COUNTS')
print(f'{"-"*20}')
print(counts_df)

WORD COUNTS
--------------------
   bird  blue  fish  one  red  two
0     0     1     0    0    1    3
1     0     1     0    2    0    2
2     1     1     0    0    0    3
3     0     1     1    2    1    0
4     0     1     1    0    1    2
5     3     0     0    0    1    1
6     0     1     2    0    2    0
7     3     0     2    0    0    0
8     2     0     2    0    1    0
9     1     0     2    0    0    2


### Splitting the corpus by class

We do this so that we can easily compute counts and other statistics for each class.

In [26]:
print('WORD COUNTS')
print(f'{"-"*15}Class 1{"-"*15}')
class_1_counts = counts_df[targets == 1]
print(class_1_counts)
print(f'{"-"*15}Class 2{"-"*15}')
class_2_counts = counts_df[targets == 2]
print(class_2_counts)

WORD COUNTS
---------------Class 1---------------
   bird  blue  fish  one  red  two
0     0     0     0    1    0    4
1     1     1     0    1    0    2
2     0     1     0    2    0    2
3     0     2     1    2    0    0
4     0     2     0    0    0    3
---------------Class 2---------------
   bird  blue  fish  one  red  two
5     2     0     1    0    1    1
6     3     1     1    0    0    0
7     4     0     0    0    1    0
8     3     1     1    0    0    0
9     2     1     0    2    0    0


### Computing conditional probabilities

Though we already used probabilities that we defined ourselfs to generate the corpus, we now use the actual word counts for the corpus to compute conditional probabilities empirically. This is because in the real world, the corpus would not be artificially generated. 

First we count tokens of each word for each class, taking **add-1** smoothing into consideration.  

### Counting the tokens for each class

In [27]:
# count the total number of words in all sentences of class 1.
total_tokens_class_1 = class_1_counts.values.sum()

# then compute the total count of each individual word.
class_1_word_counts = pd.DataFrame(class_1_counts.sum(axis=0))

# add 1 to each of these totals.
class_1_word_counts_smoothed = class_1_word_counts+1

print(f'Class 1 word counts, no +1 smoothing. Total count is {total_tokens_class_1}')
print(class_1_word_counts.T)

total_tokens_class_1 = class_1_word_counts_smoothed.values.sum()

print(f'\nClass 1 word counts, with +1 smoothing. Total count is {total_tokens_class_1}')
print(class_1_word_counts_smoothed.T)

Class 1 word counts, no +1 smoothing. Total count is 25
   bird  blue  fish  one  red  two
0     1     6     1    6    0   11

Class 1 word counts, with +1 smoothing. Total count is 31
   bird  blue  fish  one  red  two
0     2     7     2    7    1   12


We do the same thing for class 2. 

In [28]:
total_tokens_class_2 = class_2_counts.values.sum()
class_2_word_counts = pd.DataFrame(class_2_counts.sum(axis=0))
class_2_word_counts_smoothed = class_2_word_counts+1

print(f'Class 2 word counts, no +1 smoothing. Total count is {total_tokens_class_2}')
print(class_2_word_counts.T)

total_tokens_class_2 = class_2_word_counts_smoothed.values.sum()

print(f'\nClass 2 word counts, with +1 smoothing. Total count is {total_tokens_class_2}')
print(class_2_word_counts_smoothed.T)

Class 2 word counts, no +1 smoothing. Total count is 25
   bird  blue  fish  one  red  two
0    14     3     3    2    2    1

Class 2 word counts, with +1 smoothing. Total count is 31
   bird  blue  fish  one  red  two
0    15     4     4    3    3    2


### Computing the probabilities

We then use the counts to compute conditional probabilities. 

In [29]:
cond_prob_class_1 = class_1_word_counts_smoothed/total_tokens_class_1

print('Conditional Probabilities, Class 1')
print(cond_prob_class_1.T)

Conditional Probabilities, Class 1
       bird      blue      fish       one       red       two
0  0.064516  0.225806  0.064516  0.225806  0.032258  0.387097


In [30]:
cond_prob_class_2 = class_2_word_counts_smoothed/total_tokens_class_2

print('Conditional Probabilities, Class 2')
print(cond_prob_class_2.T)

Conditional Probabilities, Class 2
       bird      blue      fish       one       red       two
0  0.483871  0.129032  0.129032  0.096774  0.096774  0.064516


We also compute the class probabilities.

In [31]:
class_1_size, _ = class_1_counts.shape
class_2_size, _ = class_2_counts.shape

class_1_prob = class_1_size/(class_1_size+class_2_size) 
class_2_prob = class_2_size/(class_1_size+class_2_size)

print(f'Probability, Class 1:\t{class_1_prob}')
print(f'Probability, Class 2:\t{class_2_prob}')

Probability, Class 1:	0.5
Probability, Class 2:	0.5


### Classifying a new example

At this point, we have everything we need to classify a new instance. Suppose the instance is the one below.  As with the original data, we must vectorize it. 

In [32]:
new_data = ['bird']
X_new_counts = count_vect.transform(new_data)
arr = X_new_counts.toarray()
new_counts_df = pd.DataFrame(arr, columns=names)

Afterwards, we can compute the probabilities needed for classification. 

The below code will compute the following products for Class 1 and Class 2. $word_i$ refers to a particular word, and $count_i$ indicates the number of times it occurs in the sentences we are attempting to classify. $class_j$ indicates the class. 

$p(word_i|class_j)^{count_i}\times \ldots \times p(word_k|class_j)^{count_k}\times p(class_j)$


In [33]:
class_1_probs = np.power(cond_prob_class_1.T,new_counts_df)
result_1 = np.product(class_1_probs.values)*class_1_prob
class_2_probs = np.power(cond_prob_class_2.T,new_counts_df)
result_2 = np.product(class_2_probs.values)*class_2_prob

print(f'Class 1 Result:\t{result_1}')
print(f'Class 2 Result:\t{result_2}')

Class 1 Result:	0.03225806451612903
Class 2 Result:	0.24193548387096775


Depending on which value is larger, we classify the new instance as belonging to the corresponding class. We can also combine the two results and compute the probabilities. 

In [34]:
print('Probabilities')
print([result_1/(result_1+result_2),result_2/(result_1+result_2)])

Probabilities
[0.11764705882352942, 0.8823529411764707]


## Comparing to scikit learn. 

Once we have the vectorized input (using the count of each word, each document can be viewed as a vector), we can create a `scikit-learn` classifier for it. We'll use `MultinomialNB`, one of scikit-learn's defined Naive Bayes classifiers. It's appropriate for working with text. 

After we create the classifier, we first fit it using the corpus, and then invoke `predict` to classify the new instance. 

At the end, we also print out the probabilities, which should match our hand-rolled calculations. 

In [35]:
from sklearn.naive_bayes import MultinomialNB

# create the classifier instance and then fit it to the corpus. 
clf = MultinomialNB()
clf.fit(X_train_counts, targets)

# then classify the new instance.
X_new_counts = count_vect.transform(new_data)
predicted = clf.predict(X_new_counts)

# zip iterates through new_data and predicted in parallel, forming 2-tuples.
print('Predictions')
for doc, category in zip(new_data, predicted):
    print(f'{doc} => {category}')

print('\nProbabilities')
print(clf.predict_proba(X_new_counts))  


Predictions
bird => 2

Probabilities
[[0.11764706 0.88235294]]


## Using logs

Recall that computing the product of many values close to 0 could result in a floating point underflow error. To get around this, it is common practice to instead compute the some of the logarithms of the probabilities. This is shown below. At the end, we again compute the probabilities for each class. 

In [36]:
log_cond_prob_class_1 = np.log2(cond_prob_class_1)
result_1l = np.sum(log_cond_prob_class_1*new_counts_df.values.T)+np.log2(class_1_prob)
result_1l = result_1l[0]


log_cond_prob_class_2 = np.log2(cond_prob_class_2)
result_2l = np.sum(log_cond_prob_class_2*new_counts_df.values.T)+np.log2(class_2_prob)
result_2l = result_2l[0]

print(f'Log probabilities Class 1\n{log_cond_prob_class_1.T}')
print(f'Log probabilities Class 2\n{log_cond_prob_class_2.T}\n')
print(f'Result, Class 1: {result_1l}')
print(f'Result, Class 2: {result_2l}')

r1 = np.power(2,result_1l)
r2 = np.power(2,result_2l)
total = r1+r2
print([r1/total, r2/total])

Log probabilities Class 1
       bird      blue      fish       one       red       two
0 -3.954196 -2.146841 -3.954196 -2.146841 -4.954196 -1.369234
Log probabilities Class 2
       bird      blue      fish       one       red       two
0 -1.047306 -2.954196 -2.954196 -3.369234 -3.369234 -3.954196

Result, Class 1: -4.954196310386875
Result, Class 2: -2.0473057147783567
[0.11764705882352942, 0.8823529411764705]
