# Naive Bayes Worked Example
From Jurafsky 3rd edition

In [8]:
from collections import Counter
import itertools

Sentiment classification -- positive or negative.

 We've got a set of annotated training data.

In [3]:
training_data = [
    ("negative", "just plain boring"),
    ("negative", "entirely predictable and lacks energy"),
    ("negative", "no surprises and very few laughs"),
    ("positive", "very powerful"),
    ("positive", "the most fun film of the summer")
]

Log prior probability of each class is $\log\hat{P}(c)=\log\frac{N_c}{N_{doc}}$

In [4]:
classes = set([c for (c, d) in training_data])

p_prior = dict()
for current_class in classes:
    n_c = len([None for (c, d) in training_data if c == current_class])
    n_doc = len(training_data)
    p_prior[current_class] = n_c / n_doc
p_prior

{'negative': 0.6, 'positive': 0.4}

$\log\hat{P}(w_i|c) = \log\frac{ count(w_i, c) + 1 }{ \sum_{w \in V}{count(w, c) + 1} }$

In [5]:
big_doc = " ".join([d for (c, d) in training_data])

big_doc_by_class = dict()
for current_class in classes:
    big_doc_by_class[current_class] = " ".join([d for (c, d) in training_data if c == current_class])

bag_of_words = Counter(big_doc.split(" "))
bag_of_words_by_class = {c: Counter(d.split(" ")) for (c, d) in big_doc_by_class.items()}

likelihood = dict()
for current_class in classes:
    laplace_word_counts = dict()
    for word in bag_of_words:
        if word in bag_of_words_by_class[current_class].keys():
            laplace_word_counts[word] = bag_of_words_by_class[current_class][word] + 1
        else:
            laplace_word_counts[word] = 1

    laplace_word_sum = sum(laplace_word_counts.values())
    likelihood[current_class] = {word: count / laplace_word_sum for (word, count) in laplace_word_counts.items()}

# Binary Naive Bayes

*Within* each document, remove all duplicate words before concatenating into big_doc

In [12]:
training_data_binary = [(c, list(set(d.split(" ")))) for (c, d) in training_data]
training_data_binary
binary_bag_of_words_by_class = dict()

for current_class in classes:
    binary_bag_of_words_by_class[current_class] = Counter(itertools.chain(*[d for (c, d) in training_data_binary if c == current_class]))
