### Bayes Theorem
## P(A|B) = P(B|A) X P(A) / P(B)

# If we know the prior distribution and the likelihood (as observed from the data), we can normalize the product of these two to yield a posterior estimate of A

### However, this only works if we assume that A_i is iid

In [11]:
from nltk.corpus import movie_reviews

In [150]:
from collections import defaultdict
class NB:
    def __init__(self, classes:[str]):
        # A d[word_k] = (neg_count, pos_count)
        # This dictionary will store and update the priors and likelihoods
        
        # Notice we initialize with a small number greater than 0. We will do this for all
        # counts, to avoid unencountered classes and words
        self.classes = classes
        self.prior_counts = dict(zip(classes, [0.1]*len(classes)))
        self.ll_counts = {}
        
    def get_prior(self, label) -> float:
        '''
        get_prior will produce the current prior probability of a given label
        '''
        return self.prior_counts[label] / sum(self.prior_counts.values())
        
    def get_individual_likelihood(self, feature, label) -> float:
        # if the given feature has not yet been seen, update the dictionary to include it
        if feature not in self.ll_counts.keys():
            self.ll_counts[feature] = dict(zip(self.classes, [0.1]*len(self.classes)))
            self.ll_counts[feature][label] +=1
        return self.ll_counts[feature][label]/sum(self.ll_counts[feature].values())
        
    def get_likelihood(self, label, features):
        '''
        the total likelihood is the product of all features given a label
        '''
        ll = 1
        for f in features:
            ll *=self.get_individual_likelihood(f.lower(), label)
        return ll
        
    def get_posterior(self, label, features):
        posterior = self.get_prior(label) * self.get_likelihood(label, features)
        total = sum([self.get_prior(l) * self.get_likelihood(l, features) for l in self.classes])
        return posterior / total
        
    def fit(self, features, label):
        ''' 
        fit(features, label) will update the model using a streaming line of features (i.e. words) and label
        '''
        # Update the prior counts
        self.prior_counts[label] += 1
        for f in features:
            # Add some feature processing here
            f = f.lower()
            if f not in self.ll_counts.keys():
                self.ll_counts[f] = dict(zip(self.classes, [0.1]*len(self.classes)))
            self.ll_counts[f][label] +=1
        
        

In [151]:
model = NB(['pos', 'neg'])

In [152]:
model.fit(['yum', 'delicious'], 'pos')

In [153]:
model.fit(['is', 'not', 'great'], 'neg')

In [154]:
model.ll_counts

{'yum': {'pos': 1.1, 'neg': 0.1},
 'delicious': {'pos': 1.1, 'neg': 0.1},
 'is': {'pos': 0.1, 'neg': 1.1},
 'not': {'pos': 0.1, 'neg': 1.1},
 'great': {'pos': 0.1, 'neg': 1.1}}

In [155]:
model.get_posterior('pos', ['food', 'is', 'delicious'])

0.9166666666666666

In [156]:
movie_reviews.fileids()[0].replace('.txt', '')

'neg/cv000_29416'

In [157]:
model = NB(['pos', 'neg'])

In [180]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [181]:
nb = Pipeline([
    ('EncodeText', CountVectorizer()),
    ('NaiveBayes', MultinomialNB())
])

In [182]:
CountVectorizer().fit_transform('this is me'.split(' '))

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [186]:
import numpy as np

In [191]:
CountVectorizer().fit_transform(['this', 'is', 'me','boo', 'you'])

<5x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [188]:
nb.fit(np.array([['this', 'is', 'me'], ['boo', 'you']]), ['pos', 'neg'])

  nb.fit(np.array([['this', 'is', 'me'], ['boo', 'you']]), ['pos', 'neg'])


AttributeError: 'list' object has no attribute 'lower'

In [171]:
nb = BernoulliNB(alpha = 0.1, )

In [158]:
i = 0
for f in movie_reviews.fileids():
    if i%10 == 0:
        print(model.prior_counts)

    label, fileid = f.replace('.txt', '').split('/')
    model.fit(movie_reviews.words(fileids = f), label)
    nb.fit()
    i+=1
    

{'pos': 0.1, 'neg': 0.1}
{'pos': 0.1, 'neg': 10.1}
{'pos': 0.1, 'neg': 20.1}
{'pos': 0.1, 'neg': 30.1}
{'pos': 0.1, 'neg': 40.1}
{'pos': 0.1, 'neg': 50.1}
{'pos': 0.1, 'neg': 60.1}
{'pos': 0.1, 'neg': 70.1}
{'pos': 0.1, 'neg': 80.1}
{'pos': 0.1, 'neg': 90.1}
{'pos': 0.1, 'neg': 100.1}
{'pos': 0.1, 'neg': 110.1}
{'pos': 0.1, 'neg': 120.1}
{'pos': 0.1, 'neg': 130.1}
{'pos': 0.1, 'neg': 140.1}
{'pos': 0.1, 'neg': 150.1}
{'pos': 0.1, 'neg': 160.1}
{'pos': 0.1, 'neg': 170.1}
{'pos': 0.1, 'neg': 180.1}
{'pos': 0.1, 'neg': 190.1}
{'pos': 0.1, 'neg': 200.1}
{'pos': 0.1, 'neg': 210.1}
{'pos': 0.1, 'neg': 220.1}
{'pos': 0.1, 'neg': 230.1}
{'pos': 0.1, 'neg': 240.1}
{'pos': 0.1, 'neg': 250.1}
{'pos': 0.1, 'neg': 260.1}
{'pos': 0.1, 'neg': 270.1}
{'pos': 0.1, 'neg': 280.1}
{'pos': 0.1, 'neg': 290.1}
{'pos': 0.1, 'neg': 300.1}
{'pos': 0.1, 'neg': 310.1}
{'pos': 0.1, 'neg': 320.1}
{'pos': 0.1, 'neg': 330.1}
{'pos': 0.1, 'neg': 340.1}
{'pos': 0.1, 'neg': 350.1}
{'pos': 0.1, 'neg': 360.1}
{'pos': 0.1,

In [169]:
model.get_posterior('pos', 'wild mundane'.split(' '))

0.6417810975303053

In [160]:
model.get_prior('pos')

0.5

In [161]:
model.ll_counts['terrible']

{'pos': 28.1, 'neg': 115.1}

In [25]:
import nltk

In [28]:
reader = movie_reviews.words(a)

In [33]:
w = reader

In [37]:
w.iterate_from()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [32]:
len(reader)

1583820

In [170]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
def naive_bayes(self):
    self.dictionary = {}
    