In [206]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB

In [221]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'talk.religion.misc'])
vectorizer = CountVectorizer(stop_words="english", min_df=5)
vectors = np.asarray(vectorizer.fit_transform(train.data).todense())

In [238]:
class NaiveBayesClassification(object):
    def fit(self, X, y):
        num_class = np.unique(y).shape[0]

        self.prior = np.zeros((num_class))
        for i in range(num_class):
            self.prior[i] = (y == 1).sum() / y.shape[0]

        # Assumming the features are in the representation of bag-of-words
        x_by_c = np.array([X[y == c] for c in range(num_class)]) + 1.0
        sum_words = np.array([arr.sum(0) for arr in x_by_c])
        total_words = sum_words.sum()
        self.likelihood = sum_words / total_words

    def predict(self, X):
        posterior = np.zeros((X.shape[0], self.prior.shape[0]))
        for i, x in enumerate(X):
            pos = x.astype(bool)
            likelihood = self.likelihood[:, pos]
            likelihood = likelihood.prod(1)
            posterior[i] = self.prior * likelihood
        proba = posterior / posterior.sum(1).reshape(-1, 1)
        return proba

In [239]:
c = NaiveBayesClassification()
c.fit(vectors, train.target)

In [240]:
vectors[3].astype(bool).sum(), np.exp(np.log(c.likelihood[:, vectors[3].astype(bool)]).sum())

(111, 0.0)

In [241]:
c.prior, c.likelihood

(array([0.43990665, 0.43990665]),
 array([[0.00013282, 0.00014134, 0.00013529, ..., 0.00013199, 0.00013474,
         0.00013282],
        [0.00010477, 0.00010834, 0.00010532, ..., 0.00010587, 0.00010394,
         0.00010669]]))

In [242]:
c.predict(vectors)

array([[9.99996866e-01, 3.13446690e-06],
       [9.99999961e-01, 3.85015766e-08],
       [           nan,            nan],
       ...,
       [           nan,            nan],
       [9.99998718e-01, 1.28170010e-06],
       [           nan,            nan]])

In [219]:
bench = MultinomialNB().fit(X, y)

In [220]:
bench.predict_proba(X)

array([[0.05382675, 0.94617325],
       [0.10215483, 0.89784517],
       [0.14578588, 0.85421412],
       [0.96919027, 0.03080973],
       [0.62098241, 0.37901759],
       [0.80376766, 0.19623234],
       [0.92474413, 0.07525587]])