In [75]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from scipy.special import logsumexp
from sklearn.preprocessing import OneHotEncoder

In [76]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'talk.religion.misc'])
vectorizer = CountVectorizer(stop_words="english", min_df=5)
vectors = vectorizer.fit_transform(train.data)
# vectors = np.asarray(vectorizer.fit_transform(train.data).todense())

In [77]:
class NaiveBayesClassification(object):
    def __init__(self):
        self.label_bin = OneHotEncoder()

    def fit(self, X, y):
        _y = np.asarray(self.label_bin.fit_transform(y.reshape(y.shape[0], 1)).todense())
        num_class = np.unique(y).shape[0]

        self.prior = np.zeros((num_class))
        for i in range(num_class):
            self.prior[i] = (y == 1).sum() / y.shape[0]
        self.log_prior = np.log(self.prior)

        sum_words = (_y.T @ X) + 1
        total_words = sum_words.sum()
        self.log_likelihood = np.log(sum_words) - np.log(total_words)

    def predict_proba(self, X):
        llh = X @ self.log_likelihood.T
        posterior = llh + self.prior
        # Normalized by Z but in logarithmic form
        posterior = posterior - logsumexp(posterior, axis=1).reshape(-1, 1)
        return np.exp(posterior)

    def predict(self, X):
        preds = self.predict_proba(X)
        return preds.argmax(axis=1)

In [78]:
c = NaiveBayesClassification()
c.fit(vectors, train.target)

In [79]:
c.prior, c.log_likelihood.shape

(array([0.43990665, 0.43990665]), (2, 4098))

In [80]:
c.predict(vectors)

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [81]:
bench = MultinomialNB().fit(vectors, train.target)

In [82]:
bench.predict_proba(vectors)

array([[1.00000000e+00, 6.72482499e-30],
       [1.00000000e+00, 3.12984756e-10],
       [1.00000000e+00, 1.10013597e-42],
       ...,
       [1.00000000e+00, 2.06943454e-68],
       [1.00000000e+00, 2.07984869e-21],
       [1.00000000e+00, 3.12470907e-66]])