<a href="https://colab.research.google.com/github/bmreiniger/datascience.stackexchange/blob/master/SO66103912_NB_priors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import brier_score_loss
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

X, y = fetch_20newsgroups(return_X_y=True)

X = CountVectorizer(binary=True).fit_transform(X)
y = y<5

In [None]:
y_mean = y.mean()
y_mean

0.24951387661304578

In [15]:
preds = {}
for other_prob in [0.1, y_mean, 0.5, 1-y_mean, 2]:
    model = BernoulliNB(class_prior=[1-y_mean, other_prob])
    model.fit(X, y)
    print(model.class_log_prior_)
    print(brier_score_loss(y, model.predict_proba(X)[:, 1]))
    preds[other_prob] = model.predict_proba(X)


[-0.28703412 -2.30258509]
0.055614231239509426
[-0.28703412 -1.38824075]
0.05285587841729709
[-0.28703412 -0.69314718]
0.05094316624448597
[-0.28703412 -0.28703412]
0.049901117563745274
[-0.28703412  0.69314718]
0.04769378858170929


In [None]:
preds[0.1], preds[y_mean]

(array([[1.00000000e+00, 8.88797640e-17],
        [3.84402315e-02, 9.61559769e-01],
        [9.99930218e-01, 6.97816055e-05],
        ...,
        [4.43707462e-01, 5.56292538e-01],
        [9.54444306e-05, 9.99904556e-01],
        [1.00000000e+00, 1.33579256e-13]]),
 array([[1.00000000e+00, 2.21767345e-16],
        [1.57692821e-02, 9.84230718e-01],
        [9.99825903e-01, 1.74096625e-04],
        ...,
        [2.42233496e-01, 7.57766504e-01],
        [3.82543412e-05, 9.99961746e-01],
        [1.00000000e+00, 3.33298780e-13]]))

In [None]:
model = BernoulliNB(class_prior=[(1-y_mean)/10, y_mean/10])
model.fit(X, y)
print(model.class_log_prior_)
print(brier_score_loss(y, model.predict_proba(X)[:, 1]))

[-2.58961921 -3.69082584]
0.05285587841729718


In [None]:
model._joint_log_likelihood(X)

array([[-399.88626177, -435.93116461],
       [-476.000472  , -471.86667554],
       [-764.94857873, -773.60430472],
       ...,
       [-411.61884458, -410.47837141],
       [-558.23542283, -548.06420758],
       [-410.91301252, -439.64274959]])

In [None]:
from scipy.special import logsumexp
jll = model._joint_log_likelihood(X)
log_prob_x = logsumexp(jll, axis=1)
print(log_prob_x)

[-399.88626177 -471.8507806  -764.94840462 ... -410.20099143 -548.06416932
 -410.91301252]


In [None]:
model = BernoulliNB(class_prior=[1-y_mean, y_mean])
model.fit(X, y)
print(model.class_log_prior_)
model._joint_log_likelihood(X)

[-0.28703412 -1.38824075]


array([[-397.58367668, -433.62857952],
       [-473.69788691, -469.56409045],
       [-762.64599364, -771.30171963],
       ...,
       [-409.31625949, -408.17578632],
       [-555.93283774, -545.76162248],
       [-408.61042742, -437.34016449]])

In [None]:
import numpy as np
sc_models = [
    BernoulliNB(class_prior=[(1-y_mean)/10, y_mean/10]),
    BernoulliNB(class_prior=[1-y_mean, y_mean]),
    BernoulliNB(),
]

preds = []
for model in sc_models:
    model.fit(X, y)
    preds.append(model.predict_proba(X)[:, 1])


In [None]:
np.allclose(preds[0], preds[1]), np.allclose(preds[0], preds[2])

(True, True)