###  Created by Luis A. Sanchez-Perez (alejand@umich.edu)

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from scipy.stats import norm 
from scipy.special import logsumexp

In [2]:
# Load dataset
dataset = datasets.load_iris()
print(dataset.feature_names, end="\n")
print(dataset.target_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [3]:
# Splitting
X_train, X_test, y_train, y_test = train_test_split(dataset.data,dataset.target)

### Using sklearn implementation

In [4]:
# Fitting Naive Bayes
classifier = GaussianNB()
classifier.fit(X_train,y_train)
print(classifier.theta_)
print(classifier.sigma_)

[[4.95526316 3.38157895 1.45789474 0.24736842]
 [5.875      2.72222222 4.2        1.31666667]
 [6.61315789 2.96842105 5.55       1.98421053]]
[[0.10510388 0.14150277 0.02822715 0.01144045]
 [0.256875   0.09950618 0.21944445 0.03861111]
 [0.38430056 0.11268698 0.30460527 0.06554017]]


In [5]:
# Predicting the training set results
y_pred = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
print(accuracy_score(y_train,y_pred))

[[38  0  0]
 [ 0 33  3]
 [ 0  2 36]]
0.9553571428571429


In [6]:
# Predicting the test set results
y_pred = classifier.predict(X_test,)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))

[[12  0  0]
 [ 0 12  2]
 [ 0  1 11]]
0.9210526315789473


### Creating new input

In [7]:
point = np.array([2,1,3,1],ndmin=2)

### Custom implementation

In [8]:
# Computing sklearn model output to compare
classifier.predict_proba(point)

array([[1.62801341e-34, 9.99999652e-01, 3.47954930e-07]])

#### Building mean and std matrices
Notice here we are using list comprehension. Notice we work on the training set only. We build a mean and a std matrix where each row represent the values to define the likelihood pdf for each class.

In [9]:
mean = np.array([X_train[y_train == i,:].mean(axis=0) for i in np.unique(y_train)])
mean

array([[4.95526316, 3.38157895, 1.45789474, 0.24736842],
       [5.875     , 2.72222222, 4.2       , 1.31666667],
       [6.61315789, 2.96842105, 5.55      , 1.98421053]])

In [10]:
std = np.array([X_train[y_train == i,:].std(axis=0) for i in np.unique(y_train)])
std

array([[0.32419728, 0.37616854, 0.16800937, 0.10696001],
       [0.50682837, 0.31544599, 0.46844898, 0.1964971 ],
       [0.6199198 , 0.33568882, 0.55191056, 0.25600814]])

#### Building priors vector
We use frequency analysis to determine this. Notice we work on the training set only. We get one value per class.

In [11]:
prior = [(y_train == i).sum()/len(y_train) for i in np.unique(y_train)]
prior

[0.3392857142857143, 0.32142857142857145, 0.3392857142857143]

#### Computing the likelihood
We compute the likelihood of observing each feature value in the given input (this uses the proper pdf, that is the proper mean and std from the matrix). The result is a matrix the same size as the mean and std matrices.

In [12]:
likelihood = norm.pdf(point,mean,std)
likelihood

array([[1.11251351e-18, 2.09661381e-09, 1.20585322e-18, 6.60706261e-11],
       [1.59481319e-13, 4.25918898e-07, 3.20126833e-02, 5.54109982e-01],
       [6.07738720e-13, 4.05958540e-08, 1.67314670e-05, 9.62197289e-04]])

#### Computing the log-likelihood instead
We compute the log likelihood. As you can see the values are not as small as the ones from the likelihood so we have less chance to underflow.

In [13]:
loglikelihood = np.log(likelihood)
loglikelihood

array([[-41.3399098 , -19.98294227, -41.25934429, -23.44029685],
       [-29.4668496 , -14.66901689,  -3.4416231 ,  -0.59039209],
       [-28.12903134, -17.01959989, -10.99821936,  -6.94629105]])

#### Computing the posterior
Here we first use the straight foward method and along the way we can observe how we get really small values and of course a bigger chance to underflow.

In [14]:
numerator = likelihood.prod(axis=1) * prior
numerator

array([6.30510288e-56, 3.87292653e-22, 1.34760406e-28])

In [15]:
lognumerator = loglikelihood.sum(axis=1) + np.log(prior)
lognumerator

array([-127.10340592,  -49.30286161,  -64.17405436])

In [16]:
posterior = numerator/numerator.sum()
posterior

array([1.62799388e-34, 9.99999652e-01, 3.47954856e-07])

In [17]:
posterior.sum()

1.0

#### Computing the log-posterior
We compute the log posterior using two different ways to determine P(x). First we compute this in an 'unsafe' (less stable) way and then we use te logsumexp trick.

In [18]:
numerator.sum()

3.872927878814349e-22

In [19]:
logposterior = lognumerator - np.log(numerator.sum()) # unsafe
logposterior

array([-7.78005447e+01, -3.47954924e-07, -1.48711931e+01])

In [20]:
posterior = np.exp(logposterior)
posterior

array([1.62799388e-34, 9.99999652e-01, 3.47954856e-07])

In [21]:
posterior.sum()

0.9999999999999933

In [22]:
logposterior = lognumerator - logsumexp(lognumerator) # more stable
logposterior 

array([-7.78005447e+01, -3.47954916e-07, -1.48711931e+01])

In [23]:
posterior = np.exp(logposterior)
posterior

array([1.62799388e-34, 9.99999652e-01, 3.47954856e-07])

In [24]:
posterior.sum()

1.0000000000000004