# Naive Bayes 

#### TL;DR
Develop an understanding of the Naive Bayes algorithm by coding from scratch

### Reference


### Take-aways

- Bayes Theorem:

$$P(A|B)=\frac{P(B|A)P(A)}{P(B)}$$

- "Naive" assumes independence
- Multivariate Gaussian:

$$p(x)=\frac{1}{\sqrt{(2\pi)^D|\sigma|}}exp\big(\frac{1}{2}(x-\mu)^T\sigma^-1(x-\mu)\big)$$

where $x=\text{vector input}$, $\mu=\text{vector mean}$ and $\sigma=\text{covariance matrix}$

In [None]:
import numpy as np
from datetime import datetime
from scipy.stats import norm # single dimension gaussian
from scipy.stats import multivariate_normal as mvn # multivariable dimension gaussian

In [None]:
class Bayes(object):
    
    def fit(self, X, Y, smoothing=10e-2):
        N, D = X.shape
        self.gaussians = dict()
        self.priors = dict()
        labels = set(Y)
        for c in labels:
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'cov': np.cov(current_x.T) + np.eye(D) * smoothing
            }
            self.priors[c] = float(len(Y[Y == c]) / len(Y))
            
    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)
    
    def predict(self, X):
        N, D = X.shape
        K = len(self.gaussians)
        P = np.zeros((N, K))
        for c, g in iteritems(self.gaussians):
            mean, cov = g['mean'], g['cov']
            P[:, c] = mvn.logpdf(X, mean=mean, cov=cov) + np.log(self.priors[c])
        return np.argmax(P, axis=1)

In [None]:
if __name__=='__main__':
#     X, Y = get_data(10000)
    Ntrain = len(Y) // 2
    Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
    Xtest, Ytest = X[Ntrain:], Y[Ntrain:]
    
    model = Bayes()
    t0 = datetime.now()
    model.fit(Xtrain, Ytrain)
    print("Training time:", (datetime.now() - t0))
    
    t0 = datetime.now()
    print("Train accuracy:", model.score(Xtrain, Ytrain))
    print("Time to compute train accuracy:", (datetime.now() - t0), 
         "Train size:", len(Ytrain))
    
    t0 = datetime.now()
    print("Test accuracy:", model.score(Xtest, Ytest))
    print("Time to compute test accuracy:", (datetime.now() - t0),
         "Test size:", len(Ytest))
    
    # plot the mean of each class
    for c, g in iteritems(model.gaussians):
        plt.imshow(g['mean'].reshape(28, 28))
        plt.title(c)
        plt.show()