# Naive Bayes Implementation

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB

In [2]:
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [3]:
df = pd.DataFrame(np.c_[iris['data'], iris['target']], 
                 columns=iris.feature_names + ['species'])

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


## Naive Bayes

In [5]:
def estimate_priors(y):
    return np.bincount(y)/len(y)

def fit_gaussian(X, indep=False):
    mu = np.mean(X, 0)
    if indep:
        sigma = np.diag(np.var(X, 0))
    else:
        sigma = np.cov(X.T)
    
    return (mu, sigma)

# TODO: Vectorized over a matrix (X)
def dmvn(x, mu, sigma, log=False):
    n = mu.shape[0]
    pi = np.pi
    sigma_inv = np.linalg.inv(sigma)
    x_mu = x-mu
    denom = ((2*pi)**(n/2)) * np.linalg.det(sigma)**(1/2)    
    dens = np.exp(-((x_mu).T @ sigma_inv @ (x_mu))/2)/denom
    if log:
        dens = np.log(dens)
        
    return dens

# TODO: Vectorized splitting
def split(X, y):
    grps = np.unique(y)
    n = len(grps)
    arr = [0]*n
    for i, grp in enumerate(grps):
        arr[i] = X[y==grp, :]
        
    return arr

In [6]:
class GaussianNaiveBayes(BaseEstimator, ClassifierMixin):
    '''Gaussian Naive Bayes classifier
    
    Parameters
    ----------
    
    priors : array-like, shape (n_classes,)
        Prior probabilities of the classes. If specified the priors are not
        adjusted according to the data.
    
    indep: bool, optional (default=False)
        Whether the features are assumed to be independent (default) or not.
        
    '''
    def __init__(self, priors=None, indep=False):
        self.indep = indep
        self.priors = priors # CHECK: Will naming differently cause a problem?
    
    def fit(self, X, y):
        spltd = split(X, y)
        self.priors = estimate_priors(y)
        self.ngrps_ = len(self.priors)
        self.dens_ = [None]*self.ngrps_
        self.params_ = [fit_gaussian(x, self.indep) for x in spltd]
        
        # TODO: Vectorize
        for i, x in enumerate(spltd):
            self.dens_ = lambda xi, i: dmvn(xi, *self.params_[i])
            
        return self
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), 1) # MAP Estimate
    
    def predict_proba(self, X):
        posterior = np.array([[self.dens_(x, i)*self.priors[i] 
                               for i in range(self.ngrps_)]
                              for x in X])
        
        return posterior/posterior.sum(1).reshape(-1, 1)
        

In [7]:
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)

GaussianNaiveBayes(indep=False,
                   priors=array([0.34285714, 0.31428571, 0.34285714]))

In [8]:
preds = gnb.predict(X_test)
preds

array([1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 1, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2,
       2, 0, 1, 0, 1, 1, 1, 2, 1, 1, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 0,
       1], dtype=int64)

In [9]:
accuracy_score(y_test, preds)

1.0

In [10]:
gnb_sk = GaussianNB()
gnb_sk.fit(X_train, y_train)
preds_sk = gnb_sk.predict(X_test)

In [11]:
accuracy_score(y_test, preds_sk)

1.0