# Naive Bayes
In machine learning, naïve Bayes classifiers are a family of simple "probabilistic classifiers" based on applying Bayes' theorem with strong (naïve) independence assumptions between the features. They are among the simplest Bayesian network models. But they could be coupled with Kernel density estimation and achieve higher accuracy levels.

# Imports

In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range, input
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sortedcontainers import SortedList
from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn

# get_data(limit=None):
This function reads the data from the csv file and returns the normalised data (in reduced form).

In [5]:
def get_data(limit=None):
    print("Reading in and transforming data...")
    df = pd.read_csv('train.csv')
    data = df.values
    np.random.shuffle(data)
    X = data[:, 1:] / 255.0 # data is from 0..255
    Y = data[:, 0]
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
    return X, Y

# Naive Bayes Model

In [6]:
class NaiveBayes(object):
    def fit(self,X,Y,smoothing=10e-3):
        self.gaussians=dict()
        self.priors=dict()
        labels=set(Y)
        for c in labels:
            current_x=X[Y==c]
            self.gaussians[c]={
            'mean':current_x.mean(axis=0),
            'var':current_x.var(axis=0) + smoothing,
          }
            self.priors[c]=float(len(Y[Y==c]))/len(Y)
    def score(self,X,Y):
        P=self.predict(X)
        return np.mean(P==Y)
    def predict(self,X):
        N,D=X.shape
        K=len(self.gaussians)
        P=np.zeros((N,K))
        for c,g in iteritems(self.gaussians):
            mean,var=g['mean'],g['var']
            P[:,c]=mvn.logpdf(X,mean=mean,cov=var)+ np.log(self.priors[c])
        return np.argmax(P,axis=1)

We only use 10000 data points and splitting the data into 2 sets Train data and test data

In [12]:
X,Y=get_data(10000)
Ntrain=int(len(Y)/2)
Xtrain,Ytrain=X[:Ntrain],Y[:Ntrain]
Xtest,Ytest=X[Ntrain:],Y[Ntrain:]


Reading in and transforming data...


Timing the train  and test time .Printing their accuracy

In [14]:
model=NaiveBayes()
t0=datetime.now()
model.fit(Xtrain,Ytrain)
print("Training time:",(datetime.now()-t0))
t0=datetime.now()
print("Train Accuracy : ",model.score(Xtrain,Ytrain) )
print("time to train accuracy",(datetime.now()-t0),"\t Train size:",len(Ytrain))
t0=datetime.now()
print("Test Accuracy : ",model.score(Xtest,Ytest) )
print("time to test accuracy",(datetime.now()-t0),"\t Test size:",len(Ytest))

Training time: 0:00:00.189250
Train Accuracy :  0.8098
time to train accuracy 0:00:07.153797 	 Train size: 5000
Test Accuracy :  0.7848
time to test accuracy 0:00:07.105230 	 Test size: 5000
