# Naive Bayes
This notebook shows to build a Naive Bayes classifier from scratch. We will first look into features only contian binary values, and later we will also implement Gaussian Naive Bayes for more discrete features.

In [94]:
import scipy
import scipy.io as sio
import numpy as np
from collections import Counter

## Naive bayes with binary inputs

In [98]:
class NaiveBayes(object):
    '''X binary cases, y could be multiclasses
    '''
    def __init__(self):
        '''Initialze some variables'''
        self.priors = []
        self.condi = []
        self.counts = None
        self.label_names = None
    def get_classes(self, labels):
        '''Count each class, and store the name of each class'''
        self.counts = Counter(labels)
        self.label_names = list(self.counts.keys())
    def get_prior(self, labels):
        '''Calculate the prior probability of each class'''
        for name in self.label_names:
            self.priors.append(self.counts[name]/len(labels))
    def get_conditional(self, training, labels):
        '''Get conditional probability of features given label'''
        for label_name in self.label_names:
            pos = np.where(labels == label_name)[0]
            # here we include a prior probability.
            # for any features we did not in the training set
            # we assume it has the probability of 1/(# of classes)
            self.condi.append((np.sum(training[pos, :], axis = 0)+1)/(len(pos)+len(self.label_names)))
    def train(self, training, labels):
        '''Train dataset'''
        labels = labels.ravel()
        self.get_classes(labels)
        self.get_prior(labels)
        self.get_conditional(training, labels)
    def predict(self, testset):
        '''Predict given input dataset
        Return:
            list of classes
        '''
        if scipy.sparse.issparse(testset):
            testset = testset.todense()
        results = []
        for test in testset:
            best = None
            for i in range(len(self.label_names)):
                joint_prob = np.multiply(test, self.condi[i])+np.multiply(1-test, 1-self.condi[i])
                log_jprob = np.sum(np.log(joint_prob))+np.log(self.priors[i])
                if best is None or best < log_jprob:
                    best = log_jprob
                    index = i
            results.append(self.label_names[index])
        return results
    @staticmethod
    def accuracy(predicts, labels):
        '''
        Return:
            float, accuracy of prediction
        '''
        return sum(predicts == labels.ravel())/len(labels)

### Test how this one works
The data is from [CMU's ML course 10-601](http://www.cs.cmu.edu/~ninamf/courses/601sp15/hw/hw3_code.tar).

In [99]:
var_dict = sio.loadmat('./hw3/HW3Data.mat')
var_dict.keys()

dict_keys(['__header__', '__version__', '__globals__', 'XTrain', 'XTest', 'XTrainSmall', 'yTrain', 'yTest', 'yTrainSmall', 'Vocabulary'])

In [100]:
NB = NaiveBayes()

NB.train(var_dict['XTrain'], var_dict['yTrain'])
pred_train = NB.predict(var_dict['XTrain'])
NB.accuracy(pred_train, var_dict['yTrain'])

1.0

In [102]:
results = NB.predict(var_dict['XTest'])
NB.accuracy(results, var_dict['yTest'])

0.9793103448275862

## Gaussian Naive Bayes