In [88]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as d_sets

%matplotlib inline

### Naive Bayes ###
---
* **Discrete** feature vectors 
* The model makes a strong assumption that $x_{i}$ and y are *conditionally independet* 

> ###### Naive Bayes assumption
> For instance, if y = 1 means spam email; “buy” is word 2087 and “price” is word 39831; 
> then we are assuming that if I tell you y = 1 (that a particular piece of email is spam), then knowledge
> of $x_{2087}$ (knowledge of whether “buy” appears in the message) will have no effect on your beliefs about 
> the value of $x_{39831}$ (whether “price” appears).


> **More formally, this can be written:**  
> p($x_{2087}$ | y) = p($x_{2087}$|y, $x_{39831}$).  
> (Note that this is not the same as saying that $x_{2087}$ and $x_{39831}$ are independent,  
> which would have been written "p($x_{2087}$) = p($x_{2087}$|$x_{39831}$";  
> rather, we are only assuming that $x_{2087}$ $x_{39831}$ are conditionally independent given y.)

Good example - spam classifier where feature vector = bianry dictionary vector  
taken from [ AndrewNG-s cs299 notes ](http://cs229.stanford.edu/notes/cs229-notes2.pdf)  
also from [machinelearningmastery](http://machinelearningmastery.com/naive-bayes-classifier-scratch-python/)

In [113]:
def loadData(filename, splitRatio):
    dataset = np.loadtxt(filename, delimiter=',')
    
    train_size = int(dataset.shape[0] * splitRatio)
    np.random.shuffle(dataset)

    return (dataset[0:train_size,:], dataset[train_size:,:])

class NaiveBayes():
    def __init__(self):
        self.summaries = {}
        
    def separateByClass(self,dataset):
        separated = {}
        for i in range(dataset.shape[0]):
            vector = dataset[i]
            if (vector[-1] not in separated):
                separated[vector[-1]] = []
            separated[vector[-1]].append(vector)
        return separated

    def stdev(self,numbers):
        avg = np.mean(numbers)
        variance = np.mean([np.power(x-avg,2) for x in numbers])
        return np.sqrt(variance)

    def summarize(self,dataset):
        return [(np.mean(attribute), self.stdev(attribute)) for attribute in zip(*dataset)][:-1]

    def fit(self,dataset):
        separated = separateByClass(dataset)
        for classValue, instances in separated.iteritems():
            self.summaries[classValue] = summarize(instances)

    def getProb(self,x, mean, stdev):
        exponent = np.exp(-(np.power(x-mean,2)/(2*np.power(stdev,2))))
        return (1 / (np.sqrt(2*np.pi) * stdev)) * exponent

    def calculateClassProbabilities(self, inputVector):
        probabilities = {}
        for classValue, classSummaries in self.summaries.iteritems():
            probabilities[classValue] = 1
            for i in range(len(classSummaries)):
                mean, stdev = self.classSummaries[i]
                x = inputVector[i]
                probabilities[classValue] *= self.getProb(x, mean, stdev)
        return probabilities

    def _predict(self, inputVector):
        probabilities = calculateClassProbabilities(self.summaries, inputVector)
        bestLabel, bestProb = None, -1
        for classValue, probability in probabilities.iteritems():
            if bestLabel is None or probability > bestProb:
                bestProb = probability
                bestLabel = classValue
        return bestLabel

    def predict(self, testSet):
        return [predict(self.summaries, tS) for tS in testSet]

def accuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0


In [114]:
filename = '../datasets/pima-indians-diabetes.csv'
splitRatio = 0.6

trainingSet, testSet = loadData(filename, splitRatio)
print('Split {0} rows into train={1} and test={2} rows').format(len(trainingSet)+len(testSet), len(trainingSet), len(testSet))

clf = NaiveBayes()
summaries = clf.fit(trainingSet)

predictions = clf.predict(testSet)

acc = accuracy(testSet, predictions)
print('Accuracy: {0}%').format(acc)

Split 768 rows into train=460 and test=308 rows
Accuracy: 73.3766233766%
