In [57]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [58]:
iris = load_iris()
data = iris["data"]
labels = iris["target"]

In [59]:
attr_means = np.mean(data, axis=0)
data[data <= attr_means] = 0
data[data > attr_means] = 1
data = data.astype(int)

In [60]:
trainX, testX, trainY, testY = train_test_split(data, labels, train_size=0.75, test_size=0.25, random_state=3)

In [61]:
class NBClassifier(object):

    def fit(self, data, labels):
        self.X = data
        self.Y = labels
        self.N = self.X.shape[0]
        self.M = self.X.shape[1]
        self.class_values = np.unique(self.Y)
        
    def get_class_probablity(self, class_value):
        # This function computes the probability of a particular class label in the train set
        loc = np.where(self.Y == class_value)[0]
        return len(loc)/self.N

    def get_all_class_probabilities(self):
        # This function computes the probs. of all classes and returns a list
        return([self.get_class_probablity(self.class_values[i]) for i in range(len(self.class_values))])

    def get_prob_of_value_in_attr(self, data, attr_idx, value):
        # Calculates the prob. of a value in an attribute for a given data set
        # data - dataset
        # attr_idx - The attribute
        # value - The value to consider
        # Compute this probability
        cnt=0
        for a in data:
            if(a[attr_idx] == value):
                cnt=cnt+1
        return cnt/self.N
    
    def get_prob_of_tuple(self, t):
        # Calculate the prob. of a multi attribute tuple. This function considers the entire train set.
        return np.prod([clf.get_prob_of_value_in_attr(self.X, i, t[i]) for i in range(self.M)])

    def get_prob_of_tuple_in_class(self, t, class_value):
        # This function calculates the prob. of a tuple when a class is specified.
        locs = np.where(self.Y == class_value)
        subset = self.X[locs] #Subset o the data belonging to the specified class.
        # Return the probability of the tuple in the class.
        return np.prod([clf.get_prob_of_value_in_attr(subset, i, t[i]) for i in range(self.M)])

    def predict(self, t):
        # Predicts the most likely class for a tuple taking into consideration all classes
        class_probs = [self.get_prob_of_tuple_in_class(t, c) for c in self.class_values]
        # P(C|t) = (P(t|C)*P(C))/P(t)
        class_probs = np.multiply(class_probs, self.get_all_class_probabilities())/self.get_prob_of_tuple(t)
        indexes = np.argsort(class_probs)
        return indexes[-1]


In [62]:
clf = NBClassifier()
clf.fit(trainX, trainY)

In [63]:
%%timeit
predictions = []
for t in testX:
    predictions.append(clf.predict(t))
predictions = np.array(predictions).astype("int")
print(testY, "\n", predictions)
score = accuracy_score(testY, predictions)
print("Score = ", score)

[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0] 
 [0 0 0 0 0 2 1 0 2 2 1 0 1 1 2 0 2 2 1 0 2 2 2 2 0 2 2 1 0 2 0 0 2 2 0 0 2
 0]
Score =  0.789473684211
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0] 
 [0 0 0 0 0 2 1 0 2 2 1 0 1 1 2 0 2 2 1 0 2 2 2 2 0 2 2 1 0 2 0 0 2 2 0 0 2
 0]
Score =  0.789473684211
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0] 
 [0 0 0 0 0 2 1 0 2 2 1 0 1 1 2 0 2 2 1 0 2 2 2 2 0 2 2 1 0 2 0 0 2 2 0 0 2
 0]
Score =  0.789473684211
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0] 
 [0 0 0 0 0 2 1 0 2 2 1 0 1 1 2 0 2 2 1 0 2 2 2 2 0 2 2 1 0 2 0 0 2 2 0 0 2
 0]
Score =  0.789473684211
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0] 
 [0 0 0 0 0 2 1 0 2 2 1 0 1 1 2 0 2 2 1 0 2 2 2 2 0 2 2 1 0 2 0 0 2 2 0 0 2
 0]
Score =  0.789473684211
[0 0 0 0 0 2 1 0 2 1 1 0 1 1 2 0 1 2 2 0 2 2 2 1 0 2 2 1 1 1 0 0 2 1 0 0 1
 0] 
