In [1]:
import numpy as np


In [2]:
def fit(X_train,Y_train):
    result={}
    class_values=set(Y_train)
    for current_class in class_values:
        result[current_class]={} #dictionary corresponding to the current class that contains the features
        current_class_rows=(Y_train==current_class)
        X_train_current=X_train[current_class_rows]
        Y_train_current=Y_train[current_class_rows]
        result['total_count']=len(Y_train)
        num_features=X_train.shape[1]
        result[current_class]['total_count']=len(Y_train_current)
        for j in range(1, num_features+1):
            result[current_class][j]={} #this dictionary stores the count of all the different features in the column
            all_possible_values=set(X_train[:,j-1])
            # high/low in yesses<- store the count of high / low here
            for current_value in all_possible_values:
                result[current_class][j][current_value]=(X_train_current[:,j-1]==current_value).sum()
        
    return result

In [3]:
def probability(dictionary,x,current_class):
    output=1.0
    output*=np.log(dictionary[current_class]['total_count']) - np.log(dictionary['total_count'])
    
    num_features=len(dictionary[current_class].keys())-1 #or 1,len(x)+1, I think that is correct, though 90% due to laplace correction
    for j in range(1,num_features+1):
        count_current_class_with_value_xj=dictionary[current_class][j][x[j-1]]+1
        count_current_class_in_dictionary=dictionary[current_class]['total_count'] + len(dictionary[current_class][j].keys())
        output+=np.log(count_current_class_with_value_xj)- np.log(count_current_class_in_dictionary)
    return output


In [4]:
def predictSinglePoint(dictionary,x):
    dictionary_classes=dictionary.keys()
    product_set=[]
    max_prob=-1000
    prob_class=-1
    first_run=True
    for current_class in dictionary_classes:
        if current_class=='total_count': continue
        prob_current_class=probability(dictionary,x,current_class)
        if(first_run or prob_current_class>max_prob):
            max_prob=prob_current_class
            prob_class=current_class
            
        first_run=False
    return prob_class
    

In [5]:
def predict(dictionary,X_test):
    ypred=[]
    for x in X_test:
        x_class=predictSinglePoint(dictionary,x)
        ypred.append(x_class)
    
    return ypred

In [6]:
def convert_column(column):
    mid_value=column.mean()
    first_value=(1/2.0)*mid_value
    last_value=(1.5)*mid_value
    for i in range(len(column)):
        if(column[i]<=first_value):
            column[i]=0
        elif column[i]<=mid_value and column[i]>first_value:
            column[i]=1
        elif column[i]<=last_value and column[i]>mid_value:
            column[i]=2
        else:
            column[i]=3
    return


In [7]:
from sklearn import datasets as ds
data=ds.load_iris()
X=data.data
Y=data.target


In [8]:
N=X.shape[1]
for i in range(N):
    convert_column(X[:,i])

    

In [9]:
from sklearn.model_selection import train_test_split as tts

X_train,X_test,Y_train,Y_test=tts(X,Y,test_size=0.25,random_state=0)

In [10]:
dictionary=fit(X_train,Y_train)

In [11]:
ypred=predict(dictionary,X_test)

In [12]:
from sklearn.metrics import confusion_matrix, classification_report

print confusion_matrix(Y_test,ypred)
print classification_report (Y_test,ypred)

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

