In [1]:
import numpy as np
import pydotplus
import pandas as pd
import sklearn.datasets as Datasets
from sklearn import tree
from sklearn import model_selection as cv
from sklearn import naive_bayes
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
def calculatePriorProbabilities(Y):
    classes = set(Y)
    result = {}
    for i in classes:
        result[i] = (len(Y[Y==i])/len(Y))
    return result

In [3]:
def naiveProbability(priorProbability,current_class,X_train,Y_train,X_test_sample):
    result = priorProbability
    #Modifying X_train for current class only
    class_samples = (Y_train == current_class)
    Y_train = Y_train[class_samples]
    X_train = X_train[class_samples]
    dr = len(Y_train)
    for i in range(0,X_train.shape[-1]):
        nr = len(X_train[X_train[:,i]==X_test_sample[i]])
        result = result * (nr/dr)
    return result

In [4]:
#For Labelled(non-continous) Features only
def naiveBayesPredict(X_train,Y_train,X_test,priorProbabilities={}):
    classes = set(Y_train)
    test_samples = X_test.shape[0]
    y_pred = np.zeros(test_samples)
    if (len(priorProbabilities) == 0) :
        priorProbabilities = calculatePriorProbabilities(Y_train)
    #print(priorProbabilities)
    for i in range(0,test_samples):
        probabilities = {}
        for current_class in classes:
            probabilities[current_class] = naiveProbability(priorProbabilities[current_class],current_class,X_train,Y_train,X_test[i,:])
        #print("For sample",X_test[i,:]," probabilities are = ",probabilities)
        y_pred[i] = max(probabilities,key=probabilities.get)
    return y_pred

# Changing Iris Dataset to labelled dataset

In [5]:
def makeLabelled(column):
    mean = column.mean()
    for i in range (0,len(column)):
        column[i] = int(column[i]>=mean) 
    return column

In [6]:
iris = Datasets.load_iris()
df = pd.DataFrame(iris.data)

In [7]:
X = df.values
Y = iris.target

In [8]:
for i in range(0,X.shape[-1]):
    X[:,i] = makeLabelled(X[:,i])

In [9]:
X_train,X_test,Y_train,Y_test = cv.train_test_split(X,Y,test_size=0.2,random_state=0)

# Checking Results From My Implementation

In [10]:
Y_pred = naiveBayesPredict(X,Y,X_test)

In [11]:
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

             precision    recall  f1-score   support

          0       0.92      1.00      0.96        11
          1       0.60      0.23      0.33        13
          2       0.31      0.67      0.42         6

avg / total       0.66      0.60      0.58        30

[[11  0  0]
 [ 1  3  9]
 [ 0  2  4]]


# Checking Results Using sklearn Gaussian Model

In [12]:
from sklearn import naive_bayes
gnb = naive_bayes.GaussianNB()
gnb.fit(X_train,Y_train)
Y_pred = gnb.predict(X_test)

In [13]:
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

             precision    recall  f1-score   support

          0       0.92      1.00      0.96        11
          1       1.00      0.08      0.14        13
          2       0.35      1.00      0.52         6

avg / total       0.84      0.60      0.52        30

[[11  0  0]
 [ 1  1 11]
 [ 0  0  6]]
