In [3]:
import numpy as np
import pydotplus
import pandas as pd
import sklearn.datasets as Datasets
from sklearn import tree
from sklearn import model_selection as cv
from sklearn import naive_bayes
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
def calculatePriorProbabilities(Y):
    classes = set(Y)
    result = {}
    for i in classes:
        result[i] = (len(Y[Y==i])/len(Y))
    return result

In [5]:
def naiveProbability(priorProbability,current_class,X_train,Y_train,X_test_sample):
    result = priorProbability
    #Modifying X_train for current class only
    class_samples = (Y_train == current_class)
    Y_train = Y_train[class_samples]
    X_train = X_train[class_samples]
    dr = len(Y_train)
    for i in range(0,X_train.shape[-1]):
        nr = len(X_train[X_train[:,i]==X_test_sample[i]])
        result = result * (nr/dr)
    return result

In [70]:
#For Labelled(non-continous) Features only
def naiveBayesPredict(X_train,Y_train,X_test,priorProbabilities={}):
    classes = set(Y_train)
    test_samples = X_test.shape[0]
    y_pred = np.zeros(test_samples)
    
    #Assuming this condition is sufficient
    if (len(priorProbabilities) == 0) :
        priorProbabilities = calculatePriorProbabilities(Y_train)
    #print(priorProbabilities)
    for i in range(0,test_samples):
        probabilities = {}
        for current_class in classes:
            probabilities[current_class] = naiveProbability(priorProbabilities[current_class],current_class,X_train,Y_train,X_test[i,:])
        #print("For sample",X_test[i,:]," probabilities are = ",probabilities)
        y_pred[i] = max(probabilities,key=probabilities.get)
    return y_pred

# Using fit To Create Dictionary For Faster Computations while testing

In [64]:
#Returns a dictionary
def fit(X_train,Y_train):
    result = {}
    output_classes = set(Y_train)
    for current_class in output_classes:
        value = {}
        result[current_class] = value
        class_samples = (Y_train == current_class)
        Y_train_current = Y_train[class_samples]
        X_train_current = X_train[class_samples]
        for feature in range(0,X_train.shape[-1]):
            value[feature] = {}
            current_hash = value[feature]
            unique_feature_values = set(X_train[:,feature])
            for unique_value in unique_feature_values:
                current_hash[unique_value] = len(X_train_current[X_train_current[:,feature] == unique_value])
    return result

In [None]:
def calculatePriorProbabilitiesUsingDictionary(dictionary):
    

In [None]:
def naivePredictUsingDictionary(dictioanry,X_test,priorProbabilities={}):
    classes = dictionary.keys()
    test_samples = X_test.shape[0]
    y_pred = np.zeros(test_samples)
    
    #Assuming this condition is sifficient
    if (len(priorProbabilities) == 0) :
        priorProbabilities = calculatePriorProbabilitiesUsingDictionary(dictionary)
    #print(priorProbabilities)
    for i in range(0,test_samples):
        probabilities = {}
        for current_class in classes:
            probabilities[current_class] = naiveProbability(priorProbabilities[current_class],current_class,X_train,Y_train,X_test[i,:])
        #print("For sample",X_test[i,:]," probabilities are = ",probabilities)
        y_pred[i] = max(probabilities,key=probabilities.get)
    return y_pred

# Changing Iris Dataset to labelled dataset

### Laplace Correction for probability of one feature
#### In numerator, add 1 and in Dr, add count of different unique values that feature can take (Only for labelled data set)

In [11]:
def makeLabelled(column):
    second_limit = column.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5*second_limit
    for i in range (0,len(column)):
        if (column[i] < first_limit):
            column[i] = 0
        elif (column[i] < second_limit):
            column[i] = 1
        elif(column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [71]:
iris = Datasets.load_iris()
df = pd.DataFrame(iris.data)

In [72]:
X = df.values
Y = iris.target

In [73]:
for i in range(0,X.shape[-1]):
    X[:,i] = makeLabelled(X[:,i])

In [75]:
X_train,X_test,Y_train,Y_test = cv.train_test_split(X,Y,test_size=0.25,random_state=0)

# Checking Results From My Implementation

In [37]:
dictionary = fit(X_train,Y_train)

current class =  0
current class =  1
current class =  2


In [91]:
dictionary[0]

{0: {1.0: 39, 2.0: 0},
 1: {1.0: 6, 2.0: 33},
 2: {0.0: 39, 1.0: 0, 2.0: 0, 3.0: 0},
 3: {0.0: 38, 1.0: 1, 2.0: 0, 3.0: 0}}

In [76]:
Y_pred = naiveBayesPredict(X_train,Y_train,X_test)

In [77]:
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        13
          1       0.94      1.00      0.97        16
          2       1.00      0.89      0.94         9

avg / total       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


# Checking Results Using sklearn Gaussian Model

In [99]:
from sklearn import naive_bayes
mnb = naive_bayes.MultinomialNB(alpha=1)
mnb.fit(X_train,Y_train)
Y_pred = mnb.predict(X_test)
mnb.predict_proba(X_test[Y_pred != Y_test])

array([[  2.44154394e-04,   4.95365454e-01,   5.04390392e-01],
       [  5.13622003e-05,   4.97827770e-01,   5.02120867e-01],
       [  1.42184546e-05,   4.55422199e-01,   5.44563583e-01],
       [  1.42184546e-05,   4.55422199e-01,   5.44563583e-01],
       [  5.13622003e-05,   4.97827770e-01,   5.02120867e-01],
       [  1.42184546e-05,   4.55422199e-01,   5.44563583e-01],
       [  1.42184546e-05,   4.55422199e-01,   5.44563583e-01],
       [  1.42184546e-05,   4.55422199e-01,   5.44563583e-01],
       [  1.42184546e-05,   4.55422199e-01,   5.44563583e-01],
       [  9.30887042e-06,   4.44972777e-01,   5.55017914e-01],
       [  9.30887042e-06,   4.44972777e-01,   5.55017914e-01],
       [  2.35739663e-01,   4.26444066e-01,   3.37816271e-01],
       [  2.35739663e-01,   4.26444066e-01,   3.37816271e-01],
       [  1.42184546e-05,   4.55422199e-01,   5.44563583e-01],
       [  5.01362881e-03,   4.73160545e-01,   5.21825826e-01],
       [  9.30887042e-06,   4.44972777e-01,   5.5501791

In [100]:
print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

             precision    recall  f1-score   support

          0       1.00      0.85      0.92        13
          1       0.00      0.00      0.00        16
          2       0.36      1.00      0.53         9

avg / total       0.43      0.53      0.44        38

[[11  2  0]
 [ 0  0 16]
 [ 0  0  9]]


In [101]:
gnb.score(X_test,Y_test)

0.52631578947368418