# Importing Necessary Libraries

In [1]:
# Dataframes
import numpy as np
import pandas as pd
from copy import deepcopy

# Plots
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8,8)
plt.style.use('ggplot')

# Scikit Learn
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation  import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics


import math



# Initializing Parameters

In [2]:
# Path of the folder
path = 'C:/Users/prash/Downloads/ML ALGORITHMS/'

#labels for discretization
labels = ['low','medium','high']

# Importing and Cleaning Dataset

In [3]:
# Import Diabetes Dataset
diabetes_dataset = pd.read_csv(path + 'DATASETS/' + 'diabetes.csv')

# Input Dataframe
X = diabetes_dataset.iloc[:,:-1]


#Preprocessing
for j in X.columns:
    mean = X[j].mean()
    X[j] = X[j].replace(0,mean)
    X[j] = pd.cut(X[j],bins=len(labels),labels=labels)

#creating labelEncoder
le = LabelEncoder()
for i in range(X.shape[1]):
    # Converting string labels into numbers.
    X.iloc[:,i] = le.fit_transform(X.iloc[:,i])
    
# Encode the Output labels
Y = diabetes_dataset.iloc[:,-1]

# Divide into train and test datasets
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)


# Implementation From Scratch

In [4]:
# Training

categ_count = {}
categ_index = {}
subdataset = {}
for category in range(np.unique(Y_train).size):
    categ_count[category] = len(Y_train[Y_train == category])/X_train.shape[0]
    categ_index[category] = np.argwhere(Y_train == category)
    subdataset[category] = X_train.iloc[categ_index[category][:,0],:]
    
    
means = {}
std = {}
for category in range(np.unique(Y_train).size):
    # Here we calculate the mean and the standard deviation from datasets
    means[category] = np.mean(subdataset[category], axis=0)
    std[category] = np.std(subdataset[category], axis=0)
    

# Testing

# This function calculates the class probability using gaussian distribution
def calculate_probability(X, mean, stdev):
    exponent = math.exp(-((X - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

# This function predicts the probability for every class 
def predict_probability(X,Y,categ_count,means,std):
    class_prob = {}
    for category in range(np.unique(Y).size):
        class_prob[category] = math.log(categ_count[category], math.e)
    
    for category in range(np.unique(Y).size):
        for i in range(len(means)):
            class_prob[category]+=math.log(calculate_probability(X[i], means[category][i], std[category][i]), math.e)
    
    return class_prob


Y_pred = []
for i in range(X_test.shape[0]):
    dummy = predict_probability(X_test.iloc[i,:],Y_test,categ_count,means,std).values()
    Y_pred.append(np.argmax(list(dummy)))
    
    
    
# Metrics
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

  return getattr(obj, method)(*args, **kwds)


Accuracy: 0.7532467532467533


# Scikit Learn Implementation

In [5]:
# Training
clf = GaussianNB()
clf.fit(X_train, Y_train)

# Testing
Y_pred = clf.predict(X_test)

# Metrics
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

Accuracy: 0.7662337662337663


### Reference:
https://towardsdatascience.com/how-to-impliment-a-gaussian-naive-bayes-classifier-in-python-from-scratch-11e0b80faf5a <br>
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html