# Importing Necessary Libraries

In [1]:
# Dataframes
import numpy as np
import pandas as pd
from copy import deepcopy

# Plots
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8,8)
plt.style.use('ggplot')

# Scikit Learn
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics



# Initializing Parameters

In [2]:
# Path of the folder
path = 'C:/Users/prash/Downloads/ML ALGORITHMS/'

#labels for discretization
labels = ['low','medium','high']

# Importing and Cleaning Dataset

In [3]:
# Import Diabetes Dataset
diabetes_dataset = pd.read_csv(path + 'DATASETS/' + 'diabetes.csv')

# Input Dataframe
X = diabetes_dataset.iloc[:,:-1]


#Preprocessing
for j in X.columns:
    mean = X[j].mean()
    X[j] = X[j].replace(0,mean)
    X[j] = pd.cut(X[j],bins=len(labels),labels=labels)

    
# Encode the Output labels
Y = diabetes_dataset.iloc[:,-1]

# Divide into train and test datasets
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)


# Implementation From Scratch

In [4]:
# Training

def count(X,Y,target,col_index = None,label = None):
    if(label != None):
        data = [X,Y]
        return len(data[0][(data[0].iloc[:,col_index] == label) & (data[1] == target)])
    else:
        return len(Y[Y == target])

# Initializing Necessary Dictionaries
probabilities = {}
categ_count = {}
categ_label_count = {}
prob = {}

for category in range(np.unique(Y_train).size):
    probabilities[category] = {}
    categ_count[category] = count(X_train,Y_train,0,category)
    prob[category] = categ_count[category]/X_train.shape[0]

# Calculating Probabilities for each Category for each Feature and Label
for category in range(np.unique(Y_train).size):
    for col_index in range(X_train.shape[1]):
        probabilities[category][col_index] = {}
        for label in labels:
            categ_label_count[category] = count(X_train,Y_train,category,col_index,label)
            probabilities[category][col_index][label] = categ_label_count[category] / categ_count[category]
            
# Testing

# Calculating Probabilty for each Category and Select the Category which has Max Probabilty
predicted = []
for row_index in range(X_test.shape[0]):
    prod = deepcopy(prob)
    for category in range(np.unique(Y_test).size):
        for col_index in range(X_test.shape[1]):
            label = X_test.iloc[row_index,col_index]
            prod[category] *= probabilities[category][col_index][label]
    dummy = list(prod.values())
    predicted.append(dummy.index(max(dummy)))
    

# Metrics

tp,tn,fp,fn = 0,0,0,0
for j in range(0,len(predicted)):
    if predicted[j] == 0:
        if Y_test.iloc[j] == 0:
            tp += 1
        else:
            fp += 1
    else:
        if Y_test.iloc[j] == 1:
            tn += 1
        else:
            fn += 1
print('Accuracy for training : ',((tp+tn)/len(Y_test))*100)

Accuracy for training :  70.12987012987013


# Scikit Learn Implementation

In [5]:
#creating labelEncoder
le = LabelEncoder()
for i in range(X.shape[1]):
    # Converting string labels into numbers.
    X.iloc[:,i] = le.fit_transform(X.iloc[:,i])
        
# Divide into train and test datasets
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

# Training
clf = MultinomialNB()
clf.fit(X_train, Y_train)

# Testing
Y_pred = clf.predict(X_test)

# Metrics
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

Accuracy: 0.7077922077922078


### Reference:
https://blog.goodaudience.com/building-the-na%C3%AFve-bayes-classifier-from-scratch-in-python-b0717fa022d8 <br>
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html