In [5]:
#import necessary packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [6]:
class Dataset:
    def __init__(self, df, col_types, target_id = 0):
        self.target_id = target_id
        self.df = df
        self.col_types = col_types
        self.dictionaries = {}
        self.reverses = {}
        self.cols = list(df.columns)
        for i in range(len(self.df.columns)):
            if self.col_types[i] == "nc":
                self.normalize(i)
                
    def normalize(self, col_idx):
        col = self.df.values[:, col_idx]
        name = self.df.columns[col_idx]
        uniques = list(set(col))
        dictionary = {unique : i for (i, unique) in enumerate(uniques)}
        for i in range(len(col)):
            self.df.at[i, name] = dictionary[col[i]]
        self.dictionaries[col_idx] = dictionary
        self.reverses[col_idx] = {value: key for (key, value) in dictionary.items()}
        
    def split(self, val):
        xs = [i for i in range(len(self.cols)) if i != self.target_id]
        self.X = self.df.iloc[:, xs].values
        self.y = self.df.iloc[:, self.target_id].values
        self.y = self.y.astype('int')
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = val, random_state=1)

In [7]:
class NaiveBayesClassifier:
    def __init__(self, dataset):
        self.dataset = dataset # Instance of our Dataset class
        self.target = dataset.cols[dataset.target_id] # name of the target column
        self.hierarchical_likelihoods = {}
        self.priors = {}
        self.total = dataset.df.shape[0] * 1.0 # total no. of rows in our dataset

    def calculate_likelihoods(self):
        # calculation of mean and variance for continuous variables
        self.means = dataset.df.groupby(self.target).mean()
        self.variances = dataset.df.groupby(self.target).var()
        
        # calculation of likelihoods for discrete variables
        self.hierarchical_likelihoods = {} # target level hierarchy
        for target, idx in dataset.dictionaries[dataset.target_id].items():
            # loop through all possible target classes
            sample = dataset.df[dataset.df[dataset.cols[dataset.target_id]] == idx]
            # All rows in the sample contains only one target class
            self.priors[idx] = sample.shape[0]*1.0
            
            class_wise_probs = {} #column level hierarchy
            for i, col in enumerate(dataset.cols):
                # loop through all available columns and calculate likelihoods
                if i == dataset.target_id or dataset.col_types[i] == 'c':
                    # Skip this column if it is a continuous variable or a target variable
                    continue
                    
                uniques = {} #class level hierarchy
                for unique, u_idx in dataset.dictionaries[i].items():
                    # for every unique value in the column calculate likelihoods
                    count = sample[sample[col] == u_idx].shape[0]
                    uniques[u_idx] = count/self.priors[idx]
                
                #save class level hierarchies in column level hierarchies
                class_wise_probs[i] = uniques
                
            #save column level hierarchies in target level hierarchies
            self.hierarchical_likelihoods[idx] = class_wise_probs
            
            self.priors[idx] /= self.total
    def likelihood(self, target_idx, col_idx, val):
        if dataset.col_types[col_idx] == 'nc':
            # when the column is a discrete variable, return the
            # likelihood from hierarchical_likelihoods
            return self.hierarchical_likelihoods[target_idx][col_idx][val]
        else:
            # If continuous, calculate the likelihood using gaussian formula
            m = self.means[dataset.cols[col_idx]][target_idx]
            v = self.variances[dataset.cols[col_idx]][target_idx]
            likelihood = 1/(np.sqrt(2*np.pi*v)) * np.exp((-(val-m)**2)/(2*v))
            return likelihood
    def predict(self, x):
        arr = [] #empty array to store outputs
        num_cols = x.shape[1] #total no. of features in input data
        for i in range(x.shape[0]): #iterating through all inputs(rows)
            posteriors = {} # dictionary of all available posteriors
            for idx, target in dataset.reverses[dataset.target_id].items(): 
                #iterating through every target class
                prob = self.priors[idx] #initializing running product of likelihoods

                for j in range(num_cols):
                    # iterate through every feature in our input
                    prob *= self.likelihood(idx, j, x[i][j]) #accumulate the product of likelihoods
                posteriors[idx] = prob # store the probability as posterior of this target class
            #sort the posteriors and select the one with highest probability
            prediction = sorted(posteriors.items(), key = lambda x: x[1])[-1][0]
            arr.append(prediction) #save this prediction for this row
        return np.asarray(arr) #return the array as a numpy array



In [8]:
#download the dataset from here
#https://www.kaggle.com/abcsds/pokemon
# and place it in the same folder
df = pd.read_csv('Pokemon.csv')
df = df.drop(['#', 'Name'], axis = 1)
df = df.fillna(0)
col_types = {i : 'c' for i in range(len(df.columns))}
col_types[0] = "nc"
col_types[1] = "nc"
col_types[9] = "nc"
col_types[10] = "nc"
dataset = Dataset(df, col_types, target_id = 10)
dataset.split(0.3)

In [9]:
nbc = NaiveBayesClassifier(dataset)
nbc.calculate_likelihoods()
y_pred = nbc.predict(dataset.X_test)
accuracy = accuracy_score(dataset.y_test, y_pred)*100
print(f'Accuracy of Gaussian Naive Bayes Algorithm written from scratch : {accuracy:.3f}')

Accuracy of Gaussian Naive Bayes Algorithm written from scratch : 93.333


In [10]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(dataset.X_train, dataset.y_train)
y_pred = classifier.predict(dataset.X_test)
accuracy = accuracy_score(dataset.y_test, y_pred)*100
print(f'Accuracy of Gaussian Naive Bayes Algorithm from SKLearn(using python module) : {accuracy:.3f}')

Accuracy of Gaussian Naive Bayes Algorithm from SKLearn(using python module) : 92.917
