<a href="https://colab.research.google.com/github/aditi-saxena-1206/CS344/blob/main/gaussian_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [107]:
from sklearn import datasets
import numpy as np
import pandas as pd
import math
from sklearn.metrics import accuracy_score

In [108]:
def load_data():
  iris = datasets.load_iris()
  #print(iris)
  return iris

In [109]:
def describe_data(X,Y,features,classes):
  print("Total number of observation: ", X.shape[0])
  print("Total number of labels: ", X.shape[1])
  print("Features: ", features)
  print("Labels: ",classes)


In [110]:
def split_data(X,Y,n):
  
  #Here, since we know that the data is sorted class-wise, so we simply take first 70% samples of each class
  
  class_len = int(X.shape[0]/n)
  train_len = int(class_len * 70/100)
  train_row_list = []
  test_row_list = []

  for i in range(class_len):
    for j in range(n):
      if (i<35):
        train_row_list.append(j*class_len + i)
      else:
        test_row_list.append(j*class_len + i)
  #print(train_row_list)
  #print(test_row_list)
  X_train = X[train_row_list,:]
  X_test = X[test_row_list,:]
  Y_train = Y[train_row_list]
  Y_test = Y[test_row_list]

  return [X_train,X_test,Y_train,Y_test]


In [111]:
def prior_prob(Y_train,n):
  size = Y_train.shape[0]
  prob = np.zeros(n)
  for i in range(n):
    prob[i] = np.sum(Y_train == i)
  prob = prob/size
  #print(prob)
  return prob

In [112]:
def stat_summary(X_train,Y_train,classes,features):
  #calculate mean and variance of each feature with respect to each class
  summary = np.zeros((len(classes),2*len(features)))
  for i in range(len(classes)):
    for j in range(len(features)):
      x_feature = X_train[:,j]
      #print(x_feature)
      temp_array = x_feature[i:len(X_train):3]
      #print(temp_array)
      temp_mean = temp_array.mean()
      temp_std = temp_array.std()
      summary[i][j*2] = temp_mean
      summary[i][j*2+1] = temp_std
      print(classes[i]," : ",features[j]," : ",temp_mean," : ",temp_std)
  #print(X_train)
  feature_name_list = []
  for i in features:
    feature_name_list.extend([i+"(mean)",i+"(std. devation)"])
  #print(summary)
  df = pd.DataFrame(summary,index = classes, columns = feature_name_list)
  return df

In [113]:
def normal_pdf(data_point,mean,std):
  var = float(std)**2
  denom = (2*math.pi*var) ** 0.5
  num = math.exp(-(float(data_point) - float(mean))**2 / (2*var))
  return (num/denom)

In [134]:
def joint_probabilities(feature_list, prior, stat_df):
  joint_prob = np.zeros(len(prior))
  for i in range(len(prior)):
    joint = 1
    for j in range(len(feature_list)):
      temp = normal_pdf(feature_list[j],stat_df.iloc[i,j*2], stat_df.iloc[i,j*2+1])
      joint = joint*temp
    joint_prob[i] = joint * prior[i]
  return joint_prob

In [127]:
def predict(posterior):
  return np.argmax(posterior)


In [138]:
def predict_test(X_test, prior, stat_df):
  size = X_test.shape[0]
  Y_pred = np.zeros(size, dtype=int)
  for i in range(size):
    features = X_test[i,:]
    posterior = joint_probabilities(features,prior,stat_df)
    #print(posterior)
    Y_pred[i] = predict(posterior)

  return Y_pred

In [136]:
def main():
  print("Loading IRIS dataset from scikit-learn library...")
  dataset = load_data()
  print("Dataset Loaded.")

  print("Description of data")
  X = dataset.data
  Y = dataset.target
  features = dataset.feature_names
  classes = dataset.target_names
  describe_data(X,Y,features, classes)

  print("Splitting the data in the ratio train:test::70:30")
  splitted_data = split_data(X,Y,len(classes))
  X_train = splitted_data[0]
  X_test = splitted_data[1]
  Y_train = splitted_data[2]
  Y_test = splitted_data[3]
  print("Data splitted.")

  print("Calculating Prior Probabilities for each class")
  prior_probability = prior_prob(Y_train,len(classes))

  print("Calculating Summary Statistics")
  stat_df = stat_summary(X_train,Y_train,classes,features)
  print(stat_df)

  print("Predicting for test data...")
  Y_pred = predict_test(X_test,prior_probability, stat_df)
  print("Prediction done")

  print("Checking Accuracy")
  print("Accuracy Score: ", accuracy_score(Y_test,Y_pred))



In [137]:
if __name__=='__main__':
  main()

Loading IRIS dataset from scikit-learn library...
Dataset Loaded.
Description of data
Total number of observation:  150
Total number of labels:  4
Features:  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Labels:  ['setosa' 'versicolor' 'virginica']
Splitting the data in the ratio train:test::70:30
Data splitted.
Calculating Prior Probabilities for each class
Calculating Summary Statistics
setosa  :  sepal length (cm)  :  5.045714285714285  :  0.35724569947854995
setosa  :  sepal width (cm)  :  3.4685714285714284  :  0.3693789603192507
setosa  :  petal length (cm)  :  1.477142857142857  :  0.17085618728829774
setosa  :  petal width (cm)  :  0.2428571428571428  :  0.09938586931957763
versicolor  :  sepal length (cm)  :  6.00857142857143  :  0.5261023412506237
versicolor  :  sepal width (cm)  :  2.7685714285714287  :  0.31511578694770825
versicolor  :  petal length (cm)  :  4.314285714285714  :  0.466073481112065
versicolor  :  petal width (cm)  :  1.3