In [1]:
import numpy as np


class GaussianNaiveBayes:
    def __init__(self, X, y):
      self.X = X
      self.y = y
      self.n_observations = X.shape[0]
      self.n_features = X.shape[1]
      self.n_labels = len(np.unique(y))
      self.labels = np.unique(y)
      self.eps = 1e-6
      self.mean = {}
      self.variance = {}
      self.prior = {}

    def fit(self, X, y):
      for i, c in enumerate(np.unique(self.y)):
        X_c = X[y == c]
        self.mean[c] = np.mean(X_c, axis=0)
        self.variance[c] = np.var(X_c, axis=0)
        self.prior[c] = X_c.shape[0] / X.shape[0]

    def predict(self, X):
      log_prob = np.zeros((X.shape[0], self.n_labels))

      for i, c in enumerate(np.unique(self.y)):
          log_prior = np.log(self.prior[i])
          # after dropping all constant terms. eps is added to sigma for numeric stability
          log_cp = - 0.5 * np.sum(np.power(X - self.mean[c], 2) / (self.variance[c] + self.eps), 1) - 0.5 * np.sum(np.log(self.variance[c] + self.eps))
          log_prob[:, i] = log_prior + log_cp
      return self.labels[np.argmax(log_prob, 1)]

## Test on Iris Data

In [2]:
#import iris dataset
from sklearn import datasets

data = datasets.load_iris(as_frame=True)

#load only two features
X = data["data"]
y = data["target"]

In [3]:
X.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=1, stratify=y)

print('Labels counts in y:', y.value_counts())
print('Labels counts in y_test:', y_test.value_counts())
print('Labels counts in y_train:', y_train.value_counts())

Labels counts in y: 0    50
1    50
2    50
Name: target, dtype: int64
Labels counts in y_test: 2    15
0    15
1    15
Name: target, dtype: int64
Labels counts in y_train: 0    35
2    35
1    35
Name: target, dtype: int64


In [5]:
cf = GaussianNaiveBayes(X_train.to_numpy(), y_train.to_numpy())

In [6]:
cf.fit(X_train.to_numpy(), y_train.to_numpy())

In [7]:
y_pred = cf.predict(X_test.to_numpy())

In [8]:
print(f"Accuracy: {np.sum(y_test.to_numpy()==y_pred)/X_test.shape[0]}")

Accuracy: 0.9777777777777777


In [9]:
#Lets apply GaussianNB
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred_sklearn = gnb.fit(X_train, y_train).predict(X_test)

In [10]:
print(f"Accuracy: {np.sum(y_test.to_numpy()==y_pred_sklearn)/X_test.shape[0]}")

Accuracy: 0.9777777777777777


## Matches with sklearn!!!