In [4]:
import numpy as np
import pandas as pd

In [20]:
class GaussianNB:
  def __init__(self,alpha=1.0):
    self.alpha = alpha

  def fit(self,X,y):
    X,y = np.asarray(X),np.asarray(y)
    self.classes_ = np.unique(y)
    n_classes,n_features= len(self.classes_),X.shape[1]

    self.means_ = np.zeros((n_classes,n_features))
    self.variances_ = np.zeros((n_classes,n_features))
    self.priors_ = np.zeros(n_classes)

    for idx,k in enumerate(self.classes_):
      Xk = X[y==k]
      self.means_[idx] = Xk.mean(axis=0)
      self.variances_[idx] = Xk.var(axis=0)

      #self.priors_[idx] = Xk.shape[0]/X.shape[0]

      # We implement laplace smoothing instead
      self.priors_[idx] = (Xk.shape[0] + self.alpha) / (X.shape[0] + self.alpha * n_classes)

    return self


  def _log_gaussian(self,X):
    num = -0.5 * ((X[:,None,:] - self.means_)**2)/ (self.variances_ + 1e-9) # to make sure no zero division occurs
    log_prob = num - 0.5* np.log(2*np.pi*self.variances_)

    return log_prob.sum(axis=2) # features/columns


  def predict(self,X):
    X = np.asarray(X)

    log_likelihood = self._log_gaussian(X)
    log_prior = np.log(self.priors_)

    return self.classes_[np.argmax(log_likelihood+log_prior,axis=1)] # classes/rows


In [13]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
X,y = load_breast_cancer(return_X_y = True)

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state=42)

In [23]:
model = GaussianNB()

In [24]:
model.fit(X_train,y_train)

<__main__.GaussianNB at 0x7b263d34cb00>

In [25]:
y_pred = model.predict(X_test)

In [27]:
print(accuracy_score(y_pred,y_test))

0.9370629370629371
