In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score

In [174]:
class GaussianNaiveBayes:

    def fit(self, X, y):
        
        self.classes = np.unique(y)
        self.means, self.var, self.priorP = [],[],[]
        for i in self.classes:
            classData = X[np.where(y==i)]
            #for each class, calculate mean/variance for each feature to describe its distribution
            self.means.append(np.mean(classData,axis=0))
            self.var.append(np.var(classData,axis=0))
            #calculate prior probabilities (initial guess) that a random observation will belong to each class p(Yi)
            self.priorP.append(len(classData)/len(y))
    
    def predict(self, X):
        return [self.__predict(x) for x in X]
    
    def __predict(self, x):
        #class probability given x is p(Yi) * p(X1|Yi) * p(X2|Yi)... (posterior probabilities)
        #to prevent underflow - we use log(pYi) + log(p(X1|Yi)) + log (p(X2|Yi))...
        gaussianProbs = self.__getProbs(x)
        self.postP = []
        for i in range(len(self.classes)):
            self.postP.append(np.log(self.priorP[i]) + sum(np.log(gaussianProbs[i])))
        
        #final prediction corresponds to the class with the highest likelihood
        return np.argmax(self.postP)
        
    def __getProbs(self, x):
        #function that returns for each class/feature, the likelihood that x falls in its fit distribution
        var, means = np.array(model.var), np.array(model.means)
        return (1 / np.sqrt(2*np.pi*var)) * np.exp(-(x-means)**2/(2*var))


In [175]:
X, y = datasets.make_classification(n_samples = 1000, n_features=10, n_classes=2, random_state=123)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=123)

In [176]:
model = GaussianNaiveBayes()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

In [177]:
accuracy_score(Y_test, predictions)

0.965