# Gaussian Naive Bayes Classifier

Naive bayes classifier uses baye's theorem to classify samples with strong (naïve) independence assumptions between the features. Gaussian naive bayes is an extension of naive bayes classifier, which deals with points modelled with guassian distribution.

The points of the test set are classified using the naive Bayes classifier, where for a given x, p(x|ωi) is estimated as

>$
p(x \mid \omega_i)=\Pi_{j=1}^f \frac{1}{\sqrt{2\pi\sigma^2_{ij}}}\,e^{ -\frac{(x(j)-m_{ij})^2}{2\sigma^2_{ij}} }, i=1,2,...c
$

where x( j) is the jth component of x, f is the no. of features, and c is the no. of classes.


In [1]:
"""
Gaussian naive bayes classifier

@Author: Ajay Biswas
National Institute of Technology, Rourkela
"""

import numpy as np
from sklearn.naive_bayes import GaussianNB
from collections import Counter  
import math

def misclassifications(X,Y):
# counts no. of wrongly predicted labels

  correct_count = 0
  for i in range(len(X)):
    if(X[i] == Y[i]):
      correct_count = correct_count + 1
  return len(X) - correct_count

def MER_Error(X,Y):
# returns Misclassification error rate
# X is true class labels, Y is predicted class labels

  correct_count = 0
  for i in range(len(X)):
    if(X[i] == Y[i]):
      correct_count = correct_count + 1
  MER_val = 1 - (correct_count/len(X))
  return MER_val

def estimator(X,V,Y): 
# X and V are means and variances of classes
# Y is sample point
# Naive Bayes Classification
# returns posterior probability 

   means = X
   variances = V
   no_features = len(means)
   p = 1
   for i in range(no_features):
       exponent =  math.exp(-((Y[i] - means[i]) ** 2 / (2 * variances[i])))
       fraction = (1 / (math.sqrt(2 * math.pi * variances[i])))
       p = p * exponent*fraction     
   return p
       

def naive_bayes_train(X,y):
    # X contains n dimensional features
    # y contains true label
    
    no_of_samples,no_of_features = X.shape
        
    unique_classes = set(y)
    sc = dict(Counter(y))
    frequency_per_class = [sc[i] for i in unique_classes]
    
    # group samples classwise and find their centroid
    start = 0
    means = []
    variances = []
    for value in frequency_per_class:
        tempList = X[start:start+value,:]
        each_mean = tempList.mean(0)
        each_variance = tempList.var(0)
        means.append(each_mean)
        variances.append(each_variance)
        start = value
        
    return np.array(means),np.array(variances)

def naive_bayes_test(X,means,variances):
    # X contains n dimensional features
    # means and variances of each class taken from the training phase
    
    num_rows, num_cols = means.shape
    X_rows, X_cols = X.shape
    y_len = X_rows
    
    # no. of features must match with the no. of dimensions of the model
    if(X_cols != num_cols):
        return None
    
    # predicted labels
    predicted = [0]*y_len
   
    # test each point against each class and assign label based on max probability
    prob = 0
    max_prob = 0
    i = 0
    j = 0
    for test_point in X:
        j = 0
        for each_mean,each_variance in means,variances:
            prob = estimator(each_mean,each_variance,test_point)
            
            if(max_prob < prob):
                max_prob = prob
                label = j
            j+=1
          
        predicted[i] = label    
        max_prob = 0
        i+=1
        
    return np.array(predicted) 


def main():
    
    # Covariance Matrix of two classes
    S1 =np.array([[0.8, 0.2, 0.1, 0.05, 0.01],
        [0.2, 0.7, 0.1, 0.03, 0.02],
        [0.1, 0.1, 0.8, 0.02, 0.01],
        [0.05, 0.03, 0.02, 0.9, 0.01],
        [0.01, 0.02, 0.01, 0.01, 0.8]])
    
    S2 =np.array([[0.9, 0.1, 0.05, 0.02, 0.01],
        [0.1, 0.8, 0.1, 0.02, 0.02],
        [0.05, 0.1, 0.7, 0.02, 0.01],
        [0.02, 0.02, 0.02, 0.6, 0.02],
        [0.01, 0.02, 0.01, 0.02, 0.7]])
    
    # Mean of two classes
    mean = [0,0,0,0,0]
    mean2 = [1,1,1,1,1]
    
    # training
    
    # generating 25 points for each class for training 
    X_h1 = np.random.multivariate_normal(mean, S1, 25)
    X_h2 = np.random.multivariate_normal(mean2, S2, 25)
    X = np.concatenate((X_h1,X_h2))
    y = np.concatenate(([0]*25,[1]*25))    
    means,variances = naive_bayes_train(X,y)

    print('\nEstimated means of the two classes:\n',means)
    print('\nEstimated variances of the two classes:\n',variances)
    
    # testing
    
    # generating 5000 points for each class for training 
    X2_h1 = np.random.multivariate_normal(mean, S1, 5000)
    X2_h2 = np.random.multivariate_normal(mean2, S2, 5000)
    X2 = np.concatenate((X2_h1,X2_h2)) 
    
    # true class labels
    y2 = np.concatenate(([0]*5000,[1]*5000))    
    
    # predicted labels
    L = naive_bayes_test(X2,means,variances)  
    
    error_rate = MER_Error(y2, L)
    misclassification =  misclassifications(y2, L)

    print('\nNo. of Misclassifications: ',misclassification)
    print('\nError Probability: ',error_rate)

# Main
if __name__=="__main__": 
    main() 
    



Estimated means of the two classes:
 [[-0.09418616 -0.00301008  0.00923237 -0.03349889 -0.05621451]
 [ 0.92600075  0.77551169  0.70840117  1.20085878  0.93884501]]

Estimated variances of the two classes:
 [[0.76493889 0.86141591 0.62984343 1.05181749 0.64435765]
 [0.40955317 0.56027805 0.46434561 0.60808977 0.81745083]]

No. of Misclassifications:  1510

Error Probability:  0.15100000000000002
