In [2]:
import numpy as np 
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
# loading the data
iris = load_iris()
X, y = iris.data, iris.target
# df = pd.read_csv("Train_F.csv")
# spliting data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=1810)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((120, 4), (120,), (30, 4), (30,))

In [68]:
# scikit learn implementation 
nb = GaussianNB()
nb.fit(X_train, y_train)
sklearn_preds = nb.predict(X_test)

print(f"sklearn accuracy:{accuracy_score(y_test, sklearn_preds)}")
print(f"predictions: {sklearn_preds}")

X_train.shape

sklearn accuracy:1.0
predictions: [0 0 2 2 0 1 0 0 1 1 2 1 2 0 1 2 0 0 0 2 1 2 0 0 0 0 1 1 0 2]


(120, 4)

In [4]:
def get_params(X_train, y_train): 
    """
    Function to get the unique classes, number of classes and number of features in training data
    """
    num_examples, num_features = X_train.shape
    num_classes = len(np.unique(y_train))
    return num_examples, num_features, num_classes

In [5]:
# testing utility function
num_examples, num_features, num_classes = get_params(X_train, y_train)
print(num_examples, num_features, num_classes)

120 4 3


In [6]:
def get_stats_by_class(X_train, y_train, num_examples=num_examples, num_classes=num_classes): 
    """
    Get stats of dataset by the class
    """
    # dictionaries to store stats
    class_mean = {}
    class_var = {} 
    class_prior = {} 
    
    # loop through each class and get mean, variance and prior by class
    for cls in range(num_classes): 
        X_cls = X_train[y_train == cls]
        class_mean[str(cls)] = np.mean(X_cls, axis=0)
        class_var[str(cls)] = np.var(X_cls, axis=0)
        class_prior[str(cls)] = X_cls.shape[0] / num_examples
    return class_mean, class_var, class_prior

In [7]:
# output of function 
cm, var, cp = get_stats_by_class(X_train, y_train)
print(f"mean: {cm}\n\nvariance: {var}\n\npriors: {cp}")

mean: {'0': array([5.06111111, 3.48611111, 1.44722222, 0.25833333]), '1': array([5.90952381, 2.80714286, 4.25238095, 1.33809524]), '2': array([6.61904762, 2.97857143, 5.58571429, 2.02142857])}

variance: {'0': array([0.12570988, 0.15564043, 0.0286034 , 0.01243056]), '1': array([0.26324263, 0.08542517, 0.24582766, 0.04045351]), '2': array([0.43678005, 0.10930272, 0.31884354, 0.0802551 ])}

priors: {'0': 0.3, '1': 0.35, '2': 0.35}


In [42]:
def gaussian_density_function(X, mean, std, num_examples=num_examples, num_features=num_features, eps=1e-6): 
    
    num_exambles, num_features = X_train.shape
    const = -num_features/2 * np.log(2*np.pi) - 0.5 * np.sum(np.log(std + eps))
    probs = 0.5 * np.sum(np.power(X - mean, 2)/(std + eps),axis = 1)
    print(len(probs))
    return const - probs

In [50]:
print(X_train[:10])
print(0,"->",cm[str(0)])
print(0,"->",var[str(0)])

a = gaussian_density_function(X_train, cm[str(0)], var[str(0)])
print(len(a))

[[6.8 3.  5.5 2.1]
 [5.5 2.5 4.  1.3]
 [6.1 2.8 4.  1.3]
 [5.4 3.4 1.5 0.4]
 [6.1 2.9 4.7 1.4]
 [6.7 3.1 5.6 2.4]
 [5.  2.3 3.3 1. ]
 [5.6 2.9 3.6 1.3]
 [5.  3.5 1.6 0.6]
 [7.7 3.  6.1 2.3]]
0 -> [5.06111111 3.48611111 1.44722222 0.25833333]
0 -> [0.12570988 0.15564043 0.0286034  0.01243056]
120
120


In [21]:
def class_probabilities(X, class_mean, class_var, class_prior, num_classes=num_classes):
    """
    calculate the probability of each class given the data
    """
    num_examples = X.shape[0]
    probs = np.zeros((num_examples, num_classes))

    for cls in range(num_classes): 
        prior = class_prior[str(cls)]
        probs_cls = gaussian_density_function(X, class_mean[str(cls)], class_var[str(cls)])
        probs[:, cls] = probs_cls + np.log(prior)
    return probs

In [61]:
probs = np.zeros((3,2));
probs[:,1]=[-1,-1,1]
print(probs)
np.argmax(probs,1)

[[ 0. -1.]
 [ 0. -1.]
 [ 0.  1.]]


array([0, 0, 1])

In [23]:
probs = class_probabilities(X_train, cm, var, cp)
probs

array([[-4.35250322e+02, -1.25872865e+01, -1.46137594e+00],
       [-1.60384027e+02, -1.54102752e+00, -1.10386267e+01],
       [-1.62299027e+02, -7.39539583e-01, -9.01170028e+00],
       [-2.78379079e-01, -2.93581455e+01, -4.64418418e+01],
       [-2.41707833e+02, -1.09710712e+00, -5.34454423e+00],
       [-4.96032994e+02, -1.98425985e+01, -2.33994991e+00],
       [-8.56047225e+01, -6.85707012e+00, -2.11716263e+01],
       [-1.25851686e+02, -1.63885808e+00, -1.20146628e+01],
       [-4.06050945e+00, -2.59461101e+01, -4.31155288e+01],
       [-5.73461897e+02, -2.52086388e+01, -3.60971734e+00],
       [-3.89249991e+02, -7.28466926e+00, -1.81117068e+00],
       [-2.25767481e+02, -1.37119894e+00, -6.10506065e+00],
       [ 9.42674063e-01, -3.44447450e+01, -5.11931770e+01],
       [-6.60886691e+02, -3.37482321e+01, -7.94169253e+00],
       [-1.43924072e+02, -1.10107170e+00, -1.11545686e+01],
       [-4.39526394e+02, -9.77034435e+00, -2.80446271e+00],
       [-2.28463007e+02, -1.03754819e+00

In [24]:
def predict(X_test, X_train, y_train): 
    num_examples, num_features, num_classes = get_params(X_test, y_train)
    class_mean, class_std, class_prior = get_stats_by_class(X_train, y_train)
    probs = class_probabilities(X_test, class_mean, class_std, class_prior)
    return np.argmax(probs, 1)

In [25]:
my_preds = predict(X_test, X_train, y_train)

In [29]:
print(f"my predictions accuracy:{accuracy_score(y_test, my_preds)}")
print(f"predictions: {my_preds}")

my predictions accuracy:1.0
predictions: [0 0 2 2 0 1 0 0 1 1 2 1 2 0 1 2 0 0 0 2 1 2 0 0 0 0 1 1 0 2]


In [28]:
sklearn_preds == my_preds

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])