## Gaussian Naive Bayes Classification

In [22]:
import pandas as pd

In [23]:
import numpy as np
from sklearn import metrics
from scipy.stats import norm

In [24]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
iris = pd.read_csv(url, names=names)
iris

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


### Splitting test/ training set 

In [25]:
test_df = pd.concat([iris.loc[0:10], iris.loc[50:60], iris.loc[100:110]], axis=0)
test_df

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [26]:
iris.drop(iris.index[list(range(0,10))], inplace=True) # as they shift by 10 every time. Hacky but works
iris.drop(iris.index[list(range(40,50))], inplace=True)
iris.drop(iris.index[list(range(80,90))], inplace=True)
iris.head(5)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
10,5.4,3.7,1.5,0.2,Iris-setosa
11,4.8,3.4,1.6,0.2,Iris-setosa
12,4.8,3.0,1.4,0.1,Iris-setosa
13,4.3,3.0,1.1,0.1,Iris-setosa
14,5.8,4.0,1.2,0.2,Iris-setosa


In [27]:
flower_mean_df = iris.groupby('class').mean()
flower_std_df = iris.groupby('class').std()
flower_std_df

Unnamed: 0_level_0,sepal-length,sepal-width,petal-length,petal-width
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,0.360119,0.396103,0.187271,0.113228
Iris-versicolor,0.451749,0.306301,0.467611,0.204046
Iris-virginica,0.598883,0.32256,0.533247,0.274084


In [28]:
class_counts = iris['class'].value_counts()
class_counts

Iris-setosa        40
Iris-versicolor    40
Iris-virginica     40
Name: class, dtype: int64

In [29]:
label_prob = {label: float(class_counts[label])/iris.shape[0] for label in iris['class'].unique()}  # Prior likelihood of the label
label_prob   # P(yi)

{'Iris-setosa': 0.3333333333333333,
 'Iris-versicolor': 0.3333333333333333,
 'Iris-virginica': 0.3333333333333333}

In [30]:
def get_prob_for_class(value, label, feature):  # P(xi/yi) if xi is the feature and yi is the label
    return norm.pdf(value, loc = flower_mean_df.loc[label, feature], scale = flower_std_df.loc[label, feature]) 

def get_prob(test_row, label):  # P(X, yi) * P(yi) if yi is the label. This is considering that 
    prob = 1 * label_prob[label]
    for key, value in test_row.items():
        prob*=get_prob_for_class(value, label, key)
    return prob

In [31]:
get_prob_for_class(4.5, "Iris-setosa", "sepal-length" ) * label_prob['Iris-setosa']

0.11872806377027556

In [32]:
features = ['sepal-length', 'sepal-width','petal-length', 'petal-width']
labels = iris['class'].unique()
test_data = test_df[features]
test_df['predicted'] = np.nan

for i in range(len(labels)):
    test_df[labels[i]] = np.nan

for index, row in test_data.iterrows():     # Can be vectorized but too lazy
    probs_for_labels = [get_prob(row, label) for label in labels]
    test_df.loc[index, 'predicted'] = labels[probs_for_labels.index(max(probs_for_labels))]
    for i in range(len(labels)):
        test_df.loc[index, labels[i]] = probs_for_labels[i]/sum(probs_for_labels)
test_df.head(5)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class,predicted,Iris-setosa,Iris-versicolor,Iris-virginica
0,5.1,3.5,1.4,0.2,Iris-setosa,Iris-setosa,1.0,1.0614e-17,6.026582e-26
1,4.9,3.0,1.4,0.2,Iris-setosa,Iris-setosa,1.0,1.294547e-16,1.782539e-25
2,4.7,3.2,1.3,0.2,Iris-setosa,Iris-setosa,1.0,7.274894e-18,1.593403e-26
3,4.6,3.1,1.5,0.2,Iris-setosa,Iris-setosa,1.0,9.024735000000001e-17,2.160627e-25
4,5.0,3.6,1.4,0.2,Iris-setosa,Iris-setosa,1.0,3.162089e-18,2.415265e-26


In [33]:
metrics.accuracy_score(test_df['class'], test_df['predicted'])  # accuracy wow

0.9393939393939394

In [34]:
norm.pdf(100, 100, 0.1)

3.989422804014327

In [35]:
norm.pdf(101, 100, 0.1)

7.694598626706419e-22

In [36]:
test_df.mean(axis = 0)

sepal-length       5.824242
sepal-width        3.033333
petal-length       3.818182
petal-width        1.200000
Iris-setosa        0.333333
Iris-versicolor    0.307435
Iris-virginica     0.359232
dtype: float64