##### More here: https://en.wikipedia.org/wiki/Naive_Bayes_classifier

Are probabilistic classifiers, so they can give the probability of being in each class (this reminds me of soft voting). They assume strong independence between the features. Given we are using labeled datasets, Naive Bayes are usually classified under supervised learning. 

Bayes Probability: P(A|B) = P(A) * P(B|A) / P(B)

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as graph
import seaborn as sns
from sklearn import datasets

In [2]:
# import some data to play with
iris = datasets.load_iris()
x = iris.data
y = iris.target

iris_df = pd.DataFrame(x, columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
iris_df['target'] = y

x_y = np.hstack((x, y.reshape(len(y), 1)))

display(x_y[:5])
print(x.shape, y.shape)
display(iris_df.head())

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ]])

(150, 4) (150,)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## The steps are:
1. Separate by class
2. Summarize the dataset
3. Summarize the data by class
4. Implement Gaussian Probability Density Function
5. Calculate Class Probabilities

In [3]:
# Splitting the dataset. We will store the class value and records in the dictionary since lookups are constant

def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        sample = dataset[i]
        class_value = sample[-1]
        if class_value not in separated:
            separated[class_value] = []
        separated[class_value].append(sample)
    return separated

To calculate the distribution of the features, we need to know the mean and standard deviation of these features. Let's write a simple function that will return these for all the features in the dataset

In [4]:
def summarize_dataset(dataset):
    summaries = [(np.mean(feature), np.std(feature), len(feature)) for feature in zip(*dataset)]
    summaries.pop(-1)
    return summaries

In [5]:
summarize_dataset(x_y)

[(5.843333333333334, 0.8253012917851409, 150),
 (3.0573333333333337, 0.4344109677354946, 150),
 (3.7580000000000005, 1.759404065775303, 150),
 (1.1993333333333336, 0.7596926279021594, 150)]

##### Time to summarize the data by class now.
We can use the 2 functions we have above to summarize the dataset by class

In [6]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, samples in separated.items():
        summaries[class_value] = summarize_dataset(samples)
    return summaries

In [7]:
summarize_by_class(x_y)

{0.0: [(5.006, 0.3489469873777391, 50),
  (3.428, 0.37525458025186054, 50),
  (1.4620000000000002, 0.17191858538273283, 50),
  (0.24599999999999997, 0.1043264108459598, 50)],
 1.0: [(5.936, 0.5109833656783751, 50),
  (2.7700000000000005, 0.31064449134018135, 50),
  (4.26, 0.4651881339845203, 50),
  (1.3259999999999998, 0.19576516544063705, 50)],
 2.0: [(6.587999999999998, 0.6294886813914926, 50),
  (2.974, 0.3192553836664309, 50),
  (5.5520000000000005, 0.546347874526844, 50),
  (2.0260000000000002, 0.2718896835115301, 50)]}

#### Calculating the gaussian probability density function

We assume the values for each feature are drawn from a distribution. For a gaussian distribution, you only need to know the mean and standard deviation of that distribution

In [8]:
def calculate_probability(x, mean, sd):
    exponent = np.exp(-((x - mean)**2 / (2 * sd ** 2)))
    return (1 / (np.sqrt( 2 * np.pi) * sd)) * exponent

# Alternatively, scipy stats have it

In [9]:
print(calculate_probability(1.0, 1.0, 1.0), scipy.stats.norm(1, 1).pdf(1))
print(calculate_probability(0.5, 1.0, 1.0), scipy.stats.norm(1, 1).pdf(0.5))

0.3989422804014327 0.3989422804014327
0.3520653267642995 0.3520653267642995


#### We now have to calculate the probabilities for our new data

In [10]:
def calculate_class_probabilities(summaries, sample):
    total_samples = sum([summaries[label][0][2] for label in summaries])
    probabilities = {}
    
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_samples)
    
        for i in range(len(class_summaries)):
            mean, sd, _ = class_summaries[i]
            probabilities[class_value] *= scipy.stats.norm(mean, sd).pdf(sample[i])
    
    return probabilities

In [11]:
summaries = summarize_by_class(x_y)
probabilities = calculate_class_probabilities(summaries, x_y[0])
print(probabilities)

{0.0: 2.8940535282803888, 1.0: 3.929662157976781e-18, 2.0: 2.0584895837141216e-25}


In [12]:
summaries = summarize_by_class(x_y)

for sample in x_y:
    probabilities = calculate_class_probabilities(summaries, sample)
    print(probabilities)

{0.0: 2.8940535282803888, 1.0: 3.929662157976781e-18, 2.0: 2.0584895837141216e-25}
{0.0: 1.5231377916485482, 1.0: 2.3072568328078667e-17, 2.0: 3.576627955875813e-25}
{0.0: 1.184508092852029, 1.0: 1.2710244573642226e-18, 2.0: 2.7720622403656293e-26}
{0.0: 1.1041552333230067, 1.0: 1.6189045358882106e-17, 2.0: 3.262693827382769e-25}
{0.0: 2.7515261392066517, 1.0: 1.2472422469623888e-18, 2.0: 7.93511394413045e-26}
{0.0: 0.11117539898941703, 1.0: 1.6575594843972181e-15, 2.0: 1.9539289086816624e-22}
{0.0: 1.4932377229246712, 1.0: 1.646481510118458e-17, 2.0: 4.048829427906544e-25}
{0.0: 3.1740796533694655, 1.0: 2.074716345651472e-17, 2.0: 8.802862797021564e-25}
{0.0: 0.251447328175349, 1.0: 2.369200679957199e-18, 2.0: 3.0285069480825554e-26}
{0.0: 0.8587994752623885, 1.0: 2.9400805892792776e-18, 2.0: 1.0370046240369086e-25}
{0.0: 1.2940873410115055, 1.0: 5.669263067033024e-18, 2.0: 6.558818929262825e-25}
{0.0: 1.9801459304682145, 1.0: 3.28242163365341e-17, 2.0: 1.435106261061941e-24}
{0.0: 0.

{0.0: 1.4354888367042866e-177, 1.0: 0.0003494865887312729, 2.0: 0.17486485261407883}
{0.0: 7.24632944337333e-219, 1.0: 4.730674969801673e-08, 2.0: 0.20519684428234825}
{0.0: 5.5229544197776785e-275, 1.0: 2.856062733023935e-12, 2.0: 0.011852023898674614}
{0.0: 1.6608577577619939e-111, 1.0: 0.007235803888178921, 2.0: 0.00019685895610150862}
{0.0: 3.9430643670228924e-230, 1.0: 5.397202721731364e-08, 2.0: 0.040272139104189086}
{0.0: 4.708179546801068e-193, 1.0: 2.9119773798451113e-05, 2.0: 0.059049143822320745}
{0.0: 3.3338767108557266e-268, 1.0: 4.795672115200506e-15, 2.0: 0.003411634125175453}
{0.0: 7.547032831940105e-163, 1.0: 6.366005398707727e-05, 2.0: 0.1541202469271469}
{0.0: 2.9499773199567354e-167, 1.0: 0.0004209513012681796, 2.0: 0.15114099382116597}
{0.0: 1.4450056196149432e-194, 1.0: 1.2282749643609652e-06, 2.0: 0.25551062788013984}
{0.0: 1.056078258365643e-155, 1.0: 0.0002708092275110585, 2.0: 0.020753606946307856}
{0.0: 1.0435086644584316e-191, 1.0: 3.2014121884122866e-08, 2.