## Continuous Naive Bayes Classifier

Run the following cell to make sure you can import all of the libraries. Then move onto Part 1. 

In [1]:
import pandas as pd
from collections import *
import numpy as np
import sklearn
from sklearn import datasets
from collections import Counter as ctr
from sklearn.model_selection import train_test_split
from math import *
%matplotlib inline

#### First, load the iris data

In [2]:
iris = sklearn.datasets.load_iris()
X = iris.data
y = iris.target

X[:5], y[:5]

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]), array([0, 0, 0, 0, 0]))

In [3]:
list(iris.target_names) # classes 0, 1, 2 

['setosa', 'versicolor', 'virginica']

### Split the data

In [4]:
# put the iris data into a dataframe
cols = ['f1', 'f2', 'f3', 'f4'] # we have four features to deal with
data = pd.DataFrame(X,columns=cols)
data['y'] = y # add the prediction label as a column
# split into train/test data
train, test = train_test_split(data, test_size=0.5)

### Training

In [5]:
#estimate gaussian parameters for P(F|C)
params_fc = {}
for y_val in set(train.y):
    sub_frame = train[train.y == y_val]
    for f in cols:
        params_fc['{}-{}'.format(y_val, f)] = (sub_frame[f].mean(), sub_frame[f].std())

# estimate gaussian parameters for P(F)
params_f = {}
for f in cols:
    params_f[f] = (train[f].mean(), train[f].std())

In [6]:
# show the mean/std values for all the combinations
params_fc, params_f

({'0-f1': (4.988461538461538, 0.35813620099277654),
  '0-f2': (3.4384615384615387, 0.4079969833975964),
  '0-f3': (1.473076923076923, 0.16384795954223488),
  '0-f4': (0.27307692307692316, 0.11851647078003062),
  '1-f1': (5.826923076923078, 0.4574343164282211),
  '1-f2': (2.726923076923077, 0.316932412110459),
  '1-f3': (4.2615384615384615, 0.4499572629278679),
  '1-f4': (1.3230769230769233, 0.22504700363735983),
  '2-f1': (6.591304347826085, 0.7134506694281741),
  '2-f2': (2.986956521739131, 0.35330176108724926),
  '2-f3': (5.534782608695653, 0.5629538134229405),
  '2-f4': (2.0652173913043477, 0.24235341521342882)},
 {'f1': (5.770666666666668, 0.8311134396395896),
  'f2': (3.053333333333334, 0.46711689992784267),
  'f3': (3.685333333333333, 1.7524263745442201),
  'f4': (1.1866666666666665, 0.7609264149765134)})

In [7]:


def gaussian(x, mu, sig):
    return 1./(sqrt(2.*pi)*sig)*np.exp(-np.power((x - mu)/sig, 2.)/2)

#P(F|C)
def Pfc(feat='f1', F='',C=''):
    mu,sigma = params_fc['{}-{}'.format(C,feat)]
    return gaussian(F,mu,sigma)

#P(F)
def Pf(feat='f1', F=''):
    mu,sigma=params_f[feat]
    return gaussian(F,mu,sigma)

#P(C) -- there are 50 of each type in the data, so each type is 50/150 -> 1/3
def Pc(C=''):
    return 1.0 / 3.0

#P(C|F) = P(F|C) * P(C) / P(F)
def Pcf(feat='f1', C='', F=''):
    return Pfc(feat,F,C) * Pc(C) / Pf(feat,F)

### Testing

In [8]:
# call P(C|F) on all four individual features, multiply result together

# note:  a \ (i.e., a slash) at the end of a line in python means to continue the current line of code
for y_val in set(test.y):
    test[str(y_val)] = test.f1.map(lambda x: Pcf(feat='f1', C=y_val, F=x)) *\
    test.f2.map(lambda x: Pcf(feat='f2', C=y_val, F=x)) *\
    test.f3.map(lambda x: Pcf(feat='f3', C=y_val, F=x)) *\
    test.f4.map(lambda x: Pcf(feat='f4', C=y_val, F=x))
    
test['guess'] = test[['0','1','2']].idxmax(axis=1) # take the argmax class label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Testing accuracy

In [9]:
len(test[test.y.map(str) == test.guess]) / len(test) # see if the guess matches the actual class label

0.96

### Check against the scikit classifier

In [13]:
Xtrain = pd.DataFrame.as_matrix(train[cols])
ytrain = train.y
Xtest  = pd.DataFrame.as_matrix(test[cols])
ytest  = test.y

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
#import and instantiate classifier here
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

In [15]:
classifier.fit(Xtrain, ytrain)
preds = classifier.predict(Xtest)
sklearn.metrics.accuracy_score(ytest, preds)

0.96