# Week 2: Day 3 PM // Naive Bayes

Naive Bayes employ bayesian logic and theorem to infer probability given hypotheses and their evidences.

In [None]:
# Import Libraries

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc,roc_auc_score
import time
import pandas as pd

%matplotlib inline

To solidify our understanding of Bayes Theorem and Naive Bayes, we will manually implement Bayes Theorem by `Pandas` and compare it with `Scikit-Learn` utility.

We will intialize a toy dataset containing **whether a family buy a car given their family structure, age group, and income**.

In [None]:
# Assigning features and label variables

family_struct = ['Nuclear','Extended','Childless','Childless','Single Parent','Childless','Nuclear','Nuclear','Extended','Single Parent']
age_group = ['Young','Old','Middle-aged','Young','Middle-aged','Young','Old','Middle-aged','Middle-aged','Old']
income = ['Low','Low','Low','Medium','Medium','Low','High','Medium','High','Low']
buy_car = ['Yes','No','No','Yes','Yes','No','Yes','Yes','Yes','No']

In [None]:
# Convert to Pandas Dataframe

dict = {'family_struct': family_struct,'age_group':age_group,'income':income,'buy_car':buy_car} 
    
df = pd.DataFrame(dict)
df.columns=list(df.columns[:-1])+['label'] 

In [None]:
# Display Pandas Dataframe

df

Unnamed: 0,family_struct,age_group,income,label
0,Nuclear,Young,Low,Yes
1,Extended,Old,Low,No
2,Childless,Middle-aged,Low,No
3,Childless,Young,Medium,Yes
4,Single Parent,Middle-aged,Medium,Yes
5,Childless,Young,Low,No
6,Nuclear,Old,High,Yes
7,Nuclear,Middle-aged,Medium,Yes
8,Extended,Middle-aged,High,Yes
9,Single Parent,Old,Low,No


---
First, we **count the probability of our class/label**.

In [None]:
# Create Frequency and Probability Each Label

df_grp_lbl=df.groupby('label').count().reset_index()[['label','income']]
df_grp_lbl.columns=list(df_grp_lbl.columns[:-1])+['f_h']
df_grp_lbl['p_h']=df_grp_lbl['f_h']/df_grp_lbl['f_h'].sum()

In [None]:
# Display Frequency and Probability for Each Label

df_grp_lbl

Unnamed: 0,label,f_h,p_h
0,No,4,0.4
1,Yes,6,0.6


Notes : 
* `f_h` : Frequency of a label.
* `p_h` : Probability of a label.

---
Next, **we count the frequency of each events**. We will need this to count the conditional probability.

In [None]:
# Create a dataframe that contains chopped of a features and its label

df_melt = df.melt(id_vars=['label'],var_name='features',value_name='value')
df_melt.columns=['label']+list(df_melt.columns[1:])

In [None]:
# Addition - Display `df_melt`

print('Original Dataframe')
print(df)
print('')
print('Chopped Datafram')
print(df_melt)

Original Dataframe
   family_struct    age_group  income label
0        Nuclear        Young     Low   Yes
1       Extended          Old     Low    No
2      Childless  Middle-aged     Low    No
3      Childless        Young  Medium   Yes
4  Single Parent  Middle-aged  Medium   Yes
5      Childless        Young     Low    No
6        Nuclear          Old    High   Yes
7        Nuclear  Middle-aged  Medium   Yes
8       Extended  Middle-aged    High   Yes
9  Single Parent          Old     Low    No

Chopped Datafram
   label       features          value
0    Yes  family_struct        Nuclear
1     No  family_struct       Extended
2     No  family_struct      Childless
3    Yes  family_struct      Childless
4    Yes  family_struct  Single Parent
5     No  family_struct      Childless
6    Yes  family_struct        Nuclear
7    Yes  family_struct        Nuclear
8    Yes  family_struct       Extended
9     No  family_struct  Single Parent
10   Yes      age_group          Young
11    No   

In [None]:
# Let's group it and get its frequencies

df_grp=df_melt.groupby(['label','features','value']).size().reset_index(name='count')
df_grp.columns=list(df_grp.columns[:-1])+['f_e_given_h']

print(df_grp)

   label       features          value  f_e_given_h
0     No      age_group    Middle-aged            1
1     No      age_group            Old            2
2     No      age_group          Young            1
3     No  family_struct      Childless            2
4     No  family_struct       Extended            1
5     No  family_struct  Single Parent            1
6     No         income            Low            4
7    Yes      age_group    Middle-aged            3
8    Yes      age_group            Old            1
9    Yes      age_group          Young            2
10   Yes  family_struct      Childless            1
11   Yes  family_struct       Extended            1
12   Yes  family_struct        Nuclear            3
13   Yes  family_struct  Single Parent            1
14   Yes         income           High            2
15   Yes         income            Low            1
16   Yes         income         Medium            3


In [None]:
# Now, we can filter the data based on whatever condition we like. Let's display rows that contains value == `Low`
# f_h = frequency of Yes/No (regardless of attributes)
# p_h = probabilty of Yes/No (regardless of attributes) (P(Yes) or P(No))

# f_e_given_h = frequency of Yes/No given a specific attribute
# p_e_given_h = probability of Yes/No given a specific attribute (P(attribute | Yes/No))

df_grp[df_grp['value']=='Low'].head(5)

Unnamed: 0,label,features,value,f_e_given_h
6,No,income,Low,4
15,Yes,income,Low,1


In [None]:
# Merging chopped dataframe with frequency and probability of label

df_a=df_grp.groupby((['features','value'])).count().reset_index()[['features','value']]
df_a['key']=1
df_b = df_grp_lbl
df_b['key']=1
df_feat=pd.merge(df_a, df_b, on ='key').drop("key", 1)

print(df_feat)

         features          value label  f_h  p_h
0       age_group    Middle-aged    No    4  0.4
1       age_group    Middle-aged   Yes    6  0.6
2       age_group            Old    No    4  0.4
3       age_group            Old   Yes    6  0.6
4       age_group          Young    No    4  0.4
5       age_group          Young   Yes    6  0.6
6   family_struct      Childless    No    4  0.4
7   family_struct      Childless   Yes    6  0.6
8   family_struct       Extended    No    4  0.4
9   family_struct       Extended   Yes    6  0.6
10  family_struct        Nuclear    No    4  0.4
11  family_struct        Nuclear   Yes    6  0.6
12  family_struct  Single Parent    No    4  0.4
13  family_struct  Single Parent   Yes    6  0.6
14         income           High    No    4  0.4
15         income           High   Yes    6  0.6
16         income            Low    No    4  0.4
17         income            Low   Yes    6  0.6
18         income         Medium    No    4  0.4
19         income   

---
Then, we built the conditional probability table. In case there is a combination of event and hypotheses that never happened, i.e. Nuclei family size not buying car, we will fill the conditional probability with the prior probability of given class.

In [None]:
# Combine all dataframe into one big dataframe with its frequencies and probabilities

df_prob=df_feat.merge(df_grp,on=['label','features','value'], how='outer')
df_prob['p_e_given_h']=df_prob['f_e_given_h']/df_prob['f_h']
df_prob['p_e_given_h']=df_prob['p_e_given_h'].fillna(0)

print(df_prob)

         features          value label  f_h  p_h  f_e_given_h  p_e_given_h
0       age_group    Middle-aged    No    4  0.4          1.0     0.250000
1       age_group    Middle-aged   Yes    6  0.6          3.0     0.500000
2       age_group            Old    No    4  0.4          2.0     0.500000
3       age_group            Old   Yes    6  0.6          1.0     0.166667
4       age_group          Young    No    4  0.4          1.0     0.250000
5       age_group          Young   Yes    6  0.6          2.0     0.333333
6   family_struct      Childless    No    4  0.4          2.0     0.500000
7   family_struct      Childless   Yes    6  0.6          1.0     0.166667
8   family_struct       Extended    No    4  0.4          1.0     0.250000
9   family_struct       Extended   Yes    6  0.6          1.0     0.166667
10  family_struct        Nuclear    No    4  0.4          NaN     0.000000
11  family_struct        Nuclear   Yes    6  0.6          3.0     0.500000
12  family_struct  Single

In [None]:
## Adition - Legends
'''
f_h = frequency of Yes/No (regardless of attributes)
p_h = probabilty of Yes/No (regardless of attributes) (P(Yes) or P(No))

f_e_given_h = frequency of Yes/No given a specific attribute
p_e_given_h = probability of Yes/No given a specific attribute (P(attribute | Yes/No))

'''

'\nf_h = frequency of Yes/No (regardless of attributes)\np_h = probabilty of Yes/No (regardless of attributes) (P(Yes) or P(No))\n\nf_e_given_h = frequency of Yes/No given a specific attribute\np_e_given_h = probability of Yes/No given a specific attribute (P(attribute | Yes/No))\n\n'

---
We can then use this table as a lookup to infer our data. As example, lets try to infer a family which is **Single Parent, Young, with Low income**.

In [None]:
# Step 1 : Get all data with attributes where `family_struct=Single Parent`, or `age_group=Young`, or `income=Low` regardless its label/target.
df_ext=df_prob[(df_prob['value'].isin(['Single Parent','Young','Low']))]
print('Step 1')
print(df_ext)
print('')

# Step 2 : Get probability based on filtered data from Step 1 for each label (`Yes` and `No`).
# How ? Multiply all probability values (`p_e_given_h`) based on the label/target ('Yes` and `No`)
df_ext=df_ext.groupby('label').agg({'p_e_given_h':np.prod}).reset_index()
print('Step 2')
print(df_ext)
print('')

# Step 3 : Get sum/total of this two probabilities values.
total_prob = df_ext['p_e_given_h'].sum()
print('Step 3')
print('Total Probability : ', total_prob)
print('')

# Step 4 : Divide `p_e_given_h` with `total_prob`
print('Step 4')
df_ext['norm_prob']=df_ext['p_e_given_h']/total_prob
print(df_ext)

Step 1
         features          value label  f_h  p_h  f_e_given_h  p_e_given_h
4       age_group          Young    No    4  0.4          1.0     0.250000
5       age_group          Young   Yes    6  0.6          2.0     0.333333
12  family_struct  Single Parent    No    4  0.4          1.0     0.250000
13  family_struct  Single Parent   Yes    6  0.6          1.0     0.166667
16         income            Low    No    4  0.4          4.0     1.000000
17         income            Low   Yes    6  0.6          1.0     0.166667

Step 2
  label  p_e_given_h
0    No     0.062500
1   Yes     0.009259

Step 3
Total Probability :  0.07175925925925926

Step 4
  label  p_e_given_h  norm_prob
0    No     0.062500   0.870968
1   Yes     0.009259   0.129032


---
Let's inference all our training data

In [None]:
# Inferencing all possibilities of features based on given dataset

small_dfs = []
X=df[df.columns[:-1]]
for idx in range(len(X)):
    df_select = X.iloc[[idx]]
    df_ext=df_prob[df_prob['value'].isin(df_select.values[0])]
    df_ext=df_ext.groupby('label').agg({'p_e_given_h':np.prod}).reset_index()
    df_ext.columns=['label','p_h']
    df_ext['total_prob']=df_ext['p_h'].sum()
    df_ext['norm_prob']=df_ext['p_h']/df_ext['total_prob']
    df_select['prob_no']=df_ext[df_ext['label']=='No']['norm_prob'].values[0]
    df_select['prob_yes']=df_ext[df_ext['label']=='Yes']['norm_prob'].values[0]
    small_dfs.append(df_select)
    
df_infer = pd.concat(small_dfs, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [None]:
# Display probabilities for given dataset

df_infer

Unnamed: 0,family_struct,age_group,income,prob_no,prob_yes
0,Nuclear,Young,Low,0.0,1.0
1,Extended,Old,Low,0.964286,0.035714
2,Childless,Middle-aged,Low,0.9,0.1
3,Childless,Young,Medium,0.0,1.0
4,Single Parent,Middle-aged,Medium,0.0,1.0
5,Childless,Young,Low,0.931034,0.068966
6,Nuclear,Old,High,0.0,1.0
7,Nuclear,Middle-aged,Medium,0.0,1.0
8,Extended,Middle-aged,High,0.0,1.0
9,Single Parent,Old,Low,0.964286,0.035714


---
Let's compare it with Scikit-Learn implementation

In [None]:
# First, we must encode the dataset into numeric. Let's use One Hot Encoding for Features and Label Encoder for Target.

from sklearn.preprocessing import OneHotEncoder,LabelEncoder
enc = OneHotEncoder()

X = df[df.columns[:-1]]
enc.fit(X)
X_enc=enc.transform(X).toarray()

In [None]:
# Display `X`

print('X : \n', X, '\n')
print('One Hot Encoding : \n', X_enc)

X : 
    family_struct    age_group  income
0        Nuclear        Young     Low
1       Extended          Old     Low
2      Childless  Middle-aged     Low
3      Childless        Young  Medium
4  Single Parent  Middle-aged  Medium
5      Childless        Young     Low
6        Nuclear          Old    High
7        Nuclear  Middle-aged  Medium
8       Extended  Middle-aged    High
9  Single Parent          Old     Low 

One Hot Encoding : 
 [[0. 0. 1. 0. 0. 0. 1. 0. 1. 0.]
 [0. 1. 0. 0. 0. 1. 0. 0. 1. 0.]
 [1. 0. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 1. 0. 0. 1.]
 [0. 0. 0. 1. 1. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 1. 0. 0. 1. 0. 1. 0. 0.]
 [0. 0. 1. 0. 1. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 1. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0. 1. 0.]]


In [None]:
# Encode Target into numeric

le = LabelEncoder()
label_encoded=le.fit_transform(df['label'])
print("Label:",label_encoded)

Label: [1 0 0 1 1 0 1 1 1 0]


In [None]:
# Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

# Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets
model.fit(X_enc,label_encoded)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
# Merging result from scratch with result from Scikit-Learn

df_result_sklearn = df_infer.copy()
df_result_sklearn['prob_no_sklearn'] =  model.predict_proba(X_enc)[:,0]
df_result_sklearn['prob_yes_sklearn'] =  model.predict_proba(X_enc)[:,1]
df_result_sklearn 

Unnamed: 0,family_struct,age_group,income,prob_no,prob_yes,prob_no_sklearn,prob_yes_sklearn
0,Nuclear,Young,Low,0.0,1.0,0.0,1.0
1,Extended,Old,Low,0.964286,0.035714,1.0,5.078957e-21
2,Childless,Middle-aged,Low,0.9,0.1,1.0,5.598623999999999e-20
3,Childless,Young,Medium,0.0,1.0,0.0,1.0
4,Single Parent,Middle-aged,Medium,0.0,1.0,0.0,1.0
5,Childless,Young,Low,0.931034,0.068966,1.0,2.6446029999999998e-20
6,Nuclear,Old,High,0.0,1.0,0.0,1.0
7,Nuclear,Middle-aged,Medium,0.0,1.0,0.0,1.0
8,Extended,Middle-aged,High,0.0,1.0,0.0,1.0
9,Single Parent,Old,Low,0.964286,0.035714,1.0,5.078957e-21


In [None]:
# Test A New Data

new_data = {'family_struct': ['Childless'],'age_group': ['Young'],'income': ['High']} 
    
df_new_data = pd.DataFrame(new_data)
X_new_data = enc.transform(df_new_data).toarray()
result_class = model.predict(X_new_data)
result_proba = model.predict_proba(X_new_data)

print('New Data - Real      : \n', df_new_data, '\n')
print('New Data - Encode    : ', X_new_data, '\n')
print('Result - Class       : ', result_class[0])
print('Result - Probability : ', result_proba)

New Data - Real      : 
   family_struct age_group income
0     Childless     Young   High 

New Data - Encode    :  [[1. 0. 0. 0. 0. 0. 1. 1. 0. 0.]] 

Result - Class       :  1
Result - Probability :  [[0. 1.]]


In [None]:
# Add New Naive Bayes Classifiers

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import CategoricalNB

# Create Several Naive Bayes Classifiers
model_bernoullinb = BernoulliNB()
model_multinomialnb = MultinomialNB()
model_categoricalnb = CategoricalNB()

# Train the model using the training sets
model_bernoullinb.fit(X_enc,label_encoded)
model_multinomialnb.fit(X_enc,label_encoded)
model_categoricalnb.fit(X_enc,label_encoded)

CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# Test A New Data

# new_data = {'family_struct': ['Childless'],'age_group': ['Young'],'income': ['High']} 
new_data = {'family_struct': ['Single Parent'],'age_group': ['Young'],'income': ['Low']} 
    
df_new_data = pd.DataFrame(new_data)
X_new_data = enc.transform(df_new_data).toarray()

print('New Data - Real      : \n', df_new_data, '\n')
print('New Data - Encode    : ', X_new_data, '\n')

result_class = model.predict(X_new_data)
result_proba = model.predict_proba(X_new_data)
print('Result - Gaussian NB')
print('Result - Class       : ', result_class[0])
print('Result - Probability : ', result_proba, '\n')

result_class = model_bernoullinb.predict(X_new_data)
result_proba = model_bernoullinb.predict_proba(X_new_data)
print('Result - Bernoulli NB')
print('Result - Class       : ', result_class[0])
print('Result - Probability : ', result_proba, '\n')

result_class = model_multinomialnb.predict(X_new_data)
result_proba = model_multinomialnb.predict_proba(X_new_data)
print('Result - Multinomial NB')
print('Result - Class       : ', result_class[0])
print('Result - Probability : ', result_proba, '\n')

result_class = model_categoricalnb.predict(X_new_data)
result_proba = model_categoricalnb.predict_proba(X_new_data)
print('Result - Categorical NB')
print('Result - Class       : ', result_class[0])
print('Result - Probability : ', result_proba, '\n')


New Data - Real      : 
    family_struct age_group income
0  Single Parent     Young    Low 

New Data - Encode    :  [[0. 0. 0. 1. 0. 0. 1. 0. 1. 0.]] 

Result - Gaussian NB
Result - Class       :  0
Result - Probability :  [[1.00000000e+00 1.00327444e-19]] 

Result - Bernoulli NB
Result - Class       :  0
Result - Probability :  [[0.83708662 0.16291338]] 

Result - Multinomial NB
Result - Class       :  0
Result - Probability :  [[0.69611101 0.30388899]] 

Result - Categorical NB
Result - Class       :  0
Result - Probability :  [[0.83708662 0.16291338]] 



In [None]:
# Let's compare it with our previous category {'family_struct': ['Single Parent'],'age_group': ['Young'],'income': ['Low']} 

print(df_ext)

  label      p_h  total_prob  norm_prob
0    No  0.12500     0.12963   0.964286
1   Yes  0.00463     0.12963   0.035714
