In [6]:
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
def text_formater(inp_txt):
    if inp_txt:
        return inp_txt.strip().strip('\'')
    return inp_txt

In [9]:
df = pd.read_csv('../data/golf.csv',converters={
    'outlook':text_formater,
    'temp':text_formater,
    'humidity':text_formater,
    'wind':text_formater,
    'label':text_formater
})

In [10]:
df

Unnamed: 0,outlook,temp,humidity,wind,label
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [11]:
gr = df.groupby('label')

In [12]:
gr.apply(lambda x: print(x,'\n'))

   outlook  temp humidity    wind label
0    Sunny   Hot     High    Weak    No
1    Sunny   Hot     High  Strong    No
5     Rain  Cool   Normal  Strong    No
7    Sunny  Mild     High    Weak    No
13    Rain  Mild     High  Strong    No 

     outlook  temp humidity    wind label
2   Overcast   Hot     High    Weak   Yes
3       Rain  Mild     High    Weak   Yes
4       Rain  Cool   Normal    Weak   Yes
6   Overcast  Cool   Normal  Strong   Yes
8      Sunny  Cool   Normal    Weak   Yes
9       Rain  Mild   Normal    Weak   Yes
10     Sunny  Mild   Normal  Strong   Yes
11  Overcast  Mild     High  Strong   Yes
12  Overcast   Hot   Normal    Weak   Yes 



In [13]:
outlook_p = gr.apply(lambda x: x.groupby('outlook')['outlook'].count()/x.shape[0]).unstack(fill_value=0).T
outlook_p

label,No,Yes
outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
Overcast,0.0,0.444444
Rain,0.4,0.333333
Sunny,0.6,0.222222


In [14]:
temp_p = gr.apply(lambda x: x.groupby('temp')['temp'].count()/x.shape[0]).T
temp_p

label,No,Yes
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
Cool,0.2,0.333333
Hot,0.4,0.222222
Mild,0.4,0.444444


In [15]:
humidity_p = gr.apply(lambda x: x.groupby('humidity')['humidity'].count()/x.shape[0]).T
humidity_p

label,No,Yes
humidity,Unnamed: 1_level_1,Unnamed: 2_level_1
High,0.8,0.333333
Normal,0.2,0.666667


In [16]:
wind_p = gr.apply(lambda x: x.groupby('wind')['wind'].count()/x.shape[0]).T
wind_p

label,No,Yes
wind,Unnamed: 1_level_1,Unnamed: 2_level_1
Strong,0.6,0.333333
Weak,0.4,0.666667


In [17]:
label_p = gr.apply(lambda x: x.groupby('label')['label'].count()/x.shape[0]).unstack(fill_value=0)

In [18]:
prob_mat = pd.concat([outlook_p,temp_p,humidity_p, wind_p,label_p])
prob_mat

label,No,Yes
Overcast,0.0,0.444444
Rain,0.4,0.333333
Sunny,0.6,0.222222
Cool,0.2,0.333333
Hot,0.4,0.222222
Mild,0.4,0.444444
High,0.8,0.333333
Normal,0.2,0.666667
Strong,0.6,0.333333
Weak,0.4,0.666667


In [19]:
prob_mat.index.name='features'

In [20]:
l = [df[col].value_counts() for col in df.columns]

In [21]:
s = pd.Series().append(l)

In [22]:
s.name='unconditioned_probs'

In [23]:
s = s.transform(lambda x: x/df.shape[0])
s

Rain        0.357143
Sunny       0.357143
Overcast    0.285714
Mild        0.428571
Cool        0.285714
Hot         0.285714
Normal      0.500000
High        0.500000
Weak        0.571429
Strong      0.428571
Yes         0.642857
No          0.357143
Name: unconditioned_probs, dtype: float64

In [24]:
prob_mat = prob_mat.merge(s, left_index=True, right_index=True, how='inner')

In [25]:
prob_mat

Unnamed: 0,No,Yes,unconditioned_probs
Overcast,0.0,0.444444,0.285714
Rain,0.4,0.333333,0.357143
Sunny,0.6,0.222222,0.357143
Cool,0.2,0.333333,0.285714
Hot,0.4,0.222222,0.285714
Mild,0.4,0.444444,0.428571
High,0.8,0.333333,0.5
Normal,0.2,0.666667,0.5
Strong,0.6,0.333333,0.428571
Weak,0.4,0.666667,0.571429


In [27]:
unconditioned = prob_mat.at['Sunny','unconditioned_probs']*prob_mat.at['Cool','unconditioned_probs']*prob_mat.at['High','unconditioned_probs']*prob_mat.at['Strong','unconditioned_probs']

In [28]:
prob_mat.at['Sunny','No']*prob_mat.at['Cool','No']*prob_mat.at['High','No']*prob_mat.at['Strong','No']*prob_mat.at['No','unconditioned_probs']/unconditioned

0.9408

In [29]:
prob_mat.at['Sunny','Yes']*prob_mat.at['Cool','Yes']*prob_mat.at['High','Yes']*prob_mat.at['Strong','Yes']*prob_mat.at['Yes','unconditioned_probs']/unconditioned

0.2419753086419753

In [234]:
def predict(prob_matrix,feature_vec):
    unconditioned_prob = prob_mat.at[feature_vec[0],'unconditioned_probs']*prob_mat.at[feature_vec[1],'unconditioned_probs']*prob_mat.at[feature_vec[2],'unconditioned_probs']*prob_mat.at[feature_vec[3],'unconditioned_probs']

    yes = prob_mat.at[feature_vec[0],'Yes']*prob_mat.at[feature_vec[1],'Yes']*prob_mat.at[feature_vec[2],'Yes']*prob_mat.at[feature_vec[3],'Yes']*prob_mat.at['Yes','unconditioned_probs']/unconditioned
    
    no =  prob_mat.at[feature_vec[0],'No']*prob_mat.at[feature_vec[1],'No']*prob_mat.at[feature_vec[2],'No']*prob_mat.at[feature_vec[3],'No']*prob_mat.at['No','unconditioned_probs']/unconditioned
    
    return {'No':no, 'Yes':yes}, {'no_prob':no/(yes+no), 'yes_prob':yes/(yes+no)}

In [41]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder

In [152]:
ord_encoder_y= OrdinalEncoder(categories=[['No','Yes']])

In [153]:
ord_encoder_x= OrdinalEncoder(categories=[['Rain','Overcast','Sunny'],['Cool','Mild','Hot'],['Normal','High'],['Weak','Strong']])

In [156]:
y = ord_encoder_y.fit_transform(df['label'].values.reshape(-1,1))
y

array([[0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.]])

In [157]:
X = ord_encoder_x.fit_transform(df.iloc[:,:-1].values)
X

In [227]:
model = CategoricalNB(alpha=0.1)

In [228]:
model.fit(X,y.ravel())

CategoricalNB(alpha=0.1, class_prior=None, fit_prior=True)

In [229]:
x_vec = [['Sunny', 'Hot', 'High', 'Weak']]

In [230]:
model.predict(ord_encoder_x.transform(x_vec))

array([0.])

In [231]:
model.predict_proba(ord_encoder_x.transform(x_vec))

array([[0.78255469, 0.21744531]])

In [232]:
predict(predict,x_vec[0])

({'No': 1.2544000000000002, 'Yes': 0.322633744855967},
 {'no_prob': 0.795417348608838, 'yes_prob': 0.20458265139116197})

In [233]:
df

Unnamed: 0,outlook,temp,humidity,wind,label
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes
