In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv(r'Machine Learning/Naive_Bayes/mushrooms.csv')
print(data.head())

  type cap_shape cap_surface cap_color bruises odor gill_attachment  \
0    p         x           s         n       t    p               f   
1    e         x           s         y       t    a               f   
2    e         b           s         w       t    l               f   
3    p         x           y         w       t    p               f   
4    e         x           s         g       f    n               f   

  gill_spacing gill_size gill_color   ...   stalk_surface_below_ring  \
0            c         n          k   ...                          s   
1            c         b          k   ...                          s   
2            c         b          n   ...                          s   
3            c         n          n   ...                          s   
4            w         b          k   ...                          s   

  stalk_color_above_ring stalk_color_below_ring veil_type veil_color  \
0                      w                      w         p          w

In [14]:
le = LabelEncoder()
ds = data.apply(le.fit_transform)

In [15]:
Xdata = ds.values[:,1:]
Ydata = ds.values[:,0]
print(Xdata.shape,Ydata.shape)

(8124, 22) (8124,)


In [16]:
xtrain,xtest,ytrain,ytest = train_test_split(Xdata,Ydata,test_size=0.2)
print(xtrain.shape,ytrain.shape)
print(xtest.shape,ytest.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [17]:
np.unique(ytrain)

array([0, 1])

In [23]:
ytrain==1

array([ True,  True,  True, ...,  True,  True,  True])

In [24]:
ytrain==0

array([False, False, False, ..., False, False, False])

In [25]:
xtrain[ytrain==1]

array([[2, 2, 4, ..., 7, 4, 0],
       [2, 2, 2, ..., 7, 4, 0],
       [5, 3, 8, ..., 2, 3, 1],
       ...,
       [2, 3, 2, ..., 7, 4, 2],
       [5, 3, 9, ..., 1, 5, 0],
       [2, 3, 2, ..., 7, 4, 2]])

In [27]:
def prior_prob(ytrain,label):
    total_examples = ytrain.shape[0]
    class_examples = np.sum(ytrain==label)
    return class_examples/float(total_examples)

def conditional_prob(xtrain,ytrain,feature_col,feature_value,label):
    x_filter = xtrain[ytrain==label]
    numerator = np.sum(x_filter[:,feature_col]==feature_value)
    denomenator = np.sum(ytrain==label)
    return numerator/float(denomenator)

In [28]:
def predict(xtrain,ytrain,xtest):
    """xtest is single testing point, having n features"""
    classes = np.unique(ytrain)
    n_features = xtrain.shape[1]
    post_prob = []
    for label in classes:
        likelihood = 1.0
        # posterior_prob = likelihood * prior
        for f in range(n_features):
            cond=conditional_prob(xtrain,ytrain,f,xtest[f],label)
            likelihood *= cond
        prior = prior_prob(ytrain,label)
        post = likelihood*prior
        post_prob.append(post)
    
    pred = np.argmax(post_prob)
    return pred

            

In [31]:
predict(xtrain,ytrain,xtest[2])==ytest[2]

True

In [34]:
def accuracy(xtrain,ytrain,xtest,ytest):
    pred=[]
    for f in range(xtest.shape[0]):
        p = predict(xtrain,ytrain,xtest[f])
        pred.append(p)
    
    pred = np.array(pred)
    return np.sum(pred==ytest)/ytest.shape[0]

In [35]:
accuracy(xtrain,ytrain,xtest,ytest)

0.9981538461538462