# Naive Bayes 
   implementing naive bayes using mushroom data set from kaggle

In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('/Users/nikhilgrewal/Desktop/practice/mushrooms.csv')

In [3]:
data.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## Now we have to convert this categorial data into numerical data

In [4]:
#for this step we will use label encoder from sklearn
le = LabelEncoder()
data_n = data.apply(le.fit_transform)

In [5]:
data_n.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [6]:
ds = data_n.values
print(ds.shape)

(8124, 23)


In [7]:
print(ds)

[[1 5 2 ... 2 3 5]
 [0 5 2 ... 3 2 1]
 [0 0 2 ... 3 2 3]
 ...
 [0 2 2 ... 0 1 2]
 [1 3 3 ... 7 4 2]
 [0 5 2 ... 4 1 2]]


In [14]:
#splitting the data into train and test
X = ds[:,1:];
Y = ds[:,0];

In [15]:
print(X)
print(Y)

[[5 2 4 ... 2 3 5]
 [5 2 9 ... 3 2 1]
 [0 2 8 ... 3 2 3]
 ...
 [2 2 4 ... 0 1 2]
 [3 3 4 ... 7 4 2]
 [5 2 4 ... 4 1 2]]
[1 0 0 ... 0 1 0]


In [16]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)

In [17]:
print(x_train.shape)
print(y_train.shape)

(6499, 22)
(6499,)


In [18]:
print(x_test.shape)
print(y_test.shape)

(1625, 22)
(1625,)


## Building the classifier 

In [19]:
def prior_prob(y_train,label):
    total = y_train.shape[0]
    count = np.sum(y_train == label) 
    return float(count)/float(total);

In [24]:
#first we need to compute the conditional probability 
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    xsplit = x_train[y_train == label]
    numerator = np.sum(xsplit[:,feature_col] == feature_val)
    denominator = np.sum(y_train == label)
    return numerator/float(denominator);

In [25]:
def predict(x_train,y_train,xtest):
    classes = np.unique(y_train)
    post_prob = []
    no_features = x_train.shape[1];
    for label in classes:
        likelihood = 1.0
        for f in range(no_features):
            cond = cond_prob(x_train,y_train,f,xtest[f],label)
            likelihood *= cond
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        post_prob.append(post)
    pred = np.argmax(post_prob)   
    return pred;

In [28]:
print(x_test.shape) 

(1625, 22)


In [29]:
n = x_test.shape[0]
pred = []
for i in range(n):
    out = predict(x_train,y_train,x_test[i])
    pred.append(out)
print(pred)    

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 

## Accuracy of the model 

In [30]:
acc = np.sum(y_test == pred)/x_test.shape[0];
print(acc)

0.9956923076923077
