## Mushroom classifier using Naive Bayes

In [72]:
# importing libraries

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [73]:
# reading mushroom dataset

df=pd.read_csv('DATASET/mushrooms.csv')
df.head(n=10)
print(df.shape)

(8124, 23)


In [74]:
# converting categorical data to numeric data

le=LabelEncoder()
ds=df.apply(le.fit_transform)
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [75]:
# splitting data into train and test data

x=ds.values
Y=x[:,0]
X=x[:,1:]
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [95]:
# building classifier


def prior(y,label):
    numerator=np.sum(y==label)
    denominator=y.shape[0]
    return numerator/float(denominator)

def conditional_probability(x,y,feature_column,feature_val,label):
    x_filtered=x[y==label]
    numerator=np.sum(x_filtered[:,feature_column]==feature_val)
    denominator=np.sum(y==label)
    return numerator/float(denominator)

def predict(x_train,y_train,xtest):
    classes=np.unique(y_train)
    n_features=x_train.shape[1]
    pred=[]
    for label in classes:
        likelihood=1.0
        for f in range(n_features):
            cond=conditional_probability(x_train,y_train,f,xtest[f],label)
            likelihood*=cond
        prior=prior_prob(y_train,label)
        posterior=prior*likelihood
        pred.append(posterior)
    pred=np.array(pred)
    return np.argmax(pred)

def score(x,y,x_test,y_test):
    pred=[]
    for i in range(x_test.shape[0]):
        predict_label=predict(x,y,x_test[i])
        pred.append(predict_label)
        
    pred=np.array(pred)
    numerator=np.sum(pred==y_test)
    denominator=y_test.shape[0]
    accuracy=numerator/denominator
    return accuracy
    

In [96]:
score(x_train,y_train,x_test,y_test)

0.9944615384615385