## Mushroom Classifier - Naive Bayes

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Dataset Processing

In [2]:
df = pd.read_csv("mushrooms.csv")

In [6]:
df.head(n=10)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [8]:
le = LabelEncoder()
ds = df.apply(le.fit_transform) # Converts Categorical Data into Numerical Data

In [9]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [11]:
data = ds.values
print(data.shape,type(data))
print(data[:5,:])

(8124, 23) <class 'numpy.ndarray'>
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


In [12]:
X = data[:,1:]
Y = data[:,0]

In [15]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [16]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [17]:
np.unique(Y)

array([0, 1])

### Classifier

In [19]:
def priorProbability(Y,label):
    total_examples = Y.shape[0]
    class_examples = np.sum(Y==label)
    return class_examples/float(total_examples)

In [20]:
def conditionalProbability(X,Y,feature_name,feature_value,label):
    x_filtered = X[Y==label]
    numerator = np.sum(x_filtered[:,feature_name]==feature_value)
    denominator = np.sum(Y==label)
    return numerator/float(denominator)

In [24]:
def predict(x_train,y_train,x_test):
    classes = np.unique(y_train)
    feature_size = x_train.shape[1]
    posterior_prob = []
    
    for c in classes:
        likelihood = 1.0
        for feature in range(feature_size):
            cond = conditionalProbability(x_train,y_train,feature,x_test[feature],c)
            likelihood = likelihood*cond
        
        prior = priorProbability(y_train,c)
        post = likelihood*prior
        posterior_prob.append(post)
    pred = np.argmax(posterior_prob)
    return pred

In [22]:
def score(x_train,y_train,x_test,y_test):
    predictions = []
    for i in range(x_test.shape[0]):
        pred = predict(x_train,y_train,x_test[i])
        predictions.append(pred)
    predictions = np.array(predictions)
    accuracy = np.sum(predictions==y_test)/y_test.shape[0]
    return accuracy

In [25]:
print(score(x_train,y_train,x_test,y_test))

0.9981538461538462
