# Mushroom CLassification 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./mushrooms.csv")
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
data = df.values
print(data.shape)

(8124, 23)


# Encode the categorical data into Numerical data

In [8]:
le = LabelEncoder()
ds = df.apply(le.fit_transform)
ds.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


# Break data into train and test set

In [10]:
data = ds.values
x_data = data[:,1:]
y_data = data[:,0]

array([[5, 2, 4, ..., 2, 3, 5],
       [5, 2, 9, ..., 3, 2, 1],
       [0, 2, 8, ..., 3, 2, 3],
       ...,
       [2, 2, 4, ..., 0, 1, 2],
       [3, 3, 4, ..., 7, 4, 2],
       [5, 2, 4, ..., 4, 1, 2]])

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.2)

In [14]:
print(x_train.shape , x_test.shape)
print(y_train.shape , y_test.shape)

(6499, 22) (1625, 22)
(6499,) (1625,)


In [17]:
np.unique(y_test) # Types of classes

array([0, 1])

# Building Classifier

In [18]:
def prior_prob(y,label):
    numerator = np.sum(y==label)
    denominator = y.shape[0]
    return numerator/float(denominator)

In [25]:
def cond_prob(x,y,label,feature_col,feature_val):
    x_filtered = x[y==label]
    numerator = np.sum(x_filtered[:,feature_col] == feature_val)
    denominator = np.sum(y==label)
    return numerator/float(denominator)

In [26]:
def pred(x,y,x_test): #Row of x_test
    classes = np.unique(y)
    n_features = x.shape[1]
    post_prob = []
    for label in classes:
        likelihood = 1.0;
        for f in range(n_features):
            cond = cond_prob(x,y,label,f,x_test[f]) 
            likelihood *= cond
            
        prior = prior_prob(y,label)
        post = likelihood*prior
        post_prob.append(post)
    
    pred = np.argmax(post_prob)
    return pred
        

In [27]:
output = pred(x_train,y_train,x_test[1])

In [31]:
print("predicted class is " + str(output) )
print("actual class is "+ str(y_test[1]))

predicted class is 0
actual class is 0


# Score of Prediction

In [34]:
def score(x_train,y_train,x_test,y_test):
    predict = []
    for i in range(x_test.shape[0]):
        prediction = pred(x_train,y_train,x_test[i])
        predict.append(prediction)
    
    predict = np.asarray(predict)
    accuracy = np.sum(predict==y_test)
    return accuracy

In [36]:
s = score(x_train,y_train,x_test,y_test)
print(s/y_test.shape[0])

0.9981538461538462
