# Naive Bayes Classification of Mushrooms

In [1]:
import pandas as pd
import numpy as np

TRAIN_FRAC = 0.75

Read data. Split into training and validation data.

In [150]:
# Our label column is "class"
label = 'class'

# Read dataset split into training and testing
# Also there might be n/a values 
df = pd.read_csv('data/mushrooms.csv')
train_df = df.sample(frac=TRAIN_FRAC)
test_df = df.drop(train_df.index)

# Print out a sample
df.sample(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
8045,p,k,y,e,f,y,f,c,n,b,...,s,w,p,p,w,o,e,w,v,p
626,e,f,f,y,t,l,f,w,n,w,...,s,w,w,p,w,o,p,u,v,d
7134,e,k,s,g,f,n,f,w,b,p,...,k,w,w,p,w,t,p,w,s,g
4969,p,x,y,y,f,f,f,c,b,h,...,k,b,b,p,w,o,l,h,v,p
5721,p,x,y,n,f,y,f,c,n,b,...,s,p,w,p,w,o,e,w,v,d


Prior probabilities

In [10]:
# Priors
priors = train_df.groupby(label).size() / len(train_df)
priors

class
e    0.514853
p    0.485147
dtype: float64

Compute likelihood table

In [151]:
# Likelihoods
likelihoods = {
    column: train_df.groupby([column, label]).size() / train_df.groupby(label).size()
    for column in train_df.columns if column != label
}
likelihoods

{'cap-shape': cap-shape  class
 b          e        0.093859
            p        0.012542
 c          p        0.001017
 f          e        0.380528
            p        0.396610
 k          e        0.052498
            p        0.155932
 s          e        0.007954
 x          e        0.465161
            p        0.433898
 dtype: float64,
 'cap-surface': cap-surface  class
 f            e        0.374165
              p        0.195593
 g            p        0.000678
 s            e        0.266306
              p        0.365424
 y            e        0.359529
              p        0.438305
 dtype: float64,
 'cap-color': cap-color  class
 b          e        0.011454
            p        0.032203
 c          e        0.007318
            p        0.002712
 e          e        0.150811
            p        0.225085
 g          e        0.241807
            p        0.204068
 n          e        0.297486
            p        0.259322
 p          e        0.013999
            p  

In [353]:
column = 'odor'
sample = test_df.sample(1)
print(sample[column])
labels = priors.copy()
print(labels)
likelihood = likelihoods[column][sample[column]]
likelihood = likelihood.groupby(level=1).sum()
print(likelihood)
if 'e' not in likelihood:
    likelihood['e'] = 0
if 'p' not in likelihood:
    likelihood['p'] = 0
print(likelihood)
labels *= likelihood
labels

3940    f
Name: odor, dtype: object
class
e    0.514853
p    0.485147
dtype: float64
class
p    0.553898
dtype: float64
class
p    0.553898
e    0.000000
dtype: float64


class
e    0.000000
p    0.268722
dtype: float64

Create classifier

In [350]:
def naive_bayes_classify(sample):
    labels = priors.copy()
    for column in sample.columns:
        likelihood = likelihoods[column][sample[column]]
        likelihood = likelihood.groupby(level=1).sum()
        if 'e' not in likelihood:
            likelihood['e'] = 0
        if 'p' not in likelihood:
            likelihood['p'] = 0
    return labels.idxmax()