# Naive Bayes Classification of Mushrooms

In [12]:
import pandas as pd
import numpy as np

TRAIN_FRAC = 0.75

Read data. Split into training and validation data.

In [13]:
df = pd.read_csv('data/mushrooms.csv')
train_df = df.sample(frac=TRAIN_FRAC)
test_df = df.drop(train_df.index)
df.sample(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
7031,p,x,s,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l
3834,p,x,f,g,f,f,f,c,b,g,...,k,p,p,p,w,o,l,h,v,d
3629,e,f,y,g,t,n,f,c,b,n,...,s,g,p,p,w,o,p,k,y,d
5892,p,f,s,w,t,f,f,c,b,h,...,f,w,w,p,w,o,p,h,v,u
3630,p,x,f,g,f,f,f,c,b,g,...,k,n,n,p,w,o,l,h,y,p


In [14]:
labels = ['e', 'p']
features = {
    'cap-shape': ['b', 'c', 'f', 'k', 's', 'x'],
    'cap-surface': ['f', 'g', 's', 'y'],
    'cap-color': ['b', 'c', 'e', 'g', 'n', 'p', 'r', 'u', 'w', 'y'],
    'bruises': ['f', 't'],
    'odor': ['a', 'c', 'f', 'l', 'm', 'n', 'p', 's', 'y'],
    'gill-attachment': ['a', 'f'],
    'gill-spacing': ['c', 'w'],
    'gill-size': ['b', 'n'],
    'gill-color': ['b', 'e', 'g', 'h', 'k', 'n', 'o', 'p', 'r', 'u', 'w', 'y'],
    'stalk-shape': ['e', 't'],
    'stalk-root': ['?', 'b', 'c', 'e', 'r'],
    'stalk-surface-above-ring': ['f', 'k', 's', 'y'],
    'stalk-surface-below-ring': ['f', 'k', 's', 'y'],
    'stalk-color-above-ring': ['b', 'c', 'e', 'g', 'n', 'o', 'p', 'w', 'y'],
    'stalk-color-below-ring': ['b', 'c', 'e', 'g', 'n', 'o', 'p', 'w', 'y'],
    'veil-type': ['p'],
    'veil-color': ['n', 'o', 'w', 'y'],
    'ring-number': ['n', 'o', 't'],
    'ring-type': ['e', 'f', 'l', 'n', 'p'],
    'spore-print-color': ['b', 'h', 'k', 'n', 'o', 'r', 'u', 'w', 'y'],
    'population': ['a', 'c', 'n', 's', 'v', 'y'],
    'habitat': ['d', 'g', 'l', 'm', 'p', 'u', 'w']
}

Prior probabilities

In [15]:
# Labels
priors = {
    label: len(train_df[train_df['class'] == label]) / len(train_df)
    for label in labels
}
print(priors)

{'e': 0.517971442639094, 'p': 0.48202855736090594}


Marginal probabilities

In [16]:
marginals = {
    feature: {
        value: len(train_df[train_df[feature] == value]) / len(train_df)
        for value in values
    }
    for feature, values in features.items()
}
marginals

{'cap-shape': {'b': 0.053175775480059084,
  'c': 0.0004923682914820286,
  'f': 0.3888068275069752,
  'k': 0.1019202363367799,
  's': 0.0037748235680288857,
  'x': 0.45182996881667487},
 'cap-surface': {'f': 0.2857377318234039,
  'g': 0.0006564910553093714,
  's': 0.31199737403577876,
  'y': 0.40160840308550794},
 'cap-color': {'b': 0.019366486131626455,
  'c': 0.004923682914820286,
  'e': 0.1852946003610701,
  'g': 0.22796651895617923,
  'n': 0.2791728212703102,
  'p': 0.018053504021007714,
  'r': 0.002133595929755457,
  'u': 0.0022977186935828,
  'w': 0.12801575578532742,
  'y': 0.13277531593632036},
 'bruises': {'f': 0.5870671262104054, 't': 0.4129328737895946},
 'odor': {'a': 0.049565074675857544,
  'c': 0.025767273920892828,
  'f': 0.2639094042343673,
  'l': 0.04841621532906614,
  'm': 0.004923682914820286,
  'n': 0.4345970786148039,
  'p': 0.0298703430165764,
  's': 0.06991629739044805,
  'y': 0.07303462990316757},
 'gill-attachment': {'a': 0.026259642212374857, 'f': 0.97374035778

In [18]:
likelihoods = {
    label: {
        feature: {
            value: len(train_df[(train_df['class'] == label) & (train_df[feature] == value)]) / len(train_df[(train_df['class'] == label)])
            for value in values
        }
        for feature, values in features.items()
    }
    for label in labels
}
likelihoods

{'e': {'cap-shape': {'b': 0.09283903675538656,
   'c': 0.0,
   'f': 0.3805449936628644,
   'k': 0.05418250950570342,
   's': 0.007287705956907478,
   'x': 0.46514575411913817},
  'cap-surface': {'f': 0.36850443599493027,
   'g': 0.0,
   's': 0.2696451204055767,
   'y': 0.36185044359949303},
  'cap-color': {'b': 0.010773130544993664,
   'c': 0.006653992395437262,
   'e': 0.14512040557667935,
   'g': 0.24809885931558937,
   'n': 0.30164765525982257,
   'p': 0.013624841571609633,
   'r': 0.004119138149556401,
   'u': 0.004435994930291508,
   'w': 0.16951837769328262,
   'y': 0.09600760456273764},
  'bruises': {'f': 0.3463244613434727, 't': 0.6536755386565273},
  'odor': {'a': 0.09569074778200254,
   'c': 0.0,
   'f': 0.0,
   'l': 0.09347275031685678,
   'm': 0.0,
   'n': 0.8108365019011406,
   'p': 0.0,
   's': 0.0,
   'y': 0.0},
  'gill-attachment': {'a': 0.04594423320659062, 'f': 0.9540557667934094},
  'gill-spacing': {'c': 0.7160963244613435, 'w': 0.28390367553865653},
  'gill-size': {

In [21]:
def bayes_classify(sample):
    labels = dict(priors)
    for feature, value in sample.items():
        for label, prior in labels.items():
            labels[label] = prior * likelihoods[label][feature][value] / marginals[feature][value]            
    return labels

In [24]:
sample = test_df.sample()
sample_class = sample['class'].values[0]
sample = { feature: sample[feature].values[0] for feature in features.keys() }

print('Sample:', sample)
print()
print('Sample Prediction:', bayes_classify(sample))
print('Sample Expected:', sample_class)

Sample: {'cap-shape': 'f', 'cap-surface': 's', 'cap-color': 'n', 'bruises': 'f', 'odor': 's', 'gill-attachment': 'f', 'gill-spacing': 'c', 'gill-size': 'n', 'gill-color': 'b', 'stalk-shape': 't', 'stalk-root': '?', 'stalk-surface-above-ring': 's', 'stalk-surface-below-ring': 's', 'stalk-color-above-ring': 'p', 'stalk-color-below-ring': 'w', 'veil-type': 'p', 'veil-color': 'w', 'ring-number': 'o', 'ring-type': 'e', 'spore-print-color': 'w', 'population': 'v', 'habitat': 'd'}

Sample Prediction: {'e': 0.0, 'p': 11.716687090856444}
Sample Expected: p
