# Naive Bayes Classification of Mushrooms

In [186]:
import pandas as pd
import numpy as np

TRAIN_FRAC = 0.75

Read data. Split into training and validation data.

In [187]:
df = pd.read_csv('data/mushrooms.csv')
train_df = df.sample(frac=TRAIN_FRAC)
test_df = df.drop(train_df.index)
df.sample(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
276,e,f,y,n,t,a,f,c,b,w,...,y,w,w,p,w,o,p,n,y,g
4425,p,f,f,y,f,f,f,c,b,p,...,k,p,n,p,w,o,l,h,v,d
7738,p,k,y,n,f,s,f,c,n,b,...,k,w,p,p,w,o,e,w,v,p
5607,p,f,y,y,f,f,f,c,b,g,...,k,n,n,p,w,o,l,h,v,g
2188,e,x,f,g,t,n,f,c,b,w,...,s,g,p,p,w,o,p,k,v,d


In [188]:
lables = ['e', 'p']
features = {
    'cap-shape': ['b', 'c', 'f', 'k', 's', 'x'],
    'cap-surface': ['f', 'g', 's', 'y'],
    'cap-color': ['b', 'c', 'e', 'g', 'n', 'p', 'r', 'u', 'w', 'y'],
    'bruises': ['f', 't'],
    'odor': ['a', 'c', 'f', 'l', 'm', 'n', 'p', 's', 'y'],
    'gill-attachment': ['a', 'f'],
    'gill-spacing': ['c', 'w'],
    'gill-size': ['b', 'n'],
    'gill-color': ['b', 'e', 'g', 'h', 'k', 'n', 'o', 'p', 'r', 'u', 'w', 'y'],
    'stalk-shape': ['e', 't'],
    'stalk-root': ['?', 'b', 'c', 'e', 'r'],
    'stalk-surface-above-ring': ['f', 'k', 's', 'y'],
    'stalk-surface-below-ring': ['f', 'k', 's', 'y'],
    'stalk-color-above-ring': ['b', 'c', 'e', 'g', 'n', 'o', 'p', 'w', 'y'],
    'stalk-color-below-ring': ['b', 'c', 'e', 'g', 'n', 'o', 'p', 'w', 'y'],
    'veil-type': ['p'],
    'veil-color': ['n', 'o', 'w', 'y'],
    'ring-number': ['n', 'o', 't'],
    'ring-type': ['e', 'f', 'l', 'n', 'p'],
    'spore-print-color': ['b', 'h', 'k', 'n', 'o', 'r', 'u', 'w', 'y'],
    'population': ['a', 'c', 'n', 's', 'v', 'y'],
    'habitat': ['d', 'g', 'l', 'm', 'p', 'u', 'w']
}

Prior probabilities

In [189]:
# Labels
priors = {
    label: len(train_df[train_df['class'] == label]) / len(train_df)
    for label in labels
}
print(priors)

{'e': 0.5181355654029214, 'p': 0.4818644345970786}


Marginal probabilities

In [194]:
marginals = {
    feature: {
        value: len(train_df[train_df[feature] == value]) / len(train_df)
        for value in values
    }
    for feature, values in features.items()
}
{ feature:sum(values.values()) for feature,values in marginals.items() }

{'cap-shape': 1.0,
 'cap-surface': 1.0,
 'cap-color': 1.0,
 'bruises': 1.0,
 'odor': 1.0,
 'gill-attachment': 1.0,
 'gill-spacing': 1.0,
 'gill-size': 1.0,
 'gill-color': 1.0,
 'stalk-shape': 1.0,
 'stalk-root': 1.0,
 'stalk-surface-above-ring': 1.0,
 'stalk-surface-below-ring': 1.0,
 'stalk-color-above-ring': 1.0,
 'stalk-color-below-ring': 1.0,
 'veil-type': 1.0,
 'veil-color': 1.0,
 'ring-number': 1.0,
 'ring-type': 1.0,
 'spore-print-color': 1.0,
 'population': 1.0,
 'habitat': 1.0000000000000002}

In [198]:
likelihoods = {
    label: {
        feature: {
            value: len(train_df[(train_df['class'] == label) & (train_df[feature] == value)]) / len(train_df[(train_df['class'] == label)])
            for value in values
        }
        for feature, values in features.items()
    }
    for label in labels
}
likelihoods

{'e': {'cap-shape': {'b': 0.09724421919543871,
   'c': 0.0,
   'f': 0.3864428254672157,
   'k': 0.04909724421919544,
   's': 0.007918910357934749,
   'x': 0.4592968007602154},
  'cap-surface': {'f': 0.3718720304086158,
   'g': 0.0,
   's': 0.26575863161229013,
   'y': 0.3623693379790941},
  'cap-color': {'b': 0.010452961672473868,
   'c': 0.007602153943617358,
   'e': 0.15077605321507762,
   'g': 0.24611973392461198,
   'n': 0.30408615774469433,
   'p': 0.010136205258156478,
   'r': 0.003484320557491289,
   'u': 0.003801076971808679,
   'w': 0.17389927146024706,
   'y': 0.08964206525182135},
  'bruises': {'f': 0.33797909407665505, 't': 0.662020905923345},
  'odor': {'a': 0.09407665505226481,
   'c': 0.0,
   'f': 0.0,
   'l': 0.09471016788089959,
   'm': 0.0,
   'n': 0.8112131770668356,
   'p': 0.0,
   's': 0.0,
   'y': 0.0},
  'gill-attachment': {'a': 0.04529616724738676, 'f': 0.9547038327526133},
  'gill-spacing': {'c': 0.7218878682293316, 'w': 0.27811213177066835},
  'gill-size': {'b

In [192]:
def bayes_classify(sample):
    labels = dict(priors)
    for feature, value in sample.items():
        for label, prior in labels.items():
            labels[label] = prior * likelihoods[label][feature][value] / marginals[feature][value]            
    norm = sum(labels.values())
    return { k:v/norm for k,v in labels.items() }

sample = test_df.sample()
sample_class = sample['class'].values[0]
sample = { feature: sample[feature].values[0] for feature in features.keys() }

print(sample)
print(bayes_classify(sample))
print(sample_class)

{'cap-shape': 'f', 'cap-surface': 'y', 'cap-color': 'y', 'bruises': 'f', 'odor': 'f', 'gill-attachment': 'f', 'gill-spacing': 'c', 'gill-size': 'b', 'gill-color': 'p', 'stalk-shape': 'e', 'stalk-root': 'b', 'stalk-surface-above-ring': 'k', 'stalk-surface-below-ring': 'k', 'stalk-color-above-ring': 'b', 'stalk-color-below-ring': 'n', 'veil-type': 'p', 'veil-color': 'w', 'ring-number': 'o', 'ring-type': 'l', 'spore-print-color': 'h', 'population': 'y', 'habitat': 'g'}
{'e': 0.0, 'p': 1.0}
p
