In [1]:
import numpy as np
import pandas as pd

In [2]:
#Importing and loading the datasets

mush=pd.read_csv("datasets/mushroom.csv")
mush=mush.replace('?',np.nan)
mush.dropna(axis=1,inplace=True)

In [3]:
#Initializing target and features

target='class'
features=mush.columns[mush.columns!=target]

In [4]:
#splitting data into training and testing sets

target_class=mush[target].unique()
test=mush.sample(frac=0.3)
mush=mush.drop(test.index)

In [5]:
# Calculating conditional probabilities
cond_prob = {}
target_class_prob = {}

for t in target_class:
    mush_t = mush[mush[target] == t][features]
    target_class_prob[t] = float(len(mush_t) / len(mush))

    class_prob = {}
    for col in mush_t.columns:
        col_prob = {}
        for val, cnt in mush_t[col].value_counts().items(): 
            pr = cnt / len(mush_t)
            col_prob[val] = pr
        class_prob[col] = col_prob
    cond_prob[t] = class_prob


In [6]:
#defining probability calculation and classify function

def calc_probs(x):
    probs={}
    for t in target_class:
        p=target_class_prob[t]
        
        for col,val in x.items():
            try: p*=cond_prob[t][col][val]
            except: p=0
        probs[t]=p
    return probs

def classify(x):
    probs=calc_probs(x)
    max_prob=0
    max_class=''
    
    for cl,pr in probs.items():
        if pr>max_prob:
            max_prob=pr
            max_class=cl
        return max_class

In [7]:
#model evaluation on training sets

b=[]
for i in mush.index:
    b.append(classify(mush.loc[i,features])==mush.loc[i,target])
    print(sum(b)," correct of ",len(mush))
    print("Accuracy: ",sum(b)/len(mush))

1  correct of  45
Accuracy:  0.022222222222222223
1  correct of  45
Accuracy:  0.022222222222222223
1  correct of  45
Accuracy:  0.022222222222222223
2  correct of  45
Accuracy:  0.044444444444444446
2  correct of  45
Accuracy:  0.044444444444444446
2  correct of  45
Accuracy:  0.044444444444444446
3  correct of  45
Accuracy:  0.06666666666666667
3  correct of  45
Accuracy:  0.06666666666666667
3  correct of  45
Accuracy:  0.06666666666666667
3  correct of  45
Accuracy:  0.06666666666666667
4  correct of  45
Accuracy:  0.08888888888888889
4  correct of  45
Accuracy:  0.08888888888888889
4  correct of  45
Accuracy:  0.08888888888888889
5  correct of  45
Accuracy:  0.1111111111111111
5  correct of  45
Accuracy:  0.1111111111111111
5  correct of  45
Accuracy:  0.1111111111111111
5  correct of  45
Accuracy:  0.1111111111111111
5  correct of  45
Accuracy:  0.1111111111111111
5  correct of  45
Accuracy:  0.1111111111111111
6  correct of  45
Accuracy:  0.13333333333333333
6  correct of  45
Ac

In [9]:
#model evaluation of test set
b=[]
for i in test.index:
    b.append(classify(test.loc[i,features])==test.loc[i,target])
    print(sum(b)," correct of ",len(test))
    print("Accuracy: ",sum(b)/len(test))

0  correct of  20
Accuracy:  0.0
0  correct of  20
Accuracy:  0.0
0  correct of  20
Accuracy:  0.0
0  correct of  20
Accuracy:  0.0
1  correct of  20
Accuracy:  0.05
1  correct of  20
Accuracy:  0.05
1  correct of  20
Accuracy:  0.05
2  correct of  20
Accuracy:  0.1
2  correct of  20
Accuracy:  0.1
2  correct of  20
Accuracy:  0.1
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
3  correct of  20
Accuracy:  0.15
