In [345]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.metrics import confusion_matrix, precision_recall_curve, precision_score, recall_score, f1_score

In [3]:
header = ['sepal_length','sepal_width','petal_length','petal_width','label']

In [6]:
feature_names = header[:-1]
feature_names

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [9]:
df = pd.read_csv('../data/iris.data',names=header)

In [10]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [28]:
label_names = df.label.unique()
label_names

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [320]:
mapping = {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}

In [321]:
df['label'].apply(lambda x: mapping[x])

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: label, Length: 150, dtype: int64

In [324]:
df['label_ordinal'] = df['label'].map(mapping)

In [339]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,label_ordinal
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,2
146,6.3,2.5,5.0,1.9,Iris-virginica,2
147,6.5,3.0,5.2,2.0,Iris-virginica,2
148,6.2,3.4,5.4,2.3,Iris-virginica,2


In [325]:
#for each label calculate each features conditional mean and std

index_arr = []
stats_arr = []

for label in label_names:
    for feature in feature_names:
        mean,std = stats.norm.fit(df.loc[df['label']==label,feature])
        index_arr.append((label,feature))
        stats_arr.append([mean,std])        

In [326]:
index_arr

[('Iris-setosa', 'sepal_length'),
 ('Iris-setosa', 'sepal_width'),
 ('Iris-setosa', 'petal_length'),
 ('Iris-setosa', 'petal_width'),
 ('Iris-versicolor', 'sepal_length'),
 ('Iris-versicolor', 'sepal_width'),
 ('Iris-versicolor', 'petal_length'),
 ('Iris-versicolor', 'petal_width'),
 ('Iris-virginica', 'sepal_length'),
 ('Iris-virginica', 'sepal_width'),
 ('Iris-virginica', 'petal_length'),
 ('Iris-virginica', 'petal_width')]

In [327]:
stats_arr

[[5.006, 0.3489469873777391],
 [3.418, 0.37719490982779713],
 [1.464, 0.17176728442867112],
 [0.244, 0.10613199329137281],
 [5.936, 0.5109833656783751],
 [2.7700000000000005, 0.31064449134018135],
 [4.26, 0.4651881339845203],
 [1.3259999999999998, 0.19576516544063705],
 [6.587999999999998, 0.6294886813914926],
 [2.974, 0.3192553836664309],
 [5.5520000000000005, 0.546347874526844],
 [2.0260000000000002, 0.2718896835115301]]

In [328]:
multi_index = pd.MultiIndex.from_tuples(index_arr, names=['label','feature'])

In [329]:
df_cond = pd.DataFrame(stats_arr,index=multi_index,columns=['mean','std'])

In [330]:
df_cond = df_cond.reindex(multi_index)

In [331]:
df_cond

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
label,feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,sepal_length,5.006,0.348947
Iris-setosa,sepal_width,3.418,0.377195
Iris-setosa,petal_length,1.464,0.171767
Iris-setosa,petal_width,0.244,0.106132
Iris-versicolor,sepal_length,5.936,0.510983
Iris-versicolor,sepal_width,2.77,0.310644
Iris-versicolor,petal_length,4.26,0.465188
Iris-versicolor,petal_width,1.326,0.195765
Iris-virginica,sepal_length,6.588,0.629489
Iris-virginica,sepal_width,2.974,0.319255


In [332]:
df_cond.xs('Iris-setosa',level=0)['mean']

feature
sepal_length    5.006
sepal_width     3.418
petal_length    1.464
petal_width     0.244
Name: mean, dtype: float64

In [333]:
#for each feature calculate feature's unconditional mean and std
dic = {}
for feature in feature_names:
    dic[feature] = stats.norm.fit(df[feature])
dic

{'sepal_length': (5.843333333333334, 0.8253012917851409),
 'sepal_width': (3.0540000000000003, 0.4321465800705435),
 'petal_length': (3.758666666666666, 1.7585291834055212),
 'petal_width': (1.1986666666666668, 0.7606126185881716)}

In [334]:
df_uncond = pd.DataFrame.from_dict(dic, orient='index', columns=['mean', 'std'])
df_uncond

Unnamed: 0,mean,std
sepal_length,5.843333,0.825301
sepal_width,3.054,0.432147
petal_length,3.758667,1.758529
petal_width,1.198667,0.760613


In [335]:
def predict(vec):
    probs = []
    for label in label_names:
        #calculate pdfs f(x1|C),...,f(xn|C) and return their product
        cond_pdf = stats.norm.pdf(vec,loc=df_cond.xs(label,level=0)['mean'],scale=df_cond.xs(label,level=0)['std']).prod()
        
        #calculate the probability of class C_k
        prob_class = (df['label'].value_counts()/df.shape[0])[label]
        
        #calculate unconditional pdfs f(x1),...,f(xn) and return their product
        uncon_pdf = stats.norm.pdf(vec,loc=df_uncond['mean'], scale=df_uncond['std']).prod()
        
        #calculate posterior prob P(C|x1,...,xn) and accumulate results to find the maximum 
        probs.append(cond_pdf*prob_class/uncon_pdf)
        
    probs = np.array(probs)
    return label_names[probs.argmax()]

In [336]:
x_vec = [5.9,3.0,5.1,1.8]

In [359]:
predict(x_vec), mapping[predict(x_vec)]

('Iris-virginica', 2)

In [360]:
predictions = [mapping[predict(sample)] for sample in df_vals]

In [361]:
predictions

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [362]:
targets = df['label_ordinal'].values

In [363]:
targets

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [369]:
f1 = f1_score(y_true = targets, y_pred = predictions, average = 'macro')
precision = precision_score(y_true = targets, y_pred = predictions, average = 'macro')
recall = recall_score(y_true = targets, y_pred = predictions, average = 'macro')

In [370]:
print(f"Scores:\nf1: {f1}\nprecision:{precision}\nrecall:{recall}")

Scores:
f1: 0.96
precision:0.96
recall:0.96


In [371]:
conf_matrix = confusion_matrix(y_true=targets, y_pred=predictions, labels=[0,1,2])

In [372]:
conf_matrix_df = pd.DataFrame(conf_matrix, columns=label_names, index=label_names)

In [373]:
conf_matrix_df

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,50,0,0
Iris-versicolor,0,47,3
Iris-virginica,0,3,47
