In [8]:
import numpy as np
import pandas as pd
from math import sqrt, exp, pi
from sklearn.model_selection import train_test_split
from sklearn import metrics

data = pd.read_csv('iris.data.csv')
print('training data set sample')
print(data.sample(5))


training data set sample
     sepal_length  sepal_width  petal_length  petal_width       Class_name
70            5.9          3.2           4.8          1.8  Iris-versicolor
105           7.6          3.0           6.6          2.1   Iris-virginica
18            5.7          3.8           1.7          0.3      Iris-setosa
146           6.3          2.5           5.0          1.9   Iris-virginica
106           4.9          2.5           4.5          1.7   Iris-virginica


In [9]:
def labels_to_num(data):
    # convert strings to numbers
    data.replace(['Iris-versicolor','Iris-setosa','Iris-virginica'], [0,1,2], inplace=True)

print('\nreplaced labels with numbers')
labels_to_num(data)
print(data.sample(5))

# split data by class
def split_by_class(data):
    # split data by class
    classes = dict()
    for row in data:
        if(row[-1] not in classes.keys()):
            classes[row[-1]] = list()
            
        classes[row[-1]].append(row)

    return classes


replaced labels with numbers
     sepal_length  sepal_width  petal_length  petal_width  Class_name
24            4.8          3.4           1.9          0.2           1
101           5.8          2.7           5.1          1.9           2
7             5.0          3.4           1.5          0.2           1
73            6.1          2.8           4.7          1.2           0
125           7.2          3.2           6.0          1.8           2


In [10]:
# get mean, std and size
def get_info(data):
    # get data
    info = [(np.mean(col), np.std(col), len(col)) for col in zip(*data)]
    del info[-1] # remove target coloumn
    return info

# get mean, std and size by class
def get_info_by_class(data):
    classData = split_by_class(data)
    info = dict()
    for classVal, rows in classData.items():
        info[classVal] = get_info(rows)
    return info


In [11]:
def calc_prob(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent

def predict(info, dataset):
    outProbs = list()

    for data in dataset:
        total_rows = sum([info[label][0][2] for label in info])
        probs = dict()

        for classVal, class_info in info.items():
            probs[classVal] = info[classVal][0][2]/float(total_rows)
            for i in range(len(class_info)):
                mean, stdev, _ = class_info[i]
                probs[classVal] *= calc_prob(data[i], mean, stdev)

        outProbs.append(probs)

    preds = list()
    for prob in outProbs:
        preds.append(max(prob, key=prob.get))
    return preds

In [12]:
# split training and testing data
X_train, X_test = train_test_split(data,test_size=0.2)

info = get_info_by_class(X_train.values)
y_test = X_test.iloc[:,-1].values
X_test = X_test.iloc[:,0:-1]
preds = predict(info, X_test.values)
 
print('accuracy', metrics.accuracy_score(y_test,preds))
print('confusion matrix')
print(metrics.confusion_matrix(y_test, preds))
print('classification report')
print(metrics.classification_report(y_test, preds))

accuracy 0.9
confusion matrix
[[ 7  0  2]
 [ 0 11  0]
 [ 1  0  9]]
classification report
              precision    recall  f1-score   support

           0       0.88      0.78      0.82         9
           1       1.00      1.00      1.00        11
           2       0.82      0.90      0.86        10

    accuracy                           0.90        30
   macro avg       0.90      0.89      0.89        30
weighted avg       0.90      0.90      0.90        30

