In [18]:
import pandas as pd
import numpy as np
import random

class ProcessingData:
    
    @staticmethod
    def shuffle(data: pd.DataFrame) -> pd.DataFrame:
        for i in range(len(data)):
            j = random.randint(0,len(data)-1)
            data.iloc[i],data.iloc[j] = data.iloc[j],data.iloc[i]
        return data
    @staticmethod
    def normalize(data: pd.DataFrame,columns: list) -> pd.DataFrame:
        for column in columns:
            x = data[column].max()
            y = data[column].min()
            data[column] = (data[column]-y)/(x-y)
        return data      
    @staticmethod
    def split(data: pd.DataFrame) -> pd.DataFrame:
        train_set = pd.DataFrame(columns=data.columns)
        test_set = pd.DataFrame(columns=data.columns)
        for i in range(len(data)):
            if random.random() < 0.7:
                train_set = train_set.append(data.iloc[i])
            else:
                test_set = test_set.append(data.iloc[i])
        return train_set,test_set


In [19]:
irisdf =  pd.read_csv("iris.csv")

In [21]:
class NaiveBayes:
    @staticmethod
    def mean(atr):
        return atr.mean()
    @staticmethod
    def stdev(atr):
        return atr.std()
    @staticmethod
    def prob(x,mean,stdev):
        if stdev == 0:
            return 0
        else:
            exponent = np.exp(-(x-mean)**2/(2*stdev**2))
            return (1/(np.sqrt(2*np.pi)*stdev))*exponent
    @staticmethod
    def classify(train_set: pd.DataFrame, sample):
        varieties = train_set["variety"].unique()
        mean_std = {}
        probabilities = {}
        for variety in varieties:
            mean_std[variety] = {}
            for atr in train_set.columns:
                if atr != "variety":
                    mean_std[variety][atr] = [NaiveBayes.mean(train_set[train_set["variety"] == variety][atr]),NaiveBayes.stdev(train_set[train_set["variety"] == variety][atr])]
        for variety in varieties:
            probabilities[variety] = 1/len(varieties)
            for atr in sample.index:
                if atr != "variety":
                    probabilities[variety] *= NaiveBayes.prob(sample[atr],mean_std[variety][atr][0],mean_std[variety][atr][1])
        return max(probabilities,key=probabilities.get)
    @staticmethod
    def accuracy(test_set: pd.DataFrame, train_set: pd.DataFrame):
        correct = 0
        for i in range(len(test_set)):
            if test_set["variety"].iloc[i] == NaiveBayes.classify(train_set,test_set.iloc[i]):
                correct += 1
            else:
                print("Sample: "+test_set.iloc[i]["variety"]+" classified as: "+NaiveBayes.classify(train_set,test_set.iloc[i]))
        print("Bad predicted: "+str(len(test_set)-correct))
        return correct/len(test_set)




        


        

In [26]:
irisdf=ProcessingData.shuffle(irisdf)
irisdf = ProcessingData.normalize(irisdf,["sepal.length","sepal.width","petal.length","petal.width"])
train_set,test_set = ProcessingData.split(irisdf)

In [29]:
print("Accuracy:",NaiveBayes.accuracy(test_set,train_set))

Sample: Virginica classified as: Versicolor
Sample: Virginica classified as: Versicolor
Bad predicted: 2
Accuracy: 0.9574468085106383
