In [1]:
import pandas as pd
import numpy as np
import random as rd

seedsDF = pd.read_csv('seeds.csv', sep=',')

class ProcessingData:
    @staticmethod
    def ShuffleData(df: pd.DataFrame):
        for i in range(len(df)):
            rand_i=rd.randint(0,len(df)-1)
            #zamiana miejscami liczb
            df.loc[i],df.loc[rand_i] = df.loc[rand_i],df.loc[i]
        return df
    @staticmethod
    def SplitData(df:pd.DataFrame,x:int,y:int):
        #walidacja
        if x+y>100 or x<0 or y<0 or x+y<100:
            print("Podano zbyt dużą/małą ilość danych do podziału")
            return
        train_set,test_set = pd.DataFrame(),pd.DataFrame()
        for i in range(len(df)):
            #uwarunkowanie podzialu
            if rd.random()<x/100:
                train_set = train_set.append(df.loc[i])
            else:
                test_set = test_set.append(df.loc[i])
        return train_set,test_set
    @staticmethod
    def NormalizeData(df:pd.DataFrame):
        for atributte in df.columns[:-1]:
            mean = df[atributte].mean()
            stddev = df[atributte].std()
            #wyliczenie normy
            df[atributte] = (df[atributte]-mean)/stddev
        return df



In [8]:
class NaiveBayes:
    @staticmethod
    def mean(atr):
        return atr.mean()
    @staticmethod
    def stdev(atr):
        return atr.std()
    @staticmethod
    def prob(x,mean,stdev):
        #walidacja wzoru
        if x<mean-(6**(1/2))*stdev:
            return 0
        elif x>mean+(6)**(1/2)*stdev:
            return 0
        elif x>=mean-(6**(1/2)*stdev) and x<mean:
            return (x-mean)/(6*(stdev**2)) + 1/(6**1/2)*stdev
        elif x>mean and x<=mean+(6**1/2)*stdev:
            return (x-mean)/(6*(stdev**2)) - 1/(6**1/2)*stdev
    @staticmethod
    def classify(train_set: pd.DataFrame, sample):
        #podzila na klasy oraz utworzenie slownikow
        varieties = train_set[train_set.columns[-1]].unique()
        mean_std = {}
        probabilities = {}
        #liczenie srednich i odchylen dla kazdej klasy i zapisanie do slownika
        for variety in varieties:
            mean_std[variety] = {}
            for atr in train_set.columns:
                if atr != train_set.columns[-1]:
                    mean_std[variety][atr] = [NaiveBayes.mean(train_set[train_set[train_set.columns[-1]] == variety][atr]),NaiveBayes.stdev(train_set[train_set[train_set.columns[-1]] == variety][atr])]
        #liczenie prawdopodobienstw dla kazdej klasy
        for variety in varieties:
            probabilities[variety] = 1/len(varieties)
            for atr in sample.index:
                if atr != train_set.columns[-1]:
                    probabilities[variety] *= NaiveBayes.prob(sample[atr],mean_std[variety][atr][0],mean_std[variety][atr][1])
        #zwracanie najbardziej prawdopodobnej klasy
        return max(probabilities,key=probabilities.get)
    @staticmethod
    def accuracy(test_set: pd.DataFrame, train_set: pd.DataFrame):
        correct = 0
        # podliczenie poprawnych klas w stosunku do wszystkich
        for i in range(len(test_set)):
            if test_set[train_set.columns[-1]].iloc[i] == NaiveBayes.classify(train_set,test_set.iloc[i]):
                correct += 1
        print("Bad predicted: "+str(len(test_set)-correct))
        return correct/len(test_set)

In [24]:
# dla danych nieznormalizowanych:
seedsDF = pd.read_csv('seeds.csv', sep=',')
seedsDF = ProcessingData.ShuffleData(seedsDF)
train_set,test_set = ProcessingData.SplitData(seedsDF,80,20)
print("Accuracy: "+str(NaiveBayes.accuracy(test_set,train_set)))

Bad predicted: 23
Accuracy: 0.5576923076923077


In [21]:
seeds_normalizedDF = pd.read_csv('seeds.csv', sep=',')
seeds_normalizedDF = ProcessingData.NormalizeData(seeds_normalizedDF)
seeds_normalizedDF = ProcessingData.ShuffleData(seeds_normalizedDF)
train_set2,test_set2 = ProcessingData.SplitData(seeds_normalizedDF,80,20)

print("Accuracy: "+str(NaiveBayes.accuracy(test_set2,train_set2)))

Bad predicted: 21
Accuracy: 0.5531914893617021
