In [1]:
import pandas as pd
import numpy as np
import random as rd
import sklearn as sk


class ProcessingData:
    @staticmethod
    def ShuffleData(df: pd.DataFrame):
        for i in range(len(df)):
            rand_i=rd.randint(0,len(df)-1)
            #zamiana miejscami liczb
            df.loc[i],df.loc[rand_i] = df.loc[rand_i],df.loc[i]
        return df
    @staticmethod
    def SplitData(df:pd.DataFrame,x:int,y:int):
        #walidacja
        if x+y>100 or x<0 or y<0 or x+y<100:
            print("Podano zbyt dużą/małą ilość danych do podziału")
            return
        train_set,test_set = pd.DataFrame(),pd.DataFrame()
        for i in range(len(df)):
            #uwarunkowanie podzialu
            if rd.random()<x/100:
                train_set = train_set.append(df.loc[i])
            else:
                test_set = test_set.append(df.loc[i])
        return train_set,test_set
    @staticmethod
    def NormalizeData(df:pd.DataFrame):
        for atributte in df.columns[:-1]:
            mean = df[atributte].mean()
            stddev = df[atributte].std()
            #wyliczenie normy
            df[atributte] = (df[atributte]-mean)/stddev
        return df


In [3]:
class KNN:
    @staticmethod
    def distance(x: pd.Series, y:pd.Series, m:int) -> float:
              return sum([(abs(xi-yi))
                    for xi,yi in zip(x,y)])
    @staticmethod
    def get_neighbours(df:pd.DataFrame,x:pd.Series,k:int,m:int) -> list:
        distances = []

        # wyliczenie wszystkich odległości od punktu x
        
        for i in range(len(df)):
            distances.append((KNN.distance(x,df.iloc[i],m),df.iloc[i]))
        distances.sort(key=lambda x:x[0])
        return distances[:k]
    @staticmethod
    def get_class(neighbours:list) -> int:
        classes = {}
        # wyliczenie klas dla kazdego sasiada oraz ich ilości a nastepnie wybór klasy
        for i in range(len(neighbours)):
            if neighbours[i][1][-1] not in classes:
                classes[neighbours[i][1][-1]] = 1
            else:
                classes[neighbours[i][1][-1]] += 1
        return max(classes,key=classes.get)
    @staticmethod
    def get_accuracy(df:pd.DataFrame,test_set:pd.DataFrame,m:int,k:int) -> float:
        correct = 0
        for i in range(len(test_set)):
            neighbours = KNN.get_neighbours(df,test_set.iloc[i],k,m)
            if KNN.get_class(neighbours) == test_set.iloc[i][-1]:
                correct += 1
        print("Good predicted: ",correct)
        print("Bad predicted: ",len(test_set)-correct)
        return f"{round((correct/len(test_set))*100,2)}%"
   

In [9]:
seedsDF = pd.read_csv("seeds.csv",sep=",")
seedsDF = ProcessingData.ShuffleData(seedsDF)
seedsDF = ProcessingData.NormalizeData(seedsDF)
train_set,test_set = ProcessingData.SplitData(seedsDF,70,30)

print(KNN.get_accuracy(train_set,test_set,2,4))

Good predicted:  61
Bad predicted:  1
98.39%


In [15]:
seedsDF1 = pd.read_csv("seeds.csv",sep=",")
seedsDF1 = ProcessingData.ShuffleData(seedsDF1)
train_set1,test_set1 = ProcessingData.SplitData(seedsDF1,70,30)

print(KNN.get_accuracy(train_set1,test_set1,2,4))

Good predicted:  59
Bad predicted:  2
96.72%
