In [183]:
import pandas as pd
import numpy as np
import random as rd
import sklearn as sk

wineDF = pd.read_csv('winequality/winequality-red.csv',sep=';')

class ProcessingData:
    @staticmethod
    def ShuffleData(df: pd.DataFrame):
        for i in range(len(df)):
            rand_i=rd.randint(0,len(df)-1)
            df.loc[i],df.loc[rand_i] = df.loc[rand_i],df.loc[i]
        return df
    @staticmethod
    def SplitData(df:pd.DataFrame,x:int,y:int):
        if x+y>100:
            print("Podano zbyt dużą ilość danych do podziału")
            return
        train_set,test_set = pd.DataFrame(),pd.DataFrame()
        for i in range(len(df)):
            if rd.random()<x/100:
                train_set = train_set.append(df.loc[i])
            else:
                test_set = test_set.append(df.loc[i])
        return train_set,test_set
    @staticmethod
    def NormalizeData(df:pd.DataFrame):
        for atributte in df.columns[:-1]:
            mean = df[atributte].mean()
            stddev = df[atributte].std()
            df[atributte] = (df[atributte]-mean)/stddev
        return df


In [None]:
class KNN:
    @staticmethod
    def distance(x: pd.Series, y:pd.Series, m:int) -> float:
        return sum([abs(x-y)**m for x,y in zip(x,y)])**(1/m)
    @staticmethod
    def fit(sample:pd.Series,df:pd.DataFrame,k:int,m:int):
        df = df.drop(df.columns[-1],axis=1)
        distances = [KNN.distance(sample,x,m) for x in df.values]
        distances = pd.Series(distances,index=df.index)
        distances = distances.sort_values()
        return distances.index[1:k+1]

    @staticmethod
    def classify(sample:pd.Series,df:pd.DataFrame,k:int,m:int):
        classes_dict = {x:0 for x in df[df.columns[-1]].unique()}
        for i in KNN.fit(sample,df,k,m):
            classes_dict[df.loc[i][-1]]+=1
        return max(classes_dict,key=classes_dict.get)

    @staticmethod
    def accuracy(train_df:pd.DataFrame,test_df:pd.DataFrame,k:int,m:int):
        correct = 0
        for i in range(len(test_df)):
            try:
                if KNN.classify(test_df.loc[i],train_df,k,m)==test_df.loc[i][-1]:
                    correct+=1
            except:
                print(i)
        print("Good predicted: ",correct)
        print("Bad predicted: ",len(test_df)-correct)
        print("Accuracy: ",round((correct/len(test_df))*100,2)+"%")
         

In [321]:
# TODO: do poprawy i będzie cacy :)


# class KNN:
#     @staticmethod
#     def distance(x: pd.Series, y:pd.Series, m:int) -> float:
#         return sum([abs(x-y)**m for x,y in zip(x,y)])**(1/m)
#     @staticmethod
#     def get_neighbours(df:pd.DataFrame,x:pd.Series,k:int,m:int) -> list:
#         distances = []
#         for i in range(len(df)):
#             distances.append((KNN.distance(x,df.iloc[i],m),df.iloc[i]))
#         distances.sort(key=lambda x:x[0])
#         return distances[:k]
#     @staticmethod
#     def get_class(neighbours:list) -> int:
#         classes = {}
#         for i in range(len(neighbours)):
#             if neighbours[i][1][-1] not in classes:
#                 classes[neighbours[i][1][-1]] = 1
#             else:
#                 classes[neighbours[i][1][-1]] += 1
#         return max(classes,key=classes.get)
#     @staticmethod
#     def get_accuracy(df:pd.DataFrame,test_set:pd.DataFrame,m:int) -> float:
#         correct = 0
#         for i in range(len(test_set)):
#             neighbours = KNN.get_neighbours(df,test_set.iloc[i],5,m)
#             if KNN.get_class(neighbours) == test_set.iloc[i][-1]:
#                 correct += 1
#         return correct/len(test_set)
   

In [319]:
wineDF = ProcessingData.ShuffleData(wineDF)
wineDF = ProcessingData.NormalizeData(wineDF)
train_set,test_set = ProcessingData.SplitData(wineDF,70,20)

In [320]:
print(KNN.get_accuracy(train_set,test_set,2))

0.8428571428571429
