In [3]:
import pandas as pd
import random
iris = pd.read_csv("iris.csv")

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [2]:
class ProcessingData:
    
    @staticmethod
    def shuffle(data: pd.DataFrame) -> pd.DataFrame:
        for i in range(len(data)):
            j = random.randint(0,len(data)-1)
            data.iloc[i],data.iloc[j] = data.iloc[j],data.iloc[i]
        return data
    @staticmethod
    def normalize(data: pd.DataFrame,columns: list) -> pd.DataFrame:
        for column in columns:
            x = data[column].max()
            y = data[column].min()
            data[column] = (data[column]-y)/(x-y)
        return data      
    @staticmethod
    def split(data: pd.DataFrame) -> pd.DataFrame:
        train_set = pd.DataFrame(columns=data.columns)
        test_set = pd.DataFrame(columns=data.columns)
        for i in range(len(data)):
            if random.random() < 0.7:
                train_set = train_set.append(data.iloc[i])
            else:
                test_set = test_set.append(data.iloc[i])
        return train_set,test_set


In [29]:
class KMM:
    @staticmethod
    def distance(x: pd.Series,y: pd.Series,m: int) -> float:
        return sum([abs(x-y)**m for x,y in zip(x,y)])**(1/m)
    
    @staticmethod
    def q(s: pd.Series, dataset: pd.DataFrame, m: int, k: int) -> float:
        types = {}
        for i in pd.unique(dataset["variety"]):
            types[i] = 0
        result = []
        for sample in dataset.values:
            result.append(KMM.distance(s[:-1],sample[:-1],m))
        for i in range(len(result)):
            types[dataset.iloc[i]["variety"]] += result[i]
        return min(types, key=types.get)

    @staticmethod
    def compare_accuracy(dataset: pd.DataFrame, m: int, k: int) -> float:
        correct = 0
        for sample in dataset.values:
            if KMM.q(sample,dataset,m,k) == sample[-1]:
                correct += 1
        return correct/len(dataset)

In [37]:
#   q(s,dataset,m,k) gdzie m to metryka a k to ilosc sasiadow

#   a) odleglosc od kazdego elementu d
#   b) sortujemy po odleglosciach
#   c) do ktorej z klas naleza elementy, jako najliczniejsza dajemy ten najbardziej rozpoznany

#przetestuj dla każdego rekordu ze zbioru walidacyjnego przy uzyciu zbioru treningowego
#nastepnie wyznacz dokladnosc przy uzyciu k=2, k=3, k=4

dataset = ProcessingData.shuffle(iris)
dataset = ProcessingData.normalize(dataset,['sepal.length','sepal.width','petal.length','petal.width'])
train_set,test_set = ProcessingData.split(dataset)

for k in [2,3,4]:
    print("Accuracy for k =",k,":",KMM.compare_accuracy(test_set,4,k))


Accuracy for k = 2 : 0.926829268292683
Accuracy for k = 3 : 0.926829268292683
Accuracy for k = 4 : 0.926829268292683
