In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:

def chargerData(path):
    data = []
    with open(path, "r") as file:
        for line in file:
            data.append(line.split(","))
    return data


def classifyData(data):
    newData = np.array(data)
    #remove the last column
    newData = newData[:, :-1]
    return newData[:-1].astype("float")

In [3]:
data = chargerData("dataset.txt")
data = classifyData(data)
pd.DataFrame(data)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
144,6.7,3.3,5.7,2.5
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0


In [4]:
def manhattan(A, B):
    return np.sum(np.abs(A - B))
manhattan(data[0], data[1])

0.6999999999999993

In [5]:
def euclidean(A, B):
    return np.sqrt(np.sum((A - B) ** 2))
euclidean(data[0], data[1])

0.5385164807134502

In [6]:
def minkowski(A, B,p=3):
    return np.sum(np.abs(A - B) ** p) ** (1 / p)
minkowski(data[0], data[1], 3)

0.5104468722001463

In [7]:
def cosine(A, B):
    nom = np.sum(A * B)
    denom = np.sqrt(np.sum(A ** 2)) * np.sqrt(np.sum(B ** 2))
    return 1 - (nom / denom)
cosine(data[0], data[1])

0.0014208364959781283

In [8]:
def hamming(A,B):
    i = 0
    for x in range(len(A)):
        if A[x] != B[x]:
            i += 1
    return i
hamming(data[0], data[1])

2

In [9]:
def sort(data,new_instance, distance):
    # return sorted(data, key=lambda x: distance(x[0], x[1]))
    d = []
    for i,x in enumerate(data):
        d.append([i, distance(x, new_instance)])
    d = sorted(d, key=lambda x: x[1])
    return d

In [10]:
def knn(data,instance, k, distance):
    res = sort(data, instance, distance)
    return res[:k]

result1 = knn(data, [5.2,2.0,1.41,0.25], 3, manhattan)
for x in result1:
    print(data[x[0]], x[1])
print('-------------------')
result2 = knn(data, [5.2,3.5,1.41,0.25], 3, euclidean)
for x in result2:
    print(data[x[0]], x[1])
print('-------------------')
result3 = knn(data, [5.2,3.5,1.41,0.25], 3, minkowski)
for x in result3:
    print(data[x[0]], x[1])
print('-------------------')
result4 = knn(data, [5.2,3.5,1.41,0.25], 3, hamming)
for x in result4:
    print(data[x[0]], x[1])

[4.5 2.3 1.3 0.3] 1.16
[4.9 3.  1.4 0.2] 1.3599999999999999
[5.  3.  1.6 0.2] 1.4400000000000004
-------------------
[5.2 3.5 1.5 0.2] 0.10295630140987007
[5.2 3.4 1.4 0.2] 0.1122497216032183
[5.1 3.5 1.4 0.2] 0.11224972160321871
-------------------
[5.2 3.5 1.5 0.2] 0.09487518233801294
[5.2 3.4 1.4 0.2] 0.10403499808305595
[5.1 3.5 1.4 0.2] 0.10403499808305637
-------------------
[5.2 3.5 1.5 0.2] 2
[5.1 3.5 1.4 0.2] 3
[5.1 3.5 1.4 0.3] 3


# Apprentissage


In [11]:
def chargerData2(path):
    data = []
    with open(path, "r") as file:
        for line in file:
            data.append(line.split(","))
    return data


def classifyData2(data):
    newData = np.array(data)

    #remove the last column
    # newData = newData[:, :-1]
    for line in newData:
        #remove the \n
        line[-1] = line[-1][:-1]
        if line[-1] == 'Iris-setosa':
            line[-1] = 0
        elif line[-1] == 'Iris-versicolor':
            line[-1] = 1
        elif line[-1] == 'Iris-virginica':
            line[-1] = 2
    return newData[:-1].astype("float")

In [12]:
data2 = chargerData('dataset.txt')
data2 = classifyData2(data2)

In [13]:
#split the data
def splitData(data):
    #shake the data
    classes = {}
    for line in data:
        if line[-1] not in classes:
            classes[line[-1]] = []
        classes[line[-1]].append(line)
    #shuffle every class
    for key in classes:
        np.random.shuffle(classes[key])
    #split the data
    train, test = [], []
    for key in classes:
        train += classes[key][:int(len(classes[key]) * 0.6)]
        test += classes[key][int(len(classes[key]) * 0.6):]
    return train, test
train, test = splitData(data2)


In [14]:
#calculate the accuracy with the test data
def accuracy(train, test, k, distance):
    correct = 0
    for x in test:
        result = knn(train, x, k, distance)
        if x[-1] == train[result[0][0]][-1]:
            # print(x, train[result[0][0]])
            correct += 1
    return correct / len(test)*100


In [16]:
print('manhattan')
for k in range(3, 15):
    acc = accuracy(train, test, k, manhattan)
    print('k=', k, 'accuracy=', acc, '%')
print('-------------------')
print('euclidean')
for k in range(3, 15):
    acc = accuracy(train, test, k, euclidean)
    print('k=', k, 'accuracy=', acc, '%')
print('-------------------')
print('minkowski')
for k in range(3, 15):
    acc = accuracy(train, test, k, minkowski)
    print('k=', k, 'accuracy=', acc, '%')
print('-------------------')
print('cosine')
for k in range(3, 15):
    acc = accuracy(train, test, k, cosine)
    print('k=', k, 'accuracy=', acc, '%')
print('-------------------')
print('hamming')
for k in range(3, 15):
    acc = accuracy(train, test, k, hamming)
    print('k=', k, 'accuracy=', acc, '%')

manhattan
k= 3 accuracy= 100.0 %


k= 4 accuracy= 100.0 %
k= 5 accuracy= 100.0 %
k= 6 accuracy= 100.0 %
k= 7 accuracy= 100.0 %
k= 8 accuracy= 100.0 %
k= 9 accuracy= 100.0 %
k= 10 accuracy= 100.0 %
k= 11 accuracy= 100.0 %
k= 12 accuracy= 100.0 %
k= 13 accuracy= 100.0 %
k= 14 accuracy= 100.0 %
-------------------
euclidean
k= 3 accuracy= 100.0 %
k= 4 accuracy= 100.0 %
k= 5 accuracy= 100.0 %
k= 6 accuracy= 100.0 %
k= 7 accuracy= 100.0 %
k= 8 accuracy= 100.0 %
k= 9 accuracy= 100.0 %
k= 10 accuracy= 100.0 %
k= 11 accuracy= 100.0 %
k= 12 accuracy= 100.0 %
k= 13 accuracy= 100.0 %
k= 14 accuracy= 100.0 %
-------------------
minkowski
k= 3 accuracy= 100.0 %
k= 4 accuracy= 100.0 %
k= 5 accuracy= 100.0 %
k= 6 accuracy= 100.0 %
k= 7 accuracy= 100.0 %
k= 8 accuracy= 100.0 %
k= 9 accuracy= 100.0 %
k= 10 accuracy= 100.0 %
k= 11 accuracy= 100.0 %
k= 12 accuracy= 100.0 %
k= 13 accuracy= 100.0 %
k= 14 accuracy= 100.0 %
-------------------
cosine
k= 3 accuracy= 100.0 %
k= 4 accuracy= 100.0 %
k= 5 accuracy= 100.0 %
k= 6 accuracy= 100.0 %
k