In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [2]:
def distance(instance1, instance2, length):
    d = 0
    for i in range(length):
        d += (instance1[i]-instance2[i])**2
    return math.sqrt(d)

def get_neighbours(instance, X_train, y_train, k):
    distances = []
    for i in range(len(X_train)):
        d = distance(instance,X_train[i],len(X_train[i]))
        distances.append(d)
    idx = np.argsort(distances)[1:k+1]
    neighbours = []
    classes = []
    for index in idx:
        classes.append(y_train[index])
    return idx, classes

def get_votes(neighbours, classes):
    votes = list(np.zeros(classes))
    for i in neighbours:
        votes[i] += 1
    return votes.index(max(votes))

def regression(neighbours):
    return np.mean(neighbours)

def predict(X_test, X_train, y_train, k, alg):
    predictions = []
    for i in X_test:
        idx, classes = get_neighbours(i, X_train, y_train, k)
        if alg=='c':
            prediction = get_votes(classes, len(set(y_train)))
        if alg=='r':
            prediction = regression(classes)
        predictions.append(prediction)
    return predictions, idx

def get_accuracy(predictions, y_test, alg):
    if alg=='c':
        correct = 0
        for i in range(len(predictions)):
            if predictions[i] == y_test[i]:
                correct += 1
        print('Accuracy:',float(correct/len(predictions)))
    if alg=='r':
        absolute_error = 0
        square_error = 0
        for i in range(len(predictions)):
            absolute_error += abs(predictions[i]-y_test[i])
            square_error += (predictions[i]-y_test[i])**2
        print('MAE:', absolute_error/len(predictions))
        print('MSE:', square_error/len(predictions))

def knn(alg):
    if alg=='c':
        df = pd.read_csv('data/knn_classification.csv')
    if alg=='r':
        df = pd.read_csv('data/knn_regression.csv')
    k = 10
    y = df['y']
    X = df.drop(['y'], axis=1)
    rs = 10

    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.33, random_state=rs)

    predictions, idx = predict(X_test, X_train, y_train, k, alg)
    get_accuracy(predictions, y_test, alg)

    task_instance = [list(X.iloc[123])]
    prediction, idx = predict(task_instance, X_train, y_train, k, alg)
    print("Instance 124:", task_instance[0])
    print("Actual class:", y.iloc[123])
    print("Predicted class:", prediction)
    print("Neighbours:")
    for index in idx:
        print(X_train[index], "Class:", y_train[index])

## k-NN classification

In [3]:
knn('c')

Accuracy: 1.0
Instance 124: [6.3, 2.7, 4.9, 1.8]
Actual class: 2
Predicted class: [2]
Neighbours:
[6.1 3.  4.9 1.8] Class: 2
[6.3 2.8 5.1 1.5] Class: 2
[6.  2.7 5.1 1.6] Class: 1
[6.  3.  4.8 1.8] Class: 2
[6.5 2.8 4.6 1.5] Class: 1
[6.5 3.  5.2 2. ] Class: 2
[6.7 3.  5.  1.7] Class: 1
[6.1 2.9 4.7 1.4] Class: 1
[5.9 3.  5.1 1.8] Class: 2
[5.8 2.7 5.1 1.9] Class: 2


## k-NN regression

In [4]:
knn('r') #knn_regression

MAE: 0.1634
MSE: 0.04825799999999998
Instance 124: [6.3, 2.7, 4.9]
Actual class: 1.8
Predicted class: [1.6300000000000001]
Neighbours:
[6.3 2.8 5.1] Class: 1.5
[6.1 2.8 4.7] Class: 1.2
[6.1 2.9 4.7] Class: 1.4
[6.  2.7 5.1] Class: 1.6
[6.1 3.  4.9] Class: 1.8
[6.5 2.8 4.6] Class: 1.5
[6.  3.  4.8] Class: 1.8
[6.5 3.  5.2] Class: 2.0
[6.7 3.  5. ] Class: 1.7
[5.9 3.  5.1] Class: 1.8
