In [1]:
# load libraries
import numpy as np
import heapq
import pickle

In [2]:
# load saved training/validation/test data
with open('train_val_test_data.pkl', 'rb') as file:
    X_train, X_val, X_test, y_train, y_val, y_test = pickle.load(file)

In [3]:
# calculate accuracy given the true labels and the predictions
def accuracy(y_truth, y_pred):
    correct_pred = 0
    # iterate through the values and check if the labels are the same, update as required
    for y_t, y_p in zip(y_truth, y_pred):
        if y_t == y_p :
            correct_pred += 1
    # find the proportion by dividing the correct predictions by all the predictions
    return correct_pred / len(y_truth)

In [4]:
# k nearest neighbors function
def knn(X, num_classes, k, norm_order):
    # initialize list of predictions
    y_pred = []
    for x in X:
        # find the distances to the training data
        distances_with_indexes = []
        # iterate through the training data and find the distances to each point using the normalization order
        for i in range(len(X_train)):
            # store the top k data points that are closest to x
            if len(distances_with_indexes) < k:
                heapq.heappush(distances_with_indexes, (-1 * np.linalg.norm(X_train[i] - x, ord=norm_order), i))
            else:
                heapq.heappushpop(distances_with_indexes, (-1 * np.linalg.norm(X_train[i] - x, ord=norm_order), i))
        # initialize class counts to zero
        class_counts = np.zeros(num_classes)
        # iterate through the k nearest neighbors and find the counts of each label
        for distance, index in distances_with_indexes:
            class_counts[y_train[index]] += 1
        # append the class with the greatest count in the neighbors as the predicted label for this x
        y_pred.append(np.argmax(class_counts))
    return y_pred

In [None]:
# find the accuracy metrics for each set of data using knn
train_predictions = knn(X_train, 5, 20, 2)
train_accuracy = accuracy(y_train.reshape(1, -1)[0], train_predictions)
print("KNN Train Accuracy:", train_accuracy)

KNN Train Accuracy: 0.9876875


In [None]:
# find the accuracy metrics for each set of data using knn
val_predictions = knn(X_val, 5, 20, 5)
val_accuracy = accuracy(y_val.reshape(1, -1)[0], val_predictions)
print("KNN Validation Accuracy:", val_accuracy)

KNN Validation Accuracy: 0.9877


In [None]:
# find the accuracy metrics for each set of data using knn
test_predictions = knn(X_test, 5, 20, 5)
test_accuracy = accuracy(y_test.reshape(1, -1)[0], test_predictions)
print("KNN Test Accuracy:", test_accuracy)

KNN Test Accuracy: 0.9865
