In [2]:
import os
import sys
import heapq
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from statistics import mode
import matplotlib.pyplot as plt

# system-agnostic utils file import
root_dir_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(root_dir_path)
import utils

pd.set_option("display.max_colwidth", None)

Prediction Accuracy: 75.00%


In [3]:
# load the dataset
digits_data = load_digits()
X = digits_data.data
y = digits_data.target
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_train[0:2, :]

array([[ 0.,  0.,  5., 13., 13.,  8.,  0.,  0.,  0.,  0., 16., 11., 13.,
        16.,  6.,  0.,  0.,  1., 16.,  5.,  2., 14.,  9.,  0.,  0.,  0.,
         9., 16., 16., 15.,  0.,  0.,  0.,  0., 10., 16., 14., 14.,  0.,
         0.,  0.,  5., 15.,  4.,  0., 16.,  6.,  0.,  0.,  6., 14.,  7.,
         6., 16.,  4.,  0.,  0.,  0.,  7., 15., 16., 10.,  0.,  0.],
       [ 0.,  0.,  3., 14., 16., 14.,  0.,  0.,  0.,  0., 13., 13., 13.,
        16.,  2.,  0.,  0.,  0.,  1.,  0.,  9., 15.,  0.,  0.,  0.,  0.,
         9., 12., 15., 16., 10.,  0.,  0.,  4., 16., 16., 16., 11.,  3.,
         0.,  0.,  0.,  4.,  9., 14.,  2.,  0.,  0.,  0.,  0.,  2., 15.,
         9.,  0.,  0.,  0.,  0.,  0.,  4., 13.,  1.,  0.,  0.,  0.]])

In [4]:
# unique values and their counts
unique_values, counts = np.unique(y_train, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"{value} occurs {count} time(s).")

0 occurs 125 time(s).
1 occurs 132 time(s).
2 occurs 130 time(s).
3 occurs 129 time(s).
4 occurs 121 time(s).
5 occurs 116 time(s).
6 occurs 128 time(s).
7 occurs 124 time(s).
8 occurs 131 time(s).
9 occurs 121 time(s).


In [5]:
normalized_training_features = utils.minmax_normalize_2d_array(X_train)
X_train = None
normalized_test_features = utils.minmax_normalize_2d_array(X_test)
X_test = None
normalized_training_features[0:2, :]

array([[0.        , 0.        , 0.3125    , 0.8125    , 0.8125    ,
        0.5       , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.6875    , 0.8125    , 1.        , 0.375     ,
        0.        , 0.        , 0.0625    , 1.        , 0.3125    ,
        0.125     , 0.875     , 0.5625    , 0.        , 0.        ,
        0.        , 0.5625    , 1.        , 1.        , 0.9375    ,
        0.        , 0.        , 0.        , 0.        , 0.625     ,
        1.        , 0.875     , 0.875     , 0.        , 0.        ,
        0.        , 0.3125    , 0.9375    , 0.25      , 0.        ,
        1.        , 0.375     , 0.        , 0.        , 0.375     ,
        0.875     , 0.4375    , 0.375     , 1.        , 0.25      ,
        0.        , 0.        , 0.        , 0.4375    , 0.9375    ,
        1.        , 0.625     , 0.        , 0.        ],
       [0.        , 0.        , 0.1875    , 0.875     , 1.        ,
        0.875     , 0.        , 0.        , 0.        , 0. 

In [12]:
class kNN_with_Priority_Queue:
    def __init__(self, training_features, training_labels, k):
        """
        Store the training features and labels.
        """
        self.X_train = training_features
        self.y_train = training_labels
        self.k = k

    def fit(self):
        """
        Since kNN is a lazy learner, there is no fit / training step.
        """
        pass

    def predict(self, test_sample):
        max_heap = []
        for idx, sample in enumerate(self.X_train):
            curr_euclidean_distance = np.linalg.norm(test_sample - sample)
            heapq.heappush(
                max_heap, (-1 * curr_euclidean_distance, self.y_train[idx])
            )  # notice the use of negative value

        # get the top k elements
        nearest_predictions = []
        for idx in range(0, self.k):
            _, nearest_prediction = heapq.heappop(max_heap)
            nearest_predictions.append(nearest_prediction)
        print(nearest_predictions)
        return mode(nearest_predictions)

In [13]:
# predict test samples
k_values = [1, 5, 10]
accuracies = []

for k in k_values:
    kNN = kNN_with_Priority_Queue(normalized_training_features, y_train, k)
    preds = []
    for test_sample_idx in range(0, len(y_test)):
        prediction = kNN.predict(normalized_test_features[test_sample_idx, :])
        preds.append(prediction)
    accuracy = utils.calculate_accuracy(y_test, preds)
    print("\n For k == ", k, ", accuracy = ", accuracy)
    accuracies.append(accuracy)

[3]
[4]
[4]
[2]
[4]
[7]
[4]
[4]
[4]
[4]
[5]
[2]
[1]
[1]
[3]
[4]
[4]
[4]
[4]
[9]
[5]
[4]
[6]
[4]
[1]
[9]
[4]
[1]
[9]
[4]
[9]
[5]
[5]
[3]
[2]
[9]
[4]
[3]
[4]
[9]
[9]
[5]
[1]
[4]
[9]
[5]
[4]
[7]
[9]
[2]
[4]
[0]
[4]
[4]
[9]
[1]
[7]
[7]
[1]
[5]
[6]
[5]
[2]
[7]
[3]
[2]
[4]
[7]
[4]
[2]
[4]
[7]
[2]
[4]
[4]
[7]
[5]
[1]
[2]
[7]
[7]
[7]
[4]
[1]
[4]
[1]
[9]
[3]
[3]
[5]
[4]
[4]
[5]
[4]
[2]
[7]
[4]
[2]
[2]
[7]
[5]
[4]
[2]
[6]
[5]
[4]
[4]
[2]
[5]
[4]
[2]
[4]
[3]
[3]
[5]
[6]
[5]
[4]
[9]
[2]
[4]
[9]
[4]
[2]
[9]
[1]
[4]
[4]
[9]
[5]
[1]
[4]
[4]
[2]
[6]
[9]
[9]
[3]
[4]
[4]
[7]
[7]
[2]
[9]
[5]
[2]
[9]
[5]
[7]
[5]
[2]
[9]
[7]
[7]
[9]
[3]
[4]
[7]
[4]
[4]
[1]
[4]
[7]
[6]
[4]
[2]
[4]
[9]
[4]
[4]
[4]
[4]
[3]
[4]
[1]
[9]
[4]
[4]
[4]
[4]
[4]
[7]
[4]
[1]
[4]
[4]
[5]
[2]
[9]
[7]
[6]
[3]
[1]
[1]
[2]
[4]
[1]
[3]
[3]
[1]
[2]
[4]
[2]
[2]
[5]
[7]
[3]
[4]
[0]
[2]
[3]
[3]
[5]
[9]
[1]
[4]
[9]
[7]
[2]
[1]
[4]
[4]
[5]
[9]
[2]
[1]
[9]
[2]
[1]
[4]
[4]
[9]
[4]
[5]
[7]
[7]
[5]
[1]
[9]
[6]
[4]
[2]
[2]
[5]
[2]
[4]
[4]
[7]
[4]
[4]
