### Import libraries and data

In [1]:
import os
import sys
import math
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# system-agnostic utils file import
root_dir_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(root_dir_path)

In [2]:
import utils

In [3]:
# Load the Iris dataset from scikit-learn
iris = load_iris()

Optionally, store the data as a csv.

In [4]:
data = np.c_[
    iris.data, iris.target
]  # Combine the features and target into a single array

# convert to df to save on disk
columns = iris.feature_names + ["target"]
df = pd.DataFrame(data, columns=columns)

# save to a CSV file
df.to_csv("iris_dataset.csv", index=False)
df.sample(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
62,6.0,2.2,4.0,1.0,1.0
26,5.0,3.4,1.6,0.4,0.0


Normalize the data.

In [5]:
normalized_features = utils.minmax_normalize_2d_array(iris.data)
normalized_features[0:5, :]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666666, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666666, 0.06779661, 0.04166667]])

In [6]:
class K_Nearest_Neighbors:
    def __init__(self, training_features, training_labels, k=1):
        self.k = k
        self.training_features = training_features
        self.training_labels = training_labels
        self.closest_k_training_samples_features = []
        self.closest_k_training_samples_targets = []
        self.closest_k_training_samples_indices = set()

    def fit(self):
        """
        Since kNN is a lazy learner, there is no fit / training step.
        """
        pass

    def predict(self, test_sample):
        plurality_vote_dictionary = {}
        # find the k closest points to the test sample
        while len(self.closest_k_training_samples) < self.k:
            current_closest_distance = 100000
            current_closest_point = None
            current_closest_point_index = None

            for idx in range(0, len(self.training_features)):
                if idx not in self.closest_k_training_samples_indices:
                    distance_between_sample_and_test_sample = np.sqrt(
                        np.sum((test_sample - self.training_features[idx, :]) ** 2)
                    )

                    if (
                        distance_between_sample_and_test_sample
                        < current_closest_distance
                    ):
                        current_closest_distance = (
                            distance_between_sample_and_test_sample
                        )
                        current_closest_point_features = self.training_features[idx, :]
                        current_closest_point_label = self.training_labels[idx]
                        current_closest_point_index = idx

            # add the closest point of this while loop iteration to the list of k closest points
            self.closest_k_training_samples_features.append(
                current_closest_point_features
            )
            self.closest_k_training_samples_targets.append(current_closest_point_label)
            # count this sample as a voter in the final voting
            if (
                self.training_labels[current_closest_point_index]
                not in plurality_vote_dictionary
            ):
                plurality_vote_dictionary[
                    self.training_labels[current_closest_point_index]
                ] = 1
            else:
                plurality_vote_dictionary[
                    self.training_labels[current_closest_point_index]
                ] += 1

            # mark this sample's index so as to skip in the next iteration of the while loop
            self.closest_k_training_samples_indices.add(current_closest_point_index)

            # make prediction for test sample based on the plurality vote from the k closest points
            assert len(self.closest_k_training_samples) == self.k

            current_mode = 0
            prediction = None
            for key, val in plurality_vote_dictionary.keys():
                if val > current_mode:
                    prediction = key

            return prediction

In [8]:
kNN = K_Nearest_Neighbors(normalized_features, iris.target, 10)
kNN.predict(np.array([0.5, 0.2, 0.5, 0.7]))

TypeError: unhashable type: 'numpy.ndarray'