## 1-Nearest Neighbor with KD Tree
This notebook implements vanilla 1-NN to classify Iris dataset available [here](https://archive.ics.uci.edu/dataset/53/iris) using a KD Tree.

#### Import libraries and data

In [62]:
import os
import sys
import time
from math import floor

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from statistics import mode
import matplotlib.pyplot as plt

# system-agnostic utils file import
root_dir_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(root_dir_path)
import utils

pd.set_option("display.max_colwidth", None)

In [63]:
# load the dataset
iris_data = load_iris()
X = iris_data.data
y = iris_data.target
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_train[0:2, :]

array([[5.5, 2.4, 3.7, 1. ],
       [6.3, 2.8, 5.1, 1.5]])

In [64]:
X_train.shape

(105, 4)

In [65]:
# unique values and their counts
unique_values, counts = np.unique(y_train, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"{value} occurs {count} time(s).")

0 occurs 31 time(s).
1 occurs 37 time(s).
2 occurs 37 time(s).


Scale the training and test data.

In [66]:
normalized_training_features = utils.minmax_normalize_2d_array(X_train)
X_train = None
normalized_test_features = utils.minmax_normalize_2d_array(X_test)
X_test = None
normalized_training_features[0:2, :]

array([[0.35294118, 0.18181818, 0.46428571, 0.375     ],
       [0.58823529, 0.36363636, 0.71428571, 0.58333333]])

In [67]:
# TODO remove later
# use 10 tr samples and 2 feature dimensions, remove after testing
# normalized_training_features = normalized_training_features[0:10, 0:2]

normalized_training_features = np.array(
    [[2, 3], [4, 7], [7, 2], [5, 4], [9, 6], [8, 1]]
)
training_labels = [1, 1, 2, 1, 2, 2]


class Tree_Node:
    def __init__(self, X, depth):
        self.X = X
        self.left = None
        self.right = None
        self.depth = depth
        self.median_idx = None
        self.sample = None
        self.feature_idx_to_split = None
        pass

    def build_Kd_tree(self):
        """Builds a Kd tree rooted at this node."""

        # check if the current node is a leaf node
        if len(self.X) == 1:
            self.sample = self.X[0, :]
            print("Leaf node reached. Sample at node: ", self.sample)
        else:
            # find out the feature to calculate median
            self.feature_idx_to_split = self.depth % (len(self.X[0]))
            # find median by sorting the tr samples  along feature
            self.X = self.X[self.X[:, self.feature_idx_to_split].argsort()]
            if len(self.X) % 2 == 0:
                self.median_idx = int((len(self.X) / 2))
            else:
                self.median_idx = floor((len(self.X) / 2))

            # store the median point
            self.sample = self.X[self.median_idx, :]
            # separate left and right points
            left_points = self.X[0 : self.median_idx, :]
            right_points = self.X[self.median_idx + 1 :, :]
            # create KD subtrees' nodes from the left and the right points
            if len(left_points) == 0:
                self.left = None
            else:
                self.left = Tree_Node(left_points, self.depth + 1)
                # build KD subtrees
                self.left.build_Kd_tree()
            if len(right_points) == 0:
                self.right = None
            else:
                self.right = Tree_Node(right_points, self.depth + 1)
                # build KD subtree
                self.right.build_Kd_tree()

In [68]:
normalized_training_features

array([[0.35294118, 0.18181818],
       [0.58823529, 0.36363636],
       [0.61764706, 0.5       ],
       [0.67647059, 0.45454545],
       [0.85294117, 0.72727272],
       [0.4117647 , 0.40909091],
       [0.97058823, 0.45454545],
       [0.38235294, 0.45454545],
       [0.23529412, 0.68181818],
       [1.        , 0.36363636]])

In [69]:
KD_tree = Tree_Node(normalized_training_features, 0)
KD_tree.build_Kd_tree()

Leaf node reached. Sample at node:  [0.35294118 0.18181818]
Leaf node reached. Sample at node:  [0.23529412 0.68181818]
Leaf node reached. Sample at node:  [0.67647059 0.45454545]
Leaf node reached. Sample at node:  [0.85294117 0.72727272]


# TODO
address why kd tree construction fails when using handwritten digits data with all samples' first pixel equal to 0.