## 1-Nearest Neighbor with KD Tree
This notebook implements vanilla 1-NN to classify Iris dataset available [here](https://archive.ics.uci.edu/dataset/53/iris) using a KD Tree.

#### Import libraries and data

In [66]:
import os
import sys
import time

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from statistics import mode
import matplotlib.pyplot as plt

# system-agnostic utils file import
root_dir_path = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
sys.path.append(root_dir_path)
import utils

pd.set_option("display.max_colwidth", None)

In [67]:
# load the dataset
iris_data = load_iris()
X = iris_data.data
y = iris_data.target
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_train[0:2, :]

array([[5.5, 2.4, 3.7, 1. ],
       [6.3, 2.8, 5.1, 1.5]])

In [68]:
X_train.shape

(105, 4)

In [69]:
# unique values and their counts
unique_values, counts = np.unique(y_train, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"{value} occurs {count} time(s).")

0 occurs 31 time(s).
1 occurs 37 time(s).
2 occurs 37 time(s).


Scale the training and test data.

In [70]:
normalized_training_features = utils.minmax_normalize_2d_array(X_train)
X_train = None
normalized_test_features = utils.minmax_normalize_2d_array(X_test)
X_test = None
normalized_training_features[0:2, :]

array([[0.35294118, 0.18181818, 0.46428571, 0.375     ],
       [0.58823529, 0.36363636, 0.71428571, 0.58333333]])

In [138]:
# TODO remove after testing
normalized_training_features = normalized_training_features[0:10, :]


class Tree_Node:
    def __init__(self, indices, depth):
        self.valid_indices = indices
        self.depth = depth
        self.median = np.median(
            normalized_training_features[
                indices, self.depth % normalized_training_features.shape[1]
            ]
        )
        self.left_child_node = None
        self.right_child = None

    def build_KD_tree(self):
        if len(self.valid_indices) == 1:
            # current node is a leaf node
            pass
        else:
            # create left subtree
            left_subtree_indices = np.where(
                normalized_training_features[
                    self.valid_indices,
                    self.depth % normalized_training_features.shape[1],
                ]
                <= self.median
            )

            if len(left_subtree_indices[0]) > 0:
                self.left_child_node = Tree_Node(
                    left_subtree_indices[0], self.depth + 1
                )
                self.left_child_node.build_KD_tree()
            else:
                # No element left for left subtree
                pass
            # create right subtree
            right_subtree_indices = np.where(
                normalized_training_features[
                    self.valid_indices,
                    self.depth % normalized_training_features.shape[1],
                ]
                > self.median
            )
            if len(right_subtree_indices[0]) > 0:
                self.right_child_node = Tree_Node(
                    right_subtree_indices[0], self.depth + 1
                )
                self.right_child_node.build_KD_tree()
            else:
                # No element left for right subtree
                pass

In [139]:
KD_tree = Tree_Node([x for x in range(normalized_training_features.shape[0])], 0)
KD_tree.build_KD_tree()

# TODO
address why kd tree construction fails when using handwritten digits data with all samples' first pixel equal to 0.