In [4]:
from statistics import median

import numpy as np
from pandas import read_csv


def get_median(data, dim):
    data_dim = []
    for point in data:
        data_dim.append(point.get("point_data")[dim])
    return median(data_dim)


def get_dim_larger_data(data, dim, med):
    right_data = []
    for point in data:
        if point.get("point_data")[dim] >= med:
            right_data.append(point)
    return right_data


def get_dim_smaller_data(data, dim, med):
    right_data = []
    for point in data:
        if point.get("point_data")[dim] < med:
            right_data.append(point)
    return right_data


def get_neighbors(current, point):
    if len(point) == current.point_dim:
        if len(current.data) > current.leave_size:
            if point[current.current_dim % current.point_dim] >= current.current_median:
                return get_neighbors(current.right, point)
            else:
                return get_neighbors(current.left, point)
        else:
            return current.data
    else:
        return


class OurKdTree:
    def __init__(self, data, leave_size, point_dim, current_dim):
        self.current_median = 0
        self.current_dim = current_dim
        self.point_dim = point_dim
        self.leave_size = leave_size
        self.left = None
        self.right = None
        self.data = data

    def build_tree(self):
        dim_ = self.current_dim % self.point_dim
        if len(self.data) > self.leave_size:
            median_ = get_median(self.data, dim_)
            self.current_median = median_
            right_data = get_dim_larger_data(self.data, dim_, median_)
            self.right = OurKdTree(right_data, self.leave_size, self.point_dim, self.current_dim + 1)

            left_data = get_dim_smaller_data(self.data, dim_, median_)
            self.left = OurKdTree(left_data, self.leave_size, self.point_dim, self.current_dim + 1)

            if self.leave_size <= len(right_data):
                self.right.build_tree()
            if self.leave_size <= len(left_data):
                self.left.build_tree()
        else:
            return self


def get_data(file):
    f_points = []
    data_from_file = read_csv(file)
    points_from_file = data_from_file.values.tolist()
    for point in points_from_file:
        f_points.append({'point_name': point[:4], 'point_data': point[4:]})
    return f_points


# to implement
def get_point_dim(data):
    if len(data) != 0:
        return len(data[0].get('point_data'))
    return 0


def my_sort_func(e):
    return e['distance']


def remove_dublicates(list_to):
    for i in range(len(list_to) - 1, 0, -1):
        if list_to[i] == list_to[i - 1]:
            del list_to[i]

    return list_to


def flat_candidates_list(candidates):
    flat_list = []
    for candidate in candidates:
        for item in candidate:
            flat_list.append(item)
    return flat_list


def get_final_k_neighbors(candidates, point, k):
    out = flat_candidates_list(candidates)
    res = []
    for candidate in out:
        res.append({'candidate_point': candidate.get('point_name'),
                    'distance': euclidean_distance(candidate.get('point_data'), point)})
    res.sort(key=my_sort_func)
    sorted_list = remove_dublicates(res)
    return sorted_list[0:k]


def euclidean_distance(point1, point2):
    point1_ = np.array(point1)
    point2_ = np.array(point2)

    # calculating Euclidean distance
    # using linalg.norm()
    dist = np.linalg.norm(point1_ - point2_)

    # return Euclidean distance
    return dist


class ANN:
    def __init__(self, N, L, k):
        self.N = N
        self.L = L
        self.k = k
        self.forest = []
        self.data = None

    def fit(self, file):
        self.data = get_data(file)
        for tree in range(self.L):
            self.forest.append(OurKdTree(self.data, self.N, get_point_dim(self.data), tree))
        for tree in self.forest:
            tree.build_tree()

    def k_neighbors(self, new_points):
        neighbors_list = []
        for point in new_points:
            neighbors_list.append({'the_point': point[:4], 'k_neighbors': self.point_k_neighbors(point[4:])})
        return neighbors_list

    def point_k_neighbors(self, new_point):
        candidates = []
        for tree in self.forest:
            candidates.append(get_neighbors(tree, new_point))
        return get_final_k_neighbors(candidates, new_point, self.k)
    
    
def ratio_method(data)
    for point in data:
        

In [6]:
'''
    The constructor ANN accepts 3 arguments
    1-  N: the maximum number of points existed in each leaf in the tree
    2-  L: the number of trees created in our forest
    3-  k: number of neighbors we need to get for every point 
'''
p = ANN(10, 20, 3)
p.fit("Hananya1.csv")

data_to_test = read_csv("Hananya3.csv")
data_to_test_list = data_to_test.iloc[0:, 0:132]
points_to_test = data_to_test_list.values.tolist()
r = p.k_neighbors(points_to_test)
print(*r, sep = "\n")

{'the_point': [193.17, 371.13, 22.83, 1.571], 'k_neighbors': [{'candidate_point': [201.45, 383.78, 15.63, 1.636], 'distance': 124.2014492669067}, {'candidate_point': [191.63, 281.1, 20.79, 1.545], 'distance': 124.39051410778879}, {'candidate_point': [228.71, 565.26, 0.88, 1.48], 'distance': 137.46635952115702}]}
{'the_point': [331.15, 543.32, 18.86, 1.099], 'k_neighbors': [{'candidate_point': [235.13, 331.52, 7.26, 1.358], 'distance': 347.77722754660056}, {'candidate_point': [215.31, 407.03, 1.53, 2.0180000000000002], 'distance': 351.29047809469586}, {'candidate_point': [225.71, 265.7, 3.53, 1.069], 'distance': 363.04545169992144}]}
