#### K-Nearest neighbors: Find the k nearest points to a given query point. Find the mean/mode of the corresponding k values/labels for regression/classification.

#### As k increases from 1, the performance of the model improves upto a certain point and then diminishes.


In [1]:
import numpy as np
import pandas as pd
import heapq
from scipy import stats

In [18]:
# euclidian distance between points (a, b)
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

def knn(data, query, k, distance_fn, choice_fn):
    
    # Find the distance of the query point to every point in the data
    # store it as a tuple (distance, index)
    neighbor_distances_and_indices = [(euclidean_distance(data[i][:-1], query), i) for i in range(len(data))]
    
    # Pick the first K entries from the collection; use heapq for efficiency
    k_nearest_distances_and_indices = heapq.nsmallest(k, neighbor_distances_and_indices)
    
    # Get the labels of the selected K entries
    k_nearest_labels = [data[i][1] for distance, i in k_nearest_distances_and_indices]

    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

def mean(labels):
    return np.mean(labels)

def mode(labels):
    return stats.mode(labels)[0][0]


def main():
    '''
    # Regression Data
    # 
    # Column 0: height (inches)
    # Column 1: weight (pounds)
    '''
#     reg_data = np.array([
#        [65.75, 112.99],
#        [71.52, 136.49],
#        [69.40, 153.03],
#        [68.22, 142.34],
#        [67.79, 144.30],
#        [68.70, 123.30],
#        [69.80, 141.49],
#        [70.01, 136.46],
#        [67.90, 112.37],
#        [66.49, 127.45],
#     ])
    
#     # Question:
#     # Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?
#     reg_query = np.array([60])
#     reg_k_nearest_neighbors, reg_prediction = knn(
#         reg_data, reg_query, k=3, distance_fn=euclidean_distance, choice_fn=mean
#     )
#     print(reg_k_nearest_neighbors)
#     print(reg_prediction)

    '''
    # Classification Data
    # 
    # Column 0: age
    # Column 1: likes pineapple
    '''
    clf_data = np.array([
       [22, 1],
       [23, 1],
       [21, 1],
       [18, 1],
       [19, 1],
       [25, 0],
       [27, 0],
       [29, 0],
       [31, 0],
       [45, 0],
    ])
    # Question:
    # Given the data we have, does a 33 year old like pineapples on their pizza?
    clf_query = np.array([33])
    clf_k_nearest_neighbors, clf_prediction = knn(
        clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode
    )
    print(clf_k_nearest_neighbors)
    print(clf_prediction)




In [19]:
if __name__ == '__main__':
    main()


[(2.0, 8), (4.0, 7), (6.0, 6)]
0
