## K Nearest Neighbors

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("../data/iris.data")
df = df.reindex(np.random.permutation(df.index))

Algorithm is pretty straightforward- take the closest k neighbors and set your test label equal to the most common label.

In [8]:

def distance(p1, p2):
        
    return np.sqrt(sum(p1 - p2)**2)


def get_neighbors(p1, points, k):
        
    distances = [distance(p1, p[0]) for p in points]
    zipped = zip(distances, points)
    sorted_zipped = sorted(zipped, key = lambda x: x[0])

    return sorted_zipped[:k]

def most_popular_class(points):

    classes = [p[1] for p in points]
    return max(classes)

def evaluate(predictions, test_labels):
    
    sames = [int(predictions[i] == test_labels[i]) for i in range(len(predictions))]
    return np.mean(sames)

In [5]:
data = df.as_matrix(columns = ["sepal_length", "sepal_length", "petal_length", "petal_width"])
labels = df.as_matrix(columns = ["class"])


In [6]:
train_points = zip(data[:100], labels[:100])
test_data = data[100:]; test_labels = labels[100:]


Let's try with different k and see how accurate our results are.

In [9]:
k_options = [1,2,3,4,5]

for k in k_options:
    predictions = []
    for td in test_data:
        neighbors = get_neighbors(td, train_points, k)
        targets = [n[1] for n in neighbors]
        mpc = most_popular_class(targets)
        predictions.append(mpc)
    results = evaluate(predictions, test_labels)
    print "Using {0} neighbors led to {1:.2f} accuracy".format(k, results)


Using 1 neighbors led to 0.86 accuracy
Using 2 neighbors led to 0.86 accuracy
Using 3 neighbors led to 0.84 accuracy
Using 4 neighbors led to 0.84 accuracy
Using 5 neighbors led to 0.84 accuracy
