In [20]:
from scipy.spatial.distance import euclidean as euc
import numpy as np
np.random.seed(0)

In [21]:
# define placeholder for KNN class with 2 methods
class KNN(object):

    def fit():
        pass
    
    def predict():
        pass

In [22]:
# Recall that when "fitting" a KNN classifier, all you're really doing is storing
#the points and their corresponding labels. There's no actual "fitting" involved here, 
#since all you can do is store the data so that you can use it to 
#calculate the nearest neighbors when the predict method is called.

def fit(self, X_train, y_train):
    self.X_train = X_train
    self.y_train = y_train 
    
# This line updates the knn.fit method to point to the function we've just written
KNN.fit = fit

In [23]:
# Next, write two helper functions to make things easier when completing the predict function. 
# The first helper function should return an array containing 
# the distance between a point we pass in and every point inside of X_train.

def _get_distances(self, x):
    distances = []
    for ind, val in enumerate(self.X_train):
        dist_to_i = euc(x, val)
        distances.append((ind, dist_to_i))
    return distances
    

# This line attaches the function we just created as a method to our KNN class.
KNN._get_distances = _get_distances

In [24]:
# The second big challenge in a predict method is getting the indices of the k-nearest points. 
# To keep our coming predict method nice and clean, 
# abstract this functionality into a helper method called _get_k_nearest.

def _get_k_nearest(self, dists, k):
    sorted_dists = sorted(dists, key=lambda x: x[1])
    return sorted_dists[:k]

# This line attaches the function we just created as a method to our KNN class.
KNN._get_k_nearest = _get_k_nearest

In [40]:
# Now, you have helper functions to help you get the distances, and then get the k-nearest neighbors 
# based on those distances. The final helper function you'll create will get the labels that correspond 
# to each of the k-nearest point, and return the class that occurs the most.

def _get_label_prediction(self, k_nearest):
    labels = [self.y_train[i] for i, _ in k_nearest]
    counts = np.bincount(labels)
    return np.argmax(counts)

# This line attaches the function we just created as a method to our KNN class.
KNN._get_label_prediction = _get_label_prediction
    

In [41]:
'''
This method does all the heavy lifting for KNN, so this will be a bit more complex than the fit method. Here's a rough outline of how the method should work:
The function takes in an array of vectors that we want predictions for.
For each vector that we want to make a prediction for: 1a. The classifier calculates the distance between that vector and every other vector in the training set. 1b. The classifier identifies the K nearest vectors to the vector you want a prediction for. 1c. The classifier determines which label the majority of the K nearest neighbors share, and appends this prediction to an array we will output. The index of the prediction in this array should be the same as the index of the point that it corresponds to (e.g. pred[0] is the prediction for X_test[0]).
Once predictions have been generated for every vector in question, return the array of predictions.
This tells us a few things about what our predict function will need to be able to do:
In addition to self, our predict function should take in two arguments:
X_test, the points we want to classify
k, which specifies the number of neighbors we should use to make the classification. We'll set k=3 as a default, but allow the user to update it if they choose.
Your method will need to iterate through every item in X_test. For each item:
Calculate the distance to all points in X_train by using our _get_distances() helper method we created.
Find the k-nearest points in X_train by using the _get_k_nearest() helper method we created
Use the index values contained within the tuples returned by _get_k_nearest() to get the corresponding labels for each of the nearest points.
Determine which class is most represented in these labels and treat that as the prediction for this point. Append the prediction to preds.
Once a prediction has been generated for every item in X_test, return preds'''

def predict(self, X_test, k=3):
    preds = []
    # Iterate through each item in X_test
    for i in X_test:
        # Get distances between i and each item in X_train
        dists = self._get_distances(i)
        k_nearest = self._get_k_nearest(dists, k)
        predicted_label = self._get_label_prediction(k_nearest)
        preds.append(predicted_label)
    return preds
                                       
    
        
KNN.predict = predict

In [42]:
# Tetsing on iris dataset

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
data = iris.data
target = iris.target



In [43]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25)
knn = KNN()
knn.fit(X_train, y_train)
preds = knn.predict(X_test)

In [44]:
print("Test accuracy: {}".format(accuracy_score(y_test, preds)))

Test accuracy: 0.8947368421052632
