# KNN From Sratch

In this workbook we will guide you through the steps to implement KNN from scratch. Once this is done you'll implement you solution in a class that is tested with the knn_class_tester notebook.

1. Use the ```make_blobs``` function from SKLearn to make a dataset to test your KNN functions.
2. Create helper functions. These will be useful when you go to implement your class.
    - Squaring the difference of two vectors.
    - Summing the square differences and returning the square root.
    - Calculating the euclidian distances
    - An evaluation function to evalaute predictions
3. Create the KNN predcit function

In [15]:
#import libraries
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

### Step 1: Create a sample dataset
1. Use ```make_blobs``` to create a sample set
2. Start with 300 samples, 4 centres, 0.6 standard deviation, and random state 0
3. Plot the samples

In [16]:
#make the dataset with make_blobs
X,y = make_blobs(n_samples=300,centers=4,cluster_std=0.6,random_state=0)


# Step 2: Creating the KNN function
Pseudocode below to help out! Note: **IT IS NOT ACTUAL CODE**

In [27]:
# define train test split
#X_train
#X_test
#y_train
#y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train


array([[ 1.49493180e+00,  3.85848832e+00],
       [ 1.69747910e+00,  8.66123282e-01],
       [-1.50372568e+00,  1.92385320e+00],
       [ 2.22707373e+00,  1.26296996e+00],
       [ 2.74666646e+00,  1.54543482e+00],
       [ 3.35320909e+00,  1.69958043e+00],
       [ 1.13078931e+00,  9.35620856e-01],
       [-1.31454942e+00,  6.83904013e+00],
       [ 7.93137001e-03,  4.17614316e+00],
       [-1.18652985e+00,  2.78427720e+00],
       [ 1.48859977e+00,  6.51633844e-01],
       [ 7.89338559e-01,  4.33748653e+00],
       [ 1.39263752e+00,  9.28962707e-01],
       [-7.85412206e-01,  8.45312331e+00],
       [-5.39428614e-01,  7.45631776e+00],
       [ 5.59529363e-01,  4.21400660e+00],
       [ 2.35151259e+00,  8.28001297e-01],
       [ 7.15177948e-01,  5.41334556e+00],
       [ 1.20212540e+00,  3.64414685e+00],
       [ 1.97553917e+00,  7.18989132e-01],
       [-1.79351372e+00,  7.58086944e+00],
       [ 2.72396035e-01,  5.46996004e+00],
       [ 1.07627418e+00,  4.68480619e+00],
       [ 1.

In [97]:
def get_eucledian_distance(v1,v2):
    distance=0
    for x in range(len(v1)-1):
        distance=distance+(v1[x]-v2[x])**2
    #distance=(v1-v2)**2
    distance=np.sqrt(distance)

 

    return distance

In [98]:
def get_all_distances(X_train,y_train):
    all_distances=[]
    
    
    for x in range(len(X_train)):
        distance=get_eucledian_distance(X_train[x][:-1],y_train)
        all_distances.append((X_train[x],distance))
        #all_distances.append(distance)
    all_distances.sort(key=lambda a: a[1])
    return all_distances
    #all_distances.sort()
    #return all_distances

    

In [99]:
get_all_distances(X_train,y_train)

[(array([1.4949318 , 3.85848832]), 0.0),
 (array([1.6974791 , 0.86612328]), 0.0),
 (array([-1.50372568,  1.9238532 ]), 0.0),
 (array([2.22707373, 1.26296996]), 0.0),
 (array([2.74666646, 1.54543482]), 0.0),
 (array([3.35320909, 1.69958043]), 0.0),
 (array([1.13078931, 0.93562086]), 0.0),
 (array([-1.31454942,  6.83904013]), 0.0),
 (array([0.00793137, 4.17614316]), 0.0),
 (array([-1.18652985,  2.7842772 ]), 0.0),
 (array([1.48859977, 0.65163384]), 0.0),
 (array([0.78933856, 4.33748653]), 0.0),
 (array([1.39263752, 0.92896271]), 0.0),
 (array([-0.78541221,  8.45312331]), 0.0),
 (array([-0.53942861,  7.45631776]), 0.0),
 (array([0.55952936, 4.2140066 ]), 0.0),
 (array([2.35151259, 0.8280013 ]), 0.0),
 (array([0.71517795, 5.41334556]), 0.0),
 (array([1.2021254 , 3.64414685]), 0.0),
 (array([1.97553917, 0.71898913]), 0.0),
 (array([-1.79351372,  7.58086944]), 0.0),
 (array([0.27239604, 5.46996004]), 0.0),
 (array([1.07627418, 4.68480619]), 0.0),
 (array([1.84287117, 0.07269288]), 0.0),
 (ar

In [105]:
def select_neighbours(X_train,y_train, k):
    neigh=[]
    for i in range(k):
        neigh.append(get_all_distances(X_train,y_train)[i][0])
    



    #sl = sort(dist_list, by: distance)


    #return 0 to k elements from list (slice)
    return neigh
    

In [107]:

select_neighbours(X_train,y_train,3)

[array([1.4949318 , 3.85848832]),
 array([1.6974791 , 0.86612328]),
 array([-1.50372568,  1.9238532 ])]

In [108]:
# Make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
	neighbors = select_neighbours(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	return prediction
 

Expected 0, Got 0.


In [109]:
def predict(x_train,y_train,k):
    neighbors = select_neighbours(x_train,y_train,k)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [130]:
class KNN():
    def __init__(self,k):
        self.k=k
        #self.X_train=X_train
        #self.y_train=y_train
        #self.X_test=X_test
        #self.y_test=y_test
    def get_eucledian_distance(v1,v2):
        distance=0
        for x in range(len(v1)-1):
            distance=distance+(v1[x]-v2[x])**2
        distance=np.sqrt(distance)
        return distance
    
    def predict(self,X_train,y_train):
        all_distances=[]
        for x in range(len(X_train)):
            distance=get_eucledian_distance(X_train[x][:-1],y_train)
            all_distances.append((X_train[x],distance))
        all_distances.sort(key=lambda a: a[1])

        neighbors=[]
        for i in range(self.k):
            neighbors.append(all_distances[i][0])
        
        knn_class={}
        for i in range(len(neighbors)):
            result=neighbors[i][-1]
            if result in knn_class:
                knn_class[result]+=1
            else:
                knn_class[result]=1
        
        sorted_knn=sorted(knn_class.items(),key=lambda x:x[1],reverse=True)
        return sorted_knn
        #output_values = [row[-1] for row in neighbors]
        #prediction = max(set(output_values), key=output_values.count)
        #return prediction
 






In [131]:
knn=KNN(5)
knn.predict(X_train,y_train)

[(3.858488315204525, 1),
 (0.866123282176209, 1),
 (1.9238531999399506, 1),
 (1.2629699606232072, 1),
 (1.5454348151600295, 1)]

In [7]:
def KNN function (X_train, X_test, y_train, y_test, k)
    # given a vector, [2.5, 4.56, 2]
    # sample X_train : [2.5,4.56] = x1,x2
    # search with same index: y_train = 2
    
    # rember to do it for all X_test vectors
    # CASE: ONE SINGLE X_train vector:
    dist_list = []
    for vector in X_train:
        # result = my get_distance function
        dist_list.append(result, index)
    final_n = select_neighbours(dist_list, k)
    ypred = predict(final_n, y_train)
    score... --> using sklearn.metrics
    or comparing 1 by 1 (ypred - ytest)
    plot... do it inside in a separate function or outside. 
    
        

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=796ba814-f632-4502-b0d7-7e2f8f9e546d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>