In [9]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from collections import Counter
#importing necessary libraries 

In [2]:
x, y=make_classification(n_samples=1000, n_features=5, n_informative=5, n_redundant=0, n_classes=2)
#make a faux classification problem with 1000 rows, 5 informative features, to classify into 2 classes

In [26]:
x_train, x_test, y_train, y_test=train_test_split(x,y)

In [5]:
#now we can begin to build the KNN model 

In [47]:
class KNN():
    
    def __init__(self):
        return
    
    def __distance(self, a, b):
        '''
        helper function to get euclidean distance between two points 
        '''
        dist=0
        
        for i in range(len(a)):
            dist+=(a[i]-b[i])**2
            
        return np.sqrt(dist)
    
    def fit(self, x_train, y_train):
        
        self.x_train=x_train #set the training features
        self.y_train=y_train #set the training labels
        return
    
    def predict(self, x_test, n_neighbors=3):
        
        '''
        For each point in the testing array, this function will find the k nearest data points in the train
        array and assign the majority class label. It will return the array of predictions
        '''
        
        predictions=[] #initialize empty array to feed predicted labels 
        
        for i in range(len(x_test)): #loop over every point in the test array
            
            distances=[]
            labels=[] #initialize these two array 
            
            for j in range(len(x_train)): #loop over every point in the training array 
                
                dist=self.__distance(x_test[i], self.x_train[j])
                distances.append([dist,j]) #append euclidean distance and index location
                
            distances=sorted(distances) #sort the distances 
            
            for k in range(n_neighbors):
                
                index=distances[k][1] #retrieve the index
                labels.append(self.y_train[index]) #append the label
                
            winning_label=Counter(labels).most_common(1)[0][0] #find label with most 'votes' by k nearest neighbors
            predictions.append(winning_label)
            
        return predictions
    
    def score(self, x_test, y_test, n_neighbors=3):
        
        predicted_labels=self.predict(x_test, n_neighbors)
        
        score=0
        
        for i in range(len(y_test)):
            
            if predicted_labels[i]==y_test[i]: #check to see if labels match
                score+=1
                
        return score/len(y_test) #this returns the accuracy (or ratio of correct guesses)
    
    
            
    
    
            
            
            
            
    

In [48]:
knn=KNN()

In [49]:
knn.fit(x_train,y_train)

In [50]:
knn.predict(x_test,n_neighbors=8)

[0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0]

In [51]:
knn.score(x_test,y_test,n_neighbors=5)

0.936

The custom KNN model was able to achieve an accuracy of 93.6% This custom model works very well!

Now let's see how this model would compare to sklearn's implementation 

In [42]:
from sklearn.neighbors import KNeighborsClassifier

In [44]:
sk_knn=KNeighborsClassifier(n_neighbors=5)

In [45]:
sk_knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [46]:
sk_knn.score(x_test,y_test)

0.936

We were able to achieve the same accurracy as sklearn. There are numerous ways we could improve upon our custom model. For example, we could use different distance metrics (rather than euclidean distance) to try and improve accuracy even further. 