In this notebook, I try to reproduce the KNN classifier with Iris on python code and compare to the KNN in scikit-learn

KNN procedures 
1. Initialize the value of K
2. Calculate the distance between test data and each row of training dataset
3. Sort the calculated distances in asceding oder based on distance values
4. Get top K rows from sorted array
5. Get the most frequen class of these rows
6. Return the predicted class

In [498]:
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [499]:
iris_data = datasets.load_iris()
iris_features = iris_data['data']
iris_labels = iris_data['target']

In [500]:
x_train, x_test, y_train, y_test = train_test_split(iris_features, iris_labels, test_size=0.2, random_state=42)

In [501]:
class KNN():
    
    def __init__(self, n_neighbors = 5, distance_metric='minkowski', algorithm_neighbors='brute', weight='uniform', p=2):
        assert distance_metric in ['minkowski'], 'Metric not found! Plase choose the metric from the list: minskowski'
        self.n_neighbors = n_neighbors
        self.distance_metric = distance_metric
        self.algorithm_neighbors = algorithm_neighbors
        self.weight = weight
        self.p = p
    
    def __str__(self):
        return (f'KNN(n_neighbors = {self.n_neighbors}, distance_metric = {self.distance_metric}, p = {self.p}, algorithm_neighbors = {self.algorithm_neighbors}, weight = {self.weight})')
    
    def fit(self, x, y):
        self.x = x
        self.y = y
    
    # Calculate distance between train data and test data. The result will be assigned to a 'distance' column
    def calculate_distance(self, df, test_data, features_name):
        if self.distance_metric == 'minkowski':
            df['distance'] = pow(pow((df[features_name] - test_data), self.p).sum(axis=1), 1/self.p)
            df_neighbors_filter = df.sort_values(by=['distance'], ascending=True)[:self.n_neighbors]
            return df_neighbors_filter
    
    # Create predicted label
    def create_label(self, df, unique_label):
        labels = np.zeros((len(unique_label), 1))
        for label in df['label']:
            labels[label] += 1
        return np.argmax(labels)

    # predict new data
    def predict(self, test_data):
        y_pred = []
        features_name = [f'Feature_{i}' for i in range(self.x.shape[1])]
        df = pd.DataFrame(data=self.x, columns=features_name)
        df['label'] = self.y
        unique_label = df['label'].unique()
        
        for i in test_data:
            df_filter = self.calculate_distance(df, i, features_name)
            print(df_filter)
            prediction = self.create_label(df_filter, unique_label)
            print(f'Prediction: {prediction}')
            y_pred.append(prediction)
        
        return y_pred

        

In [502]:
n_neighbors = 3
distance_metric = 'minkowski'
algorithm_neighbors = 'brute'
weight = 'uniform'
p = 2

In [503]:
knn_classifier = KNN(n_neighbors, distance_metric, algorithm_neighbors, weight, p)
knn_classifier.fit(x_train, y_train)

In [504]:
print(knn_classifier)

KNN(n_neighbors = 3, distance_metric = minkowski, p = 2, algorithm_neighbors = brute, weight = uniform)


In [505]:
y_pred = knn_classifier.predict(x_test)

    Feature_0  Feature_1  Feature_2  Feature_3  label  distance
79        6.1        2.9        4.7        1.4      1  0.223607
90        6.1        3.0        4.6        1.4      1  0.300000
39        6.2        2.9        4.3        1.3      1  0.435890
Prediction: 1
    Feature_0  Feature_1  Feature_2  Feature_3  label  distance
48        5.4        3.9        1.7        0.4      0  0.331662
14        5.4        3.7        1.5        0.2      0  0.387298
94        5.3        3.7        1.5        0.2      0  0.469042
Prediction: 0
    Feature_0  Feature_1  Feature_2  Feature_3  label  distance
24        7.7        2.8        6.7        2.0      2  0.412311
21        7.6        3.0        6.6        2.1      2  0.547723
64        7.7        3.0        6.1        2.3      2  0.894427
Prediction: 2
    Feature_0  Feature_1  Feature_2  Feature_3  label  distance
90        6.1        3.0        4.6        1.4      1  0.200000
79        6.1        2.9        4.7        1.4      1  0.24494

In [508]:
print(y_pred)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [507]:
accuracy_score(y_pred, y_test)

1.0