# Comparison between kNN from Scratch and Sklearn

In [1]:
import pandas as pd
import numpy as np
from math import sqrt

In [2]:
plant_encode = {'Arctica': 0, 'Harlequin': 1, 'Carolinian': 2}
plant_decode = {0:'Arctica', 1:'Harlequin', 2:'Carolinian'}

In [3]:
train = pd.read_excel('TrainingSet.xlsx', engine = 'openpyxl')
train.plant = train.plant.map(plant_encode)
train.head()

Unnamed: 0,leaf.length,leaf.width,flower.length,flower.width,plant
0,5.4,3.7,1.5,0.2,0
1,4.8,3.4,1.6,0.2,0
2,4.8,3.0,1.4,0.1,0
3,4.3,3.0,1.1,0.1,0
4,5.8,4.0,1.2,0.2,0


In [4]:
test = pd.read_excel('TestSet-withLabels.xlsx', engine = 'openpyxl')
test.plant = test.plant.map(plant_encode)
test.head()

Unnamed: 0,leaf.length,leaf.width,flower.length,flower.width,plant
0,4.4,2.9,1.4,0.2,0
1,4.6,3.1,1.5,0.2,0
2,4.6,3.4,1.4,0.3,0
3,4.7,3.2,1.3,0.2,0
4,4.9,3.0,1.4,0.2,0


# kNN from Scratch

In [5]:
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [6]:
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for i, train_row in train.iterrows():
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

In [7]:
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [8]:
def k_nearest_neighbor(train, test, num_neighbors):
    predictions = list()
    for i, row in test.iterrows():
        prediction = predict_classification(train, row, num_neighbors)
        predictions.append(prediction)
        
    return predictions

### knn = 3

In [9]:
%%time
pred3 = k_nearest_neighbor(train, test, 1)
pred3 = np.asarray(pred3)

Wall time: 259 ms


### knn = 5

In [10]:
%%time
pred5 = k_nearest_neighbor(train, test, 5)
pred5 = np.asarray(pred5)

Wall time: 277 ms


### knn = 7

In [11]:
%%time
pred7 = k_nearest_neighbor(train, test, 7)
pred7 = np.asarray(pred7)

Wall time: 285 ms


# kNN using Sklearn

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
X_train = train.drop(['plant'], axis = 1)
Y_train = train.plant

X_test = test.drop(['plant'], axis = 1)
Y_test = test.plant

### knn = 3

In [14]:
%%time
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_train, Y_train)
spred3 = knn3.predict(X_test)

Wall time: 8.99 ms


### knn = 5

In [15]:
%%time
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, Y_train)
spred5 = knn5.predict(X_test)

Wall time: 14 ms


### knn = 7

In [16]:
%%time
knn7 = KNeighborsClassifier(n_neighbors=7)
knn7.fit(X_train, Y_train)
spred7 = knn7.predict(X_test)

Wall time: 19 ms


# Accuracy Comparison between Scratch KNN and Sklearn

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
print("Accuracy between predictions made by Scratch KNN and Sklearn")
print("3NN: " + str(accuracy_score(spred3, pred3)))
print("5NN: " + str(accuracy_score(spred5, pred5)))
print("7NN: " + str(accuracy_score(spred7, pred7)))

Accuracy between predictions made by Scratch KNN and Sklearn
3NN: 1.0
5NN: 1.0
7NN: 1.0


In [19]:
print("Accuracy between predictions made by Y_test and Scratch KNN")
print("3NN: " + str(accuracy_score(Y_test, pred3)))
print("5NN: " + str(accuracy_score(Y_test, pred5)))
print("7NN: " + str(accuracy_score(Y_test, pred7)))

Accuracy between predictions made by Y_test and Scratch KNN
3NN: 0.9666666666666667
5NN: 0.9666666666666667
7NN: 0.9666666666666667


In [20]:
print("Accuracy between predictions made by Y_test and Sklearn")
print("3NN: " + str(accuracy_score(Y_test, spred3)))
print("5NN: " + str(accuracy_score(Y_test, spred5)))
print("7NN: " + str(accuracy_score(Y_test, spred7)))

Accuracy between predictions made by Y_test and Sklearn
3NN: 0.9666666666666667
5NN: 0.9666666666666667
7NN: 0.9666666666666667


# Conclusion

Both Scratch and Sklearn give same results. Even changing the number of nearest neighbors (3,5,7) has no effect and the results are same. 
Therefore the final prediction stored is of Scratch where KNN = 5.

Execution time of Sklearn was faster than Scratch implementation is approximately 20 times faster.

In [25]:
result = pd.DataFrame(pred5, columns = ['Predictions']).replace(plant_decode)
result

Unnamed: 0,Predictions
0,Arctica
1,Arctica
2,Arctica
3,Arctica
4,Arctica
5,Arctica
6,Harlequin
7,Harlequin
8,Arctica
9,Arctica
