In [None]:
#--------------------KNN from scratch -----------------------------------------------

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
df = pd.read_csv('iris.csv')

le = LabelEncoder()
df['species_encoded'] = le.fit_transform(df['species'])
df.drop(columns = {'species'},inplace = True)

In [3]:
df.shape

(150, 5)

In [4]:
x = np.array(df.iloc[:,:4])
y = np.array(df['species_encoded'])

In [5]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size = 0.3)

k_list = [3,5,7,15,27,31]
acc_scores = {}

In [8]:
def eucliedian_distance(x1,x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def knn_predict(xtrain , ytrain , xtest , k):

    predictions = []
    for test_point in xtest:
        distances = [eucliedian_distance(test_point ,trainpoint) for trainpoint in xtrain]

        neighbors = np.argsort(distances)[:k]
        labels = [ ytrain[i] for i in neighbors]

        predicted_label = np.bincount(labels).argmax()
        predictions.append(predicted_label)

    return predictions


for k in k_list:
    ypred = knn_predict(xtrain,ytrain,xtest,k)
    acc = accuracy_score(ytest , ypred) * 100
    acc_scores[k] = acc

In [17]:
max_key = max(acc_scores , key =acc_scores.get)
max_value = acc_scores[max_key]

print(f'The optimal k value is {max_key} with Test accuracy {max_value}')

The optimal k value is 3 with Test accuracy 95.55555555555556


In [None]:
#------------------------ KNN using Sklearn ------------------------------------------------- 

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [22]:
lis = [3,5,7,15,27,31]
acc_score = {}

xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size = 0.3)


In [23]:
for i in lis:
    knn = KNeighborsClassifier(i)
    knn.fit(xtrain,ytrain)
    ypred = knn.predict(xtest)
    acc = accuracy_score(ytest,ypred) * 100
    acc_score[i] = acc
    

In [24]:
max_k = max(acc_score , key = acc_score.get)
max_v = acc_score[max_k]

print(f'Using Sklearn best optimal is {max_k} with accuracy {max_v}')

Using Sklearn best optimal is 15 with accuracy 97.77777777777777


In [None]:
#It is observed that building KNN from scratch gives accuracy of 95.5% whereas building knn from 
# Scikit learn gives accuracy of 97.7%