In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Iris.csv').drop('Id',axis=1)
data = data.sample(frac=1)
X = data.iloc[:,0:4].values
Y = data.iloc[:,4].values
X.shape,Y.shape

((150, 4), (150,))

In [3]:
X = (X-X.mean(axis=0))/X.std(axis=0)
X[:10]

array([[ 1.2803405 ,  0.10644536,  0.93335575,  1.1850097 ],
       [-1.50652052,  1.26346019, -1.56873522, -1.31297673],
       [ 0.18982966, -0.8191665 ,  0.76275864,  0.52764485],
       [ 0.55333328, -0.58776353,  0.76275864,  0.39617188],
       [ 0.67450115,  0.33784833,  0.42156442,  0.39617188],
       [ 0.55333328, -1.28197243,  0.64902723,  0.39617188],
       [ 0.4321654 , -0.35636057,  0.30783301,  0.13322594],
       [-1.02184904,  0.80065426, -1.2844067 , -1.31297673],
       [-0.53717756,  1.95766909, -1.39813811, -1.05003079],
       [-0.17367395, -0.35636057,  0.25096731,  0.13322594]])

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=.2,stratify=Y)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((120, 4), (30, 4), (120,), (30,))

#### Euclidean  
$ d = \sqrt{ \sum_{i=0}^{k} \big(X_i - Y_i \big)^2 } $

#### Manhatten
$ d = \sum_{i=0}^{k} \big| X_i - Y_i \big| $

#### MinKowski
$ d = \bigg( \sum_{i=0}^{k} \big( \big|X_i - Y_i\big| \big)^q  \bigg)^\frac{1}{q} $

In [5]:
def euclidean(X1,X2):
    return np.sqrt(np.sum(np.square(X1-X2),axis=1))

def manhatten(X1,X2):
    return np.sum(np.abs(X1-X2),axis=1)

def minkowski(X1,X2,q):
    return np.power(np.sum(np.power(np.abs(X1-X2),2),axis=1),1/q)

In [6]:
X1,X2 = np.array([[2,5,6]]),np.array([[5,6,9]])
minkowski(X1,X2,4),manhatten(X1,X2),euclidean(X1,X2)

(array([2.08779763]), array([7]), array([4.35889894]))

In [7]:
from scipy.stats import mode
def predict(test,X,Y,k=5,distance='euclidean',q=4):
    """
        test: sample to test
        X : train data to find neighbors
        Y : labels of train
        k : no of neighbors to consider
        distance : choice of distance metric ( 'euclidean', 'manhatten', 'minkowski' ) 
        q : used for minkowski distance metric only
    """
    func = None
    if distance=='euclidean': func= euclidean
    elif distance=='manhatten': func=manhatten
    elif distance=='minkowski': func=lambda x,y: minkowski(x,y,q)
    else: print("Wrong Distance metric!!"); return None
    
    assert test.shape == (1,X.shape[1])
    y = func(X,test)
    ind = y.argsort()[:k]
    return mode(Y[ind]).mode[0]

In [8]:
predictions = []
for sample in X_test:
    sample = np.expand_dims(sample,axis=0) 
    predictions.append(predict(sample,X_train,Y_train,k=5,distance='euclidean'))
predictions = np.array(predictions)
print(f"Accuracy : {(predictions==Y_test).sum()/len(Y_test)}")

Accuracy : 1.0




In [9]:
predictions = []
for sample in X_test:
    sample = np.expand_dims(sample,axis=0) 
    predictions.append(predict(sample,X_train,Y_train,k=5,distance='manhatten'))
predictions = np.array(predictions)
print(f"Accuracy : {(predictions==Y_test).sum()/len(Y_test)}")

Accuracy : 1.0




In [10]:
predictions = []
for sample in X_test:
    sample = np.expand_dims(sample,axis=0) 
    predictions.append(predict(sample,X_train,Y_train,k=5,distance='minkowski',q=4))
predictions = np.array(predictions)
print(f"Accuracy : {(predictions==Y_test).sum()/len(Y_test)}")

Accuracy : 1.0


