In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('student.csv')
data = data.sample(frac=1)
X,Y = data.iloc[:,0:2].values,data.iloc[:,2].values
X.shape,Y.shape

((1000, 2), (1000,))

In [3]:
X = (X-X.mean(axis=0))/X.std(axis=0)
X[:5]

array([[ 1.24092492,  1.57873575],
       [ 0.04709015, -0.12778421],
       [ 0.1134143 ,  0.00873738],
       [-0.08555816,  0.35004137],
       [-0.68247555, -0.46908821]])

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=.2)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((800, 2), (200, 2), (800,), (200,))

#### Euclidean  
$ d = \sqrt{ \sum_{i=0}^{k} \big(X_i - Y_i \big)^2 } $

#### Manhatten
$ d = \sum_{i=0}^{k} \big| X_i - Y_i \big| $

#### MinKowski
$ d = \bigg( \sum_{i=0}^{k} \big( \big|X_i - Y_i\big| \big)^q  \bigg)^\frac{1}{q} $

In [5]:
def euclidean(X1,X2):
    return np.sqrt(np.sum(np.square(X1-X2),axis=1))

def manhatten(X1,X2):
    return np.sum(np.abs(X1-X2),axis=1)

def minkowski(X1,X2,q):
    return np.power(np.sum(np.power(np.abs(X1-X2),q),axis=1),1/q)

In [6]:
def predict(test,X,Y,k=5,distance='euclidean',q=4):
    """
        test: sample to test
        X : train data to find neighbors
        Y : target values
        k : number of neighbors to consider
        distance : choice of distance metric ( 'euclidean', 'manhatten', 'minkowski' ) 
        q : used for minkowski distance metric only
    """
    func = None
    if distance=='euclidean': func= euclidean
    elif distance=='manhatten': func=manhatten
    elif distance=='minkowski': func=lambda x,y: minkowski(x,y,q)
    else: print("Wrong Distance metric!!"); return None
    
    assert test.shape == (1,X.shape[1])
    y = func(X,test)
    ind = y.argsort()[:k]
    return np.mean(Y[ind])

In [7]:
predictions = []
for sample in X_test:
    sample = np.expand_dims(sample,axis=0) 
    predictions.append(predict(sample,X_train,Y_train,k=5,distance='euclidean'))
mae = np.mean(np.abs(predictions-Y_test))
mse = np.mean( np.square(predictions-Y_test) )
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}")

MAE: 3.6950, MSE: 21.7670


In [8]:
predictions = []
for sample in X_test:
    sample = np.expand_dims(sample,axis=0) 
    predictions.append(predict(sample,X_train,Y_train,k=5,distance='manhatten'))
mae = np.mean(np.abs(predictions-Y_test))
mse = np.mean( np.square(predictions-Y_test) )
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}")

MAE: 3.7420, MSE: 22.5368


In [9]:
predictions = []
for sample in X_test:
    sample = np.expand_dims(sample,axis=0) 
    predictions.append(predict(sample,X_train,Y_train,k=5,distance='minkowski',q=3))
mae = np.mean(np.abs(predictions-Y_test))
mse = np.mean( np.square(predictions-Y_test) )
print(f"MAE: {mae:.4f}, MSE: {mse:.4f}")

MAE: 3.6620, MSE: 22.0984
