K-nearest neighbors regression: the value of the neighbor with shortest distance instead of average of k neighbors.

In [71]:
import numpy as np
import math
import pandas as ps
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

Implementation of K-nearest neighbors regression from scratch

In [72]:
#define function for Euclidian distance
def eucl_d(train_row,test_row,n):
  d=0
  for i in range(n-1):
    d=d+(train_row[i]-test_row[i])**2
  return math.sqrt(d)

def k_neighbors(train_data,test_data,k):
  d=list()
  for i in range(train_data.shape[0]):
    distance=eucl_d(train_data[i,:],test_data,train_data.shape[1])
    d.append((train_data[i,:],distance))
  d.sort(key=lambda t: t[1])
  neighbors=list()
  for i in range(k):
    neighbors.append(d[i][0])
  return np.array(neighbors)

#prediction based on average of K neighbor values
def predict_ave(train_data, test_row, k_value):
  neighbors=k_neighbors(train_data,test_row,k_value)
  ave=0
  for i in range(k_value):
    ave=ave+neighbors[i,-1]
  return ave/k_value

#function for multiple test data
def predict_multi_ave(train_data,test_data,k):
  y=np.zeros((test_data.shape[0],1))
  for i in range(test_data.shape[0]):
   y[i,0]=predict_ave(train_data,test_data[i,:],k)
  return y

Implementation with "nearest neighbor's value"

In [73]:
def predict_bestfit(train_data,test_row,k_value):
  neighbors=k_neighbors(train_data,test_row,k_value)
  s=list()
  for i in range(k_value):
    s.append(eucl_d(neighbors[i,:],test_row,neighbors.shape[1]))
  minimum_d=min(s)
  j=0
  for i in range(k_value):
    if minimum_d==s[i]:
      j=i
  return neighbors[j,-1]

#function for multiple test data
def predict_multi_fit(train_data,test_data,k):
  y=np.zeros((test_data.shape[0],1))
  for i in range(test_data.shape[0]):
   y[i,0]=predict_bestfit(train_data,test_data[i,:],k)
  return y

Testing the model on datasets




1.Diabetes dataset

In [74]:
from sklearn.datasets import load_diabetes
X,y=load_diabetes(return_X_y=True)
X_data, X_test, y_data, y_test = train_test_split(X,y, test_size=0.3, random_state=11)
Data=np.zeros((X_data.shape[0],X_data.shape[1]+1))
Data[:,-1]=y_data
Data[:,:-1]=X_data

In [75]:
#usual Knn regression
a=predict_multi_ave(Data,X_test,20)

#comparing with sklearn built-in function
neigh=neighbors.KNeighborsRegressor(n_neighbors=20)
neigh.fit(X_data,y_data)
b=neigh.predict(X_test)
print(mean_squared_error(a,b))
print(mean_absolute_error(a,b))

#zero error confirms the reliability of handwritten implementation of Knn

0.0
0.0


In [76]:
#modified Knn regression
mod=predict_multi_fit(Data,X_test,20)

In [80]:
#comparing error of two
print('Knn regression error:',mean_absolute_error(a,y_test))
print('Knn sklearn regression error:',mean_absolute_error(b,y_test))
print('Modified Knn regression error:',mean_absolute_error(mod,y_test))

Knn regression error: 51.0093984962406
Knn sklearn regression error: 51.0093984962406
Modified Knn regression error: 63.41353383458647


2.Dataset through sklearn.make_regression

In [82]:
from sklearn.datasets import make_regression
#generate regression data with 300 samples and 8 features
A, c = make_regression(300,8)
A_data, A_test,c_data,c_test = train_test_split(A,c,test_size=0.3,random_state=11)
Aadata=np.zeros((A_data.shape[0],A_data.shape[1]+1))
Aadata[:,-1]=c_data
Aadata[:,:-1]=A_data

#sklearn model
neighb=neighbors.KNeighborsRegressor(n_neighbors=15)
neighb.fit(A_data,c_data)
Knn1=neighb.predict(A_test)
Knn2=predict_multi_ave(Aadata,A_test,15)
ModKnn2=predict_multi_fit(Aadata,A_test,15)

In [84]:
#comparing error of two
print('Knn regression error:',mean_absolute_error(Knn2,c_test))
print('Knn sklearn regression error:',mean_absolute_error(Knn1,c_test))
print('Modified Knn regression error:',mean_absolute_error(ModKnn2,c_test))

Knn regression error: 38.61688624660365
Knn sklearn regression error: 38.61688624660365
Modified Knn regression error: 42.05494003971795
