In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
def rmse(pred, actual):
    return np.sqrt(mean_squared_error(pred, actual))

In [4]:
kfold = KFold(n_splits=5)

### Simple KNeighborsRegressor

In [6]:
knn = KNeighborsRegressor()
err = []

for train_idx, test_idx in kfold.split(train):
    train_x = train.iloc[train_idx, :-1]
    test_x = train.iloc[test_idx, :-1]

    train_y = train.iloc[train_idx, -1]
    test_y = train.iloc[test_idx, -1]

    knn.fit(train_x, train_y)
    y_pred = knn.predict(test_x)
    err.append(rmse(y_pred, test_y))

print(err)
print(np.mean(err))

[98.71587979778246, 85.07751519659098, 246.19921948639916, 251.4279111882935, 246.4838796849063]
185.58088107079448


### Hyperparameter selection for KNeighborsRegressor

In [9]:
param = {
    'n_neighbors': np.arange(5,30,5),
    'weights': ['uniform', 'distance'],
    'p': [1,2,3],
}

grid_search = GridSearchCV(KNeighborsRegressor(), param_grid=param, cv=5)
grid_search.fit(train.iloc[:,:-1], train.iloc[:,-1])
knn = grid_search.best_estimator_
best_param = grid_search.best_params_

print(best_param)

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}




In [12]:
err = []

for train_idx, test_idx in kfold.split(train):
    train_x = train.iloc[train_idx, :-1]
    test_x = train.iloc[test_idx, :-1]

    train_y = train.iloc[train_idx, -1]
    test_y = train.iloc[test_idx, -1]

    knn = KNeighborsRegressor(n_neighbors=5, p=1, weights='distance')
    knn.fit(train_x, train_y)
    y_pred = knn.predict(test_x)
    err.append(rmse(y_pred, test_y))

print(err)
print(np.mean(err))

[78.67688342818211, 62.17543516280433, 216.26837258801183, 218.41305295559536, 212.19835017626386]
157.5464188621715


<h2>Accuracy of KNeighborsRegressor on Test set</h2>

In [None]:
test = pd.read_csv('../data/test.csv')

print("RMSE for RF: ", rmse(knn.predict(test.iloc[:,:-1]), test.iloc[:,-1]))

## Conclusion
<p>The optimal hyperparameters are:
<ul>
    <li>n_neighbors = 5</li>
    <li>p = 1</li>
    <li>weights = 'distance'</li>
</ul>

<p>The average RMSE is 157.546.</p>