In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
y = df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [8]:
rmseList = []

for k in range(10):
    k += 1
    knn_model = KNeighborsRegressor(n_neighbors=k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE value for k={k} value: {rmse}")
    rmseList.append(rmse)

RMSE value for k=1 value: 455.03925390751965
RMSE value for k=2 value: 415.99629571490965
RMSE value for k=3 value: 420.6765370082348
RMSE value for k=4 value: 428.8564674588792
RMSE value for k=5 value: 426.6570764525201
RMSE value for k=6 value: 423.5071669008732
RMSE value for k=7 value: 414.9361222421057
RMSE value for k=8 value: 413.7094731463598
RMSE value for k=9 value: 417.84419990871265
RMSE value for k=10 value: 421.6252180741266


In [10]:
# GridSearchCV
knn_params = {"n_neighbors": np.arange(1, 30, 1)}

In [11]:
knn_model = KNeighborsRegressor()

In [16]:
knn_cv_model = GridSearchCV(knn_model, knn_params, cv=10).fit(X_train, y_train)

In [17]:
knn_cv_model.best_params_

{'n_neighbors': 8}

In [18]:
# Final model
knn_tuned_model = KNeighborsRegressor(n_neighbors=knn_cv_model.best_params_["n_neighbors"]).fit(X_train, y_train)

In [19]:
y_pred = knn_tuned_model.predict(X_test)

In [20]:
np.sqrt(mean_squared_error(y_test, y_pred))

413.7094731463598