In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn import model_selection
from sklearn.neighbors import KNeighborsRegressor
from sklearn import neighbors

In [3]:
from warnings import filterwarnings
filterwarnings("ignore")

In [4]:
# Data
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])

y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

### Model & Prediction

In [5]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [6]:
y_pred = knn_model.predict(X_test)

In [7]:
np.sqrt(mean_squared_error(y_test, y_pred))

426.6570764525201

### Model Tuning

In [13]:
RMSE = []

for k in range(10):
    k = k + 1
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    print("k:", k, " RMSE value: ", rmse)

k: 1  RMSE value:  455.03925390751965
k: 2  RMSE value:  415.99629571490965
k: 3  RMSE value:  420.6765370082348
k: 4  RMSE value:  428.8564674588792
k: 5  RMSE value:  426.6570764525201
k: 6  RMSE value:  423.5071669008732
k: 7  RMSE value:  414.9361222421057
k: 8  RMSE value:  413.7094731463598
k: 9  RMSE value:  417.84419990871265
k: 10  RMSE value:  421.6252180741266


In [15]:
# GridSearchCV
knn_params = {"n_neighbors": np.arange(1, 30, 1)}

In [16]:
knn = KNeighborsRegressor()

In [17]:
knn_cv_model = GridSearchCV(knn, knn_params, cv = 10).fit(X_train, y_train)

In [18]:
knn_cv_model.best_params_

{'n_neighbors': 8}

### Final Model

In [19]:
knn_tuned = KNeighborsRegressor(n_neighbors = knn_cv_model.best_params_["n_neighbors"]).fit(X_train, y_train)

In [20]:
y_pred = knn_tuned.predict(X_test)

In [21]:
np.sqrt(mean_squared_error(y_test, y_pred))

413.7094731463598