# Estimation of salaries of Baseball players with non-linear KNN Machine learning method

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV #
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt 
from sklearn.preprocessing import scale # for standardization
from sklearn.preprocessing import StandardScaler # for standardization
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor##  library for KNN
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [5]:
 # turn off alerts
from warnings import filterwarnings
filterwarnings ('ignore')

In [6]:
df=pd.read_csv("Hitters.csv")  
df=df.dropna() # we deleted the missing variables
dms=pd.get_dummies(df[['League','Division','NewLeague']]) # we changed categorical variables to dummys
y=df["Salary"] # bağımlı değişken
X_=df.drop(['Salary','League','Division','NewLeague'],axis=1)# The initial states of the dependent and categorical variables were excluded.
X=pd.concat([X_,dms[['League_N','Division_W','NewLeague_N']]],axis=1) # The remaining variables and dummy variables are combined into the independent variables.
X_train,X_test,y_train,y_test = train_test_split(X,
                                                y,
                                                test_size=0.25,
                                                random_state=42)# train test created.

In [7]:
X_train.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
183,328,91,12,51,43,33,2,342,94,12,51,44,33,145,59,8,1,0,1
229,514,144,0,67,54,79,9,4739,1169,13,583,374,528,229,453,15,1,0,1
286,593,152,23,69,75,53,6,2765,686,133,369,384,321,315,10,6,0,1,0
102,233,49,2,41,23,18,8,1350,336,7,166,122,106,102,132,10,0,0,0
153,341,95,6,48,42,20,10,2964,808,81,379,428,221,158,4,5,1,1,1


## Model

In [8]:
knn_model= KNeighborsRegressor().fit(X_train,y_train) # model installed.

In [9]:
knn_model

KNeighborsRegressor()

In [10]:
knn_model.metric

'minkowski'

In [11]:
dir(knn_model) # values that can be retrieved within the model object.

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_algorithm_metric',
 '_check_feature_names',
 '_check_n_features',
 '_estimator_type',
 '_fit',
 '_fit_X',
 '_fit_method',
 '_get_param_names',
 '_get_tags',
 '_kneighbors_reduce_func',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_tree',
 '_validate_data',
 '_y',
 'algorithm',
 'effective_metric_',
 'effective_metric_params_',
 'feature_names_in_',
 'fit',
 'get_params',
 'kneighbors',
 'kneighbors_graph',
 'leaf_size',
 'metric',
 'metric_params',
 'n_features_in_',
 'n_jobs',
 'n_neighbors',
 'n_s

In [12]:
knn_model.predict(X_test)[0:5]

array([ 510.3334,  808.3334,  772.5   ,  125.5   , 1005.    ])

In [13]:
y_pred=knn_model.predict(X_test) # will estimate the dependent variable using the independent variables in the test set.

In [14]:
np.sqrt(mean_squared_error(y_test,y_pred)) # Root square value of mean squared error

426.6570764525201

Test RMSE: 426.6570764525201

## Model Tuning

In [15]:
RMSE = []

for k in range(10): 
    
    k= k+1  # To skip to the next iteration when each iteration is caught
    knn_model= KNeighborsRegressor(n_neighbors=k).fit (X_train, y_train) 
    y_pred = knn_model.predict(X_test)
    rmse=np.sqrt(mean_squared_error(y_test,y_pred))
    RMSE.append(rmse)
    print("k=", k,"için RMSE değeri:", rmse)
    

k= 1 için RMSE değeri: 455.03925390751965
k= 2 için RMSE değeri: 415.99629571490965
k= 3 için RMSE değeri: 420.6765370082348
k= 4 için RMSE değeri: 428.8564674588792
k= 5 için RMSE değeri: 426.6570764525201
k= 6 için RMSE değeri: 423.5071669008732
k= 7 için RMSE değeri: 414.9361222421057
k= 8 için RMSE değeri: 413.7094731463598
k= 9 için RMSE değeri: 417.84419990871265
k= 10 için RMSE değeri: 421.6252180741266


In [16]:
#GridSearchCV
knn_params={"n_neighbors" : np.arange(1,30,1)}

In [17]:
knn=KNeighborsRegressor()

In [18]:
knn_cv_model = GridSearchCV(knn,knn_params, cv=10).fit(X_train, y_train) # GdidSearch method

In [19]:
knn_cv_model.best_params_

{'n_neighbors': 8}

In [24]:
#final model
knn_tuned = KNeighborsRegressor(n_neighbors=knn_cv_model.best_params_["n_neighbors"]).fit(X_train, y_train)

In [25]:
y_pred=knn_tuned.predict(X_test)

In [26]:
np.sqrt(mean_squared_error(y_test,y_pred)) 

413.7094731463598