In [16]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error 
from math import sqrt

In [17]:
nba = pd.read_csv("nba_2013.csv")
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [18]:
nba.isna().sum()  ##fg.,x3p.,x2p.efg.,ft.
       
       
for column in ['fg.','x3p.','x2p.','efg.','ft.']:
       nba[column].fillna(nba[column].mode()[0], inplace=True)
        
nba.isna().sum()
    

player          0
pos             0
age             0
bref_team_id    0
g               0
gs              0
mp              0
fg              0
fga             0
fg.             0
x3p             0
x3pa            0
x3p.            0
x2p             0
x2pa            0
x2p.            0
efg.            0
ft              0
fta             0
ft.             0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
season          0
season_end      0
dtype: int64

In [19]:
X = nba.iloc[:,4:28]
y = nba.iloc[:,28]

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size= 0.25, random_state = 355)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train)

[[ 1.18875881 -0.21189118  1.120928   ... -0.47993522  1.25321498
  -0.20216116]
 [ 0.91662289  1.68328461  1.62833809 ... -0.08463483  2.30271862
   1.41265333]
 [ 0.95549945  0.53264216  0.59340033 ... -0.38871205 -0.16765148
   0.54956283]
 ...
 [ 0.48898073 -0.41494573  0.35981286 ...  0.37148101 -0.08692043
   0.75837504]
 [ 0.83886977  0.60032701  0.57887096 ... -0.4495275  -0.23223632
   0.57740445]
 [ 0.72224009  1.54791491  1.33775081 ... -0.02381938  0.51048933
   0.59132527]]


In [21]:
regressor = KNeighborsRegressor()
regressor.fit(X_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [22]:
y_pred = regressor.predict(X_test)
y_pred

array([  19.4,  304.8,  799.8,  298.6,   84.6,   84.8, 1153.2,    3.4,
        181.2,   22.6,   93.6,   53.6,  652.8, 1257.8,    3.4,  603.6,
       1298.8,  134.2,  110.2,   76.6,  348.8,  600. ,  147.2, 1373.8,
       1238.6, 1012.2, 1204.4,  415.4,    8.8,  371.4,  719.4,  121.6,
        551.8,  340. ,  589. ,   37.2,  342.8, 1106.2,  837.2,  603.4,
        796.2,  114.6,  217.8,  333.2,  368.8,  113.2,  224.4,  401.6,
         63.4,   32. ,  172.8,  873.4,   60.6,  177. ,  992. ,  878.2,
       1221.6,  346.6,  441.8, 1274.8,  374. ,   12.8, 1318. ,  717. ,
        115.8,   96.6, 1089. ,  784.8,    8.8,  713.6,   62.4, 1230.6,
       1687.2,  305.8,  760.6,  591.6,  909.6,  555.6,  169.6, 1161.2,
       1685.4,  696.2, 1022. ,  765.4,   39. ,  180. , 1051. ,  296.4,
        900. ,  263. ,  224.4,  122.4, 1047.4,  165.2,  360.8, 1225.4,
        173. ,  168.4,   49.4,  369.4,   28.4,  291.6,  226.8,   80.8,
        761.6, 1237.2, 1208.2,  342.8,  784. ,  876.6,   88.4,  403.4,
      

In [23]:

    error = sqrt(mean_squared_error(y_test,y_pred)) #calculate rmse
    print(error)

78.69179549156284


In [25]:
rmse_val = [] #to store rmse values for different k
for K in range(20):
    K = K+1
    model = KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train, y_train)  #fit the model
    pred=model.predict(X_test) #make prediction on test set
    error = sqrt(mean_squared_error(y_test,y_pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

RMSE value for k=  1 is: 78.69179549156284
RMSE value for k=  2 is: 78.69179549156284
RMSE value for k=  3 is: 78.69179549156284
RMSE value for k=  4 is: 78.69179549156284
RMSE value for k=  5 is: 78.69179549156284
RMSE value for k=  6 is: 78.69179549156284
RMSE value for k=  7 is: 78.69179549156284
RMSE value for k=  8 is: 78.69179549156284
RMSE value for k=  9 is: 78.69179549156284
RMSE value for k=  10 is: 78.69179549156284
RMSE value for k=  11 is: 78.69179549156284
RMSE value for k=  12 is: 78.69179549156284
RMSE value for k=  13 is: 78.69179549156284
RMSE value for k=  14 is: 78.69179549156284
RMSE value for k=  15 is: 78.69179549156284
RMSE value for k=  16 is: 78.69179549156284
RMSE value for k=  17 is: 78.69179549156284
RMSE value for k=  18 is: 78.69179549156284
RMSE value for k=  19 is: 78.69179549156284
RMSE value for k=  20 is: 78.69179549156284


In [28]:
from sklearn.model_selection import GridSearchCV
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}

knn = KNeighborsRegressor()

model = GridSearchCV(knn, params, cv=5)
model.fit(X_train,y_train)
model.best_params_

{'n_neighbors': 4}

In [29]:

    model = KNeighborsRegressor(n_neighbors = 4)

    model.fit(X_train, y_train)  #fit the model
    pred=model.predict(X_test) #make prediction on test set
    error = sqrt(mean_squared_error(y_test,y_pred)) #calculate rmse
    print(error)

78.69179549156284
