
In this assignment, students will be using the K-nearest neighbors algorithm to predict
how many points NBA players scored in the 2013-2014 season.
A look at the data
Before we dive into the algorithm, letâ€™s take a look at our data. Each row in the data
contains information on how a player performed in the 2013-2014 NBA season.
Download 'nba_2013.csv' file from this link:


In [None]:
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
import pandas as pd
with open("nba_2013.csv", 'r') as csvfile:
    nba = pd.read_csv(csvfile)

In [3]:
nba.head(50)

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013
5,Cole Aldrich,C,25,NYK,46,2,330,33,61,0.541,...,92,129,14,8,30,18,40,92,2013-2014,2013
6,LaMarcus Aldridge,PF,28,POR,69,69,2498,652,1423,0.458,...,599,765,178,63,68,123,147,1603,2013-2014,2013
7,Lavoy Allen,PF,24,TOT,65,2,1072,134,300,0.447,...,192,311,71,24,33,44,126,303,2013-2014,2013
8,Ray Allen,SG,38,MIA,73,9,1936,240,543,0.442,...,182,205,143,54,8,84,115,701,2013-2014,2013
9,Tony Allen,SG,32,MEM,55,28,1278,204,413,0.494,...,129,208,94,90,19,90,121,495,2013-2014,2013


In [4]:
nba.shape

(481, 31)

In [5]:
nba['player'].value_counts()

Khris Middleton          1
Jeff Ayres               1
Jonas Valanciunas        1
Shannon Brown            1
Dwight Howard            1
Jeremy Tyler             1
Scotty Hopson            1
Hamady N'Diaye           1
Miroslav Raduljica       1
Andre Roberson           1
DeAndre Liggins          1
DeMarre Carroll          1
Steve Novak              1
Jeremy Lamb              1
Jae Crowder              1
Nemanja Nedovic          1
Monta Ellis              1
Leandro Barbosa          1
Troy Daniels             1
Toure' Murry             1
Enes Kanter              1
Quincy Pondexter         1
Josh Smith               1
Carl Landry              1
Arnett Moultrie          1
Earl Watson              1
Solomon Hill             1
Mirza Teletovic          1
Mike Miller              1
Robert Sacre             1
                        ..
Sergey Karasev           1
Darius Morris            1
Giannis Antetokounmpo    1
Pablo Prigioni           1
Andrew Bogut             1
Jared Cunningham         1
J

In [6]:
nba['pos'].value_counts()

SG    109
SF     99
PF     96
C      90
PG     85
G       1
F       1
Name: pos, dtype: int64

In [7]:
nba['season'].value_counts()

2013-2014    481
Name: season, dtype: int64

In [8]:
nba['season_end'].value_counts()

2013    481
Name: season_end, dtype: int64

In [9]:
# Remove the "Player" column as the Player Name is unique and will lead to incorrect model if included
#Remove Season which is having same value for all rows
#Remove Season_end which is also having same value for all rows and is object type
nbadata = nba.iloc[:,1:29] 

In [10]:
# nbadata.drop('player', axis=1)
nbadata.head()


Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,SF,23,TOT,63,0,847,66,141,0.468,4,...,0.66,72,144,216,28,23,26,30,122,171
1,C,20,OKC,81,20,1197,93,185,0.503,0,...,0.581,142,190,332,43,40,57,71,203,265
2,PF,27,TOT,53,12,961,143,275,0.52,0,...,0.639,102,204,306,38,24,36,39,108,362
3,SG,28,ORL,73,73,2552,464,1011,0.459,128,...,0.815,32,230,262,248,35,3,146,136,1330
4,C,25,NOP,56,30,951,136,249,0.546,0,...,0.836,94,183,277,40,23,46,63,187,328


In [11]:
charcter_col = nbadata.dtypes.pipe(lambda x: x[x=='object']).index

In [12]:
charcter_col

Index(['pos', 'bref_team_id'], dtype='object')

In [13]:
#Here label_mapping will be having all unique values for each column stored in a dictionary
label_mapping= {}
for c in charcter_col:
    nbadata[c], label_mapping[c] = pd.factorize(nbadata[c])


nbadata.head()

Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,0,23,0,63,0,847,66,141,0.468,4,...,0.66,72,144,216,28,23,26,30,122,171
1,1,20,1,81,20,1197,93,185,0.503,0,...,0.581,142,190,332,43,40,57,71,203,265
2,2,27,0,53,12,961,143,275,0.52,0,...,0.639,102,204,306,38,24,36,39,108,362
3,3,28,2,73,73,2552,464,1011,0.459,128,...,0.815,32,230,262,248,35,3,146,136,1330
4,1,25,3,56,30,951,136,249,0.546,0,...,0.836,94,183,277,40,23,46,63,187,328


In [14]:
nbadata.isnull().any() # identify if a column has Null Value or not

pos             False
age             False
bref_team_id    False
g               False
gs              False
mp              False
fg              False
fga             False
fg.              True
x3p             False
x3pa            False
x3p.             True
x2p             False
x2pa            False
x2p.             True
efg.             True
ft              False
fta             False
ft.              True
orb             False
drb             False
trb             False
ast             False
stl             False
blk             False
tov             False
pf              False
pts             False
dtype: bool

In [15]:
nbadata.fillna(nbadata.mean(), inplace=True)

Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,0,23,0,63,0,847,66,141,0.468,4,...,0.660000,72,144,216,28,23,26,30,122,171
1,1,20,1,81,20,1197,93,185,0.503,0,...,0.581000,142,190,332,43,40,57,71,203,265
2,2,27,0,53,12,961,143,275,0.520,0,...,0.639000,102,204,306,38,24,36,39,108,362
3,3,28,2,73,73,2552,464,1011,0.459,128,...,0.815000,32,230,262,248,35,3,146,136,1330
4,1,25,3,56,30,951,136,249,0.546,0,...,0.836000,94,183,277,40,23,46,63,187,328
5,1,25,4,46,2,330,33,61,0.541,0,...,0.867000,37,92,129,14,8,30,18,40,92
6,2,28,5,69,69,2498,652,1423,0.458,3,...,0.822000,166,599,765,178,63,68,123,147,1603
7,2,24,0,65,2,1072,134,300,0.447,2,...,0.660000,119,192,311,71,24,33,44,126,303
8,3,38,6,73,9,1936,240,543,0.442,116,...,0.905000,23,182,205,143,54,8,84,115,701
9,3,32,7,55,28,1278,204,413,0.494,11,...,0.628000,79,129,208,94,90,19,90,121,495


In [16]:
nbadata.head(10)

Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,0,23,0,63,0,847,66,141,0.468,4,...,0.66,72,144,216,28,23,26,30,122,171
1,1,20,1,81,20,1197,93,185,0.503,0,...,0.581,142,190,332,43,40,57,71,203,265
2,2,27,0,53,12,961,143,275,0.52,0,...,0.639,102,204,306,38,24,36,39,108,362
3,3,28,2,73,73,2552,464,1011,0.459,128,...,0.815,32,230,262,248,35,3,146,136,1330
4,1,25,3,56,30,951,136,249,0.546,0,...,0.836,94,183,277,40,23,46,63,187,328
5,1,25,4,46,2,330,33,61,0.541,0,...,0.867,37,92,129,14,8,30,18,40,92
6,2,28,5,69,69,2498,652,1423,0.458,3,...,0.822,166,599,765,178,63,68,123,147,1603
7,2,24,0,65,2,1072,134,300,0.447,2,...,0.66,119,192,311,71,24,33,44,126,303
8,3,38,6,73,9,1936,240,543,0.442,116,...,0.905,23,182,205,143,54,8,84,115,701
9,3,32,7,55,28,1278,204,413,0.494,11,...,0.628,79,129,208,94,90,19,90,121,495


In [18]:
X = nbadata.drop('pts', axis=1)
Y = nbadata['pts']

In [19]:
X.head()

Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf
0,0,23,0,63,0,847,66,141,0.468,4,...,53,0.66,72,144,216,28,23,26,30,122
1,1,20,1,81,20,1197,93,185,0.503,0,...,136,0.581,142,190,332,43,40,57,71,203
2,2,27,0,53,12,961,143,275,0.52,0,...,119,0.639,102,204,306,38,24,36,39,108
3,3,28,2,73,73,2552,464,1011,0.459,128,...,336,0.815,32,230,262,248,35,3,146,136
4,1,25,3,56,30,951,136,249,0.546,0,...,67,0.836,94,183,277,40,23,46,63,187


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)

In [21]:
for K in range(25):
    K_value = K+1
    neigh = KNeighborsClassifier(n_neighbors = K_value, weights='uniform', algorithm='auto')
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    print("Accuracy is ", accuracy_score(y_test,y_pred)*100,"% for K-Value:",K_value)

Accuracy is  4.8275862069 % for K-Value: 1
Accuracy is  3.44827586207 % for K-Value: 2
Accuracy is  4.13793103448 % for K-Value: 3
Accuracy is  4.13793103448 % for K-Value: 4
Accuracy is  2.06896551724 % for K-Value: 5
Accuracy is  2.75862068966 % for K-Value: 6
Accuracy is  4.13793103448 % for K-Value: 7
Accuracy is  4.13793103448 % for K-Value: 8
Accuracy is  4.13793103448 % for K-Value: 9
Accuracy is  4.13793103448 % for K-Value: 10
Accuracy is  4.13793103448 % for K-Value: 11
Accuracy is  2.75862068966 % for K-Value: 12
Accuracy is  3.44827586207 % for K-Value: 13
Accuracy is  3.44827586207 % for K-Value: 14
Accuracy is  3.44827586207 % for K-Value: 15
Accuracy is  3.44827586207 % for K-Value: 16
Accuracy is  3.44827586207 % for K-Value: 17
Accuracy is  3.44827586207 % for K-Value: 18
Accuracy is  3.44827586207 % for K-Value: 19
Accuracy is  2.75862068966 % for K-Value: 20
Accuracy is  2.75862068966 % for K-Value: 21
Accuracy is  2.75862068966 % for K-Value: 22
Accuracy is  2.75862

In [22]:
X_train[:1:]

Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf
328,4,31,2,68,68,2179,294,747,0.394,136,...,112,0.857,29,202,231,476,52,5,166,146


In [23]:
y_test[1]

265

In [24]:
y_pred[1]

490

In [25]:
neigh.predict(X_train[:10:])

array([ 810,  132,  961,  404,  181,   65,    6,   68,  810, 1417], dtype=int64)

In [26]:
y_train.head(10)

328     820
252     226
229    1594
127     339
188     225
268      98
324       6
260      76
80      925
450    1289
Name: pts, dtype: int64

In [27]:
X_train[:1:]

Unnamed: 0,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,x3p,...,fta,ft.,orb,drb,trb,ast,stl,blk,tov,pf
328,4,31,2,68,68,2179,294,747,0.394,136,...,112,0.857,29,202,231,476,52,5,166,146
