In [1]:
import pandas as pd 
with open("nba_2013.csv", 'r') as csvfile:
    nba = pd.read_csv(csvfile)

# The names of all the columns in the data.
print(nba.columns.values)

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']


In [2]:
### Check for null values 

nba.isnull().any()

player          False
pos             False
age             False
bref_team_id    False
g               False
gs              False
mp              False
fg              False
fga             False
fg.              True
x3p             False
x3pa            False
x3p.             True
x2p             False
x2pa            False
x2p.             True
efg.             True
ft              False
fta             False
ft.              True
orb             False
drb             False
trb             False
ast             False
stl             False
blk             False
tov             False
pf              False
pts             False
season          False
season_end      False
dtype: bool

In [3]:
# Fill NANs

nba["fg."].fillna(nba["fg."].mean(),inplace=True)
nba["x2p."].fillna(nba["x2p."].mean(),inplace=True)
nba["efg."].fillna(nba["efg."].mean(),inplace=True)
nba["x3p."].fillna(nba["x3p."].mean(),inplace=True)
nba["ft."].fillna(nba["ft."].mean(),inplace=True)

In [4]:
# Select only the numeric columns from the dataset
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']
nba_numeric = nba[distance_columns]

# Normalize all of the numeric columns
nba_normalized = nba_numeric.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# Category Columns
nba_category = nba[['player', 'bref_team_id', 'season']]

nba = pd.concat([nba_category, nba_normalized], axis=1)

In [5]:
from sklearn.model_selection import train_test_split

# The columns that we will be making predictions with.
x_columns = nba[['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']]

# The column that we want to predict.
y_column = nba["pts"]

x_train, x_test, y_train, y_test = train_test_split(x_columns, y_column, test_size=0.3, random_state=42)

In [16]:
# Create the knn model. We will use regressor because we are predicting on continous values

from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics

# check for increasing neighbour values to see which has the least mean squared error

for k in range(10):
    k_value = k + 1
    knn = KNeighborsRegressor(n_neighbors = k_value)
    knn.fit(x_train, y_train) 
    y_pred = knn.predict(x_test)
    print ("Mean Squared Error is:", metrics.mean_squared_error(y_test, y_pred), " for k_value:", k_value)

Mean Squared Error is: 0.001776736371000023  for k_value: 1
Mean Squared Error is: 0.001250043971145984  for k_value: 2
Mean Squared Error is: 0.0008851633940660284  for k_value: 3
Mean Squared Error is: 0.0009051285708615703  for k_value: 4
Mean Squared Error is: 0.0009538708765905599  for k_value: 5
Mean Squared Error is: 0.0010122022610499911  for k_value: 6
Mean Squared Error is: 0.00109706834236204  for k_value: 7
Mean Squared Error is: 0.0011085445637352268  for k_value: 8
Mean Squared Error is: 0.0011985856267422432  for k_value: 9
Mean Squared Error is: 0.0012486277348341143  for k_value: 10


In [24]:
## We will go for k-value = 5, because it has low MSE. k-value=3 has the least MSE but less k-value increases the chance of
## overfitting. 

knn = KNeighborsRegressor(n_neighbors = 5)
knn.fit(x_train, y_train) 
y_pred = knn.predict(x_test)
print ("Mean Squared Error is:", metrics.mean_squared_error(y_test, y_pred))

Mean Squared Error is: 0.0009538708765905599


In [25]:
Test_With_Predicted = pd.DataFrame({'Actual Points': y_test.tolist(), 'Predicted Points': y_pred.tolist()})

Test_With_Predicted

Unnamed: 0,Actual Points,Predicted Points
0,0.226379,0.222368
1,0.034323,0.029618
2,0.134979,0.134207
3,0.546471,0.602854
4,0.413035,0.338912
5,0.130351,0.149017
6,0.020825,0.022291
7,0.338990,0.306903
8,0.086772,0.075280
9,0.114925,0.120555
