In [1]:
import pandas as pd

In [31]:
stats = pd.read_csv('complete_stats.csv')
del stats['Unnamed: 0']
pd.isnull(stats).sum() # can see some % are None
stats = stats.fillna(0)

In [32]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [33]:
train = stats[stats['Year'] < 2021]
test = stats[stats['Year'] == 2021]

**Linear Regression (Ridge) Model**

In [34]:
# using linear Ridge regression first
from sklearn.linear_model import Ridge

reg = Ridge(alpha=0.1)
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [35]:
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns=['Predictions'], index=test.index)

In [36]:
comb = pd.concat([test[['Player', 'Share']], predictions], axis=1)
comb.sort_values('Share', ascending=False).head(10)

Unnamed: 0,Player,Share,Predictions
641,Nikola Jokić,0.961,0.154306
8624,Joel Embiid,0.58,0.162713
3651,Stephen Curry,0.449,0.142386
9907,Giannis Antetokounmpo,0.345,0.207436
1389,Chris Paul,0.138,0.072293
10997,Luka Dončić,0.042,0.15143
7464,Damian Lillard,0.038,0.116303
3536,Julius Randle,0.02,0.088877
3531,Derrick Rose,0.01,0.033001
11358,Rudy Gobert,0.008,0.09535


In [37]:
comb = comb.sort_values('Share', ascending=False)
comb['Rk'] = list(range(1, comb.shape[0]+1))

In [38]:
comb = comb.sort_values('Predictions', ascending=False)
comb['Predicted_Rk'] = list(range(1, comb.shape[0]+1))

In [39]:
# gets average precision score for specified year
def find_ap(totStats):
  # get top 5 nba mvp winners
  actual = totStats.sort_values('Share', ascending=False).head(5)
  pred = totStats.sort_values('Predictions', ascending=False)
  ps = []
  found = 0
  seen = 1
  for idx, row in pred.iterrows():
    if row['Player'] in actual['Player'].values: 
      found += 1
      ps.append(found / seen)

    seen += 1
  return sum(ps) / len(ps)

Note: The error metric we are using here is the precision score we have defined above

In [40]:
find_ap(comb)

0.7636363636363636

In [41]:
def add_ranks(preds):
  preds = preds.sort_values('Predictions', ascending=False)
  preds['Predicted_Rk'] = list(range(1, preds.shape[0]+1))
  preds = preds.sort_values('Share', ascending=False)
  preds['Rk'] = list(range(1, preds.shape[0]+1))
  preds['Diff'] = preds['Rk'] - preds['Predicted_Rk'] # adding diff metric to see if rank is accurate
  return preds

In [81]:
def backtest(stats, model, years, predictors):
  aps = []
  all_predictions = []
  for year in years:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    model.fit(train[predictors],train["Share"])
    predictions = model.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["Predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share", 'Year']], predictions], axis=1)
    combination = add_ranks(combination)
    all_predictions.append(combination)
    aps.append(find_ap(combination))
  return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [82]:
years = list(range(1991,2022))
mean_ap, aps, all_preds = backtest(stats, reg, years[5:], predictors)

In [87]:
all_preds[all_preds['Year'] == 2021].sort_values('Predictions', ascending=False)

Unnamed: 0,Player,Share,Year,Predictions,Predicted_Rk,Rk,Diff
9907,Giannis Antetokounmpo,0.345,2021,0.199173,1,4,3
8624,Joel Embiid,0.580,2021,0.150510,2,2,0
641,Nikola Jokić,0.961,2021,0.143975,3,1,-2
10997,Luka Dončić,0.042,2021,0.142971,4,6,2
3651,Stephen Curry,0.449,2021,0.141596,5,3,-2
...,...,...,...,...,...,...,...
5588,Isaac Okoro,0.000,2021,-0.051739,536,185,-351
9915,P.J. Tucker,0.000,2021,-0.054016,537,184,-353
3862,Anžejs Pasečņiks,0.000,2021,-0.057780,538,183,-355
13476,Killian Hayes,0.000,2021,-0.058971,539,182,-357


In [None]:
# diagnoses of ceofficients
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

In [89]:
stat_ratios = stats[['PTS', 'AST', 'STL', 'BLK', '3P', 'Year']].groupby('Year').apply(lambda x: x/x.mean())
stats[['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']] = stat_ratios[['PTS', 'AST', 'STL', 'BLK', '3P']]
predictors += ['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']

In [90]:
mean_ap, aps, all_preds = backtest(stats, reg, years[5:], predictors)
mean_ap

0.7208380973034985

In [91]:
# changing positions and teams to categorical variables
stats['NPos'] = stats['Pos'].astype('category').cat.codes
stats['NTm'] = stats['Tm'].astype('category').cat.codes

**Random Forest Model**

In [92]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)
mean_ap, aps, all_preds = backtest(stats, rf, years[28:], predictors)

In [98]:
all_preds[all_preds['Year'] == 2018].sort_values('Predictions', ascending=False).head(10)

Unnamed: 0,Player,Share,Year,Predictions,Predicted_Rk,Rk,Diff
