In [1]:
import pandas as pd

In [3]:
stats = pd.read_csv('complete_stats.csv')
del stats['Unnamed: 0']
pd.isnull(stats).sum() # can see some % are None
stats = stats.fillna(0)

In [4]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [6]:
train = stats[stats['Year'] < 2021]
test = stats[stats['Year'] >= 2021]

**Linear Regression (Ridge) Model**

In [7]:
# using linear Ridge regression first
from sklearn.linear_model import Ridge

reg = Ridge(alpha=0.1)
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [8]:
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns=['Predictions'], index=test.index)

In [9]:
comb = pd.concat([test[['Player', 'Share']], predictions], axis=1)
comb.sort_values('Share', ascending=False).head(10)

Unnamed: 0,Player,Share,Predictions
641,Nikola Jokić,0.961,0.154306
663,Nikola Jokić,0.875,0.195133
837,Joel Embiid,0.706,0.193757
11678,Giannis Antetokounmpo,0.595,0.224689
9018,Joel Embiid,0.58,0.162713
3843,Stephen Curry,0.449,0.142386
10338,Giannis Antetokounmpo,0.345,0.207436
907,Devin Booker,0.216,0.095222
11469,Luka Dončić,0.146,0.164077
1499,Chris Paul,0.138,0.072293


In [10]:
comb = comb.sort_values('Share', ascending=False)
comb['Rk'] = list(range(1, comb.shape[0]+1))

In [11]:
comb = comb.sort_values('Predictions', ascending=False)
comb['Predicted_Rk'] = list(range(1, comb.shape[0]+1))

In [12]:
# gets average precision score for specified year
def find_ap(totStats):
  # get top 5 nba mvp winners
  actual = totStats.sort_values('Share', ascending=False).head(5)
  pred = totStats.sort_values('Predictions', ascending=False)
  ps = []
  found = 0
  seen = 1
  for idx, row in pred.iterrows():
    if row['Player'] in actual['Player'].values: 
      found += 1
      ps.append(found / seen)

    seen += 1
  return sum(ps) / len(ps)

Note: The error metric we are using here is the precision score we have defined above

In [13]:
find_ap(comb)

0.9107142857142857

In [14]:
def add_ranks(preds):
  preds = preds.sort_values('Predictions', ascending=False)
  preds['Predicted_Rk'] = list(range(1, preds.shape[0]+1))
  preds = preds.sort_values('Share', ascending=False)
  preds['Rk'] = list(range(1, preds.shape[0]+1))
  preds['Diff'] = preds['Rk'] - preds['Predicted_Rk'] # adding diff metric to see if rank is accurate
  return preds

In [15]:
def backtest(stats, model, years, predictors):
  aps = []
  all_predictions = []
  for year in years:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    model.fit(train[predictors],train["Share"])
    predictions = model.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["Predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share", 'Year']], predictions], axis=1)
    combination = add_ranks(combination)
    all_predictions.append(combination)
    aps.append(find_ap(combination))
  return sum(aps) / len(aps), aps, pd.concat(all_predictions)

In [18]:
years = list(range(1991,2023))
mean_ap, aps, all_preds = backtest(stats, reg, years[5:], predictors)

In [21]:
all_preds[all_preds['Year'] == 2022].sort_values('Predictions', ascending=False)

Unnamed: 0,Player,Share,Year,Predictions,Predicted_Rk,Rk,Diff
11678,Giannis Antetokounmpo,0.595,2022,0.219410,1,3,2
837,Joel Embiid,0.706,2022,0.190462,2,2,0
663,Nikola Jokić,0.875,2022,0.190365,3,1,-2
8241,LeBron James,0.001,2022,0.157828,4,12,8
11469,Luka Dončić,0.146,2022,0.157395,5,5,0
...,...,...,...,...,...,...,...
12512,Trayvon Palmer,0.000,2022,-0.057006,601,212,-389
11821,Wes Iwundu,0.000,2022,-0.066277,602,213,-389
1216,Zavier Simpson,0.000,2022,-0.066415,603,214,-389
12495,Derrick Walton,0.000,2022,-0.066490,604,215,-389


In [22]:
# diagnoses of ceofficients
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.087852,eFG%
18,0.03386,DRB
29,0.023198,W/L%
17,0.020993,ORB
10,0.016456,2P
21,0.01207,STL
22,0.010901,BLK
15,0.010414,FTA
20,0.007113,AST
12,0.007054,2P%


In [23]:
stat_ratios = stats[['PTS', 'AST', 'STL', 'BLK', '3P', 'Year']].groupby('Year').apply(lambda x: x/x.mean())
stats[['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']] = stat_ratios[['PTS', 'AST', 'STL', 'BLK', '3P']]
predictors += ['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']

In [24]:
mean_ap, aps, all_preds = backtest(stats, reg, years[5:], predictors)
mean_ap

0.726619022474594

In [25]:
# changing positions and teams to categorical variables
stats['NPos'] = stats['Pos'].astype('category').cat.codes
stats['NTm'] = stats['Tm'].astype('category').cat.codes

**Random Forest Model**

In [35]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)
mean_ap, aps, all_preds = backtest(stats, rf, years[28:], predictors)

In [36]:
all_preds[all_preds['Year'] == 2022].sort_values('Predictions', ascending=False).head(10)

Unnamed: 0,Player,Share,Year,Predictions,Predicted_Rk,Rk,Diff
907,Devin Booker,0.216,2022,0.548503,1,4,3
837,Joel Embiid,0.706,2022,0.420495,2,2,0
12226,Ja Morant,0.01,2022,0.387262,3,7,4
663,Nikola Jokić,0.875,2022,0.382979,4,1,-3
11678,Giannis Antetokounmpo,0.595,2022,0.366661,5,3,-2
11469,Luka Dončić,0.146,2022,0.269472,6,5,-1
1179,Jayson Tatum,0.043,2022,0.209107,7,6,-1
6185,Kevin Durant,0.001,2022,0.200081,8,10,2
6398,Stephen Curry,0.004,2022,0.174114,9,8,-1
8241,LeBron James,0.001,2022,0.172289,10,11,1
