In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/nba-mvp/

Mounted at /content/drive
/content/drive/MyDrive/nba-mvp


In [50]:
import pandas as pd

stats = pd.read_csv('complete_stats.csv')
del stats['Unnamed: 0']
pd.isnull(stats).sum() # can see some % are None
stats = stats.fillna(0)

predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

train = stats[stats['Year'] < 2015]
test = stats[stats['Year'] >= 2016]

years = list(range(1991,2023))

In [38]:
# gets average precision score for specified year
def find_ap(totStats):
  # get top 5 nba mvp winners
  actual = totStats.sort_values('Share', ascending=False).head(5)
  pred = totStats.sort_values('Predictions', ascending=False)
  ps = []
  found = 0
  seen = 1
  for idx, row in pred.iterrows():
    if row['Player'] in actual['Player'].values: 
      found += 1
      ps.append(found / seen)

    seen += 1
  return sum(ps) / len(ps)

def add_ranks(preds):
  preds = preds.sort_values('Predictions', ascending=False)
  preds['Predicted_Rk'] = list(range(1, preds.shape[0]+1))
  preds = preds.sort_values('Share', ascending=False)
  preds['Rk'] = list(range(1, preds.shape[0]+1))
  preds['Diff'] = preds['Rk'] - preds['Predicted_Rk'] # adding diff metric to see if rank is accurate
  return preds

def backtest(stats, model, years, predictors):
  aps = []
  all_predictions = []
  for year in years:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    model.fit(train[predictors],train["Share"])
    predictions = model.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["Predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Share", 'Year']], predictions], axis=1)
    combination = add_ranks(combination)
    all_predictions.append(combination)
    aps.append(find_ap(combination))
  return sum(aps) / len(aps), aps, pd.concat(all_predictions)

def get_top_five(year): return all_preds[all_preds['Year'] == year].sort_values('Predictions', ascending=False).head(5)

In [42]:
stat_ratios = stats[['PTS', 'AST', 'STL', 'BLK', '3P', 'Year']].groupby('Year').apply(lambda x: x/x.mean())
stats[['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']] = stat_ratios[['PTS', 'AST', 'STL', 'BLK', '3P']]
predictors += ['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']

# changing positions and teams to categorical variables
stats['NPos'] = stats['Pos'].astype('category').cat.codes
stats['NTm'] = stats['Tm'].astype('category').cat.codes

In [60]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=50, oob_score = True,
                           min_samples_split=5, n_jobs = -1)
mean_ap, aps, all_preds = backtest(stats, rf, years[25:], predictors)

In [61]:
get_top_five(2022)

Unnamed: 0,Player,Share,Year,Predictions,Predicted_Rk,Rk,Diff
907,Devin Booker,0.216,2022,0.555175,1,4,3
837,Joel Embiid,0.706,2022,0.407142,2,2,0
12226,Ja Morant,0.01,2022,0.401553,3,7,4
663,Nikola Jokić,0.875,2022,0.36288,4,1,-3
11678,Giannis Antetokounmpo,0.595,2022,0.300648,5,3,-2


In [62]:
mean_ap

0.7597690591388071