# NBA MVP Prediction Model (WIP)

Features to add in the future: All-Star teams, All-NBA teams, All-Defense teams, DPOY, Sixth man, scoring leaders, etc

## Import everything

In [2]:
import pandas as pd
import numpy as np
import requests
import pickle
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Initialize the dictionary we're going to store all our player data in

In [111]:
nba_players_list = []
career_stats = {}
league_leaders = {}

Scrape the data and place them into the career_stats dictionary

In [112]:
abc = 'abcdefghijklmnopqrstuvwyz'
driver = webdriver.Chrome(options=chrome_options)
for letter in range(len(abc)):
    url = f'https://www.basketball-reference.com/players/{abc[letter]}/'
    
    driver.get(url)
    table_id = 'players'
    table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, table_id)))
    links = table.find_elements(By.CSS_SELECTOR, 'strong > a')
    
    for link in links:
        href = link.get_attribute('href')
        
        # Open the link in a new tab
        driver.execute_script('window.open(arguments[0]);', href)
        
        # Switch to the newly opened tab
        driver.switch_to.window(driver.window_handles[-1])

        # scrape the tables off the player website
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'table')))
        
        player_name = driver.find_element(By.TAG_NAME, 'h1')

        soup = BeautifulSoup(driver.page_source, 'lxml')
        table_ids = ['per_game', 'per_poss', 'advanced']
        data = {}

        for table_id in table_ids:
            table = soup.find('table', {'id': table_id})
            if table:
                df = pd.read_html(str(table))[0]
                data[table_id] = df
        career_stats[player_name.text] = data
        nba_players_list.append(player_name.text)
        
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        
driver.quit()

Do some data cleaning and dataframe merging

In [113]:
for player in career_stats:
    
    if 'ORtg' in career_stats[player]['per_poss']:
        career_stats[player]['per_poss']['NRtg'] = career_stats[player]['per_poss']['ORtg'] - \
        career_stats[player]['per_poss']['DRtg']
    
    columns_to_delete = ['Lg','Pos','G','GS','MP','FG','FGA','FG%','3P','3PA','3P%',
                        '2P','2PA','2P%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF',
                        'PTS']
    
    for column in columns_to_delete:
        if column in career_stats[player]['per_poss']:
            del career_stats[player]['per_poss'][column]
        if column in career_stats[player]['advanced']:
            del career_stats[player]['advanced'][column]
            
    for dfs in career_stats[player]:
        df = career_stats[player][dfs]
        career_stats[player][dfs].dropna(axis=1,how='all',inplace=True)
        career_stats[player][dfs].dropna(axis=0,inplace=True)
        
        if 'Tm' in career_stats[player][dfs]:
            career_stats[player][dfs] = df[~((df['Season'].duplicated()) & (df['Tm'] != 'TOT'))]
        elif 'Team' in career_stats[player][dfs]:
            career_stats[player][dfs] = df[~((df['Season'].duplicated()) & (df['Team'] != 'TOT'))]
    
    if 'Tm' in career_stats[player]['per_game']:
        career_stats[player]['cum'] = pd.merge(career_stats[player]['per_game'], career_stats[player]['per_poss'], 
                                          on=['Season','Tm','Age'],how='outer')
    elif 'Team' in career_stats[player]['per_game']:
        career_stats[player]['cum'] = pd.merge(career_stats[player]['per_game'], career_stats[player]['per_poss'], 
                                          on=['Season','Team','Age'],how='outer')

Delete unnecessary columns in the cumulative dataframe

In [114]:
for player in career_stats:
    
    if 'Team_y' in career_stats[player]['cum']:
        del career_stats[player]['cum']['Team_y']
        
    if 'PF' in career_stats[player]['cum']:
        del career_stats[player]['cum']['PF']
        
    if 'ORB' in career_stats[player]['cum']:
        del career_stats[player]['cum']['ORB']
        
    if 'DRB' in career_stats[player]['cum']:
        del career_stats[player]['cum']['DRB']
        
    career_stats[player]['cum'] = career_stats[player]['cum'].dropna(axis=1, how='all')

Trim the advanced dataframe and add it on the cumulative dataframe

In [115]:
for player in career_stats:
    df = career_stats[player]['advanced']
    dfc = career_stats[player]['cum']
    
    columns_to_delete = ['Team','Tm','Age','3PAr','FTr','ORB%','DRB%','STL%','BLK%','TOV%']
    
    for column in columns_to_delete:
        if column in career_stats[player]['advanced']:
            del career_stats[player]['advanced'][column]
            
    career_stats[player]['advanced'] = df.dropna(axis=1,how='all')
    
    career_stats[player]['cum'] = pd.merge(career_stats[player]['cum'], career_stats[player]['advanced'],
                                          on='Season',how='outer')
    

Remove DNP years due to injury or whatever else, rename players who have Tm instead of Team to Team, and change season column to ints (the year will represent the year the season ends, not begins)

In [116]:
for player in career_stats:
    career_stats[player]['cum'].dropna(axis=0,inplace=True)
    
    if 'Tm' in career_stats[player]['cum']:
        career_stats[player]['cum'].rename(columns={'Tm':'Team'},inplace=True)
        
    for index in career_stats[player]['cum'].index:
        season = career_stats[player]['cum']['Season'][index]
        career_stats[player]['cum'].at[index, 'Season'] = int(season[:2]+season[5:])

Scrape all award data, like MVP voting, All-NBA, All-Defense (we're not going to use All-Stars as a determining factor)

In [117]:
years = list(range(2004, 2024))
driver = webdriver.Chrome(options=chrome_options)
award_data = {}

for year in years:
    url = f'https://www.basketball-reference.com/awards/awards_{year}.html'

    driver.get(url)
    driver.implicitly_wait(5)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    table_ids = ['mvp', 'leading_all_nba', 'leading_all_defense']
    data = {}

    for table_id in table_ids:
        table = soup.find('table', {'id': table_id})
        if table:
            df = pd.read_html(str(table))[0]
            data[table_id] = df

    award_data[year] = data

driver.quit()

Do a little bit of cleaning

In [118]:
for year in award_data:
    for df in award_data[year]:
        dfs = award_data[year][df]
        award_data[year][df].fillna(value=0, inplace=True)
        award_data[year][df] = dfs.rename(columns={'# Tm':'#Tm'})
        
        if '#Tm' in dfs:
            award_data[year][df] = dfs[dfs['#Tm']!='ORV']
            
        award_data[year][df].reset_index(drop=True, inplace=True)
            
    mvp = award_data[year]['mvp']
    
    award_data[year]['mvp']['Rank'] = mvp.index+1
    award_data[year]['mvp'] = mvp[mvp['Rank'] <= 10]

Remove all unnecessary columns

In [119]:
columns_to_keep_mvp = ['Rank','Player','Age','Share']
columns_to_keep_all_nba = ['#Tm','Player','Age','Share']

for year in award_data:
    dfm = award_data[year]['mvp'] 
    dfn = award_data[year]['leading_all_nba']
    dfd = award_data[year]['leading_all_defense']
    
    if award_data[year]['mvp'].columns.nlevels > 1:
        award_data[year]['mvp'].columns = award_data[year]['mvp'].columns.droplevel()

    if award_data[year]['leading_all_nba'].columns.nlevels > 1:
        award_data[year]['leading_all_nba'].columns = award_data[year]['leading_all_nba'].columns.droplevel()

    if award_data[year]['leading_all_defense'].columns.nlevels > 1:
        award_data[year]['leading_all_defense'].columns = award_data[year]['leading_all_defense'].columns.droplevel()
    
    award_data[year]['mvp'] = dfm[columns_to_keep_mvp]
    award_data[year]['leading_all_nba'] = dfn[columns_to_keep_all_nba]
    award_data[year]['leading_all_defense'] = dfd[columns_to_keep_all_nba]

Set up the career_stats dictionaries to contain the necessary columns so we can add the award info to them

In [120]:
for player in career_stats:
    career_stats[player]['cum']['All-NBA'] = None
    career_stats[player]['cum']['All-Defense'] = None
    career_stats[player]['cum']['MVP'] = None
    career_stats[player]['cum']['MVP Rank'] = None

In [121]:
for year in award_data:
    for award in award_data[year]:
        for index in award_data[year][award].index:
            player = award_data[year][award]['Player'][index]
            share = award_data[year][award]['Share'][index]
            
            if player in career_stats:
                if award == 'mvp':
                    for i in career_stats[player]['cum'].index:
                        if year == career_stats[player]['cum']['Season'][i]:
                            career_stats[player]['cum'].at[i, 'MVP'] = share
                            career_stats[player]['cum'].at[i, 'MVP Rank'] = index+1

                if award == 'leading_all_nba':
                    for i in career_stats[player]['cum'].index:
                        if year == career_stats[player]['cum']['Season'][i]:
                            career_stats[player]['cum'].at[i, 'All-NBA'] = share

                if award == 'leading_all_defense':
                    for i in career_stats[player]['cum'].index:
                        if year == career_stats[player]['cum']['Season'][i]:
                            career_stats[player]['cum'].at[i, 'All-Defense'] = share

Save the career_stats dictionary

In [123]:
with open('data.pkl', 'wb') as f:
    pickle.dump(career_stats, f)

Load the career_stats dictionary

In [30]:
with open('data.pkl', 'rb') as f:
    career_stats = pickle.load(f)

We'll make a dataframe that consists of every single player's stats. We'll use this one to train for MVP

In [31]:
all_nba_actives = pd.DataFrame()

for player in career_stats:
    all_nba_actives = pd.concat([all_nba_actives, career_stats[player]['cum']], axis=0)

New dataframe to predict stats

In [32]:
vorp = {}
columns_to_remove = ['Team','Lg','Pos','All-NBA','All-Defense','MVP','MVP Rank']

for player in career_stats:
    df = career_stats[player]['cum']
    
    common_columns = set(df.columns).intersection(columns_to_remove)
    if len(career_stats[player]['cum']) > 1:
        vorp[player] = career_stats[player]['cum'].drop(columns=common_columns)

In [33]:
for player in vorp:
    vorp[player].insert(2, 'Name', player)
    vorp[player]['Next VORP'] = vorp[player]['VORP'].shift(-1)
    vorp[player] = vorp[player].dropna()

Put everything in one dataframe

In [34]:
all_vorp = pd.DataFrame()

for player in vorp:
    all_vorp = pd.concat([all_vorp, vorp[player]], axis=0)

In [35]:
all_vorp.head()

Unnamed: 0,Season,Age,Name,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PTS,ORtg,DRtg,NRtg,PER,TS%,TRB%,AST%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Next VORP
0,2021,21.0,Precious Achiuwa,61.0,4.0,12.1,2.0,3.7,0.544,0.0,0.0,0.0,2.0,3.7,0.546,0.544,0.9,1.8,0.509,3.4,0.5,0.3,0.5,0.7,5.0,107.0,109.0,-2.0,14.2,0.55,16.1,6.1,19.5,0.3,1.0,1.3,0.085,-3.6,-0.5,-4.1,-0.4,-0.2
1,2022,22.0,Precious Achiuwa,73.0,28.0,23.6,3.6,8.3,0.439,0.8,2.1,0.359,2.9,6.1,0.468,0.486,1.1,1.8,0.595,6.5,1.1,0.5,0.6,1.2,9.1,105.0,110.0,-5.0,12.7,0.503,14.9,6.9,18.5,0.4,2.1,2.5,0.07,-2.0,-0.6,-2.6,-0.2,-0.1
0,2015,21.0,Steven Adams,70.0,67.0,25.3,3.1,5.7,0.544,0.0,0.0,0.0,3.1,5.7,0.547,0.544,1.5,2.9,0.502,7.5,0.9,0.5,1.2,1.4,7.7,108.0,104.0,4.0,14.1,0.549,15.8,5.5,14.3,1.9,2.2,4.1,0.111,-1.5,0.2,-1.3,0.3,1.1
1,2017,23.0,Steven Adams,80.0,80.0,29.9,4.7,8.2,0.571,0.0,0.0,0.0,4.7,8.2,0.572,0.571,2.0,3.2,0.611,7.7,1.1,1.1,1.0,1.8,11.3,114.0,107.0,7.0,16.5,0.589,14.2,5.4,16.2,3.3,3.1,6.5,0.13,-0.2,0.0,-0.2,1.1,2.0
2,2018,24.0,Steven Adams,76.0,76.0,32.7,5.9,9.4,0.629,0.0,0.0,0.0,5.9,9.3,0.631,0.629,2.1,3.8,0.559,9.0,1.2,1.2,1.0,1.7,13.9,125.0,107.0,18.0,20.6,0.63,15.3,5.5,16.7,6.7,3.0,9.7,0.187,1.7,-0.6,1.1,2.0,2.1


# Data cleaning finished
Now we train and test our model

In [36]:
rr = Ridge(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=15, direction="forward", cv=split, n_jobs=4)

In [37]:
removed_columns = ['Next VORP','Name','Season','Age']
selected_columns = all_vorp.columns[~all_vorp.columns.isin(removed_columns)]

In [38]:
columns = [col for col in all_vorp.columns if all_vorp[col].dtype == 'object']
columns.remove('Name')
all_vorp[columns] = all_vorp[columns].astype('float64')

In [39]:
scaler = MinMaxScaler()
all_vorp.loc[:, selected_columns] = scaler.fit_transform(all_vorp[selected_columns])
all_vorp.dropna(inplace=True)

In [40]:
all_vorp.reset_index(inplace=True)

In [41]:
sfs.fit(all_vorp[selected_columns], all_vorp['Next VORP'])

In [42]:
predictors = list(selected_columns[sfs.get_support()])

In [43]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(all_vorp['Season'].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        
        train = data[data['Season'] < current_year]
        test = data[data['Season'] == current_year]
        
        model.fit(train[predictors], train['Next VORP'])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test['Next VORP'], preds], axis=1)
        combined.columns = ['actual','prediction']
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions)

In [44]:
predictions = backtest(all_vorp, rr, predictors)

In [45]:
predictions

Unnamed: 0,actual,prediction
67,1.7,1.824861
132,1.4,0.06719
422,0.9,2.473898
587,0.8,0.396973
620,7.5,5.186376
707,1.7,-0.368495
732,2.0,3.174644
790,0.6,2.176471
854,1.7,2.61196
1018,1.7,1.108541


In [46]:
mean_squared_error(predictions['actual'], predictions['prediction'])

0.983697870112816

In [47]:
all_vorp['Next VORP'].describe()

count    2426.000000
mean        1.145012
std         1.673528
min        -1.300000
25%         0.000000
50%         0.600000
75%         1.700000
max        11.800000
Name: Next VORP, dtype: float64

In [48]:
def player_history(df):
    df = df.sort_values('Season')
    
    df['player_season'] = range(0, df.shape[0])
    df['VORP_corr'] = list(df[['player_season','VORP']].expanding().corr().loc[(slice(None), 'player_season'),'VORP'])
    df['VORP_corr'].fillna(1, inplace=True)
    
    df['VORP_diff'] = df['VORP'] / df['VORP'].shift(1)
    df['VORP_diff'].fillna(1, inplace=True)
    
    df['VORP_diff'][df['VORP_diff'] == np.inf] = 1
    
    return df

all_vorp = all_vorp.groupby('Name', group_keys=False).apply(player_history)

In [49]:
def group_averages(df):
    return df['VORP'] / df['VORP'].mean()

In [50]:
all_vorp['VORP_season'] = all_vorp.groupby('Season', group_keys=False).apply(group_averages)

new_predictors = predictors + ['player_season', 'VORP_corr', 'VORP_season', 'VORP_diff']

In [51]:
predictions = backtest(all_vorp, rr, new_predictors)

In [52]:
mean_squared_error(predictions['actual'], predictions['prediction'])

0.9982379394668776

In [53]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

DRtg            -1.556766
DWS             -1.357847
3P              -0.345529
VORP_diff       -0.300329
VORP_corr       -0.101736
player_season   -0.056947
3PA             -0.020892
G                0.101383
USG%             0.129146
FGA              0.158212
FT%              0.189961
2PA              0.190878
VORP_season      0.258846
AST%             0.323724
DBPM             0.458302
AST              0.558357
MP               0.833688
FT               1.265668
VORP             7.623296
dtype: float64

In [54]:
diff = predictions['actual'] - predictions['prediction']

In [55]:
merged = predictions.merge(all_vorp, left_index=True, right_index=True)
merged['diff'] = (predictions['actual'] - predictions['prediction']).abs()

In [56]:
merged[merged['Season']==2022].sort_values(ascending=False, by='prediction')

Unnamed: 0,actual,prediction,index,Season,Age,Name,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,PTS,ORtg,DRtg,NRtg,PER,TS%,TRB%,AST%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Next VORP,player_season,VORP_corr,VORP_diff,VORP_season,diff
1257,8.8,7.70133,6,2022.0,26.0,Nikola Jokić,0.9,0.902439,0.772152,0.903509,0.716667,0.583,0.245283,0.295455,0.337,0.882353,0.715789,0.652,0.62,0.5,0.533898,0.81,0.861635,0.675214,0.535714,0.243243,0.666667,0.746479,0.763158,0.384615,0.714286,1.0,0.665278,0.870079,0.743455,0.716763,0.779006,0.714286,0.772321,0.943182,0.954167,0.731884,1.0,0.855072,8.8,6,0.9296233,1.092593,4.259967,1.09867
49,5.4,6.305158,8,2022.0,27.0,Giannis Antetokounmpo,0.8125,0.817073,0.756962,0.903509,0.754167,0.553,0.207547,0.272727,0.293,0.901961,0.778947,0.616,0.582,0.813725,0.966102,0.722,0.72327,0.495726,0.392857,0.378378,0.578947,0.825352,0.745614,0.423077,0.690476,0.981818,0.626389,0.700787,0.553229,0.803468,0.690608,0.6,0.669643,0.918831,0.8875,0.65942,0.914384,0.681159,5.4,8,0.8383171,1.236842,3.393533,0.905158
644,6.4,5.838065,7,2022.0,27.0,Joel Embiid,0.825,0.829268,0.779747,0.859649,0.795833,0.499,0.264151,0.280303,0.371,0.823529,0.826316,0.529,0.534,0.941176,1.0,0.814,0.72956,0.358974,0.392857,0.405405,0.54386,0.84507,0.701754,0.384615,0.65873,0.958442,0.602778,0.728346,0.410122,0.869942,0.618785,0.657143,0.629464,0.871753,0.870833,0.550725,0.84589,0.615942,6.4,5,0.8512738,1.465517,3.06862,0.561935
575,6.6,5.327724,3,2022.0,22.0,Luka Dončić,0.7875,0.792683,0.820253,0.868421,0.879167,0.457,0.584906,0.666667,0.353,0.666667,0.663158,0.528,0.529,0.54902,0.635593,0.744,0.566038,0.74359,0.428571,0.162162,0.789474,0.783099,0.622807,0.461538,0.571429,0.8,0.540278,0.527559,0.802792,0.875723,0.392265,0.614286,0.433036,0.720779,0.8375,0.536232,0.811644,0.572464,6.6,3,0.8569,1.112676,2.852012,1.272276
2411,3.4,4.479096,3,2022.0,23.0,Trae Young,0.925,0.926829,0.807595,0.824561,0.825,0.46,0.584906,0.606061,0.382,0.617647,0.636842,0.512,0.536,0.647059,0.618644,0.904,0.226415,0.82906,0.321429,0.027027,0.701754,0.783099,0.701754,0.884615,0.555556,0.807792,0.584722,0.19685,0.815009,0.789017,0.679558,0.214286,0.540179,0.756494,0.866667,0.26087,0.708904,0.492754,3.4,3,0.9378044,1.36,2.454896,1.079096
632,3.9,4.106184,14,2022.0,33.0,Kevin Durant,0.6625,0.670732,0.865823,0.921053,0.825,0.518,0.396226,0.416667,0.383,0.823529,0.768421,0.568,0.57,0.666667,0.627119,0.91,0.459119,0.547009,0.321429,0.243243,0.614035,0.825352,0.719298,0.653846,0.619048,0.812987,0.627778,0.389764,0.507853,0.696532,0.535912,0.357143,0.46875,0.784091,0.8375,0.456522,0.777397,0.492754,3.9,13,0.01001059,1.446809,2.454896,0.206184
2141,5.1,4.013824,4,2022.0,23.0,Jayson Tatum,0.925,0.926829,0.832911,0.815789,0.8375,0.453,0.566038,0.651515,0.353,0.617647,0.621053,0.524,0.526,0.519608,0.525424,0.853,0.496855,0.376068,0.357143,0.162162,0.508772,0.740845,0.649123,0.423077,0.603175,0.714286,0.55,0.440945,0.368237,0.722543,0.458564,0.728571,0.522321,0.737013,0.75,0.449275,0.69863,0.492754,5.1,4,0.8749737,1.259259,2.454896,1.086176
336,5.8,3.866477,10,2022.0,32.0,Jimmy Butler,0.6875,0.695122,0.782278,0.614035,0.583333,0.48,0.09434,0.151515,0.233,0.637255,0.647368,0.52,0.496,0.676471,0.677966,0.87,0.36478,0.470085,0.571429,0.135135,0.368421,0.585915,0.745614,0.5,0.674603,0.761039,0.569444,0.354331,0.460733,0.560694,0.524862,0.485714,0.504464,0.832792,0.758333,0.543478,0.746575,0.434783,5.8,10,0.5759829,0.952381,2.166085,1.933523
1692,2.1,3.826742,5,2022.0,25.0,Dejounte Murray,0.825,0.829268,0.805063,0.736842,0.741667,0.462,0.264151,0.325758,0.327,0.686275,0.721053,0.504,0.5,0.284314,0.305085,0.794,0.515723,0.786325,0.714286,0.081081,0.45614,0.577465,0.649123,0.5,0.587302,0.727273,0.4875,0.456693,0.708551,0.583815,0.39779,0.557143,0.419643,0.704545,0.725,0.528986,0.715753,0.463768,2.1,4,0.8878239,1.828571,2.310491,1.726742
1207,4.0,3.766975,18,2022.0,37.0,LeBron James,0.675,0.682927,0.865823,1.0,0.8875,0.524,0.54717,0.606061,0.359,0.843137,0.715789,0.62,0.59,0.441176,0.508475,0.756,0.509434,0.529915,0.464286,0.297297,0.614035,0.83662,0.684211,0.615385,0.595238,0.828571,0.606944,0.429134,0.534031,0.728324,0.469613,0.4,0.428571,0.741883,0.858333,0.463768,0.794521,0.514493,4.0,18,-0.4470981,1.224138,2.5632,0.233025
