In [1]:
import pandas as pd


In [2]:
stats = pd.read_csv('player_mvp_stats.csv')

In [3]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,14692,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14693,14693,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14694,14694,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14695,14695,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


In [4]:
#clean the final data once more
del stats['Unnamed: 0']

In [6]:
#find nulls 
pd.isna(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          59
3P            0
3PA           0
3P%        2086
2P            0
2PA           0
2P%         100
eFG%         59
FT            0
FTA           0
FT%         521
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [9]:
#check the percentage data - probably null because they didn't attempt
#can replace with zero
stats[pd.isna(stats['FT%'])][['Player', 'FTA']]

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0
...,...,...
14556,Mark McNamara,0.0
14584,Luke Zeller,0.0
14637,Myron Brown,0.0
14659,Malcolm Lee,0.0


In [10]:
stats = stats.fillna(0)

In [11]:
#training the models
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [14]:
#want to use the numeric columns
#need to remove Share because that is the prediction, Pts Max and Pts Won since those directly correlated to Share
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
        'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [15]:
#set up training and testing dataframes
train = stats[stats['Year'] < 2022]

In [16]:
test = stats[stats['Year'] == 2022]

In [17]:
#use ridge regression, skrinking coefficients to avoid overfitting
from sklearn.linear_model import Ridge

In [18]:
reg = Ridge(alpha=.1)

In [19]:
#fit the model to the training data (training data with the predictors list, generating a Share column in train)
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [20]:
#model is fit, generate predictions using the test data with the predictors list
predictions = reg.predict(test[predictors])

In [21]:
#turn the array into a df, the column name is predictions, the index is the same as test data
predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)

In [22]:
predictions

Unnamed: 0,predictions
648,0.012934
649,-0.028142
650,-0.006163
651,0.016564
652,-0.004820
...,...
12508,-0.019380
12509,-0.010196
12510,0.003810
12511,0.001162


In [24]:
#combine the two dataframes want the test Player and Share, and then predictions made, combined on column axis
combination = pd.concat([test[['Player', 'Share']], predictions], axis =1)

In [26]:
combination

Unnamed: 0,Player,Share,predictions
648,Aaron Gordon,0.0,0.012934
649,Austin Rivers,0.0,-0.028142
650,Bol Bol,0.0,-0.006163
651,Bones Hyland,0.0,0.016564
652,Bryn Forbes,0.0,-0.004820
...,...,...,...
12508,Micah Potter,0.0,-0.019380
12509,Rodney McGruder,0.0,-0.010196
12510,Saben Lee,0.0,0.003810
12511,Saddiq Bey,0.0,0.001162


In [28]:
#want to sort by share to see who won MVP 
combination.sort_values('Share', ascending = False).head(10)

Unnamed: 0,Player,Share,predictions
663,Nikola Jokić,0.875,0.190365
837,Joel Embiid,0.706,0.190462
11678,Giannis Antetokounmpo,0.595,0.21941
907,Devin Booker,0.216,0.091309
11469,Luka Dončić,0.146,0.157395
1179,Jayson Tatum,0.043,0.095902
12226,Ja Morant,0.01,0.120508
6398,Stephen Curry,0.004,0.093138
905,Chris Paul,0.002,0.078329
8241,LeBron James,0.001,0.157828


In [29]:
#indentifying an error metric to see if algorithm did a good job
from sklearn.metrics import mean_squared_error as MSE

In [32]:
MSE(combination['Share'], combination['predictions'])
#mean difference between prediction and actual value is 0.0022
#MSE is not a good error metric since many players actually get 0 Share votes

0.0022402416025650695

In [35]:
#sort the data in order where the highest has the highest vote share 
combination = combination.sort_values('Share', ascending = False)
#Make a rank column to match the sorted data 
combination['Rk'] = list(range(1, combination.shape[0]+1))

In [36]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
663,Nikola Jokić,0.875,0.190365,1
837,Joel Embiid,0.706,0.190462,2
11678,Giannis Antetokounmpo,0.595,0.21941,3
907,Devin Booker,0.216,0.091309,4
11469,Luka Dončić,0.146,0.157395,5
1179,Jayson Tatum,0.043,0.095902,6
12226,Ja Morant,0.01,0.120508,7
6398,Stephen Curry,0.004,0.093138,8
905,Chris Paul,0.002,0.078329,9
8241,LeBron James,0.001,0.157828,10


In [37]:
#data now has Rk column based on the MVP voting 
combination = combination.sort_values('predictions', ascending = False)
combination['Predicted_Rk'] = list(range(1, combination.shape[0]+1))

In [39]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
11678,Giannis Antetokounmpo,0.595,0.21941,3,1
837,Joel Embiid,0.706,0.190462,2,2
663,Nikola Jokić,0.875,0.190365,1,3
8241,LeBron James,0.001,0.157828,10,4
11469,Luka Dončić,0.146,0.157395,5,5
6185,Kevin Durant,0.001,0.140627,12,6
12226,Ja Morant,0.01,0.120508,7,7
11820,Trae Young,0.0,0.109246,289,8
8231,Anthony Davis,0.0,0.107306,112,9
836,James Harden,0.0,0.103584,393,10


In [40]:
combination.sort_values('Share', ascending = False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
663,Nikola Jokić,0.875,0.190365,1,3
837,Joel Embiid,0.706,0.190462,2,2
11678,Giannis Antetokounmpo,0.595,0.21941,3,1
907,Devin Booker,0.216,0.091309,4,17
11469,Luka Dončić,0.146,0.157395,5,5
1179,Jayson Tatum,0.043,0.095902,6,13
12226,Ja Morant,0.01,0.120508,7,7
6398,Stephen Curry,0.004,0.093138,8,15
905,Chris Paul,0.002,0.078329,9,21
3938,DeMar DeRozan,0.001,0.099241,11,11


In [42]:
#model didnt predict very well need to find a good error metric - find average precision
def find_ap(combination):
    actual = combination.sort_values('Share', ascending = False).head(5) #sort by share and take top 5
    predicted = combination.sort_values('predictions', ascending = False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row['Player'] in actual['Player'].values:
            found += 1
            ps.append (found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [43]:
find_ap(combination)

0.8188235294117646

In [46]:
years = list(range(1991, 2023))

In [49]:
aps = []
all_predictions = []
for year in years[5:]:
    train = stats[stats['Year'] < year]
    test = stats[stats['Year'] == year]
#use this to make predictions from 1996 to 2022, able to get error metric for every single year
    reg.fit(train[predictors], train['Share']) #fit the model
    predictions = reg.predict(test[predictors]) #define predictions
    predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index) #make into df
    combination = pd.concat([test[['Player', 'Share']], predictions], axis = 1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [50]:
#that was backtesting, now want to get mean average precision
sum(aps) / len(aps)

0.7152712173135063

In [58]:
#model predicted the top players 0.71 well for all the data
#now sort the df by the share and add predictions and rank
def add_ranks(combination):
    combination = combination.sort_values('Share', ascending = False)
    combination['Rk'] = list(range(1, combination.shape[0]+1))
    combination = combination.sort_values('predictions', ascending = False)
    combination['Predicted_Rk'] = list(range(1, combination.shape[0]+1))
    combination['Diff'] = combination['Rk'] - combination['Predicted_Rk']
    return combination

In [59]:
add_ranks(all_predictions[1])[Rk <.sort_values('Diff', ascending = False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
13154,Shaquille O'Neal,0.006,0.198202,10,1,9
1710,Karl Malone,0.857,0.192318,1,2,-1
10976,Michael Jordan,0.832,0.167629,2,3,-1
12591,Hakeem Olajuwon,0.083,0.136357,7,4,3
10274,David Robinson,0.000,0.131089,36,5,31
...,...,...,...,...,...,...
13748,Eric Montross,0.000,-0.043824,164,437,-273
5171,Pervis Ellison,0.000,-0.047037,423,438,-15
10841,Popeye Jones,0.000,-0.048904,199,439,-240
10970,Dennis Rodman,0.000,-0.054609,191,440,-249


In [60]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    for year in years[5:]:
        train = stats[stats['Year'] < year]
        test = stats[stats['Year'] == year]
#use this to make predictions from 1996 to 2022, able to get error metric for every single year
        model.fit(train[predictors], train['Share']) #fit the model
        predictions = reg.predict(test[predictors]) #define predictions
        predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index) #make into df
        combination = pd.concat([test[['Player', 'Share']], predictions], axis = 1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions )

In [62]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [63]:
mean_ap

0.7152712173135063

In [65]:
all_predictions[all_predictions['Rk'] <= 5].sort_values('Diff').head(10)
#view players who were misranked by the algorithm the most

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1334,Jason Kidd,0.712,0.02821,2,52,-50
8642,Glen Rice,0.117,0.03311,5,53,-48
5420,Steve Nash,0.839,0.0341,1,45,-44
8910,Peja Stojaković,0.228,0.03627,4,38,-34
13331,Joakim Noah,0.258,0.046968,4,37,-33
5438,Steve Nash,0.739,0.054129,1,34,-33
3849,Chauncey Billups,0.344,0.052696,5,35,-30
1499,Chris Paul,0.138,0.072293,5,33,-28
5453,Steve Nash,0.785,0.074421,2,21,-19
4912,Tim Hardaway,0.207,0.059984,4,20,-16


In [69]:
#diagnose algorithm by looking at coefficients of the regression
#determine which feature that the model finds important based on coefficients
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis = 1).sort_values(0, ascending = False)

Unnamed: 0,0,1
13,0.087852,eFG%
18,0.03386,DRB
29,0.023198,W/L%
17,0.020993,ORB
10,0.016456,2P
21,0.01207,STL
22,0.010901,BLK
15,0.010414,FTA
20,0.007113,AST
12,0.007054,2P%


In [70]:
#Adding more predictors  
stat_ratios = stats[['PTS', 'AST', 'STL', 'BLK', '3P','Year']].groupby('Year').apply(lambda x: x/x.mean())

In [71]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
14692,0.735752,0.819562,0.479763,1.528302,0.650951,1.0
14693,0.071202,0.000000,0.000000,0.000000,0.130190,1.0
14694,1.281633,0.601012,1.119447,2.547170,0.520761,1.0
14695,0.474679,0.218550,0.319842,1.273585,0.650951,1.0


In [74]:
stats[['PTS_T', 'AST_R', 'STL_R', 'BLK_R', '3P_R']] = stat_ratios[['PTS', 'AST', 'STL', 'BLK', '3P']]

In [75]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3_PR,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,1.52576


In [76]:
predictors +=['PTS_T', 'AST_R', 'STL_R', 'BLK_R', '3P_R']

In [77]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [78]:
mean_ap

0.726619022474594

In [83]:
stats['NPos'] = stats['Pos'].astype('category').cat.codes

In [84]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3_PR,3P_R,NPos
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587,0.508587,2
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279,4.577279,12
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0,0.0,2
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0,0.0,2
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576,1.52576,8


In [85]:
stats['NTm'] = stats['Tm'].astype('category').cat.codes

In [86]:
#using random forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50, random_state = 1, min_samples_split=5)

mean_ap, aps, all_preidcitons = backtest(stats, rf, years[28:], predictors)

In [87]:
mean_ap

0.7479028168624604

In [88]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)

In [89]:
mean_ap

0.726619022474594

In [None]:
#more accuracy using random forest regression on the data