In [71]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
stats = pd.read_csv("./files/player_mvp_stats.csv")

stats.head()

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,36,LAL,82,82,23.5,2.1,4.7,...,0.0,0.0,Los Angeles Lakers,67.0,15.0,0.817,0.0,100.8,92.3,8.41
1,1,Brian Shaw,SG,33,LAL,74,2,16.9,1.7,4.4,...,0.0,0.0,Los Angeles Lakers,67.0,15.0,0.817,0.0,100.8,92.3,8.41
2,2,Derek Fisher,PG,25,LAL,78,22,23.1,2.1,6.2,...,0.0,0.0,Los Angeles Lakers,67.0,15.0,0.817,0.0,100.8,92.3,8.41
3,3,Devean George,SF,22,LAL,49,1,7.0,1.1,2.9,...,0.0,0.0,Los Angeles Lakers,67.0,15.0,0.817,0.0,100.8,92.3,8.41
4,4,Glen Rice,SF,32,LAL,80,80,31.6,5.3,12.3,...,0.0,0.0,Los Angeles Lakers,67.0,15.0,0.817,0.0,100.8,92.3,8.41


In [3]:
del stats["Unnamed: 0"]

In [4]:
stats.isnull().sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          34
3P            0
3PA           0
3P%        1424
2P            0
2PA           0
2P%          66
eFG%         34
FT            0
FTA           0
FT%         363
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team       1235
W          1235
L          1235
W/L%       1235
GB         1235
PS/G       1235
PA/G       1235
SRS        1235
dtype: int64

In [5]:
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
6,John Salley,0.0
12,Travis Knight,0.0
17,Anthony Mason,0.0
22,Duane Causwell,0.0
28,Todd Fuller,0.0
...,...,...
10335,Michael Cage,0.0
10344,John Henson,0.0
10345,Johnny O'Bryant,0.0
10347,Kenyon Martin,0.0


In [6]:
stats[pd.isnull(stats["FT%"])][["Player", "FTA"]]

Unnamed: 0,Player,FTA
25,Jamal Robinson,0.0
29,A.J. Bramlett,0.0
32,Benoit Benjamin,0.0
70,A.J. Guyton,0.0
80,Guy Rucker,0.0
...,...,...
10133,Jason Hart,0.0
10176,George King,0.0
10246,Luke Zeller,0.0
10296,Malcolm Lee,0.0


In [7]:
stats = stats.fillna(0)

In [8]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [9]:
#Removed the pts won, pts max, and share columns since they have high correlation with MVP title. This will overfit the algorithm

predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
        'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [10]:
train = stats[stats["Year"] < 2021]

In [11]:
test = stats[stats["Year"] == 2021]

In [12]:
#Initialized the Ridge module with a alpha value of .1

reg = Ridge(alpha=.1)

In [13]:
reg.fit(train[predictors], train["Share"])

Ridge(alpha=0.1)

In [14]:
predictions = reg.predict(test[predictors])

In [15]:
predictions = pd.DataFrame(predictions, columns = ['predictions'], index= test.index)
predictions

Unnamed: 0,predictions
632,0.019474
633,-0.012718
634,0.008972
635,0.046641
636,-0.009253
...,...
10223,-0.015157
10224,-0.002406
10225,-0.004678
10226,-0.027176


In [16]:
combine = pd.concat([test[['Player', 'Share']], predictions], axis=1)
combine

Unnamed: 0,Player,Share,predictions
632,Aaron Gordon,0.0,0.019474
633,Al-Farouq Aminu,0.0,-0.012718
634,Alex Len,0.0,0.008972
635,Andre Drummond,0.0,0.046641
636,Austin Rivers,0.0,-0.009253
...,...,...,...
10223,Rodney McGruder,0.0,-0.015157
10224,Saben Lee,0.0,-0.002406
10225,Saddiq Bey,0.0,-0.004678
10226,Sekou Doumbouya,0.0,-0.027176


In [17]:
combine.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
8174,Nikola Jokić,0.961,0.152242
6500,Joel Embiid,0.58,0.153164
3110,Stephen Curry,0.449,0.151958
7305,Giannis Antetokounmpo,0.345,0.203462
1360,Chris Paul,0.138,0.074704
7902,Luka Dončić,0.042,0.160501
5788,Damian Lillard,0.038,0.126849
3042,Julius Randle,0.02,0.087323
656,Derrick Rose,0.01,0.027348
8150,Rudy Gobert,0.008,0.092071


## Identify an error metric

In [18]:
mean_squared_error(combine["Share"], combine["predictions"])

0.0026558903832069707

In [19]:
combine["Share"].value_counts()

0.000    525
0.001      3
0.042      1
0.580      1
0.038      1
0.449      1
0.008      1
0.961      1
0.345      1
0.010      1
0.003      1
0.020      1
0.138      1
0.005      1
Name: Share, dtype: int64

In [23]:
#Assign a rank colume based on Share 
combine = combine.sort_values("Share", ascending=False)
combine

Unnamed: 0,Player,Share,predictions
8174,Nikola Jokić,0.961,0.152242
6500,Joel Embiid,0.580,0.153164
3110,Stephen Curry,0.449,0.151958
7305,Giannis Antetokounmpo,0.345,0.203462
1360,Chris Paul,0.138,0.074704
...,...,...,...
3101,James Wiseman,0.000,0.005521
3100,Gary Payton II,0.000,-0.004722
3099,Eric Paschall,0.000,0.008649
3098,Draymond Green,0.000,0.032403


In [25]:
combine["Rank"] = list(range(1,combine.shape[0] + 1))

In [26]:
combine.head(10)

Unnamed: 0,Player,Share,predictions,Rank
8174,Nikola Jokić,0.961,0.152242,1
6500,Joel Embiid,0.58,0.153164,2
3110,Stephen Curry,0.449,0.151958,3
7305,Giannis Antetokounmpo,0.345,0.203462,4
1360,Chris Paul,0.138,0.074704,5
7902,Luka Dončić,0.042,0.160501,6
5788,Damian Lillard,0.038,0.126849,7
3042,Julius Randle,0.02,0.087323,8
656,Derrick Rose,0.01,0.027348,9
8150,Rudy Gobert,0.008,0.092071,10


In [27]:
combine = combine.sort_values("predictions", ascending = False)
combine["Predicted_Rank"] = list(range(1,combine.shape[0] + 1))

In [29]:
combine.head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted_Rank
7305,Giannis Antetokounmpo,0.345,0.203462,4,1
7902,Luka Dončić,0.042,0.160501,6,2
669,James Harden,0.001,0.154968,15,3
6500,Joel Embiid,0.58,0.153164,2,4
8174,Nikola Jokić,0.961,0.152242,1,5
3110,Stephen Curry,0.449,0.151958,3,6
3183,LeBron James,0.001,0.14969,13,7
3521,Kevin Durant,0.0,0.144611,439,8
6099,Russell Westbrook,0.005,0.127107,11,9
5788,Damian Lillard,0.038,0.126849,7,10


In [32]:
combine.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted_Rank
8174,Nikola Jokić,0.961,0.152242,1,5
6500,Joel Embiid,0.58,0.153164,2,4
3110,Stephen Curry,0.449,0.151958,3,6
7305,Giannis Antetokounmpo,0.345,0.203462,4,1
1360,Chris Paul,0.138,0.074704,5,33
7902,Luka Dončić,0.042,0.160501,6,2
5788,Damian Lillard,0.038,0.126849,7,10
3042,Julius Randle,0.02,0.087323,8,23
656,Derrick Rose,0.01,0.027348,9,82
8150,Rudy Gobert,0.008,0.092071,10,20


In [37]:
def find_ap(combine):
    actual = combine.sort_values("Share", ascending = False).head(5)
    
    predicted = combine.sort_values("predictions", ascending = False)
    ps = []
    found = 0
    seen = 1
    
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1 
            ps.append(found/seen)
            
        seen += 1
    return sum(ps)/len(ps)

In [38]:
find_ap(combine)

0.5836363636363636

In [39]:
years = list(range(2000, 2022))

In [40]:
aps = []
all_predictions = []

for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns = ["predictions"], index = test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [41]:
sum(aps)/len(aps)

0.6721247807896416

In [45]:
def add_ranks(combine):
    combine = combine.sort_values("Share", ascending= False)
    combine["Rank"] = list(range(1,combine.shape[0] + 1))
    
    combine = combine.sort_values("predictions", ascending= False)
    combine["Predicted_Rank"] = list(range(1,combine.shape[0] + 1))
    combine["Difference"] = combine["Rank"] - combine["Predicted_Rank"]
    
    return combine

In [51]:
ranking = add_ranks(all_predictions[1])

ranking[ranking["Rank"] < 6].sort_values("Difference", ascending=False)

Unnamed: 0,Player,Share,predictions,Rank,Predicted_Rank,Difference
2709,LeBron James,0.55,0.190683,2,2,0
1790,Dirk Nowitzki,0.435,0.096274,3,16,-13
1036,Kobe Bryant,0.386,0.091477,4,18,-14
3967,Steve Nash,0.739,0.075788,1,24,-23
3116,Chauncey Billups,0.344,0.048343,5,39,-34


In [52]:
def back_test(stats, model, year, predictors):
    aps = []
    all_predictions = []

    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        reg.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns = ["predictions"], index = test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
        
        
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [53]:
mean_ap, aps, all_predictions = back_test(stats, reg, years[5:], predictors)

In [54]:
mean_ap

0.6721247807896416

In [55]:
all_predictions[all_predictions["Rank"] <= 5].sort_values("Difference").head(10)

Unnamed: 0,Player,Share,predictions,Rank,Predicted_Rank,Difference
3116,Chauncey Billups,0.344,0.048343,5,39,-34
9231,Joakim Noah,0.258,0.044913,4,37,-33
3952,Steve Nash,0.839,0.051677,1,32,-31
1360,Chris Paul,0.138,0.074704,5,33,-28
3967,Steve Nash,0.739,0.075788,1,24,-23
1036,Kobe Bryant,0.386,0.091477,4,18,-14
6609,Kevin Durant,0.495,0.072627,2,16,-14
1020,Allen Iverson,0.189,0.075844,5,19,-14
1790,Dirk Nowitzki,0.435,0.096274,3,16,-13
5336,Kobe Bryant,0.873,0.08251,1,13,-12


In [57]:
#Diagnose regression model - see which values correlates well with the MVP votes
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis = 1).sort_values(0, ascending=False)

Unnamed: 0,0,1
29,0.112984,W/L%
13,0.048205,eFG%
18,0.039156,DRB
17,0.027842,ORB
15,0.013146,FTA
4,0.012152,FG
22,0.008813,BLK
20,0.008142,AST
25,0.007738,PTS
21,0.007343,STL


In [59]:
stats_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [60]:
stats_ratios 

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,0.627089,0.540640,0.894701,0.467519,0.000000,1.0
1,0.514213,1.459729,0.745584,0.467519,0.512850,1.0
2,0.790132,1.513793,1.491168,0.000000,1.794977,1.0
3,0.401337,0.108128,0.298234,0.233759,0.769276,1.0
4,1.994143,1.189409,0.894701,0.467519,2.820678,1.0
...,...,...,...,...,...,...
10370,0.870914,2.183373,1.768693,0.250592,1.062036,1.0
10371,0.985508,0.831761,1.768693,1.002369,0.743425,1.0
10372,3.036739,2.287343,1.447112,3.007106,3.504718,1.0
10373,0.733401,0.363895,0.321581,0.751776,0.849629,1.0


In [61]:
stats[["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stats_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [62]:
predictors += ["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [63]:
mean_ap, aps, all_predictions = back_test(stats, reg, years[5:], predictors)

In [65]:
mean_ap

0.7008001703620635

In [67]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes

In [68]:
stats["NTm"] = stats["Tm"].astype("category").cat.codes

In [70]:
stats["Pos"].unique()

array(['PF', 'SG', 'PG', 'SF', 'C', 'PG-SG', 'SG-SF', 'SG-PG', 'PF-SF',
       'SF-SG', 'SF-PF', 'C-PF', 'PF-C', 'SG-PF', 'SF-C', 'PG-SF'],
      dtype=object)

In [79]:
rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

In [80]:
mean_ap, aps, all_predictions = back_test(stats, rf, years[18:], predictors)

In [81]:
mean_ap

0.7008001703620635