In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [113]:
stats = pd.read_csv("player_mvp_stats.csv")

In [114]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27.0,LAL,82.0,21.0,26.4,3.1,6.6,...,0.0,0.000,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29.0,LAL,82.0,82.0,32.1,6.1,12.8,...,0.0,0.000,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22.0,LAL,52.0,0.0,7.3,1.1,2.4,...,0.0,0.000,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25.0,LAL,26.0,0.0,4.2,0.7,1.9,...,0.0,0.000,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29.0,LAL,78.0,74.0,38.6,9.2,18.7,...,0.0,0.000,Los Angeles Lakers,58.0,24.0,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12508,12508,Vince Carter,,,,,,,,,...,1270.0,0.002,,,,,,,,
12509,12509,Chauncey Billups,,,,,,,,,...,1210.0,0.027,,,,,,,,
12510,12510,Stephen Jackson,,,,,,,,,...,1230.0,0.001,,,,,,,,
12511,12511,Derrick Rose,,,,,,,,,...,1010.0,0.010,,,,,,,,


In [115]:
del stats["Unnamed: 0"]

In [116]:
pd.isnull(stats).sum()

Player        0
Pos           7
Age           7
Tm            7
G             7
GS            7
MP            7
FG            7
FGA           7
FG%          57
3P            7
3PA           7
3P%        1828
2P            7
2PA           7
2P%          90
eFG%         57
FT            7
FTA           7
FT%         439
ORB           7
DRB           7
TRB           7
AST           7
STL           7
BLK           7
TOV           7
PF            7
PTS           7
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          7
W             7
L             7
W/L%          7
GB            7
PS/G          7
PA/G          7
SRS           7
dtype: int64

In [117]:
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
17,Jack Haley,0.0
19,Keith Owens,0.0
32,James Edwards,0.0
...,...,...
12508,Vince Carter,
12509,Chauncey Billups,
12510,Stephen Jackson,
12511,Derrick Rose,


In [118]:
stats = stats.fillna(0)

In [119]:
stats.columns


Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [120]:
stats['Age'] = stats['Age'].astype(int)

In [123]:
# Numeric columns are the ones we want to use as predictors. Don't use pts won/pts max/share - these are very related with what we are trying to predict (overfitting)
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']
scaler = StandardScaler()
stats[predictors] = scaler.fit_transform(stats[predictors])

In [125]:
stats['Year']

0        1991
1        1991
2        1991
3        1991
4        1991
         ... 
12508    2005
12509    2009
12510    2010
12511    2021
12512    2021
Name: Year, Length: 12513, dtype: int64

In [126]:
train = stats[stats["Year"] < 2021]

In [127]:
train

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,0.074016,LAL,1.135924,-0.192221,0.555965,-0.030408,-0.088413,0.382642,...,0.0,0.000,Los Angeles Lakers,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
1,Byron Scott,SG,0.535827,LAL,1.135924,1.867104,1.112382,1.304199,1.224947,0.392767,...,0.0,0.000,Los Angeles Lakers,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
2,Elden Campbell,PF,-1.080511,LAL,-0.047404,-0.901169,-1.308521,-0.920146,-0.978109,0.170035,...,0.0,0.000,Los Angeles Lakers,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
3,Irving Thomas,PF,-0.387795,LAL,-1.072955,-0.901169,-1.611134,-1.098094,-1.084025,-0.994245,...,0.0,0.000,Los Angeles Lakers,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
4,James Worthy,SF,0.535827,LAL,0.978147,1.597029,1.746893,2.683292,2.474757,0.544629,...,0.0,0.000,Los Angeles Lakers,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12506,Dominique Wilkins,0,-6.160430,0,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,...,1010.0,0.001,0,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164
12507,Clyde Drexler,0,-6.160430,0,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,...,1050.0,0.003,0,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164
12508,Vince Carter,0,-6.160430,0,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,...,1270.0,0.002,0,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164
12509,Chauncey Billups,0,-6.160430,0,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,...,1210.0,0.027,0,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164


In [128]:
# Don't test on data that is before data you're training on. That's information the algo wouldn't have had in the real world. This causes overfitting.
test = stats[stats["Year"] >= 2021]

In [129]:
stats['Year'].value_counts()

2018    481
2020    469
2021    463
2019    444
2012    442
2017    433
2007    429
2016    426
2014    417
2013    417
2015    416
2000    411
2002    410
1999    407
2006    407
2005    406
2003    401
2001    394
1998    386
1995    384
2008    382
1997    378
2010    377
2009    377
2004    374
1996    372
2011    369
1994    367
1993    362
1991    359
1992    353
Name: Year, dtype: int64

In [130]:
def find_average_precision(combination):
    # Take the top 5 MVP winners
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending = False)
    ps = []
    found = 0
    seen = 1
    # if predicted player is in top 5, we get 100%, but if not, then penalize based on how far off
    # biased towards top of the ranking (rank in top 5 a lot more important than rank outside)
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [133]:
train[predictors]

Unnamed: 0,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,TOV,PF,PTS,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0.074016,1.135924,-0.192221,0.555965,-0.030408,-0.088413,0.382642,-0.646992,-0.474662,-0.192238,...,-0.031299,-0.598424,0.107244,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
1,0.535827,1.135924,1.867104,1.112382,1.304199,1.224947,0.392767,0.520456,0.636377,0.496874,...,-0.275926,-0.120143,0.990964,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
2,-1.080511,-0.047404,-0.901169,-1.308521,-0.920146,-0.978109,0.170035,-0.792923,-0.863526,-1.303708,...,-1.132121,-0.598424,-0.923762,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
3,-0.387795,-1.072955,-0.901169,-1.611134,-1.098094,-1.084025,-0.994245,-0.792923,-0.863526,-1.303708,...,-0.887494,-1.196275,-1.087414,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
4,0.535827,0.978147,1.597029,1.746893,2.683292,2.474757,0.544629,-0.355130,-0.196902,0.302366,...,0.457955,-0.478854,2.120162,1.403295,-1.220766,1.325371,-0.791765,0.790780,-0.140668,1.471122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12506,-6.160430,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,-0.792923,-0.863526,-1.303708,...,-1.499062,-2.272406,-1.381988,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164
12507,-6.160430,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,-0.792923,-0.863526,-1.303708,...,-1.499062,-2.272406,-1.381988,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164
12508,-6.160430,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,-0.792923,-0.863526,-1.303708,...,-1.499062,-2.272406,-1.381988,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164
12509,-6.160430,-2.098505,-0.901169,-2.021126,-1.409502,-1.486506,-4.436464,-0.792923,-0.863526,-1.303708,...,-1.499062,-2.272406,-1.381988,-3.082248,-3.086574,-3.212359,-1.174644,-14.071773,-13.879068,-0.005164


In [140]:
from sklearn.linear_model import Ridge
reg = Ridge(alpha = 0.1)
reg.fit(train[predictors], train["Share"])
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)
combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
combination = combination.sort_values("Share", ascending=False)
# adding a rank for each entry in the dataframe
combination["Rk"] = list(range(1, combination.shape[0]+1))
combination = combination.sort_values("predictions", ascending = False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))
combination.sort_values('Predicted_Rk', ascending = True).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
8130,Giannis Antetokounmpo,0.345,0.213927,4,1
7065,Joel Embiid,0.58,0.170881,2,2
9644,Nikola Jokić,0.961,0.161073,1,3
9268,Luka Dončić,0.042,0.156706,6,4
2956,LeBron James,0.001,0.153422,15,5
2883,Stephen Curry,0.449,0.148897,3,6
3272,Kevin Durant,0.0,0.147357,425,7
10054,Zion Williamson,0.0,0.131408,211,8
6573,Russell Westbrook,0.005,0.126112,11,9
4935,Jimmy Butler,0.0,0.124175,352,10


In [135]:
find_average_precision(combination)

0.7655913978494623

In [45]:
# Use these predictors to try to predict share
# SHOULD SCALE FEATURES BEFORE RUNNING RIDGE REGRESSION
reg.fit(train[predictors], train["Share"])

NameError: name 'reg' is not defined

In [16]:
reg.fit(train[predictors], train["Share"])
predictions = reg.predict(test[predictors])

In [17]:
predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)

In [18]:
predictions

Unnamed: 0,predictions
651,-0.016694
652,-0.014088
653,-0.005959
654,-0.011945
655,0.091248
...,...
12231,-0.005740
12232,-0.028248
12233,-0.009648
12511,0.057212


In [19]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)

In [20]:
combination

Unnamed: 0,Player,Share,predictions
651,Aaron Holiday,0.000,-0.016694
652,Amida Brimah,0.000,-0.014088
653,Brian Bowen,0.000,-0.005959
654,Cassius Stanley,0.000,-0.011945
655,Domantas Sabonis,0.000,0.091248
...,...,...,...
12231,Saddiq Bey,0.000,-0.005740
12232,Sekou Doumbouya,0.000,-0.028248
12233,Wayne Ellington,0.000,-0.009648
12511,Derrick Rose,0.010,0.057212


In [21]:
# Jokic won MVP but did not have the highest predicted amount - 2021 is the year before this year (last year we had full data for)
combination.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions
9644,Nikola Jokić,0.961,0.160097
7065,Joel Embiid,0.58,0.169816
2883,Stephen Curry,0.449,0.149148
8130,Giannis Antetokounmpo,0.345,0.215053
1080,Chris Paul,0.138,0.076427
9268,Luka Dončić,0.042,0.158128
6100,Damian Lillard,0.038,0.122486
2777,Julius Randle,0.02,0.092736
12511,Derrick Rose,0.01,0.057212
9620,Rudy Gobert,0.008,0.098534


In [22]:
from sklearn.metrics import mean_squared_error

# not meaningful since many of these values are 0
mean_squared_error(combination["Share"], combination["predictions"])

0.003032037638974017

In [23]:
combination = combination.sort_values("Share", ascending=False)
# adding a rank for each entry in the dataframe
combination["Rk"] = list(range(1, combination.shape[0]+1))

In [24]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
9644,Nikola Jokić,0.961,0.160097,1
7065,Joel Embiid,0.58,0.169816,2
2883,Stephen Curry,0.449,0.149148,3
8130,Giannis Antetokounmpo,0.345,0.215053,4
1080,Chris Paul,0.138,0.076427,5
9268,Luka Dončić,0.042,0.158128,6
6100,Damian Lillard,0.038,0.122486,7
2777,Julius Randle,0.02,0.092736,8
12511,Derrick Rose,0.01,0.057212,9
9620,Rudy Gobert,0.008,0.098534,10


In [25]:
combination = combination.sort_values("predictions", ascending = False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))

In [26]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
8130,Giannis Antetokounmpo,0.345,0.215053,4,1
7065,Joel Embiid,0.58,0.169816,2,2
9644,Nikola Jokić,0.961,0.160097,1,3
9268,Luka Dončić,0.042,0.158128,6,4
2956,LeBron James,0.001,0.153478,15,5
2883,Stephen Curry,0.449,0.149148,3,6
3272,Kevin Durant,0.0,0.147318,425,7
10054,Zion Williamson,0.0,0.129441,211,8
6573,Russell Westbrook,0.005,0.126329,11,9
6100,Damian Lillard,0.038,0.122486,7,10


In [27]:
# Error metric - we really care about who is in the top 5; of all the final vote getters in the top 5, how many did we actually place in the top 5?
# We will use average precision - not commonly used for regression, because it deals w/ ranking, but this problem requires ranking

combination.sort_values("Share", ascending=False).head(10)


Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
9644,Nikola Jokić,0.961,0.160097,1,3
7065,Joel Embiid,0.58,0.169816,2,2
2883,Stephen Curry,0.449,0.149148,3,6
8130,Giannis Antetokounmpo,0.345,0.215053,4,1
1080,Chris Paul,0.138,0.076427,5,31
9268,Luka Dončić,0.042,0.158128,6,4
6100,Damian Lillard,0.038,0.122486,7,10
2777,Julius Randle,0.02,0.092736,8,22
12511,Derrick Rose,0.01,0.057212,9,40
9620,Rudy Gobert,0.008,0.098534,10,18


# Error Metric

In [28]:
def find_average_precision(combination):
    # Take the top 5 MVP winners
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending = False)
    ps = []
    found = 0
    seen = 1
    # if predicted player is in top 5, we get 100%, but if not, then penalize based on how far off
    # biased towards top of the ranking (rank in top 5 a lot more important than rank outside)
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [29]:
# compares my top 5 to the actual top 5 i.e., chris paul is ranked 5 but we picked him at 33, so we get 0.15151515 for that
find_average_precision(combination)

0.7655913978494623

In [30]:
years = list(range(1991, 2022))

# Backtesting

In [31]:
# Backtesting is a way of testing if a model's predictions are in line with realised data - if historical is very different from predicted, model is not good
aps = []
all_predictions = []

# start on 5th year since we need some data to make predictions with
# train on all years immediately before current year, test on current year for each iteration on for loop
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    # make predictions into a dataframe instead of just a bunch of numbers
    predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)
    # joins tables (columns) length wise, i.e., || -> |||||
    combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
    all_predictions.append(combination)
    aps.append(find_average_precision(combination))

In [32]:
# mean average precision
sum(aps) / len(aps)

0.7174660306488415

In [33]:
# helpful for diagnosing if there is a common issue with our model - with what features it is thrown off the most by (ex. position)
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending=False)
    combination["Rk"] = list(range(1, combination.shape[0]+1))
    combination = combination.sort_values("predictions", ascending = False)
    combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))
    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    return combination

In [34]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] <= 5].sort_values("Diff", ascending = False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1218,Karl Malone,0.857,0.197241,1,2,-1
8775,Michael Jordan,0.832,0.174466,2,3,-1
12352,Grant Hill,0.327,0.130701,3,7,-4
3683,Tim Hardaway,0.207,0.064258,4,19,-15
6750,Glen Rice,0.117,0.035158,5,48,-43


In [35]:
def backtest(stats, model, year, predictors):
    # Backtesting is a way of testing if a model's predictions are in line with realised data - if historical is very different from predicted, model is not good
    aps = []
    all_predictions = []

    # start on 5th year since we need some data to make predictions with
    # train on all years immediately before current year, test on current year for each iteration on for loop
    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = model.predict(test[predictors])
        # make predictions into a dataframe instead of just a bunch of numbers
        predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)
        # joins tables (columns) length wise, i.e., || -> |||||
        combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_average_precision(combination))
        
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [36]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [37]:
mean_ap

0.7174660306488415

In [38]:
# If we have time - where does Jason Kidd differ from other candidates in terms of stats, why would he be placed like this?
all_predictions[all_predictions["Rk"] <= 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
956,Jason Kidd,0.712,0.028791,2,50,-48
6750,Glen Rice,0.117,0.035158,5,48,-43
4086,Steve Nash,0.839,0.036037,1,40,-39
10927,Joakim Noah,0.258,0.048103,4,39,-35
4101,Steve Nash,0.739,0.056404,1,34,-33
6971,Peja Stojaković,0.228,0.0374,4,37,-33
2889,Chauncey Billups,0.344,0.05511,5,35,-30
1080,Chris Paul,0.138,0.076427,5,31,-26
4115,Steve Nash,0.785,0.077297,2,22,-20
3951,Jason Kidd,0.135,0.001625,5,23,-18


In [39]:
# highest weighted features in the regression
reg.coef_

array([ 3.76124530e-04,  1.00860509e-04, -1.04796398e-05, -4.37761699e-03,
        5.15145437e-03,  6.52736169e-03, -1.33938098e-01,  5.17526818e-03,
       -1.08014064e-02, -1.06557650e-02,  1.81516419e-02, -1.83174266e-02,
        2.40326009e-03,  7.17700877e-02, -6.50049960e-03,  1.18264083e-02,
       -4.92078629e-03,  2.42032672e-02,  3.80848335e-02, -3.02309185e-02,
        7.75790673e-03,  1.23949778e-02,  1.16725638e-02, -1.00220753e-02,
       -2.58255148e-03,  5.77281288e-03, -1.97664721e-04, -2.26541990e-04,
        4.78944023e-05,  9.56249133e-02,  3.02684663e-04, -6.74533489e-04,
       -1.03452919e-04, -7.71949888e-04])

In [40]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis = 1).sort_values(0, ascending = False)

Unnamed: 0,0,1
29,0.095625,W/L%
13,0.07177,eFG%
18,0.038085,DRB
17,0.024203,ORB
10,0.018152,2P
21,0.012395,STL
15,0.011826,FTA
22,0.011673,BLK
20,0.007758,AST
5,0.006527,FGA


In [41]:
# how much above the mean are we?
stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [42]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.010830,0.408729,0.951534,0.661548,0.502098,1.0
1,1.610662,0.999114,1.631200,0.661548,4.518881,1.0
2,0.311024,0.090829,0.271867,1.543612,0.000000,1.0
3,0.199944,0.181657,0.271867,0.000000,0.000000,1.0
4,2.377116,1.589500,1.767134,0.882064,1.506294,1.0
...,...,...,...,...,...,...
12508,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
12509,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
12510,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
12511,0.000000,0.000000,0.000000,0.000000,0.000000,1.0


In [43]:
stats[["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[["PTS", "AST", "STL", "BLK", "3P"]]

In [44]:
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27.0,LAL,82.0,21.0,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.01083,0.408729,0.951534,0.661548,0.502098
1,Byron Scott,SG,29.0,LAL,82.0,82.0,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.610662,0.999114,1.6312,0.661548,4.518881
2,Elden Campbell,PF,22.0,LAL,52.0,0.0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311024,0.090829,0.271867,1.543612,0.0
3,Irving Thomas,PF,25.0,LAL,26.0,0.0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.199944,0.181657,0.271867,0.0,0.0
4,James Worthy,SF,29.0,LAL,78.0,74.0,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.377116,1.5895,1.767134,0.882064,1.506294


In [45]:
predictors += ["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [46]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [47]:
mean_ap

0.7269513700649745

In [48]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes

In [49]:
stats["NTm"] = stats["Tm"].astype("category").cat.codes

In [50]:
stats["Pos"].unique()

array(['PF', 'SG', 'SF', 'PG', 'C', 0], dtype=object)

In [51]:
stats["NTm"]

0        16
1        16
2        16
3        16
4        16
         ..
12508     0
12509     0
12510     0
12511     0
12512     0
Name: NTm, Length: 12513, dtype: int8

In [52]:
# Linear regression can't find relationship between categorical numbers, even when they are made to be 1, 2,3, etc.
# Need to use something else - in this case, we'll use random forest

In [53]:
# Random forest makes a series of decision trees and averages predictions from those trees

In [54]:
from sklearn.ensemble import RandomForestRegressor

#n_estimators is number of trees, random state = 1 means repeat the same outcome twice, min_samples_split means min number of values at a node to be split
rf = RandomForestRegressor(n_estimators = 100, random_state = 1, min_samples_split = 5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors)

KeyboardInterrupt: 

In [None]:
mean_ap

In [None]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)

In [None]:
mean_ap

In [55]:
train = stats[stats["Year"] < 2021]
test = stats[stats["Year"] == 2021]

In [56]:
rf.fit(train[predictors], train["Share"])
predictions = rf.predict(test[predictors])

In [57]:
predictions = pd.DataFrame(predictions, columns = ['predictions'], index = test.index)

In [58]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)

In [59]:
combination.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions
9644,Nikola Jokić,0.961,0.179951
7065,Joel Embiid,0.58,0.223306
2883,Stephen Curry,0.449,0.237937
8130,Giannis Antetokounmpo,0.345,0.327272
1080,Chris Paul,0.138,0.004116
9268,Luka Dončić,0.042,0.155833
6100,Damian Lillard,0.038,0.150906
2777,Julius Randle,0.02,0.028304
12511,Derrick Rose,0.01,0.007809
9620,Rudy Gobert,0.008,0.014236


In [73]:
from sklearn.linear_model import Lasso

las = Lasso(alpha = 0.01)
mean_ap, aps, all_predictions = backtest(stats, las, years[:], predictors)

In [84]:
all_predictions.sort_values('Predicted_Rk', ascending = True).head(30)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
8761,Michael Jordan,0.986,0.126125,1,1,0
823,Kobe Bryant,0.404,0.09784,3,1,2
4427,LeBron James,0.98,0.10158,1,1,0
8775,Michael Jordan,0.832,0.11911,2,1,1
2883,Stephen Curry,0.449,0.110048,3,1,2
809,Kobe Bryant,0.386,0.121234,4,1,3
10753,Kevin Durant,0.157,0.088017,5,1,4
11651,Kevin Durant,0.735,0.092941,2,1,1
7956,James Harden,0.363,0.123934,3,1,2
11687,Shaquille O'Neal,0.268,0.122886,4,1,3


In [88]:
test

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,PS/G,PA/G,SRS,PTS_T,AST_R,STL_R,BLK_R,3P_R,NPos,NTm
651,Aaron Holiday,PG,24.0,IND,66.0,8.0,17.8,2.6,6.6,0.390,...,115.3,115.3,-0.13,0.791754,0.945507,1.135599,0.484563,0.975147,3,14
652,Amida Brimah,C,26.0,IND,5.0,0.0,5.8,1.0,1.6,0.625,...,115.3,115.3,-0.13,0.285911,0.099527,0.000000,2.422815,0.000000,1,14
653,Brian Bowen,SF,22.0,IND,6.0,0.0,2.5,0.2,0.7,0.250,...,115.3,115.3,-0.13,0.054983,0.000000,0.000000,0.000000,0.000000,4,14
654,Cassius Stanley,SG,21.0,IND,24.0,0.0,3.9,0.5,1.8,0.302,...,115.3,115.3,-0.13,0.164949,0.000000,0.000000,0.242282,0.097515,5,14
655,Domantas Sabonis,PF,24.0,IND,62.0,62.0,36.0,7.8,14.6,0.535,...,115.3,115.3,-0.13,2.232306,3.334157,1.946741,1.211408,0.780118,2,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12231,Saddiq Bey,SF,21.0,DET,70.0,53.0,27.3,4.0,9.9,0.404,...,106.6,111.1,-4.38,1.341583,0.696690,1.135599,0.484563,2.437869,4,11
12232,Sekou Doumbouya,PF,20.0,DET,56.0,11.0,15.5,1.9,5.0,0.379,...,106.6,111.1,-4.38,0.560826,0.398108,0.648914,0.484563,0.487574,2,11
12233,Wayne Ellington,SG,33.0,DET,46.0,31.0,22.0,3.2,7.3,0.441,...,106.6,111.1,-4.38,1.055672,0.746453,0.648914,0.484563,2.437869,5,11
12511,Derrick Rose,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.000,...,0.0,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
