In [209]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [176]:
stats = pd.read_csv("mvp-stats.csv")

del stats["Unnamed: 0"]

Unnamed: 0,Player,Tm,Pos,G,GS,Cmp,PassAtt,PassYds,PassTD,PassInt,...,L,W-L%,PF,PA,PD,MoV,SoS,SRS,OSRS,DSRS
0,Aeneas Williams,ARI,LCB,16,16,0.0,0.0,0.0,0.0,0.0,...,13,0.188,210,443,-233,-14.6,-0.7,-15.2,-7.2,-8.1
1,Andre Wadsworth,ARI,LDE,9,8,0.0,0.0,0.0,0.0,0.0,...,13,0.188,210,443,-233,-14.6,-0.7,-15.2,-7.2,-8.1
2,Anthony Clement,ARI,RT,16,16,0.0,0.0,0.0,0.0,0.0,...,13,0.188,210,443,-233,-14.6,-0.7,-15.2,-7.2,-8.1
3,Barron Tanner,ARI,DT,4,0,0.0,0.0,0.0,0.0,0.0,...,13,0.188,210,443,-233,-14.6,-0.7,-15.2,-7.2,-8.1
4,Brad Ottis,ARI,LDE,15,11,0.0,0.0,0.0,0.0,0.0,...,13,0.188,210,443,-233,-14.6,-0.7,-15.2,-7.2,-8.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33634,Trey Quinn,WAS,WR,12,6,0.0,0.0,0.0,0.0,0.0,...,13,0.188,266,435,-169,-10.6,-0.2,-10.8,-6.3,-4.5
33635,Treyvon Hester,WAS,DT,15,0,0.0,0.0,0.0,0.0,0.0,...,13,0.188,266,435,-169,-10.6,-0.2,-10.8,-6.3,-4.5
33636,Troy Apke,WAS,DB,15,2,0.0,0.0,0.0,0.0,0.0,...,13,0.188,266,435,-169,-10.6,-0.2,-10.8,-6.3,-4.5
33637,Vernon Davis,WAS,TE,4,1,0.0,0.0,0.0,0.0,0.0,...,13,0.188,266,435,-169,-10.6,-0.2,-10.8,-6.3,-4.5


In [139]:
predictors = ['G', 'GS', 'Cmp', 'PassAtt', 'PassYds', 'PassTD',
       'PassInt', 'Year', 'Rec', 'RecYds', 'RecTD', 'RushAtt', 'RushYds',
       'RushTD', 'Solo', 'Sk', 'Int','W', 'L','W-L%', 'PF', 'PA', 'PD', 
        'MoV', 'SoS', 'SRS', 'OSRS', 'DSRS']

In [140]:
train = stats[stats["Year"] < 2023]
test = stats[stats["Year"] == 2023]

In [141]:
reg = Ridge(alpha=.1)

reg.fit(train[predictors], train["Share"])

In [142]:
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns=["prediction"], index=test.index)

In [143]:
predictions

Unnamed: 0,prediction
1020,-0.043078
1021,-0.017719
1022,-0.024420
1023,0.033148
1024,0.020640
...,...
32647,-1.339384
32648,0.105534
32649,0.067414
32650,0.042513


In [144]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
combination

Unnamed: 0,Player,Share,prediction
1020,Andre Chachere,0.0,-0.043078
1021,Antonio Hamilton,0.0,-0.017719
1022,BJ Ojulari,0.0,-0.024420
1023,Ben Stille,0.0,0.033148
1024,Bobby Price,0.0,0.020640
...,...,...,...
32647,Sam Howell,0.0,-1.339384
32648,Sean Chandler,0.0,0.105534
32649,Tariq Castro-Fields,0.0,0.067414
32650,Terrell Burgess,0.0,0.042513


In [145]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,prediction
3099,Lamar Jackson,98.0,6.834603
15772,Josh Allen,2.0,0.090246
4135,Josh Allen,2.0,8.019737
1020,Andre Chachere,0.0,-0.043078
20777,Jeremiah Pharms,0.0,-0.060439
20776,Jalen Reagor,0.0,-0.069637
20775,Jalen Mills,0.0,-0.08264
20774,Jahlani Tavai,0.0,-0.063009
20773,Jabrill Peppers,0.0,-0.051884
20772,Ja'Whaun Bentley,0.0,-0.039268


In [146]:
mean_squared_error(combination["Share"], combination["prediction"])

6.368533656109217

In [147]:
combination["Share"].value_counts()

Share
0.0     1464
2.0        2
98.0       1
Name: count, dtype: int64

In [148]:
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1, combination.shape[0] + 1))

combination.head(10)

Unnamed: 0,Player,Share,prediction,Rk
3099,Lamar Jackson,98.0,6.834603,1
15772,Josh Allen,2.0,0.090246,2
4135,Josh Allen,2.0,8.019737,3
1020,Andre Chachere,0.0,-0.043078,4
20777,Jeremiah Pharms,0.0,-0.060439,5
20776,Jalen Reagor,0.0,-0.069637,6
20775,Jalen Mills,0.0,-0.08264,7
20774,Jahlani Tavai,0.0,-0.063009,8
20773,Jabrill Peppers,0.0,-0.051884,9
20772,Ja'Whaun Bentley,0.0,-0.039268,10


In [149]:
combination = combination.sort_values("prediction", ascending=False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))
combination = combination.sort_values("Share", ascending=False)

combination.head(10)

Unnamed: 0,Player,Share,prediction,Rk,Predicted_Rk
3099,Lamar Jackson,98.0,6.834603,1,6
4135,Josh Allen,2.0,8.019737,3,5
15772,Josh Allen,2.0,0.090246,2,164
9386,Dak Prescott,0.0,12.63473,1345,1
16792,Derrick Nnadi,0.0,-0.039354,1211,977
16994,Marcus Peters,0.0,-0.040516,273,986
21816,Carl Granderson,0.0,-0.040505,160,985
12602,Colby Wooden,0.0,-0.040471,1394,984
18695,Brandon Jones,0.0,-0.040424,299,983
17000,Robert Spillane,0.0,-0.04031,267,982


In [150]:
def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(5)
    predicted = combination.sort_values("prediction", ascending=False)
    ps = []
    found = 0
    seen = 1

    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [151]:
find_ap(combination)

0.25359810353750356

In [152]:
years = list(range(2000, 2024))
aps = []
all_predictions = []

for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])

    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["prediction"], index=test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [153]:
sum(aps) / len(aps)

0.3789656022535348

In [154]:
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending=False)
    combination["Rk"] = list(range(1, combination.shape[0] + 1))

    combination = combination.sort_values("prediction", ascending=False)
    combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))

    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    return combination

In [155]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] < 6].sort_values("Diff", ascending=False)

Unnamed: 0,Player,Share,prediction,Rk,Predicted_Rk,Diff
13890,Peyton Manning,4.0,10.80922,3,1,2
27196,LaDainian Tomlinson,88.0,7.228963,1,2,-1
21056,Drew Brees,8.0,5.475316,2,4,-2
23179,Chad Pennington,0.0,1.117086,4,38,-34
23177,C.J. Mosley,0.0,0.000376,5,591,-586


In [156]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    
    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
    
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["prediction"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
    
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [157]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [158]:
mean_ap

0.3952210729464899

In [159]:
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,prediction,Rk,Predicted_Rk,Diff
5559,Adrian Peterson,6.0,-0.250775,4,1308,-1304
15638,Josh Allen,8.0,-0.12256,2,1202,-1200
13164,J.J. Watt,26.0,-0.100913,2,1135,-1133
23227,Dewayne Robertson,0.0,-0.107854,4,1048,-1044
1020,Andre Chachere,0.0,-0.043078,4,1009,-1005
23392,Darrelle Revis,0.0,-0.085827,4,944,-940
782,Antoine Bethea,0.0,-0.064713,3,936,-933
23474,Alex Green,0.0,-0.098967,4,933,-929
15730,Josh Allen,2.0,-0.040011,3,890,-887
647,Alani Fua,0.0,-0.053006,4,848,-844


In [160]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
5,0.913367,PassTD
25,0.2395,SRS
13,0.227302,RushTD
17,0.192012,W
10,0.021015,RecTD
26,0.013354,OSRS
15,0.006032,Sk
21,0.003481,PA
12,0.003252,RushYds
16,0.001673,Int


In [194]:
stat_ratios = stats[["PassYds", "PassTD", "PassInt", "RushYds", "RushTD", "RecYds", "RecTD", "Solo", "Sk", "Int", "Year"]]\
                .groupby("Year").apply(lambda x: x/x.mean())

stat_ratios = stat_ratios.reset_index(drop=True)

stats[["PassYds_R", "PassTD_R", "PassInt_R", "RushYds_R", "RushTD_R", "RecYds_R", "RecTD_R", "Solo_R", "Sk_R", "Int_R"]] = \
    stat_ratios.reset_index()[["PassYds", "PassTD", "PassInt", "RushYds", "RushTD", "RecYds", "RecTD", "Solo", "Sk", "Int"]]

  .groupby("Year").apply(lambda x: x/x.mean())


In [195]:
predictors += ["PassYds_R", "PassTD_R", "PassInt_R", "RushYds_R", "RushTD_R", "RecYds_R", "RecTD_R", "Solo_R", "Sk_R", "Int_R"]

In [196]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [199]:
mean_ap

0.4036947648336097

In [206]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes

Unnamed: 0,Player,Tm,Pos,G,GS,Cmp,PassAtt,PassYds,PassTD,PassInt,...,PassInt_R,RushYds_R,RushTD_R,RecYds_R,RecTD_R,Solo_R,Sk_R,Int_R,NPos,NTm
0,Aeneas Williams,ARI,LCB,16,16,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,2.494890,0.000000,11.707547,23,0
1,Andre Wadsworth,ARI,LDE,9,8,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.675699,1.014303,0.000000,28,0
2,Anthony Clement,ARI,RT,16,16,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100,0
3,Barron Tanner,ARI,DT,4,0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.259884,0.000000,0.000000,10,0
4,Brad Ottis,ARI,LDE,15,11,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.715237,1.014303,0.000000,28,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33634,Trey Quinn,WAS,WR,12,6,0.0,0.0,0.0,0.0,0.0,...,71.811189,6.722323,16.900922,0.000000,0.000000,0.000000,0.000000,0.000000,112,34
33635,Treyvon Hester,WAS,DT,15,0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.135991,0.000000,0.000000,10,34
33636,Troy Apke,WAS,DB,15,2,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.203986,0.000000,0.000000,4,34
33637,Vernon Davis,WAS,TE,4,1,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.155921,0.000000,0.000000,109,34


In [216]:
rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[20:], predictors)

In [217]:
mean_ap

0.39489193670547784

In [218]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[20:], predictors)

In [219]:
mean_ap

0.4036947648336097