# National League

## Reading the Data Into Pandas and Cleaning It

In [1]:
import pandas as pd

In [2]:
#Opening up the player stat database with stats for 1990-2021
nl_stats = pd.read_csv("nl_stats.csv")

In [3]:
nl_stats

Unnamed: 0.1,Unnamed: 0,Player,Age,Year,Tm,Lg,W_p,L_p,ERA,G,...,SO9,SO/W,Vote Pts,1st Place,Share,W,L,W-L%,GB,Division
0,0,A.J. Burnett,22,1999,Florida Marlins,NL,4,2,3.48,7,...,7.2,1.32,0.0,0.0,0.0,64,98,0.395,39.0,NL East
1,59,A.J. Burnett,25,2002,Florida Marlins,NL,12,9,3.30,31,...,8.9,2.26,0.0,0.0,0.0,79,83,0.488,23.0,NL East
2,79,A.J. Burnett,26,2003,Florida Marlins,NL,0,2,4.70,4,...,8.2,1.17,0.0,0.0,0.0,91,71,0.562,10.0,NL East
3,101,A.J. Burnett,27,2004,Florida Marlins,NL,7,6,3.68,20,...,8.5,2.97,0.0,0.0,0.0,83,79,0.512,13.0,NL East
4,123,A.J. Burnett,28,2005,Florida Marlins,NL,12,12,3.44,32,...,8.5,2.51,0.0,0.0,0.0,83,79,0.512,7.0,NL East
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10819,8145,Zane Smith,30,1991,Pittsburgh Pirates,NL,16,10,3.20,35,...,4.7,4.14,0.0,0.0,0.0,98,64,0.605,0.0,NL East
10820,8129,Zane Smith,29,1990,Pittsburgh Pirates,NL,12,9,2.55,33,...,5.4,2.60,0.0,0.0,0.0,95,67,0.586,0.0,NL East
10821,8730,Zane Smith,32,1993,Pittsburgh Pirates,NL,3,7,4.55,14,...,3.5,1.45,0.0,0.0,0.0,75,87,0.463,22.0,NL East
10822,3725,Zeke Spruill,24,2014,Arizona Diamondbacks,NL,1,1,3.57,6,...,5.6,3.50,0.0,0.0,0.0,64,98,0.395,30.0,NL West


In [4]:
del nl_stats['Unnamed: 0']

In [5]:
nl_stats

Unnamed: 0,Player,Age,Year,Tm,Lg,W_p,L_p,ERA,G,GS,...,SO9,SO/W,Vote Pts,1st Place,Share,W,L,W-L%,GB,Division
0,A.J. Burnett,22,1999,Florida Marlins,NL,4,2,3.48,7,7,...,7.2,1.32,0.0,0.0,0.0,64,98,0.395,39.0,NL East
1,A.J. Burnett,25,2002,Florida Marlins,NL,12,9,3.30,31,29,...,8.9,2.26,0.0,0.0,0.0,79,83,0.488,23.0,NL East
2,A.J. Burnett,26,2003,Florida Marlins,NL,0,2,4.70,4,4,...,8.2,1.17,0.0,0.0,0.0,91,71,0.562,10.0,NL East
3,A.J. Burnett,27,2004,Florida Marlins,NL,7,6,3.68,20,19,...,8.5,2.97,0.0,0.0,0.0,83,79,0.512,13.0,NL East
4,A.J. Burnett,28,2005,Florida Marlins,NL,12,12,3.44,32,32,...,8.5,2.51,0.0,0.0,0.0,83,79,0.512,7.0,NL East
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10819,Zane Smith,30,1991,Pittsburgh Pirates,NL,16,10,3.20,35,35,...,4.7,4.14,0.0,0.0,0.0,98,64,0.605,0.0,NL East
10820,Zane Smith,29,1990,Pittsburgh Pirates,NL,12,9,2.55,33,31,...,5.4,2.60,0.0,0.0,0.0,95,67,0.586,0.0,NL East
10821,Zane Smith,32,1993,Pittsburgh Pirates,NL,3,7,4.55,14,14,...,3.5,1.45,0.0,0.0,0.0,75,87,0.463,22.0,NL East
10822,Zeke Spruill,24,2014,Arizona Diamondbacks,NL,1,1,3.57,6,1,...,5.6,3.50,0.0,0.0,0.0,64,98,0.395,30.0,NL West


In [6]:
#See if we have any missing data
pd.isnull(nl_stats).sum()

Player         0
Age            0
Year           0
Tm             0
Lg             0
W_p            0
L_p            0
ERA            4
G              0
GS             0
GF             0
CG             0
SHO            0
SV             0
IP             0
H              0
R              0
ER             0
HR             0
BB             0
IBB            0
SO             0
HBP            0
BK             0
WP             0
BF             0
FIP           14
WHIP          14
H9            14
HR9           14
BB9           14
SO9           14
SO/W         340
Vote Pts       0
1st Place      0
Share          0
W              0
L              0
W-L%           0
GB             0
Division       0
dtype: int64

In [7]:
# 'ERA' column has 5 NaN values. This is possible if a pitcher enters a game, doesn't record an out or ER
nl_stats[nl_stats['ERA'].isnull()][['Player', 'ERA', 'IP', 'BF', 'ER']]

Unnamed: 0,Player,ERA,IP,BF,ER
4140,Jake Esch,,0.0,2,0
8203,Randy Choate,,0.0,3,0
9112,Scott Radinsky,,0.0,1,0
9368,Shawn Tolleson,,0.0,2,0


In [8]:
# 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9' all have 14 Nan values. Some of these players are causing NaN values in ERA
# The common denominator seems to be players with 0 IP. Since there are only 14 players with 0 IP, we can feel comfortable removing them from the dataframe
nl_stats[nl_stats['FIP'].isnull()][['Player', 'IP', 'ER', 'BF', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9']]

Unnamed: 0,Player,IP,ER,BF,FIP,WHIP,H9,HR9,BB9,SO9
722,Bernardo Flores Jr.,0.0,1,3,,,,,,
1402,Bryan Harvey,0.0,3,3,,,,,,
2438,Daniel McCutchen,0.0,2,2,,,,,,
2782,Dennis Tankersley,0.0,7,7,,,,,,
3605,Gerardo Parra,0.0,5,5,,,,,,
4140,Jake Esch,0.0,0,2,,,,,,
6991,Matt Koch,0.0,3,3,,,,,,
7672,Nick Greenwood,0.0,2,2,,,,,,
8203,Randy Choate,0.0,0,3,,,,,,
8329,Rex Brothers,0.0,1,2,,,,,,


In [9]:
remove = nl_stats[nl_stats['IP'] == 0].index

In [10]:
remove

Int64Index([  722,  1402,  2438,  2782,  3605,  4140,  6991,  7672,  8203,
             8329,  9112,  9368,  9443, 10807],
           dtype='int64')

In [11]:
nl_stats = nl_stats.drop(index=(remove))

In [12]:
nl_stats

Unnamed: 0,Player,Age,Year,Tm,Lg,W_p,L_p,ERA,G,GS,...,SO9,SO/W,Vote Pts,1st Place,Share,W,L,W-L%,GB,Division
0,A.J. Burnett,22,1999,Florida Marlins,NL,4,2,3.48,7,7,...,7.2,1.32,0.0,0.0,0.0,64,98,0.395,39.0,NL East
1,A.J. Burnett,25,2002,Florida Marlins,NL,12,9,3.30,31,29,...,8.9,2.26,0.0,0.0,0.0,79,83,0.488,23.0,NL East
2,A.J. Burnett,26,2003,Florida Marlins,NL,0,2,4.70,4,4,...,8.2,1.17,0.0,0.0,0.0,91,71,0.562,10.0,NL East
3,A.J. Burnett,27,2004,Florida Marlins,NL,7,6,3.68,20,19,...,8.5,2.97,0.0,0.0,0.0,83,79,0.512,13.0,NL East
4,A.J. Burnett,28,2005,Florida Marlins,NL,12,12,3.44,32,32,...,8.5,2.51,0.0,0.0,0.0,83,79,0.512,7.0,NL East
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10819,Zane Smith,30,1991,Pittsburgh Pirates,NL,16,10,3.20,35,35,...,4.7,4.14,0.0,0.0,0.0,98,64,0.605,0.0,NL East
10820,Zane Smith,29,1990,Pittsburgh Pirates,NL,12,9,2.55,33,31,...,5.4,2.60,0.0,0.0,0.0,95,67,0.586,0.0,NL East
10821,Zane Smith,32,1993,Pittsburgh Pirates,NL,3,7,4.55,14,14,...,3.5,1.45,0.0,0.0,0.0,75,87,0.463,22.0,NL East
10822,Zeke Spruill,24,2014,Arizona Diamondbacks,NL,1,1,3.57,6,1,...,5.6,3.50,0.0,0.0,0.0,64,98,0.395,30.0,NL West


In [13]:
#See if we have any missing data
pd.isnull(nl_stats).sum()

Player         0
Age            0
Year           0
Tm             0
Lg             0
W_p            0
L_p            0
ERA            0
G              0
GS             0
GF             0
CG             0
SHO            0
SV             0
IP             0
H              0
R              0
ER             0
HR             0
BB             0
IBB            0
SO             0
HBP            0
BK             0
WP             0
BF             0
FIP            0
WHIP           0
H9             0
HR9            0
BB9            0
SO9            0
SO/W         338
Vote Pts       0
1st Place      0
Share          0
W              0
L              0
W-L%           0
GB             0
Division       0
dtype: int64

In [14]:
# We can remove the 'SO/W' column because we already have the 'SO' and 'BB' columns
del nl_stats['SO/W']

In [15]:
#See if we have any missing data
pd.isnull(nl_stats).sum()

Player       0
Age          0
Year         0
Tm           0
Lg           0
W_p          0
L_p          0
ERA          0
G            0
GS           0
GF           0
CG           0
SHO          0
SV           0
IP           0
H            0
R            0
ER           0
HR           0
BB           0
IBB          0
SO           0
HBP          0
BK           0
WP           0
BF           0
FIP          0
WHIP         0
H9           0
HR9          0
BB9          0
SO9          0
Vote Pts     0
1st Place    0
Share        0
W            0
L            0
W-L%         0
GB           0
Division     0
dtype: int64

## Training a Machine Learning Model

In [16]:
#Seeing what columns we would like to use for our predictions
nl_stats.columns
#In this case we will use all of the numeric values to make predictions

Index(['Player', 'Age', 'Year', 'Tm', 'Lg', 'W_p', 'L_p', 'ERA', 'G', 'GS',
       'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO',
       'HBP', 'BK', 'WP', 'BF', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9',
       'Vote Pts', '1st Place', 'Share', 'W', 'L', 'W-L%', 'GB', 'Division'],
      dtype='object')

In [17]:
predictors = ['Age', 'Year', 'W_p', 'L_p', 'ERA', 'G', 'GS',
       'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO',
       'HBP', 'BF', 'FIP', 'WHIP', 'H9', 'HR9', 'W-L%', 'GB']

In [18]:
train = nl_stats[nl_stats["Year"] < 2021]

In [19]:
test = nl_stats[nl_stats["Year"] == 2021]

In [20]:
#Ridge is a from of linear regression that is designed to prevent overfitting. Shrinks the linear regression coeffecients to prevent overfitting
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)   #Alpha controllers

In [21]:
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [22]:
predictions = reg.predict(test[predictors])

In [23]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [24]:
combination = pd.concat([test[["Player", "Age", "Year", "Share"]], predictions], axis=1)

In [25]:
combination

Unnamed: 0,Player,Age,Year,Share,predictions
16,A.J. Minter,27,2021,0.00,-0.002730
36,Aaron Ashby,23,2021,0.00,0.007926
86,Aaron Loup,33,2021,0.00,0.036835
94,Aaron Nola,28,2021,0.00,0.093284
99,Aaron Northcraft,31,2021,0.00,0.009693
...,...,...,...,...,...
10785,Zach Pop,24,2021,0.00,0.004035
10788,Zach Thompson,27,2021,0.00,-0.000553
10792,Zack Godley,31,2021,0.00,-0.002647
10802,Zack Littell,25,2021,0.00,0.009134


In [26]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Age,Year,Share,predictions
2174,Corbin Burnes,26,2021,0.72,0.133042
10810,Zack Wheeler,31,2021,0.67,0.180838
7105,Max Scherzer,36,2021,0.54,0.158124
10473,Walker Buehler,26,2021,0.33,0.108012
1128,Brandon Woodruff,28,2021,0.1,0.080206
6107,Kevin Gausman,30,2021,0.03,0.100417
166,Adam Wainwright,39,2021,0.01,0.129662
5839,Julio Urias,24,2021,0.01,0.113414
8184,Ramon Rosso,25,2021,0.0,6e-05
7159,Michael Lorenzen,29,2021,0.0,-0.004088


## Identifying an Error Metric

In [27]:
#Mean Squared Error
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Share"], combination["predictions"])

0.0021755411112128428

In [28]:
combination["Share"].value_counts()

0.00    466
0.01      2
0.10      1
0.72      1
0.03      1
0.54      1
0.33      1
0.67      1
Name: Share, dtype: int64

In [29]:
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1,combination.shape[0]+1))

In [30]:
combination.head(10)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk
2174,Corbin Burnes,26,2021,0.72,0.133042,1
10810,Zack Wheeler,31,2021,0.67,0.180838,2
7105,Max Scherzer,36,2021,0.54,0.158124,3
10473,Walker Buehler,26,2021,0.33,0.108012,4
1128,Brandon Woodruff,28,2021,0.1,0.080206,5
6107,Kevin Gausman,30,2021,0.03,0.100417,6
166,Adam Wainwright,39,2021,0.01,0.129662,7
5839,Julio Urias,24,2021,0.01,0.113414,8
8184,Ramon Rosso,25,2021,0.0,6e-05,9
7159,Michael Lorenzen,29,2021,0.0,-0.004088,10


In [31]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [32]:
#Compare our predictions to our actual values
combination.head(10)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk
10810,Zack Wheeler,31,2021,0.67,0.180838,2,1
7105,Max Scherzer,36,2021,0.54,0.158124,3,2
2174,Corbin Burnes,26,2021,0.72,0.133042,1,3
4077,Jacob deGrom,33,2021,0.0,0.13119,383,4
166,Adam Wainwright,39,2021,0.01,0.129662,7,5
4971,Joe Musgrove,28,2021,0.0,0.117607,408,6
7087,Max Fried,27,2021,0.0,0.114974,27,7
5839,Julio Urias,24,2021,0.01,0.113414,8,8
1787,Charlie Morton,37,2021,0.0,0.113261,246,9
10473,Walker Buehler,26,2021,0.33,0.108012,4,10


In [33]:
#Average Precision: Error metric to measure our rank accuracy
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk
2174,Corbin Burnes,26,2021,0.72,0.133042,1,3
10810,Zack Wheeler,31,2021,0.67,0.180838,2,1
7105,Max Scherzer,36,2021,0.54,0.158124,3,2
10473,Walker Buehler,26,2021,0.33,0.108012,4,10
1128,Brandon Woodruff,28,2021,0.1,0.080206,5,17
6107,Kevin Gausman,30,2021,0.03,0.100417,6,11
5839,Julio Urias,24,2021,0.01,0.113414,8,8
166,Adam Wainwright,39,2021,0.01,0.129662,7,5
8872,Ryan Meisinger,27,2021,0.0,-0.001911,192,326
5418,Jordan Holloway,25,2021,0.0,-0.001902,395,325


In [34]:
#Will determine accuracy based on if we predicted correctly in the top 3, top 20, etc
def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(3)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual ["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [35]:
#The lower the result, the longer it took our predictions to find the correct MVPs
find_ap(combination)

1.0

# Implementing Backtesting to Predict Each Year

In [36]:
years = list(range(1990,2022))

In [37]:
aps = []
all_predictions = []
for year in years[5:]:      #Starting with our 5th year since we need some data to work with
    train = nl_stats[nl_stats["Year"] < year]
    test = nl_stats[nl_stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Age", "Year", "Share"]], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [38]:
sum(aps) / len(aps)

0.7413479345699743

In [39]:
# Use the Ranks as diagnostics to improve our algorithm
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending=False)
    combination["Rk"] = list(range(1,combination.shape[0]+1))
    combination = combination.sort_values("predictions", ascending=False)
    combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))
    combination['Diff'] = combination['Rk'] - combination['Predicted_Rk']
    return combination

In [40]:
ranking = add_ranks(all_predictions[1])
ranking[ranking['Rk'] < 4].sort_values('Diff', ascending=False)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk,Diff
5256,John Smoltz,29,1996,0.97,0.222828,1,1,0
6083,Kevin Brown,31,1996,0.63,0.181483,2,2,0
445,Andy Benes,28,1996,0.06,0.072237,3,10,-7


In [41]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    for year in years[5:]:      #Starting with our 5th year since we need some data to work with
        train = nl_stats[nl_stats["Year"] < year]
        test = nl_stats[nl_stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Year", "Player", "Age", "Share", "IP"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [42]:
mean_ap, aps, all_predictions = backtest(nl_stats, reg, years[5:], predictors)

In [43]:
mean_ap

0.7441537707091439

In [44]:
#Looking at the largest difference between the actual rankings and our predictions
all_predictions[all_predictions['Rk'] <= 3].sort_values('Diff').head(10)

Unnamed: 0,Year,Player,Age,Share,IP,predictions,Rk,Predicted_Rk,Diff
10148,2006,Trevor Hoffman,38,0.48,63.0,0.018791,2,49,-47
5263,2002,John Smoltz,35,0.13,80.1,0.043259,3,29,-26
10146,1998,Trevor Hoffman,30,0.55,73.0,0.051464,2,24,-22
3232,2003,Eric Gagne,27,0.91,82.1,0.068135,1,13,-12
5531,2013,Jose Fernandez,20,0.3,172.2,0.073594,3,13,-10
985,2007,Brad Penny,29,0.09,208.0,0.04523,3,12,-9
3859,1995,Hideo Nomo,26,0.21,191.1,0.073243,3,11,-8
9947,1998,Tom Glavine,32,0.62,229.1,0.111078,1,9,-8
8657,2005,Roger Clemens,42,0.25,211.1,0.079874,3,10,-7
445,1996,Andy Benes,28,0.06,230.1,0.072237,3,10,-7


In [45]:
#Once again looking at our top 3 predictions compared to actual rankings
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1,combination.shape[0]+1))
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [46]:
combination.sort_values("Share", ascending=False).head(3)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk
2174,Corbin Burnes,26,2021,0.72,0.133042,1,3
10810,Zack Wheeler,31,2021,0.67,0.180838,2,1
7105,Max Scherzer,36,2021,0.54,0.158124,3,2


In [47]:
#Tells you which variables are the most important to the algorithm
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
8,0.013139,CG
9,0.008628,SHO
21,0.005088,FIP
2,0.003698,W_p
11,0.002509,IP
23,0.001749,H9
19,0.001122,HBP
18,0.000832,SO
0,0.000605,Age
13,0.000517,R


# Adding More Predictors

In [48]:
nl_stats['BB%'] = nl_stats['BB'] / nl_stats['BF']
nl_stats['SO%'] = nl_stats['SO'] / nl_stats['BF']
predictors += ['BB%', 'SO%']

In [49]:
mean_ap, aps, all_predictions = backtest(nl_stats, reg, years[5:], predictors)

In [50]:
mean_ap

0.7450353965427178

In [51]:
#A measure that compares a players stat to the league average
stat_ratios = nl_stats[['ERA', 'SO', 'IP', 'WHIP', 'W-L%', 'Year']].groupby('Year').apply(lambda x: x/x.mean())

In [52]:
stat_ratios

Unnamed: 0,ERA,SO,IP,WHIP,W-L%,Year
0,0.582833,0.573168,0.532932,0.877362,0.793904,1.0
1,0.648585,3.750924,2.844837,0.751798,0.984690,1.0
2,0.864207,0.405220,0.328009,1.009446,1.123351,1.0
3,0.662006,2.166982,1.726856,0.733176,1.033595,1.0
4,0.587480,3.859053,2.992178,0.778316,1.032702,1.0
...,...,...,...,...,...,...
10819,0.705279,2.208263,2.772863,0.774504,1.211713,1.0
10820,0.526067,2.577301,2.745438,0.720180,1.170282,1.0
10821,0.883803,0.587111,1.000784,0.942721,0.934480,1.0
10822,0.752792,0.245755,0.336246,0.950741,0.803170,1.0


In [53]:
nl_stats[['ERA-', 'SO+', 'IP+', 'WHIP-', 'W-L%+']] = stat_ratios[['ERA', 'SO', 'IP', 'WHIP', 'W-L%']]

In [54]:
nl_stats.head()

Unnamed: 0,Player,Age,Year,Tm,Lg,W_p,L_p,ERA,G,GS,...,W-L%,GB,Division,BB%,SO%,ERA-,SO+,IP+,WHIP-,W-L%+
0,A.J. Burnett,22,1999,Florida Marlins,NL,4,2,3.48,7,7,...,0.395,39.0,NL East,0.137363,0.181319,0.582833,0.573168,0.532932,0.877362,0.793904
1,A.J. Burnett,25,2002,Florida Marlins,NL,12,9,3.3,31,29,...,0.488,23.0,NL East,0.106635,0.240521,0.648585,3.750924,2.844837,0.751798,0.98469
2,A.J. Burnett,26,2003,Florida Marlins,NL,0,2,4.7,4,4,...,0.562,10.0,NL East,0.169811,0.198113,0.864207,0.40522,0.328009,1.009446,1.123351
3,A.J. Burnett,27,2004,Florida Marlins,NL,7,6,3.68,20,19,...,0.512,13.0,NL East,0.077551,0.230612,0.662006,2.166982,1.726856,0.733176,1.033595
4,A.J. Burnett,28,2005,Florida Marlins,NL,12,12,3.44,32,32,...,0.512,7.0,NL East,0.090493,0.226804,0.58748,3.859053,2.992178,0.778316,1.032702


In [55]:
predictors += ['ERA-', 'SO+', 'IP+', 'WHIP-', 'W-L%+']

In [56]:
mean_ap, aps, all_predictions = backtest(nl_stats, reg, years[5:], predictors)

In [57]:
mean_ap

0.7462791049828087

In [58]:
# The inclusion of additional predictors has shown an increase in our Mean Precision metric. Satisfied with the results, we can now begin predicting the 2022 NL Cy Young winner

## 2022 NL Cy Young Prediction

In [59]:
train = nl_stats[nl_stats["Year"] < 2022]

In [60]:
test = nl_stats[nl_stats["Year"] == 2022]

In [61]:
#Ridge is a from of linear regression that is designed to prevent overfitting. Shrinks the linear regression coeffecients to prevent overfitting
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)   #Alpha controllers

In [62]:
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [63]:
predictions = reg.predict(test[predictors])

In [64]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [65]:
combination = pd.concat([test[["Player", "Age", "Year"]], predictions], axis=1)

In [66]:
combination

Unnamed: 0,Player,Age,Year,predictions
14,A.J. Ladwig,29,2022,0.012410
15,A.J. Minter,28,2022,0.038241
37,Aaron Ashby,24,2022,-0.000233
44,Aaron Brooks,32,2022,-0.001632
55,Aaron Fletcher,26,2022,-0.000985
...,...,...,...,...
10772,Zach Eflin,28,2022,0.008668
10789,Zach Thompson,28,2022,-0.036277
10803,Zack Littell,26,2022,-0.000924
10806,Zack Thompson,24,2022,-0.002036


In [67]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [80]:
#Our final predictions for the NL Cy Young Award
combination.head(5)

Unnamed: 0,Player,Age,Year,predictions,Predicted_Rk
8991,Sandy Alcantara,26,2022,0.211544,1
92,Aaron Nola,29,2022,0.154936,2
1594,Carlos Rodon,29,2022,0.149925,3
2170,Corbin Burnes,27,2022,0.132127,4
10731,Zac Gallen,26,2022,0.117616,5
