# American League

## Reading the Data Into Pandas and Cleaning It

In [1]:
import pandas as pd

In [2]:
#Opening up the player stat database with stats for 1990-2021
al_stats = pd.read_csv("al_stats.csv")

In [3]:
al_stats

Unnamed: 0.1,Unnamed: 0,Player,Age,Year,Tm,Lg,W_p,L_p,ERA,G,...,SO9,SO/W,Vote Pts,1st Place,Share,W,L,W-L%,GB,Division
0,0,A.J. Achter,25,2014,Minnesota Twins,AL,1,0,3.27,7,...,4.1,1.67,0.0,0.0,0.0,70,92,0.432,20.0,AL Central
1,44,A.J. Achter,27,2016,Los Angeles Angels,AL,1,0,3.11,27,...,3.3,1.17,0.0,0.0,0.0,74,88,0.457,21.0,AL West
2,21,A.J. Achter,26,2015,Minnesota Twins,AL,0,1,6.75,11,...,9.5,2.33,0.0,0.0,0.0,83,79,0.512,12.0,AL Central
3,71,A.J. Alexy,23,2021,Texas Rangers,AL,3,1,4.70,5,...,6.7,1.00,0.0,0.0,0.0,60,102,0.370,35.0,AL West
4,98,A.J. Alexy,24,2022,Texas Rangers,AL,1,1,11.57,4,...,7.7,0.67,0.0,0.0,0.0,68,94,0.420,38.0,AL West
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10358,10185,Zack Littell,24,2020,Minnesota Twins,AL,0,0,9.95,6,...,4.3,1.00,0.0,0.0,0.0,36,24,0.600,0.0,AL Central
10359,2148,Zack Littell,22,2018,Minnesota Twins,AL,0,2,6.20,8,...,6.2,1.27,0.0,0.0,0.0,78,84,0.481,13.0,AL Central
10360,1543,Zack Weiss,30,2022,Los Angeles Angels,AL,0,1,3.38,12,...,12.2,2.57,0.0,0.0,0.0,73,89,0.451,33.0,AL West
10361,8688,Zak Shinall,24,1993,Seattle Mariners,AL,0,0,3.38,1,...,0.0,0.00,0.0,0.0,0.0,82,80,0.506,12.0,AL West


In [4]:
del al_stats['Unnamed: 0']

In [5]:
al_stats

Unnamed: 0,Player,Age,Year,Tm,Lg,W_p,L_p,ERA,G,GS,...,SO9,SO/W,Vote Pts,1st Place,Share,W,L,W-L%,GB,Division
0,A.J. Achter,25,2014,Minnesota Twins,AL,1,0,3.27,7,0,...,4.1,1.67,0.0,0.0,0.0,70,92,0.432,20.0,AL Central
1,A.J. Achter,27,2016,Los Angeles Angels,AL,1,0,3.11,27,0,...,3.3,1.17,0.0,0.0,0.0,74,88,0.457,21.0,AL West
2,A.J. Achter,26,2015,Minnesota Twins,AL,0,1,6.75,11,0,...,9.5,2.33,0.0,0.0,0.0,83,79,0.512,12.0,AL Central
3,A.J. Alexy,23,2021,Texas Rangers,AL,3,1,4.70,5,4,...,6.7,1.00,0.0,0.0,0.0,60,102,0.370,35.0,AL West
4,A.J. Alexy,24,2022,Texas Rangers,AL,1,1,11.57,4,0,...,7.7,0.67,0.0,0.0,0.0,68,94,0.420,38.0,AL West
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10358,Zack Littell,24,2020,Minnesota Twins,AL,0,0,9.95,6,0,...,4.3,1.00,0.0,0.0,0.0,36,24,0.600,0.0,AL Central
10359,Zack Littell,22,2018,Minnesota Twins,AL,0,2,6.20,8,2,...,6.2,1.27,0.0,0.0,0.0,78,84,0.481,13.0,AL Central
10360,Zack Weiss,30,2022,Los Angeles Angels,AL,0,1,3.38,12,0,...,12.2,2.57,0.0,0.0,0.0,73,89,0.451,33.0,AL West
10361,Zak Shinall,24,1993,Seattle Mariners,AL,0,0,3.38,1,0,...,0.0,0.00,0.0,0.0,0.0,82,80,0.506,12.0,AL West


In [6]:
#See if we have any missing data
pd.isnull(al_stats).sum()

Player         0
Age            0
Year           0
Tm             0
Lg             0
W_p            0
L_p            0
ERA            1
G              0
GS             0
GF             0
CG             0
SHO            0
SV             0
IP             0
H              0
R              0
ER             0
HR             0
BB             0
IBB            0
SO             0
HBP            0
BK             0
WP             0
BF             0
FIP            7
WHIP           7
H9             7
HR9            7
BB9            7
SO9            7
SO/W         311
Vote Pts       0
1st Place      0
Share          0
W              0
L              0
W-L%           0
GB             0
Division       0
dtype: int64

In [7]:
# 'ERA' column has 5 NaN values. This is possible if a pitcher enters a game, doesn't record an out or ER
al_stats[al_stats['ERA'].isnull()][['Player', 'ERA', 'IP', 'BF', 'ER']]

Unnamed: 0,Player,ERA,IP,BF,ER
9167,Shane Halter,,0.0,1,0


In [8]:
# 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9' all have 14 Nan values. Some of these players are causing NaN values in ERA
# The common denominator seems to be players with 0 IP. Since there are only 14 players with 0 IP, we can feel comfortable removing them from the dataframe
al_stats[al_stats['FIP'].isnull()][['Player', 'IP', 'ER', 'BF', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9']]

Unnamed: 0,Player,IP,ER,BF,FIP,WHIP,H9,HR9,BB9,SO9
444,Andrew Vasquez,0.0,3,3,,,,,,
1019,Brad Pennington,0.0,1,4,,,,,,
3361,Erik Sabel,0.0,2,2,,,,,,
5486,Jonathan Stiever,0.0,3,4,,,,,,
6495,Lino Urdaneta,0.0,6,6,,,,,,
7580,Nate Jones,0.0,4,5,,,,,,
9167,Shane Halter,0.0,0,1,,,,,,


In [9]:
remove = al_stats[al_stats['IP'] == 0].index

In [10]:
remove

Int64Index([444, 1019, 3361, 5486, 6495, 7580, 9167], dtype='int64')

In [11]:
al_stats = al_stats.drop(index=(remove))

In [12]:
al_stats

Unnamed: 0,Player,Age,Year,Tm,Lg,W_p,L_p,ERA,G,GS,...,SO9,SO/W,Vote Pts,1st Place,Share,W,L,W-L%,GB,Division
0,A.J. Achter,25,2014,Minnesota Twins,AL,1,0,3.27,7,0,...,4.1,1.67,0.0,0.0,0.0,70,92,0.432,20.0,AL Central
1,A.J. Achter,27,2016,Los Angeles Angels,AL,1,0,3.11,27,0,...,3.3,1.17,0.0,0.0,0.0,74,88,0.457,21.0,AL West
2,A.J. Achter,26,2015,Minnesota Twins,AL,0,1,6.75,11,0,...,9.5,2.33,0.0,0.0,0.0,83,79,0.512,12.0,AL Central
3,A.J. Alexy,23,2021,Texas Rangers,AL,3,1,4.70,5,4,...,6.7,1.00,0.0,0.0,0.0,60,102,0.370,35.0,AL West
4,A.J. Alexy,24,2022,Texas Rangers,AL,1,1,11.57,4,0,...,7.7,0.67,0.0,0.0,0.0,68,94,0.420,38.0,AL West
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10358,Zack Littell,24,2020,Minnesota Twins,AL,0,0,9.95,6,0,...,4.3,1.00,0.0,0.0,0.0,36,24,0.600,0.0,AL Central
10359,Zack Littell,22,2018,Minnesota Twins,AL,0,2,6.20,8,2,...,6.2,1.27,0.0,0.0,0.0,78,84,0.481,13.0,AL Central
10360,Zack Weiss,30,2022,Los Angeles Angels,AL,0,1,3.38,12,0,...,12.2,2.57,0.0,0.0,0.0,73,89,0.451,33.0,AL West
10361,Zak Shinall,24,1993,Seattle Mariners,AL,0,0,3.38,1,0,...,0.0,0.00,0.0,0.0,0.0,82,80,0.506,12.0,AL West


In [13]:
#See if we have any missing data
pd.isnull(al_stats).sum()

Player         0
Age            0
Year           0
Tm             0
Lg             0
W_p            0
L_p            0
ERA            0
G              0
GS             0
GF             0
CG             0
SHO            0
SV             0
IP             0
H              0
R              0
ER             0
HR             0
BB             0
IBB            0
SO             0
HBP            0
BK             0
WP             0
BF             0
FIP            0
WHIP           0
H9             0
HR9            0
BB9            0
SO9            0
SO/W         309
Vote Pts       0
1st Place      0
Share          0
W              0
L              0
W-L%           0
GB             0
Division       0
dtype: int64

In [14]:
# We can remove the 'SO/W' column because we already have the 'SO' and 'BB' columns
del al_stats['SO/W']

In [15]:
#See if we have any missing data
pd.isnull(al_stats).sum()

Player       0
Age          0
Year         0
Tm           0
Lg           0
W_p          0
L_p          0
ERA          0
G            0
GS           0
GF           0
CG           0
SHO          0
SV           0
IP           0
H            0
R            0
ER           0
HR           0
BB           0
IBB          0
SO           0
HBP          0
BK           0
WP           0
BF           0
FIP          0
WHIP         0
H9           0
HR9          0
BB9          0
SO9          0
Vote Pts     0
1st Place    0
Share        0
W            0
L            0
W-L%         0
GB           0
Division     0
dtype: int64

In [16]:
al_stats.to_csv('al_stats2.csv')

## Training a Machine Learning Model

In [17]:
#Seeing what columns we would like to use for our predictions
al_stats.columns
#In this case we will use all of the numeric values to make predictions

Index(['Player', 'Age', 'Year', 'Tm', 'Lg', 'W_p', 'L_p', 'ERA', 'G', 'GS',
       'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO',
       'HBP', 'BK', 'WP', 'BF', 'FIP', 'WHIP', 'H9', 'HR9', 'BB9', 'SO9',
       'Vote Pts', '1st Place', 'Share', 'W', 'L', 'W-L%', 'GB', 'Division'],
      dtype='object')

In [18]:
predictors = ['Age', 'Year', 'W_p', 'L_p', 'ERA', 'G', 'GS',
       'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB', 'SO',
       'HBP', 'BF', 'FIP', 'WHIP', 'H9', 'HR9', 'W-L%', 'GB']

In [19]:
train = al_stats[al_stats["Year"] < 2021]

In [20]:
test = al_stats[al_stats["Year"] == 2021]

In [21]:
#Ridge is a from of linear regression that is designed to prevent overfitting. Shrinks the linear regression coeffecients to prevent overfitting
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)   #Alpha controllers

In [22]:
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [23]:
predictions = reg.predict(test[predictors])

In [24]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [25]:
combination = pd.concat([test[["Player", "Age", "Year", "Share"]], predictions], axis=1)

In [26]:
combination

Unnamed: 0,Player,Age,Year,Share,predictions
3,A.J. Alexy,23,2021,0.0,0.008097
14,A.J. Cole,29,2021,0.0,-0.002362
22,A.J. Puk,26,2021,0.0,-0.018180
27,AJ Ramos,34,2021,0.0,0.002314
38,Aaron Bummer,27,2021,0.0,0.009111
...,...,...,...,...,...
10289,Zac Lowther,25,2021,0.0,-0.002801
10319,Zach Plesac,26,2021,0.0,0.019992
10328,Zack Britton,33,2021,0.0,-0.012552
10340,Zack Burdi,26,2021,0.0,0.007423


In [27]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Age,Year,Share,predictions
8406,Robbie Ray,29,2021,0.99,0.140716
3722,Gerrit Cole,30,2021,0.59,0.186117
6427,Lance Lynn,34,2021,0.23,0.109806
7609,Nathan Eovaldi,31,2021,0.2,0.085595
1612,Carlos Rodon,28,2021,0.16,0.143163
3591,Frankie Montas,28,2021,0.1,0.09789
6435,Lance McCullers Jr.,27,2021,0.07,0.082495
6488,Liam Hendriks,32,2021,0.05,0.078604
5570,Jose Berrios,27,2021,0.04,0.102375
1859,Chris Bassitt,32,2021,0.01,0.106246


## Identifying an Error Metric

In [28]:
#Mean Squared Error
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Share"], combination["predictions"])

0.0025710310097370264

In [29]:
combination["Share"].value_counts()

0.00    433
0.16      1
0.01      1
0.10      1
0.59      1
0.04      1
0.23      1
0.07      1
0.05      1
0.20      1
0.99      1
Name: Share, dtype: int64

In [30]:
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1,combination.shape[0]+1))

In [31]:
combination.head(10)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk
8406,Robbie Ray,29,2021,0.99,0.140716,1
3722,Gerrit Cole,30,2021,0.59,0.186117,2
6427,Lance Lynn,34,2021,0.23,0.109806,3
7609,Nathan Eovaldi,31,2021,0.2,0.085595,4
1612,Carlos Rodon,28,2021,0.16,0.143163,5
3591,Frankie Montas,28,2021,0.1,0.09789,6
6435,Lance McCullers Jr.,27,2021,0.07,0.082495,7
6488,Liam Hendriks,32,2021,0.05,0.078604,8
5570,Jose Berrios,27,2021,0.04,0.102375,9
1859,Chris Bassitt,32,2021,0.01,0.106246,10


In [32]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [33]:
#Compare our predictions to our actual values
combination.head(10)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk
3722,Gerrit Cole,30,2021,0.59,0.186117,2,1
1612,Carlos Rodon,28,2021,0.16,0.143163,5,2
8406,Robbie Ray,29,2021,0.99,0.140716,1,3
3129,Dylan Cease,25,2021,0.0,0.125525,416,4
6427,Lance Lynn,34,2021,0.23,0.109806,3,5
9092,Sean Manaea,29,2021,0.0,0.1064,205,6
1859,Chris Bassitt,32,2021,0.01,0.106246,10,7
5570,Jose Berrios,27,2021,0.04,0.102375,9,8
3591,Frankie Montas,28,2021,0.1,0.09789,6,9
6535,Lucas Giolito,26,2021,0.0,0.093479,74,10


In [34]:
#Average Precision: Error metric to measure our rank accuracy
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk
8406,Robbie Ray,29,2021,0.99,0.140716,1,3
3722,Gerrit Cole,30,2021,0.59,0.186117,2,1
6427,Lance Lynn,34,2021,0.23,0.109806,3,5
7609,Nathan Eovaldi,31,2021,0.2,0.085595,4,12
1612,Carlos Rodon,28,2021,0.16,0.143163,5,2
3591,Frankie Montas,28,2021,0.1,0.09789,6,9
6435,Lance McCullers Jr.,27,2021,0.07,0.082495,7,13
6488,Liam Hendriks,32,2021,0.05,0.078604,8,15
5570,Jose Berrios,27,2021,0.04,0.102375,9,8
1859,Chris Bassitt,32,2021,0.01,0.106246,10,7


In [35]:
#Will determine accuracy based on if we predicted correctly in the top 3, top 20, etc
def find_ap(combination):
    actual = combination.sort_values("Share", ascending=False).head(3)
    predicted = combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual ["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [36]:
#The lower the result, the longer it took our predictions to find the correct MVPs
find_ap(combination)

0.7555555555555555

# Implementing Backtesting to Predict Each Year

In [37]:
years = list(range(1990,2022))

In [38]:
aps = []
all_predictions = []
for year in years[5:]:      #Starting with our 5th year since we need some data to work with
    train = al_stats[al_stats["Year"] < year]
    test = al_stats[al_stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Age", "Year", "Share"]], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [39]:
sum(aps) / len(aps)

0.692848604974508

In [40]:
# Use the Ranks as diagnostics to improve our algorithm
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending=False)
    combination["Rk"] = list(range(1,combination.shape[0]+1))
    combination = combination.sort_values("predictions", ascending=False)
    combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))
    combination['Diff'] = combination['Rk'] - combination['Predicted_Rk']
    return combination

In [41]:
ranking = add_ranks(all_predictions[1])
ranking[ranking['Rk'] < 4].sort_values('Diff', ascending=False)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk,Diff
7823,Pat Hentgen,27,1996,0.79,0.138712,1,1,0
464,Andy Pettitte,24,1996,0.74,0.084788,2,8,-6
6705,Mariano Rivera,26,1996,0.13,0.060823,3,12,-9


In [42]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    for year in years[5:]:      #Starting with our 5th year since we need some data to work with
        train = al_stats[al_stats["Year"] < year]
        test = al_stats[al_stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Year", "Player", "Age", "Share", "IP"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [43]:
mean_ap, aps, all_predictions = backtest(al_stats, reg, years[5:], predictors)

In [44]:
mean_ap

0.7049137003729368

In [45]:
#Looking at the largest difference between the actual rankings and our predictions
all_predictions[all_predictions['Rk'] <= 3].sort_values('Diff').head(10)

Unnamed: 0,Year,Player,Age,Share,IP,predictions,Rk,Predicted_Rk,Diff
3547,2008,Francisco Rodriguez,26,0.23,68.1,0.017494,3,57,-54
1839,2006,Chien-Ming Wang,26,0.36,218.0,0.038563,2,36,-34
6703,2004,Mariano Rivera,34,0.19,78.2,0.03549,3,34,-31
6704,1999,Mariano Rivera,29,0.19,69.0,0.030904,3,24,-21
6702,2005,Mariano Rivera,35,0.49,78.1,0.05029,2,21,-19
5623,1995,Jose Mesa,29,0.39,64.0,0.065154,2,12,-10
6705,1996,Mariano Rivera,26,0.13,107.2,0.060823,3,12,-9
4021,2020,Hyun Jin Ryu,33,0.24,67.0,0.036008,3,11,-8
9644,1995,Tim Wakefield,28,0.21,195.1,0.068603,3,10,-7
8606,2006,Roy Halladay,29,0.34,220.0,0.089522,3,10,-7


In [46]:
#Once again looking at our top 3 predictions compared to actual rankings
combination = combination.sort_values("Share", ascending=False)
combination["Rk"] = list(range(1,combination.shape[0]+1))
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [47]:
combination.sort_values("Share", ascending=False).head(3)

Unnamed: 0,Player,Age,Year,Share,predictions,Rk,Predicted_Rk
8406,Robbie Ray,29,2021,0.99,0.140716,1,3
3722,Gerrit Cole,30,2021,0.59,0.186117,2,1
6427,Lance Lynn,34,2021,0.23,0.109806,3,5


In [48]:
#Tells you which variables are the most important to the algorithm
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
9,0.013007,SHO
21,0.00651,FIP
8,0.005606,CG
2,0.004323,W_p
23,0.001962,H9
11,0.001502,IP
13,0.001016,R
18,0.001006,SO
17,0.000826,IBB
0,0.00033,Age


# Adding More Predictors

In [49]:
al_stats['BB%'] = al_stats['BB'] / al_stats['BF']
al_stats['SO%'] = al_stats['SO'] / al_stats['BF']
predictors += ['BB%', 'SO%']

In [50]:
mean_ap, aps, all_predictions = backtest(al_stats, reg, years[5:], predictors)

In [51]:
mean_ap

0.7207034805536471

In [52]:
list(predictors)

['Age',
 'Year',
 'W_p',
 'L_p',
 'ERA',
 'G',
 'GS',
 'GF',
 'CG',
 'SHO',
 'SV',
 'IP',
 'H',
 'R',
 'ER',
 'HR',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'BF',
 'FIP',
 'WHIP',
 'H9',
 'HR9',
 'W-L%',
 'GB',
 'BB%',
 'SO%']

In [53]:
predictors = ['Age', 'Year', 'W_p', 'L_p', 'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'IBB', 'HBP', 'BF', 'FIP', 'WHIP', 'H9', 'HR9', 'W-L%', 'GB', 'BB%', 'SO%']

In [54]:
mean_ap, aps, all_predictions = backtest(al_stats, reg, years[5:], predictors)

In [55]:
mean_ap

0.7508577156725305

In [56]:
#Tells you which variables are the most important to the algorithm
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
25,0.069048,BB%
26,0.026024,SO%
9,0.014091,SHO
2,0.004786,W_p
8,0.003704,CG
11,0.002969,IP
22,0.001646,HR9
21,0.001236,H9
13,0.000809,R
17,0.000401,HBP


In [57]:
# Unlike with the NL Cy Young, adding more predictors does not necessarily benefit our model. In this case, substituting similar predictors improves our algorithm. Satisfied with the results, we can now begin predicting the 2022 NL Cy Young winner

## 2022 AL Cy Young Prediction

In [58]:
train = al_stats[al_stats["Year"] < 2022]

In [59]:
test = al_stats[al_stats["Year"] == 2022]

In [60]:
#Ridge is a from of linear regression that is designed to prevent overfitting. Shrinks the linear regression coeffecients to prevent overfitting
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)   #Alpha controllers

In [61]:
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [62]:
predictions = reg.predict(test[predictors])

In [63]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [64]:
combination = pd.concat([test[["Player", "Age", "Year"]], predictions], axis=1)

In [65]:
combination

Unnamed: 0,Player,Age,Year,predictions
4,A.J. Alexy,24,2022,0.007516
23,A.J. Puk,27,2022,0.014149
37,Aaron Bummer,28,2022,0.003499
42,Aaron Civale,27,2022,0.007668
69,Aaron Loup,34,2022,-0.021989
...,...,...,...,...
10320,Zach Pop,25,2022,0.014699
10332,Zack Britton,34,2022,-0.015859
10353,Zack Greinke,38,2022,-0.008710
10356,Zack Kelly,27,2022,0.004285


In [66]:
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1,combination.shape[0]+1))

In [67]:
#Our final predictions for the AL Cy Young Award
combination.head(3)

Unnamed: 0,Player,Age,Year,predictions,Predicted_Rk
5951,Justin Verlander,39,2022,0.150167,1
3513,Framber Valdez,28,2022,0.122555,2
3128,Dylan Cease,26,2022,0.114732,3
