In [5]:
import os 
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [6]:
START = 2004
END = 2024

In [7]:
batting = batting_stats(START, END, qual=200)

In [8]:
batting.to_csv("batting.csv")

In [9]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [11]:
# This data frame is all of the batting data for players between 2004-2024 who have had at least 200 plate appeareances
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
4,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.6
10,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
70,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,...,,0.0,,0,0.200,0.266,,,,10.1
157,10155,2012,Mike Trout,LAA,20,139,559,639,182,117,...,,0.0,,0,0.221,0.293,,,,10.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6452,8585,2009,Yuniesky Betancourt,- - -,27,134,470,508,115,83,...,,0.0,,0,0.181,0.232,,,,-2.4
7006,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
6474,45,2012,Rod Barajas,PIT,36,104,321,361,66,44,...,,0.0,,0,0.147,0.258,,,,-2.6
6620,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9


In [12]:
def next_season(player):
    player = player.sort_values("Season")
    player["NEXT_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

  batting = batting.groupby("IDfg", group_keys=False).apply(next_season)


In [13]:
batting[["Name", "Season", "WAR", "NEXT_WAR"]]

Unnamed: 0,Name,Season,WAR,NEXT_WAR
5484,Alfredo Amezaga,2006,1.1,2.0
4909,Alfredo Amezaga,2007,2.0,1.2
5157,Alfredo Amezaga,2008,1.2,
2478,Garret Anderson,2004,0.8,-0.2
4069,Garret Anderson,2005,-0.2,0.1
...,...,...,...,...
1215,Seiya Suzuki,2024,2.3,
5081,Zach Neto,2023,1.1,2.2
3004,Zach Neto,2024,2.2,
2421,Masataka Yoshida,2023,0.6,0.1


In [14]:
null_count = batting.isnull().sum()

In [15]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         6744
xSLG        6744
xwOBA       6744
L-WAR          0
NEXT_WAR    1207
Length: 321, dtype: int64

In [16]:
complete_cols = list(batting.columns[null_count == 0])

In [17]:
complete_cols

['IDfg',
 'Season',
 'Name',
 'Team',
 'Age',
 'G',
 'AB',
 'PA',
 'H',
 '1B',
 '2B',
 '3B',
 'HR',
 'R',
 'RBI',
 'BB',
 'IBB',
 'SO',
 'HBP',
 'SF',
 'SH',
 'GDP',
 'SB',
 'CS',
 'AVG',
 'GB',
 'FB',
 'LD',
 'IFFB',
 'Pitches',
 'Balls',
 'Strikes',
 'IFH',
 'BU',
 'BUH',
 'BB%',
 'K%',
 'BB/K',
 'OBP',
 'SLG',
 'OPS',
 'ISO',
 'BABIP',
 'GB/FB',
 'LD%',
 'GB%',
 'FB%',
 'IFFB%',
 'HR/FB',
 'IFH%',
 'BUH%',
 'wOBA',
 'wRAA',
 'wRC',
 'Bat',
 'Rep',
 'Pos',
 'RAR',
 'WAR',
 'Dol',
 'Spd',
 'wRC+',
 'WPA',
 '-WPA',
 '+WPA',
 'RE24',
 'REW',
 'pLI',
 'PH',
 'WPA/LI',
 'Clutch',
 'FB% (Pitch)',
 'FBv',
 'SL%',
 'SLv',
 'CB%',
 'CBv',
 'CH%',
 'CHv',
 'wFB',
 'wSL',
 'wCB',
 'wCH',
 'wFB/C',
 'wSL/C',
 'wCB/C',
 'wCH/C',
 'O-Swing%',
 'Z-Swing%',
 'Swing%',
 'O-Contact%',
 'Z-Contact%',
 'Contact%',
 'Zone%',
 'F-Strike%',
 'SwStr%',
 'BsR',
 'Def',
 'wSB',
 'Age Rng',
 'Off',
 'Lg',
 'Pull%',
 'Cent%',
 'Oppo%',
 'Soft%',
 'Med%',
 'Hard%',
 'TTO%',
 'AVG+',
 'BB%+',
 'K%+',
 'OBP+',
 'S

In [18]:
batting = batting[complete_cols + ["NEXT_WAR"]].copy()

In [19]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,NEXT_WAR
5484,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
4909,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
5157,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,1.2,
2478,2,2004,Garret Anderson,ANA,32,112,442,475,133,98,...,103,75,78,106,98,0,0.176,0.270,0.8,-0.2
4069,2,2005,Garret Anderson,LAA,33,142,575,603,163,111,...,78,83,100,97,106,0,0.150,0.258,-0.2,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1215,30116,2024,Seiya Suzuki,CHC,29,88,340,383,93,54,...,125,86,75,96,120,238,0.205,0.296,2.3,
5081,31347,2023,Zach Neto,LAA,22,84,289,329,65,39,...,82,103,81,108,97,216,0.161,0.290,1.1,2.2
3004,31347,2024,Zach Neto,LAA,23,109,367,406,94,56,...,89,89,91,100,106,281,0.160,0.286,2.2,
2421,31837,2023,Masataka Yoshida,BOS,29,140,537,580,155,104,...,102,117,123,99,91,458,0.212,0.285,0.6,0.1


In [20]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
NEXT_WAR    float64
Length: 131, dtype: object

In [21]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [22]:
batting["Dol"]

5484      $5.5
4909     $11.2
5157      $7.2
2478      $3.4
4069    ($1.2)
         ...  
1215     $18.7
5081      $8.9
3004     $17.9
2421      $4.8
3785      $0.5
Name: Dol, Length: 6744, dtype: object

In [23]:
del batting["Dol"]

In [24]:
batting["Age Rng"]

5484    28 - 28
4909    29 - 29
5157    30 - 30
2478    32 - 32
4069    33 - 33
         ...   
1215    29 - 29
5081    22 - 22
3004    23 - 23
2421    29 - 29
3785    30 - 30
Name: Age Rng, Length: 6744, dtype: object

In [25]:
del batting["Age Rng"]

In [30]:
batting["Team_Code"] =  batting["Team"].astype("category").cat.codes

In [35]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [36]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)

In [37]:
removed_cols = ["NEXT_WAR", "Name", "Team", "IDfg", "Season"]
selected_cols = batting.columns[~batting.columns.isin(removed_cols)]

In [39]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])


  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transform(batting[selected_cols])
  batting.loc[:, selected_cols] = scaler.fit_transfo

In [41]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,NEXT_WAR,Team_Code
count,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,...,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0,5537.0
mean,6978.167058,2013.163627,0.356143,0.649077,0.476371,0.477838,0.360908,0.285923,0.38883,0.100172,...,0.402198,0.411122,0.509725,0.468299,0.227437,0.497911,0.55183,0.339416,1.758154,0.479039
std,6137.835556,5.673558,0.145616,0.255848,0.241316,0.260889,0.181194,0.137847,0.167153,0.104292,...,0.130916,0.121268,0.132891,0.133415,0.293529,0.136366,0.121706,0.127327,1.942325,0.305139
min,1.0,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.1,0.0
25%,1677.0,2008.0,0.230769,0.470085,0.27518,0.256055,0.211207,0.174528,0.254237,0.043478,...,0.315789,0.331461,0.42029,0.375,0.0,0.408511,0.471366,0.246667,0.4,0.205882
50%,5222.0,2013.0,0.346154,0.709402,0.498201,0.50346,0.362069,0.278302,0.372881,0.086957,...,0.398496,0.404494,0.507246,0.472222,0.0,0.493617,0.550661,0.32,1.4,0.470588
75%,11265.0,2018.0,0.461538,0.871795,0.685252,0.705882,0.50431,0.386792,0.508475,0.130435,...,0.488722,0.488764,0.594203,0.555556,0.477462,0.591489,0.634361,0.406667,2.8,0.735294
max,31837.0,2023.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.2,1.0


In [44]:
sfs.fit(batting[selected_cols], batting["NEXT_WAR"])

In [48]:
predictors = list(selected_cols[sfs.get_support()])

In [53]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []

    years = sorted(data["Season"].unique())

    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["NEXT_WAR"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["NEXT_WAR"], preds], axis=1)
        combined.columns = ["Actual", "Prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [54]:
predictions = backtest(batting, rr, predictors)

In [55]:
predictions

Unnamed: 0,Actual,Prediction
5446,1.6,0.353958
2674,0.3,1.349977
4624,-1.1,0.389569
6098,0.2,0.605145
2312,0.4,1.009840
...,...,...
4541,1.4,1.936450
1373,0.6,2.569413
1310,2.3,2.386686
5081,2.2,2.247648


In [57]:
from sklearn.metrics import mean_squared_error

In [59]:
mean_squared_error(predictions["Actual"], predictions["Prediction"])

2.5929308447187367

In [62]:
# Rule of Thumb is to get the mean_squared_error lower than the std dev value to indicate our model is better than just random guesses
batting["NEXT_WAR"].describe()

count    5537.000000
mean        1.758154
std         1.942325
min        -3.100000
25%         0.400000
50%         1.400000
75%         2.800000
max        11.200000
Name: NEXT_WAR, dtype: float64

In [64]:
2.5929308447187367 ** .5

1.6102580056372136

In [79]:
def player_history(df):
    df = df.sort_values("Season")

    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
    df["war_corr"].fillna(1, inplace=True)

    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    
    df["war_diff"][df["war_diff"] == np.inf] = 1

    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["war_corr"].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["war_diff"].fillna(1, inplace=True)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default beha

In [80]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [84]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

  batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)


In [85]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [86]:
predictions = backtest(batting, rr, new_predictors)

In [87]:
mean_squared_error(predictions["Actual"], predictions["Prediction"]) 

2.5504135719796452

In [88]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.932391
G               -1.783551
ISO             -1.718438
WAR             -1.610597
BABIP           -1.507009
O-Swing%        -1.492796
BU              -1.136711
Soft%+          -0.775183
LD+%            -0.490703
BABIP+          -0.433974
war_diff        -0.343573
war_corr        -0.150458
player_season    0.012615
PH               0.335667
GDP              0.460070
CS               0.527972
Oppo%            0.629570
O-Contact%       0.762220
Swing%           0.934272
Spd              1.137245
Strikes          1.558588
IBB              1.931718
Hard%+           2.308678
war_season       3.218566
dtype: float64

In [90]:
diff = predictions["Actual"] - predictions["Prediction"]

In [91]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [92]:
merged["diff"] = (predictions["Actual"] - predictions["Prediction"]).abs()

In [94]:
merged[["IDfg", "Season", "Name", "WAR", "NEXT_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,NEXT_WAR,diff
7047,15082,2017,Adam Engel,0.220000,1.1,0.000077
5748,6609,2013,Freddy Galvis,0.220000,1.6,0.000485
2264,14566,2020,Franmil Reyes,0.260000,1.7,0.000899
2686,3531,2015,Troy Tulowitzki,0.366667,2.3,0.000996
1835,5361,2011,Freddie Freeman,0.246667,1.7,0.001939
...,...,...,...,...,...,...
3705,1875,2009,Josh Hamilton,0.293333,8.4,6.379830
798,9166,2010,Buster Posey,0.466667,9.8,6.386642
409,15640,2021,Aaron Judge,0.573333,11.2,7.444170
2427,11579,2014,Bryce Harper,0.313333,9.3,7.500714
