In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [2]:
START = 2002
END = 2022

In [3]:
### SCRAPING DATA ###

In [4]:
batting = batting_stats(START, END, qual=200)

In [5]:
# grouped by player
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [6]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,0,0.127,0.191,,,,12.7
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
8,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.2
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,0,0.135,0.223,,,,10.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6885,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,0.0,,0,0.166,0.252,,,,-2.4
7042,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-3.1
6673,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9
6988,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,0,0.130,0.187,,,,-2.9


In [7]:
### CREATING ML TARGET ###

In [8]:
# for every player create value in next_AVG column
def next_season(player):
    player = player.sort_values("Season")
    player["Next_AVG"] = player["AVG"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [9]:
batting[["Name", "Season", "AVG", "Next_AVG"]]

Unnamed: 0,Name,Season,AVG,Next_AVG
5562,Alfredo Amezaga,2006,0.260,0.263
5006,Alfredo Amezaga,2007,0.263,0.264
5252,Alfredo Amezaga,2008,0.264,
1169,Garret Anderson,2002,0.306,0.315
864,Garret Anderson,2003,0.315,0.301
...,...,...,...,...
6002,Owen Miller,2022,0.243,
4881,Andrew Vaughn,2021,0.235,0.271
3377,Andrew Vaughn,2022,0.271,
6620,Ha-seong Kim,2021,0.202,0.251


In [10]:
### CLEANING THE DATA ###

In [11]:
null_count = batting.isnull().sum()
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         6754
xSLG        6754
xwOBA       6754
L-WAR          0
Next_AVG    1179
Length: 321, dtype: int64

In [12]:
# columns with 0 null values
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_AVG"]].copy()

In [13]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [14]:
# deleting object types
del batting["Dol"]
del batting["Age Rng"]

In [15]:
# creating team codes as numeric values
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [16]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [17]:
### SELECTING FEATURES ###

In [18]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit


# creating ridge, timeseries split, and sequential feature selector from sklearn
rr = Ridge(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)

In [19]:
# select the correct columns
removed_columns = ["Next_AVG", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [20]:
# scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [21]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_AVG,team_code
count,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,...,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0,5575.0
mean,5366.78583,2011.163229,0.3606,0.652755,0.478666,0.480943,0.365973,0.290481,0.399279,0.103459,...,0.403164,0.410923,0.511026,0.478646,0.172991,0.498932,0.545898,0.334663,0.261648,0.474128
std,5133.255295,5.612014,0.147476,0.255929,0.242481,0.26229,0.182585,0.138786,0.171732,0.105891,...,0.131213,0.121082,0.130359,0.133992,0.273858,0.13718,0.120701,0.120013,0.03212,0.305105
min,1.0,2002.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144,0.0
25%,1131.5,2006.0,0.269231,0.478632,0.27518,0.257785,0.211207,0.179245,0.258621,0.043478,...,0.315789,0.331461,0.42029,0.387755,0.0,0.408511,0.46696,0.248447,0.241,0.205882
50%,3531.0,2011.0,0.346154,0.709402,0.505396,0.508651,0.37069,0.283019,0.37931,0.086957,...,0.398496,0.404494,0.507246,0.489796,0.0,0.493617,0.546256,0.31677,0.262,0.470588
75%,9015.0,2016.0,0.461538,0.871795,0.688849,0.710208,0.508621,0.391509,0.517241,0.130435,...,0.488722,0.483146,0.594203,0.564626,0.346411,0.591489,0.625551,0.403727,0.284,0.735294
max,27506.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.372,1.0


In [22]:
sfs.fit(batting[selected_columns], batting["Next_AVG"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=Ridge(alpha=1), n_features_to_select=20,
                          n_jobs=4)

In [23]:
predictors = list(selected_columns[sfs.get_support()])
predictors

['Age',
 'IBB',
 'SO',
 'SH',
 'AVG',
 'GB',
 'IFFB',
 'K%',
 'Pos',
 'Spd',
 'CB%',
 'CH%',
 'CHv',
 'Swing%',
 'Contact%',
 'Oppo%',
 'Soft%',
 'HR/FB%+',
 'Cent%+',
 'Hard%+']

In [24]:
### MAKING THE PREDICTIONS ###

In [25]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_AVG"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_AVG"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [26]:
predictions = backtest(batting, rr, predictors)
predictions

Unnamed: 0,actual,prediction
5006,0.264,0.270711
1925,0.293,0.272631
3102,0.265,0.278703
5797,0.276,0.263152
1109,0.270,0.257763
...,...,...
1914,0.277,0.275658
5875,0.208,0.227587
7032,0.243,0.255145
4881,0.271,0.263407


In [27]:
from sklearn.metrics import mean_squared_error
mean_squared_error(predictions["actual"], predictions["prediction"])

0.0007501977592075455

In [None]:
# show that picking the right stat and scale to predict is very important