In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats
import time
import glob

In [3]:
START = 1973
END = 2023

batting = batting_stats(START, END, qual=200)
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

batting.to_csv("fiddy.csv")

#extension = 'csv'
#all_batting = glob.glob('bat*.{}'.format(extension))

#concat_bat = pd.concat([pd.read_csv(f) for f in all_batting ], ignore_index=True)
#print(concat_bat)

#concat_bat.to_csv('all_batting.csv', index=False)

HTTPError: Error accessing 'https://www.fangraphs.com/leaders.aspx'. Received status code 500

In [None]:
allbat = pd.read_csv('all_batting.csv', low_memory=False)

In [None]:

def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

allbat = allbat.groupby("IDfg", group_keys=False).apply(next_season)
allbat[["Name", "Season", "WAR", "Next_WAR"]]

In [None]:
null_count = allbat.isnull().sum()

null_count

In [None]:
complete_col = list(allbat.columns[null_count == 0])

In [None]:
allbat = allbat[complete_col + ["Next_WAR"]].copy()
allbat

In [None]:
allbat.dtypes

In [None]:
# Getting rid of unnecessary 'string' --- not needed for ML

del allbat["Age Rng"]
del allbat["Unnamed: 0"]

In [None]:
allbat["team_code"] = allbat["Team"].astype("category").cat.codes

In [None]:
batting_full = allbat.copy()

allbat = allbat.dropna().copy()

In [None]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=15, direction="forward", cv=split, n_jobs=8)

In [None]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = allbat.columns[~allbat.columns.isin(removed_columns)]

In [None]:
# RidgeReg requires scaling data s/t mu = 0 & sigma = 1 --- instead, using more aggressive scaling

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
allbat.loc[:, selected_columns] = scaler.fit_transform(allbat[selected_columns])

In [None]:
allbat.describe()

In [None]:
sfs.fit(allbat[selected_columns], allbat["Next_WAR"])

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []

    years = sorted(data["Season"].unique())

    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["Next_WAR"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
predictions = backtest(allbat, rr, predictors)

predictions

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(predictions["actual"], predictions["prediction"])

In [None]:
allbat["Next_WAR"].describe()

In [None]:
mse ** 0.5

okay model... could be better (let's do that!)

In [None]:
def player_history(df):
    df = df.sort_values("Season")

    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
    df["war_corr"].fillna(1, inplace=True)

    df["war_diff"] = df["WAR"]/df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)

    df["war_diff"][df["war_diff"] == np.inf] = 1

    return df

allbat = allbat.groupby("IDfg", group_keys=False).apply(player_history)

In [None]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [None]:
allbat["war_season"] = allbat.groupby("Season", group_keys=False).apply(group_averages)

In [None]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [None]:
allbat[["Name", "Season", "WAR", "Next_WAR","player_season", "war_corr", "war_season", "war_diff"]].sample(50)

In [None]:
allbat.info()

In [None]:
predictions = backtest(allbat, rr, new_predictors)
predictions

In [None]:
mean_squared_error(predictions["actual"], predictions["prediction"])

In [None]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

In [None]:
diff = predictions["actual"] - predictions["prediction"]
diff

In [None]:
merged = predictions.merge(allbat, left_index=True, right_index=True)
merged["diff"] = (diff.abs())
merged

In [None]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])