In [1]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append("..")

from analysis.utils import describe_endpoint, compile_average_player_values, monetary_string_to_numeric
from data_sources import PyBaseball, MLBStatsAPI, Salary
from analysis.batter_data_structure import KEEP_RENAME_MAP

from dotenv import load_dotenv
load_dotenv()

py_baseball = PyBaseball()
mlb_api = MLBStatsAPI()

payroll_source_paths = {
    "historical": os.getenv("MLB_PAYROLLS"),
    "recent": os.getenv("MLB_PAYROLLS_2025")
}

salary_source_paths = {
    "historical": os.getenv("MLB_PLAYER_SALARY_DATA")
}

salary = Salary(payroll_source_paths=payroll_source_paths, salary_source_paths=salary_source_paths)
payrolls = salary.payroll()

# Contains matching keys between data sources
# chadwick = py_baseball.player_search.chadwick()

import logging
logging.basicConfig(level=logging.WARNING, force=True)  # force=True resets handlers in Jupyter (Py3.8+)

for name in ("urllib3", "urllib3.connectionpool", "requests"):
    logging.getLogger(name).setLevel(logging.ERROR)
    logging.getLogger(name).propagate = False

### Utils

In [14]:
def _filter_and_rename(df: pd.DataFrame, rename_map: dict, rename_id: str):
    df = df[list(rename_map[rename_id].keys())].rename(rename_map[rename_id])
    return df

def _reformat_statcast_name(name: str):
    name_split = name.split(", ")
    return f"{name_split[-1]} {name_split[0]}"

### Batters

In [None]:
def standard_batter_stats_data_preprocessing(season: int, batter_stats_collection: dict):
    batter_stats = py_baseball.batter.stats(start_season=2025)
    batter_stats = _filter_and_rename(batter_stats, KEEP_RENAME_MAP, rename_id="stats")
    # Used to obtain the player salaries
    batter_bwar = py_baseball.batter.bref_war(season)
    batter_bwar = _filter_and_rename(batter_bwar, KEEP_RENAME_MAP, rename_id="bref_war")
    

    batter_stats = (
        batter_stats
        .merge(
            batter_bwar, 
            how="left", 
            on=["player_name", "team", "season"]
            )
    )
    
    # For now assume that na values are league minimum
    league_minimum = salary.league_minimum_salaries(season)
    batter_stats["salary"] = batter_stats["salary"].fillna(np.float64(league_minimum))
    
    return batter_stats_collection | {season: batter_stats}

start_season, end_season = 2009, 2025

batter_stats = {}
for season in range(start_season, end_season + 1):
    batter_stats = standard_batter_stats_data_preprocessing(season, batter_stats)

NameError: name 'standard_batter_stats_data_cleanup' is not defined

In [18]:
def standard_batter_statcast_preprocessing(season: int, batter_statcast_collection: dict):
    batter_statcast_expected = py_baseball.batter.statcast_expected_stats(season)
    batter_statcast_expected = _filter_and_rename(batter_statcast_expected, KEEP_RENAME_MAP, "statcast_exp")


    batter_statcast_percentile = py_baseball.batter.statcast_percentile_ranks(season)
    batter_statcast_percentile = _filter_and_rename(batter_statcast_percentile, KEEP_RENAME_MAP, "statcast_pct")

    statcast = batter_statcast_expected.merge(batter_statcast_percentile, how="left", on=["player_id"])

    statcast = statcast.dropna(subset="player_name")

    statcast["player_name"] = statcast["player_name"].apply(lambda player_name: _reformat_statcast_name(player_name))

    statcast = statcast.reset_index(drop=True)
    return batter_statcast_collection | {season: statcast}


start_season, end_season = 2015, 2025

batter_statcast = {}
for season in range(start_season, end_season + 1):
    batter_statcast = standard_batter_statcast_preprocessing(season, batter_statcast)

In [None]:
%%capture

def standard_batter_stats_data_preprocessing(season: int, batter_stats_collection: dict):
    batter_stats = py_baseball.batter.stats(start_season=2025)
    # Used to obtain the player salaries
    bwar_keep_cols = ["name_common", "mlb_ID", "year_ID", "team_ID", "salary"]
    batter_bwar = py_baseball.batter.bref_war(season)
    batter_bwar = batter_bwar[bwar_keep_cols]

    batter_stats = (
        batter_stats
        .merge(
            batter_bwar, 
            how="left", 
            left_on=["Name", "Team", "Season"], 
            right_on=["name_common", "team_ID", "year_ID"]
            )
        .drop(
            ["name_common", "year_ID", "team_ID"], 
            axis=1
            )
    )
    
    # For now assume that na values are league minimum
    league_minimum = salary.league_minimum_salaries(season)
    batter_stats["salary"] = batter_stats["salary"].fillna(np.float64(league_minimum))
    
    return batter_stats_collection | {season: batter_stats}

start_season, end_season = 2009, 2025

batter_stats = {}
for season in range(start_season, end_season + 1):
    batter_stats = standard_batter_stats_data_cleanup(season, batter_stats)

2025-11-12 11:23:24,199 -    DEBUG - urllib3.connectionpool(7748) - Starting new HTTPS connection (1): www.fangraphs.com:443


2025-11-12 11:23:26,127 -    DEBUG - urllib3.connectionpool(7748) - https://www.fangraphs.com:443 "GET /leaders-legacy.aspx?pos=all&stats=bat&lg=all&qual=1&type=c%2C-1%2C3%2C4%2C5%2C6%2C7%2C8%2C9%2C10%2C11%2C12%2C13%2C14%2C15%2C16%2C17%2C18%2C19%2C20%2C21%2C22%2C23%2C24%2C25%2C26%2C27%2C28%2C29%2C30%2C31%2C32%2C33%2C34%2C35%2C36%2C37%2C38%2C39%2C40%2C41%2C42%2C43%2C44%2C45%2C46%2C47%2C48%2C49%2C50%2C51%2C52%2C53%2C54%2C55%2C56%2C57%2C58%2C59%2C60%2C61%2C62%2C63%2C64%2C65%2C66%2C67%2C68%2C69%2C70%2C71%2C72%2C73%2C74%2C75%2C76%2C77%2C78%2C79%2C80%2C81%2C82%2C83%2C84%2C85%2C86%2C87%2C88%2C89%2C90%2C91%2C92%2C93%2C94%2C95%2C96%2C97%2C98%2C99%2C100%2C101%2C102%2C103%2C104%2C105%2C106%2C107%2C108%2C109%2C110%2C111%2C112%2C113%2C114%2C115%2C116%2C117%2C118%2C119%2C120%2C121%2C122%2C123%2C124%2C125%2C126%2C127%2C128%2C129%2C130%2C131%2C132%2C133%2C134%2C135%2C136%2C137%2C138%2C139%2C140%2C141%2C142%2C143%2C144%2C145%2C146%2C147%2C148%2C149%2C150%2C151%2C152%2C153%2C154%2C155%2C156%2C157%2C158%

In [None]:
free_agent_dfs = {}
free_agents_path = os.getenv("MLB_FREE_AGENTS")
cols = [c.split(" ")[0] for c in free_agents_2012.columns]
free_agents_2012.columns = cols
free_agents_2012

Unnamed: 0,From,Player,Pos,Yrs,Value,AAV
0,LAA,Albert Pujols,DH/1B,10,"$240,000,000","$24,000,000"
1,DET,Prince Fielder,DH/1B,9,"$214,000,000","$23,777,778"
2,MIA,Jose Reyes,SS/2B,6,"$106,000,000","$17,666,667"
3,LAA,C.J. Wilson,SP,5,"$77,500,000","$15,500,000"
4,MIA,Mark Buehrle,SP/SP4,4,"$58,000,000","$14,500,000"
...,...,...,...,...,...,...
130,HOU,Jack Cust,DH,0,$0,
131,CHW,Kosuke Fukudome,RF,0,$0,
132,CHW,Orlando Hudson,3B/2B,0,$0,
133,CIN,Mike Wuertz,RP,0,$0,


In [None]:
# @TODO 
"""
Clean up free agent data to fill in missing values.

Check the next year in the bref_war dataframe to see if the player played and if a salary is available for that player.
If the player did not play, remove the row from the free agents dataframe.
If the player did play but does not have a salary from bref_war, use the league minimum salary as a safe assumption.
"""

Unnamed: 0,name_common,mlb_ID,player_ID,year_ID,team_ID,stint_ID,lg_ID,pitcher,G,PA,salary,runs_above_avg,runs_above_avg_off,runs_above_avg_def,WAR_rep,WAA,WAR
37754,Kosuke Fukudome,493120.0,fukudko01,2008,CHC,1,NL,N,150,590.0,7000000.0,-9.8,-9.8,-5.0,1.76,-1.2,0.56
37755,Kosuke Fukudome,493120.0,fukudko01,2009,CHC,1,NL,N,146,603.0,12500000.0,3.3,5.3,-1.2,1.84,0.25,2.09
37756,Kosuke Fukudome,493120.0,fukudko01,2010,CHC,1,NL,N,130,429.0,14000000.0,0.4,1.4,-4.6,1.29,-0.05,1.24
37757,Kosuke Fukudome,493120.0,fukudko01,2011,CHC,1,NL,N,87,345.0,14500000.0,1.6,0.6,-1.8,1.06,0.14,1.2
37758,Kosuke Fukudome,493120.0,fukudko01,2011,CLE,2,AL,N,59,258.0,,-13.2,-10.2,-4.6,0.89,-1.39,-0.5
37759,Kosuke Fukudome,493120.0,fukudko01,2012,CHW,1,AL,N,24,51.0,500000.0,-3.7,-4.7,0.3,0.18,-0.4,-0.22


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import MissingIndicator
from xgboost import XGBRegressor

base_feats = ["age","position","team","PA","WAR_prev","wRC_plus_prev"]
statcast_feats = ["avg_ev","max_ev","la","hardhit_rate","spins","chase_rate"]

pre = ColumnTransformer([
    ("base_num", SimpleImputer(strategy="median"), ["age","PA","WAR_prev","wRC_plus_prev"]),
    ("base_cat", OneHotEncoder(handle_unknown="ignore"), ["position","team"]),
    ("sc_num", SimpleImputer(strategy="median", add_indicator=True), statcast_feats),
])

model = Pipeline([("pre", pre), ("est", XGBRegressor(n_estimators=800, max_depth=6))])
