In [None]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append("..")

from analysis.utils import describe_endpoint, compile_average_player_values, monetary_string_to_numeric
from data_sources import PyBaseball, MLBStatsAPI, Salary
from analysis.batter_data_structure import KEEP_RENAME_MAP

from dotenv import load_dotenv
load_dotenv()

py_baseball = PyBaseball()
mlb_api = MLBStatsAPI()

payroll_source_paths = {
    "historical": os.getenv("MLB_PAYROLLS"),
    "recent": os.getenv("MLB_PAYROLLS_2025")
}

salary_source_paths = {
    "historical": os.getenv("MLB_PLAYER_SALARY_DATA")
}

salary = Salary(payroll_source_paths=payroll_source_paths, salary_source_paths=salary_source_paths)
payrolls = salary.payroll()

# Contains matching keys between data sources
# May be helpful down the road
# chadwick = py_baseball.player_search.chadwick()

import logging
logging.basicConfig(level=logging.WARNING, force=True)  # force=True resets handlers in Jupyter (Py3.8+)

for name in ("urllib3", "urllib3.connectionpool", "requests"):
    logging.getLogger(name).setLevel(logging.ERROR)
    logging.getLogger(name).propagate = False

### Utils

In [None]:
def _filter_and_rename(df: pd.DataFrame, rename_map: dict, rename_id: str):
    df = df[list(rename_map[rename_id].keys())].rename(rename_map[rename_id])
    return df

def _reformat_statcast_name(name: str):
    name_split = name.split(", ")
    return f"{name_split[-1]} {name_split[0]}"

### Batters

In [None]:
def standard_batter_stats_data_preprocessing(season: int, batter_stats_collection: dict):
    batter_stats = py_baseball.batter.stats(start_season=2025)
    batter_stats = _filter_and_rename(batter_stats, KEEP_RENAME_MAP, rename_id="stats")
    # Used to obtain the player salaries
    batter_bwar = py_baseball.batter.bref_war(season)
    batter_bwar = _filter_and_rename(batter_bwar, KEEP_RENAME_MAP, rename_id="bref_war")
    

    batter_stats = (
        batter_stats
        .merge(
            batter_bwar, 
            how="left", 
            on=["player_name", "team", "season"]
            )
    )
    
    # For now assume that na values are league minimum
    league_minimum = salary.league_minimum_salaries(season)
    batter_stats["salary"] = batter_stats["salary"].fillna(np.float64(league_minimum))
    
    return batter_stats_collection | {season: batter_stats}

start_season, end_season = 2009, 2025

batter_stats = {}
for season in range(start_season, end_season + 1):
    batter_stats = standard_batter_stats_data_preprocessing(season, batter_stats)

In [None]:
def standard_batter_statcast_preprocessing(season: int, batter_statcast_collection: dict):
    batter_statcast_expected = py_baseball.batter.statcast_expected_stats(season)
    batter_statcast_expected = _filter_and_rename(batter_statcast_expected, KEEP_RENAME_MAP, "statcast_exp")


    batter_statcast_percentile = py_baseball.batter.statcast_percentile_ranks(season)
    batter_statcast_percentile = _filter_and_rename(batter_statcast_percentile, KEEP_RENAME_MAP, "statcast_pct")

    statcast = batter_statcast_expected.merge(batter_statcast_percentile, how="left", on=["player_id"])

    statcast = statcast.dropna(subset="player_name")

    statcast["player_name"] = statcast["player_name"].apply(lambda player_name: _reformat_statcast_name(player_name))

    statcast = statcast.reset_index(drop=True)
    return batter_statcast_collection | {season: statcast}


start_season, end_season = 2015, 2025

batter_statcast = {}
for season in range(start_season, end_season + 1):
    batter_statcast = standard_batter_statcast_preprocessing(season, batter_statcast)

In [None]:
%%capture

def standard_batter_stats_data_preprocessing(season: int, batter_stats_collection: dict):
    batter_stats = py_baseball.batter.stats(start_season=2025)
    # Used to obtain the player salaries
    bwar_keep_cols = ["name_common", "mlb_ID", "year_ID", "team_ID", "salary"]
    batter_bwar = py_baseball.batter.bref_war(season)
    batter_bwar = batter_bwar[bwar_keep_cols]

    batter_stats = (
        batter_stats
        .merge(
            batter_bwar, 
            how="left", 
            left_on=["Name", "Team", "Season"], 
            right_on=["name_common", "team_ID", "year_ID"]
            )
        .drop(
            ["name_common", "year_ID", "team_ID"], 
            axis=1
            )
    )
    
    # For now assume that na values are league minimum
    league_minimum = salary.league_minimum_salaries(season)
    batter_stats["salary"] = batter_stats["salary"].fillna(np.float64(league_minimum))
    
    return batter_stats_collection | {season: batter_stats}

start_season, end_season = 2009, 2025

batter_stats = {}
for season in range(start_season, end_season + 1):
    batter_stats = standard_batter_stats_data_cleanup(season, batter_stats)

In [None]:
free_agent_dfs = {}
free_agents_path = os.getenv("MLB_FREE_AGENTS")
cols = [c.split(" ")[0] for c in free_agents_2012.columns]
free_agents_2012.columns = cols
free_agents_2012

In [None]:
# @TODO 
"""
Clean up free agent data to fill in missing values.

Check the next year in the bref_war dataframe to see if the player played and if a salary is available for that player.
If the player did not play, remove the row from the free agents dataframe.
If the player did play but does not have a salary from bref_war, use the league minimum salary as a safe assumption.
"""

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import MissingIndicator
from xgboost import XGBRegressor

base_feats = ["age","position","team","PA","WAR_prev","wRC_plus_prev"]
statcast_feats = ["avg_ev","max_ev","la","hardhit_rate","spins","chase_rate"]

pre = ColumnTransformer([
    ("base_num", SimpleImputer(strategy="median"), ["age","PA","WAR_prev","wRC_plus_prev"]),
    ("base_cat", OneHotEncoder(handle_unknown="ignore"), ["position","team"]),
    ("sc_num", SimpleImputer(strategy="median", add_indicator=True), statcast_feats),
])

model = Pipeline([("pre", pre), ("est", XGBRegressor(n_estimators=800, max_depth=6))])
