In [1]:
# Add src to sys.path
import sys
import os
sys.path.append(os.path.abspath("../src"))

import importlib
import data_utils, model_utils, shap_utils, ensemble_utils
importlib.reload(data_utils)
importlib.reload(model_utils)
importlib.reload(shap_utils)
importlib.reload(ensemble_utils)

from config import TARGET_VAR, set_seed, device
from data_utils import load_batting_years, build_feature_dataset
from model_utils import PlayerMLP
from shap_utils import explain_shap, get_top_shap_features
from ensemble_utils import load_ensemble_and_predict

import torch
import torch.nn as nn
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

set_seed()
print("Device:", device)


Device: mps


# dont run every time

In [None]:
from pybaseball import statcast
from datetime import datetime, timedelta
import pandas as pd
import time
from contextlib import redirect_stdout, redirect_stderr
from io import StringIO


def fetch_statcast_data(start_date="2023-03-30", end_date=None, verbose=True):
    if end_date is None:
        end_date = datetime.today().strftime('%Y-%m-%d')

    if verbose:
        print(f"Downloading Statcast data from {start_date} to {end_date}...")
    all_data = []

    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    # Step through in 7-day increments to avoid malformed responses
    delta = timedelta(days=7)
    current = start
    chunk_count = 0
    skipped = 0

    while current < end:
        next_day = min(current + delta, end)
        print(f"Fetching {current.date()} → {next_day.date()}...")
        try:
            if verbose:
                chunk = statcast(start_dt=current.strftime('%Y-%m-%d'),
                                 end_dt=next_day.strftime('%Y-%m-%d'))
            else:
                with redirect_stdout(StringIO()), redirect_stderr(StringIO()):
                    chunk = statcast(start_dt=current.strftime('%Y-%m-%d'),
                                     end_dt=next_day.strftime('%Y-%m-%d'))
            all_data.append(chunk)
            chunk_count += 1
        except Exception as e:
            skipped += 1
            print(f"{current.date()} → {next_day.date()} failed: {e}")
        current = next_day
        if verbose:
            time.sleep(1)

    df_statcast = pd.concat(all_data, ignore_index=True)
    df_statcast = df_statcast[df_statcast['events'].notna()]
    print(f"Download complete: {len(df_statcast)} events from {chunk_count} chunks ({skipped} skipped)")
    return df_statcast


In [None]:
def compute_total_bases(row):
    if row['events'] == 'single': return 1
    elif row['events'] == 'double': return 2
    elif row['events'] == 'triple': return 3
    elif row['events'] == 'home_run': return 4
    else: return 0



In [None]:
from tqdm import tqdm

In [None]:
def build_pitcher_rolling_features(df_statcast, rolling_windows=[1, 3, 7, 14, 30], min_bf=10):
    """
    Builds rolling stats for each pitcher prior to each game date.
    
    Parameters:
    - df_statcast: raw Statcast DataFrame
    - rolling_windows: list of day ranges for feature aggregation
    - min_bf: minimum batters faced in the rolling window

    Returns:
    - pitcher_features: DataFrame with pitcher-date level features
    """
    df_statcast['game_date'] = pd.to_datetime(df_statcast['game_date'])
    dates = sorted(df_statcast['game_date'].unique())
    pitcher_rows = []

    for i in tqdm(range(max(rolling_windows), len(dates)), desc="Processing pitcher dates"):
        current_date = dates[i]
        prev_df = df_statcast[df_statcast['game_date'] < current_date]
        today_df = df_statcast[df_statcast['game_date'] == current_date]

        for pitcher in today_df['pitcher'].unique():
            row = {'pitcher': pitcher, 'game_date': current_date}
            for w in rolling_windows:
                window_df = prev_df[
                    (prev_df['game_date'] >= current_date - pd.Timedelta(days=w)) &
                    (prev_df['pitcher'] == pitcher)
                ]
                if len(window_df) < min_bf:
                    continue

                prefix = f"{w}d_"
                pa = len(window_df)
                row[prefix + 'bf'] = pa
                row[prefix + 'avg_exit_velo'] = window_df['launch_speed'].mean()
                row[prefix + 'hard_hit_rate'] = (window_df['launch_speed'] >= 95).sum() / (pa + 1e-5)
                row[prefix + 'hr_allowed'] = (window_df['events'] == 'home_run').sum()
                row[prefix + 'bb_rate'] = (window_df['events'] == 'walk').sum() / (pa + 1e-5)
                row[prefix + 'k_rate'] = (window_df['events'] == 'strikeout').sum() / (pa + 1e-5)

            pitcher_rows.append(row)

    return pd.DataFrame(pitcher_rows)


In [None]:
def add_pitcher_context_features(df_games, df_statcast):
    """
    Enhances df_games with contextual pitcher features:
    - p_throws (pitcher handedness)
    - stand (batter handedness)
    - handed_matchup (e.g., L_vs_R)
    - is_home flag based on inning_topbot
    
    Returns:
        df_games with additional features
    """
    # Pitcher handedness
    pitcher_handed = df_statcast[['game_date', 'player_name', 'pitcher', 'p_throws']].drop_duplicates()

    # Batter handedness
    batter_handed = df_statcast[['game_date', 'player_name', 'stand']].drop_duplicates()

    # Venue/home flag
    venue_info = df_statcast[['game_date', 'player_name', 'inning_topbot']].drop_duplicates()
    venue_info['is_home'] = venue_info['inning_topbot'] == 'Bot'

    # Merge all into df_games
    df_games = df_games.merge(pitcher_handed, on=['game_date', 'player_name'], how='left')
    df_games = df_games.merge(batter_handed, on=['game_date', 'player_name'], how='left')
    df_games = df_games.merge(venue_info[['game_date', 'player_name', 'is_home']], on=['game_date', 'player_name'], how='left')

    # Create matchup feature (e.g., L_vs_R)
    df_games['handed_matchup'] = df_games['stand'] + '_vs_' + df_games['p_throws']

    # One-hot encode matchup
    matchup_dummies = pd.get_dummies(df_games['handed_matchup'], prefix='matchup')
    df_games = pd.concat([df_games, matchup_dummies], axis=1)

    return df_games


In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

def build_rolling_feature_dataset(df_statcast, rolling_windows=[1, 3, 7, 14, 30], min_pa=5):
    dates = sorted(df_statcast['game_date'].unique())
    player_games = []
    def process_day(i):
        current_date = dates[i]
        today_games = df_statcast[df_statcast['game_date'] == current_date]
        today_players = today_games['player_name'].unique()
        rows = []

        for player in today_players:
            row = {
                'player_name': player,
                'game_date': current_date,
                'total_bases': today_games[today_games['player_name'] == player]['total_bases'].sum()
            }

            for w in rolling_windows:
                window_data = df_statcast[
                    (df_statcast['game_date'] >= dates[i - w]) &
                    (df_statcast['game_date'] < dates[i]) &
                    (df_statcast['player_name'] == player)
                ]

                if len(window_data) < min_pa:
                    continue

                prefix = f"{w}d_"
                pa = len(window_data)
                at_bats = window_data['at_bat_number'].count()

                row[prefix + 'pa'] = pa
                row[prefix + 'launch_speed'] = window_data['launch_speed'].mean()
                row[prefix + 'launch_angle'] = window_data['launch_angle'].mean()
                row[prefix + 'woba'] = window_data['woba_value'].sum() / (pa + 1e-5)
                row[prefix + 'slg'] = window_data['total_bases'].sum() / (at_bats + 1e-5)
                row[prefix + 'hard_hits'] = (window_data['launch_speed'] >= 95).sum()
                row[prefix + 'hard_hit_rate'] = row[prefix + 'hard_hits'] / pa
                row[prefix + 'barrels'] = window_data['estimated_ba_using_speedangle'].notna().sum()
                row[prefix + 'barrel_rate'] = row[prefix + 'barrels'] / pa
                row[prefix + 'singles'] = (window_data['events'] == 'single').sum()
                row[prefix + 'doubles'] = (window_data['events'] == 'double').sum()
                row[prefix + 'triples'] = (window_data['events'] == 'triple').sum()
                row[prefix + 'hr'] = (window_data['events'] == 'home_run').sum()
                row[prefix + 'walks'] = (window_data['events'] == 'walk').sum()
                row[prefix + 'strikeouts'] = (window_data['events'] == 'strikeout').sum()
                row[prefix + 'hbp'] = (window_data['events'] == 'hit_by_pitch').sum()
                row[prefix + 'tb_per_pa'] = window_data['total_bases'].sum() / (pa + 1e-5)
                row[prefix + 'hr_per_pa'] = row[prefix + 'hr'] / (pa + 1e-5)
                row[prefix + 'bb_rate'] = row[prefix + 'walks'] / (pa + 1e-5)
                row[prefix + 'k_rate'] = row[prefix + 'strikeouts'] / (pa + 1e-5)
                row[prefix + 'batted_balls'] = window_data['bb_type'].count()
                row[prefix + 'events'] = window_data['events'].count()

            rows.append(row)
        return rows

    print("Building rolling feature dataset in parallel...")
    all_rows = Parallel(n_jobs=-1)(
        delayed(process_day)(i) for i in tqdm(range(max(rolling_windows), len(dates) - 1))
    )
    player_games = [row for sublist in all_rows for row in sublist]
    df_games = pd.DataFrame(player_games)
    df_games = df_games.dropna()

    venue_data = df_statcast[['game_date', 'player_name', 'home_team', 'away_team', 'inning_topbot']].drop_duplicates()
    df_games = df_games.merge(venue_data, on=['game_date', 'player_name'], how='left')
    df_games['is_home'] = df_games['inning_topbot'] == 'Bot'

    print("✅ Built enhanced rolling feature dataset:", df_games.shape)
    return df_games


In [None]:
# Only do this once
df_statcast = fetch_statcast_data(start_date="2000-03-01", verbose=False)
df_statcast['game_date'] = pd.to_datetime(df_statcast['game_date'])
df_statcast['total_bases'] = df_statcast.apply(compute_total_bases, axis=1)

In [None]:
df_statcast.to_parquet("../data/statcast_2000_2025.parquet", index=False)

In [None]:


team_code_to_venue = {
    "ARI": "Chase Field",
    "ATL": "Truist Park",
    "BAL": "Oriole Park at Camden Yards",
    "BOS": "Fenway Park",
    "CHC": "Wrigley Field",
    "CIN": "Great American Ball Park",
    "CLE": "Progressive Field",
    "COL": "Coors Field",
    "CWS": "Rate Field",
    "DET": "Comerica Park",
    "HOU": "Daikin Park",
    "KC": "Kauffman Stadium",
    "LAA": "Angel Stadium",
    "LAD": "Dodger Stadium",
    "MIA": "loanDepot park",
    "MIL": "American Family Field",
    "MIN": "Target Field",
    "NYM": "Citi Field",
    "NYY": "Yankee Stadium",
    "OAK": "Oakland Coliseum",  # Not in park factors but okay
    "PHI": "Citizens Bank Park",
    "PIT": "PNC Park",
    "SD": "Petco Park",
    "SEA": "T-Mobile Park",
    "SF": "Oracle Park",
    "STL": "Busch Stadium",
    "TB": "Tropicana Field",    # Not in park factors but okay
    "TEX": "Globe Life Field",
    "TOR": "Rogers Centre",
    "WSH": "Nationals Park"
}

# Add venue column to df_statcast for merging
df_statcast['venue'] = df_statcast['home_team'].map(team_code_to_venue)



In [None]:
df_statcast.to_parquet("../data/statcast_2000_2025_venues.parquet", index=False)

In [None]:
df_games.to_parquet("df_games_partial.parquet", index=False)


In [None]:
df_games = pd.read_parquet("df_games_partial.parquet")
print(list(df_games.columns))

In [None]:
df_games['is_home'] = df_games['is_home_x']
df_games.drop(columns=['is_home_x', 'is_home_y'], inplace=True)
print(list(df_games.columns))


In [None]:
df_games["team_name"] = np.where(df_games["is_home"], df_games["home_team"], df_games["away_team"])
df_games["opponent_team"] = np.where(df_games["is_home"], df_games["away_team"], df_games["home_team"])

park_factors = pd.read_csv("../data/park_factors.csv")
df_games = df_games.merge(park_factors, left_on="home_team", right_on="venue", how="left")

In [None]:
df_games = build_rolling_feature_dataset(df_statcast)
df_games = add_pitcher_context_features(df_games, df_statcast)  # Handedness, venue, etc.
pitcher_df = build_pitcher_rolling_features(df_statcast)        # Pitcher rolling stats
df_games = df_games.merge(pitcher_df, on=["game_date", "pitcher"], how="left")  # Merge in

df_games["team_name"] = np.where(df_games["is_home"], df_games["home_team"], df_games["away_team"])
df_games["opponent_team"] = np.where(df_games["is_home"], df_games["away_team"], df_games["home_team"])

park_factors = pd.read_csv("../data/park_factors.csv")
df_games = df_games.merge(park_factors, left_on="home_team", right_on="venue", how="left")

In [None]:
df_games.to_parquet("../data/rolling_features_2000_2025.parquet", index=False)

df_games.to_csv("../data/rolling_features_2000_2025.csv", index=False)

# run every time

In [None]:
df_games = pd.read_parquet("../data/rolling_features_2000_2025.parquet")

In [4]:
rolling_features = [col for col in numeric_features if any(prefix in col for prefix in ['1d_', '3d_', '7d_', '14d_', '30d_'])]
for feature in rolling_features:
    median_value = df_games[feature].median()
    df_games[feature] = df_games[feature].fillna(median_value)


In [5]:
if 'ParkFactor' in df_games.columns:
    df_games['ParkFactor'] = df_games['ParkFactor'].fillna(1.0)  # neutral value

for col in ['wOBACon', 'xBACON', 'xwOBACon']:
    if col in df_games.columns:
        league_avg = df_games[col].mean()
        df_games[col] = df_games[col].fillna(league_avg)


In [6]:
print(df_games[numeric_features].isnull().sum())


14d_avg_exit_velo        0
14d_barrel_rate          0
14d_barrels              0
14d_batted_balls         0
14d_bb_rate_x            0
                     ...  
ParkFactor               0
pitcher                  0
wOBACon              14732
xBACON               14732
xwOBACon             14732
Length: 146, dtype: int64


In [7]:
for col in ['wOBACon', 'xBACON', 'xwOBACon', 'BACON']:
    if col in df_games.columns:
        league_avg = df_games[col].mean()
        if pd.isna(league_avg):
            league_avg = 0.0  # or a neutral value
        df_games[col] = df_games[col].fillna(league_avg)


In [8]:
print(df_games[numeric_features].isnull().sum())


14d_avg_exit_velo    0
14d_barrel_rate      0
14d_barrels          0
14d_batted_balls     0
14d_bb_rate_x        0
                    ..
ParkFactor           0
pitcher              0
wOBACon              0
xBACON               0
xwOBACon             0
Length: 146, dtype: int64


In [9]:
TARGET_VAR = "total_bases"
search_dir = f"{TARGET_VAR}_gridsearch_chunks"


In [16]:
exclude_cols = ['player_name', 'game_date', 'total_bases']
features = df_games.columns.difference(exclude_cols).tolist()
numeric_features = df_games[features].select_dtypes(include=[np.number]).columns.tolist()


X = df_games[numeric_features].values
y = df_games['total_bases'].values

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Tensors for use in training
X_train = torch.tensor(X_train_np, dtype=torch.float32)
y_train = torch.tensor(y_train_np, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test_np, dtype=torch.float32)
y_test = torch.tensor(y_test_np, dtype=torch.float32).view(-1, 1)


In [17]:
def train_mlp_model_with_valsplit(config):
    hidden_dims, lr, epochs, batch_size, dropout, val_split, activation, scheduler_type = config

    model = PlayerMLP(
        input_dim=X_train.shape[1],
        hidden_dims=hidden_dims,
        dropout=dropout,
        activation=activation
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
    if scheduler_type == 'cosine':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=10, T_mult=2, eta_min=lr * 0.1
        )
    elif scheduler_type == 'plateau':
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=5, verbose=False
        )
    else:
        scheduler = None

    loss_fn = nn.MSELoss()

    X_subtrain, X_val, y_subtrain, y_val = train_test_split(
        X_train.cpu(), y_train.cpu(), test_size=val_split, random_state=42
    )
    X_subtrain, X_val = X_subtrain.to(device), X_val.to(device)
    y_subtrain, y_val = y_subtrain.to(device), y_val.to(device)

    train_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(X_subtrain, y_subtrain),
        batch_size=batch_size, shuffle=True
    )

    best_loss = float("inf")
    patience = 5
    epochs_no_improve = 0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            optimizer.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()

        model.eval()
        with torch.no_grad():
            val_pred = model(X_val)
            val_loss = loss_fn(val_pred, y_val).item()
            if scheduler_type == 'plateau':
                scheduler.step(val_loss)  # for ReduceLROnPlateau, pass val_loss
            elif scheduler_type == 'cosine':
                scheduler.step(epoch)     # for CosineAnnealing, pass epoch

        if val_loss < best_loss:
            best_loss = val_loss
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                break

    if best_model_state:
        model.load_state_dict(best_model_state)

    model.eval()
    with torch.no_grad():
        y_pred = model(X_test.to(device)).cpu().numpy().flatten()
        y_true = y_test.cpu().numpy().flatten()
        mae = mean_absolute_error(y_true, y_pred)
        rmse = mean_squared_error(y_true, y_pred) ** 0.5
    
    model = model.to('cpu')
    torch.cuda.empty_cache()

    return model, (config, mae, rmse)


In [18]:
import pickle
from joblib import Parallel, delayed
import random
import itertools
import glob

In [19]:
# Hyperparameter grid
hidden_dim_options = [
    [64, 32], [128, 64], [256, 128], [512, 256],
    [64, 32, 16], [128, 64, 32], [256, 128, 64], [512, 256, 128],
    [64, 32, 16, 8], [128, 64, 32, 16], [256, 128, 64, 32], [128, 64, 32, 16, 8],
    [64, 32, 16, 8, 4], [128, 64, 32, 16, 8], [256, 128, 64, 32, 16],
    [512, 256, 128, 64], [1024, 512, 256],
    [64, 32, 16, 8, 4, 2], [128, 64, 32, 16, 8, 4], [256, 128, 64, 32, 16, 8],
    [512, 256, 128, 64, 32], [1024, 512, 256, 128],
]
lr_options = [0.01, 0.005, 0.001, 0.0005, 0.0001]
epoch_options = [50, 100, 150]
batch_size_options = [32, 64, 128]
dropout_options = [0.2, 0.3, 0.4, 0.5]
val_split_options = [0.1, 0.2, 0.3, 0.4, 0.5]
activation_options = ['relu', 'gelu', 'tanh', 'leaky_relu', 'sigmoid', 'swish']
scheduler_options = ['plateau', 'cosine']
random.seed(42)

# Generate and shuffle grid
grid_configs = list(itertools.product(
    hidden_dim_options, lr_options, epoch_options,
    batch_size_options, dropout_options, val_split_options,
    activation_options, scheduler_options
))
# Chunk setup
chunk_size = 500
chunks = [grid_configs[i:i + chunk_size] for i in range(0, len(grid_configs), chunk_size)]
print(f"Split {len(grid_configs)} total configs into {len(chunks)} chunks (size={chunk_size})")


Split 237600 total configs into 476 chunks (size=500)


In [20]:
print("Checking for any NaNs in numeric features:")
print(df_games[numeric_features].isnull().sum().sum())
df_games = df_games.fillna(0)


Checking for any NaNs in numeric features:
0


In [None]:
os.makedirs(f"{TARGET_VAR}_gridsearch_chunks", exist_ok=True)

for idx, chunk in enumerate(chunks):
    chunk_id = idx + 22
    print(f"\nRunning Grid Search Chunk {chunk_id+1}/{len(chunks)} with {len(chunk)} configs...")

    results = Parallel(n_jobs=-1)(
        delayed(train_mlp_model_with_valsplit)(cfg)
        for i, cfg in enumerate(chunk)
    )

    models = [res[0] for res in results if res[0] is not None]
    metrics = [res[1] for res in results if res[0] is not None]
    top5 = sorted(zip(models, metrics), key=lambda x: x[1][2])[:5]
    top5_cpu = [(m.to('cpu'), metrics) for m, metrics in top5]


    chunk_path = f"{TARGET_VAR}_gridsearch_chunks/top5_chunk_{chunk_id+1}.pkl"
    with open(chunk_path, "wb") as f:
        pickle.dump(top5_cpu, f)

    print(f"Saved top 5 from chunk {chunk_id+1} to {chunk_path}")



Running Grid Search Chunk 23/476 with 500 configs...
Saved top 5 from chunk 23 to total_bases_gridsearch_chunks/top5_chunk_23.pkl

Running Grid Search Chunk 24/476 with 500 configs...
Saved top 5 from chunk 24 to total_bases_gridsearch_chunks/top5_chunk_24.pkl

Running Grid Search Chunk 25/476 with 500 configs...
Saved top 5 from chunk 25 to total_bases_gridsearch_chunks/top5_chunk_25.pkl

Running Grid Search Chunk 26/476 with 500 configs...
Saved top 5 from chunk 26 to total_bases_gridsearch_chunks/top5_chunk_26.pkl

Running Grid Search Chunk 27/476 with 500 configs...
Saved top 5 from chunk 27 to total_bases_gridsearch_chunks/top5_chunk_27.pkl

Running Grid Search Chunk 28/476 with 500 configs...
Saved top 5 from chunk 28 to total_bases_gridsearch_chunks/top5_chunk_28.pkl

Running Grid Search Chunk 29/476 with 500 configs...
Saved top 5 from chunk 29 to total_bases_gridsearch_chunks/top5_chunk_29.pkl

Running Grid Search Chunk 30/476 with 500 configs...
Saved top 5 from chunk 30 to 

In [None]:
combined_top = []
for path in sorted(glob.glob(os.path.join(search_dir, "top5_chunk_*.pkl"))):
    try:
        with open(path, "rb") as f:
            chunk_top5 = pickle.load(f)
        combined_top.extend(chunk_top5)
    except Exception as e1:
        print(f"Failed to load {path}: {e1}")

# Sort and print final top 5
final_top5 = sorted(combined_top, key=lambda x: x[1][2])[:5]

print("\n✅ Final Top 5 Grid Search Results:")
for rank, (model, (cfg, mae, rmse)) in enumerate(final_top5, 1):
    print(f"Top {rank}: {cfg} → MAE: {mae:.4f}, RMSE: {rmse:.4f}")


In [None]:
import pandas as pd

summary_df = pd.DataFrame([
    {
        'rank': i+1,
        'hidden_dims': cfg[0],
        'lr': cfg[1],
        'epochs': cfg[2],
        'batch_size': cfg[3],
        'dropout': cfg[4],
        'val_split': cfg[5],
        'activation': cfg[6],
        'scheduler': cfg[7],
        'mae': mae,
        'rmse': rmse
    }
    for i, (model, (cfg, mae, rmse)) in enumerate(final_top5)
])
summary_df.to_csv(f"{TARGET_VAR}_top5_summary.csv", index=False)
print(f"\nSaved summary to {TARGET_VAR}_top5_summary.csv")


In [None]:
best_model = final_top5[0][0]
torch.save(best_model.state_dict(), f"{TARGET_VAR}_best_model.pt")
print("Best model saved.")


## load full dataset

In [None]:
df_statcast = pd.read_parquet("../data/statcast_2000_2025.parquet")
df_statcast['game_date'] = pd.to_datetime(df_statcast['game_date'])
df_statcast['total_bases'] = df_statcast.apply(compute_total_bases, axis=1)

In [None]:
team_code_to_venue = {
    "ARI": "Chase Field",
    "ATL": "Truist Park",
    "BAL": "Oriole Park at Camden Yards",
    "BOS": "Fenway Park",
    "CHC": "Wrigley Field",
    "CIN": "Great American Ball Park",
    "CLE": "Progressive Field",
    "COL": "Coors Field",
    "CWS": "Rate Field",
    "DET": "Comerica Park",
    "HOU": "Daikin Park",
    "KC": "Kauffman Stadium",
    "LAA": "Angel Stadium",
    "LAD": "Dodger Stadium",
    "MIA": "loanDepot park",
    "MIL": "American Family Field",
    "MIN": "Target Field",
    "NYM": "Citi Field",
    "NYY": "Yankee Stadium",
    "OAK": "Oakland Coliseum",  # Not in park factors but okay
    "PHI": "Citizens Bank Park",
    "PIT": "PNC Park",
    "SD": "Petco Park",
    "SEA": "T-Mobile Park",
    "SF": "Oracle Park",
    "STL": "Busch Stadium",
    "TB": "Tropicana Field",    # Not in park factors but okay
    "TEX": "Globe Life Field",
    "TOR": "Rogers Centre",
    "WSH": "Nationals Park"
}

# Add venue column to df_statcast for merging
df_statcast['venue'] = df_statcast['home_team'].map(team_code_to_venue)


## generate rolling features up to today

In [None]:
from datetime import datetime
today = pd.to_datetime(datetime.today().date())

# Only use statcast data *before today* to avoid leakage
df_statcast = pd.read_parquet("../data/statcast_2000_2025.parquet")
df_statcast['game_date'] = pd.to_datetime(df_statcast['game_date'])
df_statcast = df_statcast[df_statcast['game_date'] < today]
df_statcast['total_bases'] = df_statcast.apply(compute_total_bases, axis=1)

# Use your existing function to build rolling features
df_rolling = build_rolling_feature_dataset(df_statcast)
df_today = df_rolling[df_rolling['game_date'] == df_rolling['game_date'].max()].copy()


## load model and assets

In [None]:
import torch
import joblib

# Load saved model, scaler, and feature list
model = PlayerMLP(input_dim=512, hidden_dims=[128, 64], dropout=0.3, activation='relu')
model.load_state_dict(torch.load("total_bases_best_model.pt"))
model.eval()

scaler = joblib.load("ensemble_engineered/scaler_total_bases.joblib")
features = joblib.load("ensemble_engineered/features_total_bases.joblib")


## predict with saved model

In [None]:
X_today = df_today[features].fillna(0).values
X_today_scaled = scaler.transform(X_today)
X_today_tensor = torch.tensor(X_today_scaled, dtype=torch.float32)

with torch.no_grad():
    y_pred_today = model(X_today_tensor).cpu().numpy().flatten()

df_today["Pred_TB"] = y_pred_today
df_today = df_today.sort_values(by="Pred_TB", ascending=False)


## evaluate thresholds historically

In [None]:
exclude_cols = ['player_name', 'game_date', 'total_bases']
features = df_rolling.columns.difference(exclude_cols).tolist()

X = scaler.transform(df_rolling[features].fillna(0).values)
y = df_rolling['total_bases'].values

model.eval()
with torch.no_grad():
    y_pred = model(torch.tensor(X, dtype=torch.float32)).cpu().numpy().flatten()

df_rolling["Pred_TB"] = y_pred


In [None]:
thresholds = np.arange(1.5, 4.1, 0.2)
results = []

for t in thresholds:
    df_rolling["Bet"] = df_rolling["Pred_TB"] > t
    df_rolling["Won"] = df_rolling["total_bases"] > 1.5
    subset = df_rolling[df_rolling["Bet"]]

    if len(subset) == 0:
        continue

    win_rate = subset["Won"].mean()
    profit = np.where(subset["Won"], 10 * 0.91, -10)
    roi = profit.sum() / (10 * len(subset))

    results.append({
        "threshold": t,
        "win_rate": win_rate,
        "roi": roi,
        "bets": len(subset)
    })

df_thresholds = pd.DataFrame(results)
df_thresholds = df_thresholds.sort_values(by="roi", ascending=False)
print(df_thresholds.head(10))


In [None]:
best_threshold = df_thresholds.iloc[0]["threshold"]
df_today["Bet"] = df_today["Pred_TB"] > best_threshold


## add betting threshold today

In [None]:
df_today["Bet"] = df_today["Pred_TB"] > best_threshold
df_today[["player_name", "game_date", "Pred_TB", "Bet"]].to_csv("today_predictions.csv", index=False)

print(df_today[["player_name", "Pred_TB", "Bet"]].head(10))


# Before

In [None]:
# Drop games with very low plate appearances (e.g., pinch hits)
df_games = df_games[df_games['plate_appearances'] >= 2].copy()

# Drop nulls if any
df_games = df_games.dropna()

# Final feature list
features = ['batted_balls', 'launch_speed', 'launch_angle', 'woba_value', 'event_count', 'plate_appearances']
X = df_games[features].values
y = df_games['total_bases'].values


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

model = PlayerMLP(X_train_tensor.shape[1], [64, 32], dropout=0.2, activation='relu').to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    pred = model(X_train_tensor.to(device))
    loss = loss_fn(pred, y_train_tensor.to(device))
    loss.backward()
    optimizer.step()

model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor.to(device)).cpu().numpy().flatten()
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred) ** 0.5

print(f"Daily MLP → MAE: {mae:.4f}, RMSE: {rmse:.4f}")


In [None]:
# Predict on test set and compute binary classification
preds = y_pred
binary_true = (y_test > 1.5).astype(int)
binary_pred = (preds > 1.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Over 1.5 TB Accuracy:", accuracy_score(binary_true, binary_pred))
print("Precision:", precision_score(binary_true, binary_pred))
print("Recall:", recall_score(binary_true, binary_pred))


In [None]:
player_names = df_games['player_name'].values
X_train_names, X_test_names = train_test_split(player_names, test_size=0.2, random_state=42)


In [None]:
df_preds = pd.DataFrame({
    "Player": X_test_names,
    "True_TB": y_test,
    "Pred_TB": y_pred
})
df_preds["Over_1.5"] = df_preds["Pred_TB"] > 1.5
df_preds = df_preds.sort_values(by="Pred_TB", ascending=False)
print(df_preds.head(10))


In [None]:
df_preds["Bet"] = df_preds["Pred_TB"] > 2.0
df_preds["Won"] = df_preds["True_TB"] > 1.5

bet_results = df_preds[df_preds["Bet"]]
win_rate = bet_results["Won"].mean()
print(f"Win rate on high-confidence bets: {win_rate:.2%}")


In [None]:
for threshold in [1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 3.0, 3.2, 3.4, 3.6, 3.8, 4.0, 5.0]:
    df_preds["Bet"] = df_preds["Pred_TB"] > threshold
    df_preds["Won"] = df_preds["True_TB"] > 1.5
    win_rate = df_preds[df_preds["Bet"]]["Won"].mean()
    print(f"Threshold {threshold:.1f} → Win Rate: {win_rate:.2%}")
    bet_results = df_preds[df_preds["Bet"]]
    bet_results["Profit"] = np.where(bet_results["Won"], 10 * 0.91, -10)
    roi = bet_results["Profit"].sum() / (10 * len(bet_results))
    print(f"Simulated ROI: {roi:.2%}")
