In [3]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
from torch.nn.utils import clip_grad_norm_

from pybaseball import statcast
from datetime import datetime, timedelta

def load_statcast_range(start_date, end_date, step_days=7):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end   = datetime.strptime(end_date, "%Y-%m-%d")

    all_dfs = []
    current = start

    while current <= end:
        chunk_start = current.strftime("%Y-%m-%d")
        chunk_end = min(current + timedelta(days=step_days), end).strftime("%Y-%m-%d")
        print(f"Downloading: {chunk_start} → {chunk_end}")

        try:
            chunk = statcast(start_dt=chunk_start, end_dt=chunk_end)
            if chunk is not None and not chunk.empty:
                all_dfs.append(chunk)
        except Exception as e:
            print(f"Failed on {chunk_start} → {chunk_end}: {e}")

        current += timedelta(days=step_days + 1)

    if len(all_dfs) == 0:
        raise ValueError("No data downloaded — check date ranges.")
    return pd.concat(all_dfs, ignore_index=True)

df = load_statcast_range("2023-03-28", "2025-09-28", step_days=30)

Downloading: 2023-03-28 → 2023-04-27
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:21<00:00,  1.42it/s]


Downloading: 2023-04-28 → 2023-05-28
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:37<00:00,  1.22s/it]


Downloading: 2023-05-29 → 2023-06-28
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:30<00:00,  1.01it/s]


Downloading: 2023-06-29 → 2023-07-29
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:28<00:00,  1.08it/s]


Downloading: 2023-07-30 → 2023-08-29
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:31<00:00,  1.01s/it]


Downloading: 2023-08-30 → 2023-09-29
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:32<00:00,  1.06s/it]


Downloading: 2023-09-30 → 2023-10-30
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:07<00:00,  3.97it/s]


Downloading: 2023-10-31 → 2023-11-30
This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.47it/s]

Downloading: 2023-12-01 → 2023-12-31
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Downloading: 2024-01-01 → 2024-01-31
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Downloading: 2024-02-01 → 2024-03-02
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Downloading: 2024-03-03 → 2024-04-02
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:12<00:00,  1.46it/s]


Downloading: 2024-04-03 → 2024-05-03
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:33<00:00,  1.08s/it]


Downloading: 2024-05-04 → 2024-06-03
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:28<00:00,  1.11it/s]


Downloading: 2024-06-04 → 2024-07-04
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:30<00:00,  1.02it/s]


Downloading: 2024-07-05 → 2024-08-04
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:29<00:00,  1.04it/s]


Downloading: 2024-08-05 → 2024-09-04
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:33<00:00,  1.07s/it]


Downloading: 2024-09-05 → 2024-10-05
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:27<00:00,  1.13it/s]


Downloading: 2024-10-06 → 2024-11-05
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:07<00:00,  4.17it/s]

Downloading: 2024-11-06 → 2024-12-06
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  4.18it/s]

Downloading: 2024-12-07 → 2025-01-06
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Downloading: 2025-01-07 → 2025-02-06
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Downloading: 2025-02-07 → 2025-03-09
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Downloading: 2025-03-10 → 2025-04-09
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|██████████████████████████████████████████████████████████████████████████████████| 26/26 [00:21<00:00,  1.19it/s]


Downloading: 2025-04-10 → 2025-05-10
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:26<00:00,  1.18it/s]


Downloading: 2025-05-11 → 2025-06-10
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:29<00:00,  1.05it/s]


Downloading: 2025-06-11 → 2025-07-11
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:25<00:00,  1.20it/s]


Downloading: 2025-07-12 → 2025-08-11
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:20<00:00,  1.55it/s]


Downloading: 2025-08-12 → 2025-09-11
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:33<00:00,  1.07s/it]


Downloading: 2025-09-12 → 2025-09-28
This is a large query, it may take a moment to complete


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:18<00:00,  1.06s/it]


In [4]:
df = df[df["game_type"] == "R"]
df = df[df['description'] == 'hit_into_play']
df = df.dropna(subset=["launch_angle", "launch_speed"])

df = df.dropna(subset=["bat_speed", "swing_length", "attack_angle"])

numeric_features = [
    "release_speed", "release_spin_rate", "pfx_x", "pfx_z",
    "plate_x", "plate_z", "vx0", "vy0", "vz0",
    "ax", "ay", "az",
    "bat_speed", "swing_length", "attack_angle", "attack_direction",
    "estimated_slg_using_speedangle", "estimated_ba_using_speedangle",
    "estimated_woba_using_speedangle", "woba_value", "babip_value", "iso_value",
    "bat_score", "bat_score_diff", "bat_win_exp", "age_bat", 
    "batter_days_since_prev_game", "n_priorpa_thisgame_player_at_bat"
]

categorical_features = ["pitch_type", "stand", "p_throws", "home_team", "away_team"]

numeric_features = [f for f in numeric_features if f in df.columns]
categorical_features = [f for f in categorical_features if f in df.columns]

df["bat_speed_sq"] = df["bat_speed"] ** 2
df["plane_match"] = df["attack_angle"] * df["pfx_z"]
df["vertical_offset"] = df["plate_z"] - (df["attack_angle"] / 8)
df["timing_offset"] = df["vx0"] / df["release_speed"]

numeric_features += ["bat_speed_sq", "plane_match", "vertical_offset", "timing_offset"]

In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)

TARGET_LA = "launch_angle"
TARGET_EV = "launch_speed"

X_train = train_df[numeric_features + categorical_features].copy()
X_test  = test_df[numeric_features + categorical_features].copy()

y_train_la = train_df[TARGET_LA]
y_train_ev = train_df[TARGET_EV]
y_test_la = test_df[TARGET_LA]
y_test_ev = test_df[TARGET_EV]

X_train = pd.get_dummies(X_train, columns=categorical_features)
X_test = pd.get_dummies(X_test, columns=categorical_features)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

imputer = SimpleImputer(strategy="median")
X_train[numeric_features] = imputer.fit_transform(X_train[numeric_features])
X_test[numeric_features] = imputer.transform(X_test[numeric_features])

scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

In [6]:
def to_tensor(features, target):
    X = torch.tensor(features.values, dtype=torch.float32)
    y = torch.tensor(target.values, dtype=torch.float32).unsqueeze(1)
    return X, y

X_train_tensor, y_train_la_tensor = to_tensor(X_train, y_train_la)
_, y_train_ev_tensor = to_tensor(X_train, y_train_ev)
X_test_tensor, y_test_la_tensor = to_tensor(X_test, y_test_la)
_, y_test_ev_tensor = to_tensor(X_test, y_test_ev)

In [7]:
class MLPRegression(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.model(x)

def train_model(model, X, y, lr=1e-5, epochs=50, batch_size=1024):
    loader = DataLoader(TensorDataset(X, y), batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    model.train()

    for epoch in range(epochs):
        running_loss = 0
        for xb, yb in loader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}: {running_loss/len(loader):.3f}")
    return model

input_dim = X_train_tensor.shape[1]
model_la = MLPRegression(input_dim)
model_ev = MLPRegression(input_dim)

model_la = train_model(model_la, X_train_tensor, y_train_la_tensor)
model_ev = train_model(model_ev, X_train_tensor, y_train_ev_tensor)

Epoch 1/50: 1008.561
Epoch 2/50: 1001.831
Epoch 3/50: 990.783
Epoch 4/50: 973.803
Epoch 5/50: 952.506
Epoch 6/50: 926.908
Epoch 7/50: 897.678
Epoch 8/50: 864.358
Epoch 9/50: 829.403
Epoch 10/50: 794.436
Epoch 11/50: 762.051
Epoch 12/50: 735.532
Epoch 13/50: 715.739
Epoch 14/50: 702.712
Epoch 15/50: 692.636
Epoch 16/50: 684.340
Epoch 17/50: 677.524
Epoch 18/50: 671.182
Epoch 19/50: 664.766
Epoch 20/50: 659.884
Epoch 21/50: 654.759
Epoch 22/50: 649.591
Epoch 23/50: 644.740
Epoch 24/50: 639.176
Epoch 25/50: 634.248
Epoch 26/50: 629.801
Epoch 27/50: 624.820
Epoch 28/50: 619.869
Epoch 29/50: 614.702
Epoch 30/50: 610.346
Epoch 31/50: 605.477
Epoch 32/50: 600.940
Epoch 33/50: 595.873
Epoch 34/50: 590.764
Epoch 35/50: 586.607
Epoch 36/50: 582.099
Epoch 37/50: 577.822
Epoch 38/50: 573.751
Epoch 39/50: 569.111
Epoch 40/50: 564.478
Epoch 41/50: 560.137
Epoch 42/50: 555.794
Epoch 43/50: 551.235
Epoch 44/50: 547.430
Epoch 45/50: 543.067
Epoch 46/50: 539.103
Epoch 47/50: 534.901
Epoch 48/50: 530.266

In [8]:
def predict_with_uncertainty(model, X, n_samples=30):
    model.train()
    preds_list = []
    with torch.no_grad():
        for _ in range(n_samples):
            preds_list.append(model(X).cpu().numpy())
    preds_array = np.array(preds_list)
    mean_preds = preds_array.mean(axis=0).flatten()
    std_preds = preds_array.std(axis=0).flatten()
    return mean_preds, std_preds

pred_la_mean, pred_la_std = predict_with_uncertainty(model_la, X_test_tensor)
pred_ev_mean, pred_ev_std = predict_with_uncertainty(model_ev, X_test_tensor)

rmse_la = np.sqrt(mean_squared_error(y_test_la, pred_la_mean))
rmse_ev = np.sqrt(mean_squared_error(y_test_ev, pred_ev_mean))
print(f"\nLaunch Angle RMSE: {rmse_la:.2f}")
print(f"Exit Velocity RMSE: {rmse_ev:.2f}")


Launch Angle RMSE: 22.57
Exit Velocity RMSE: 12.16


In [9]:
predicted_df = test_df.copy()
predicted_df["launch_angle_pred"] = pred_la_mean
predicted_df["launch_angle_std"] = pred_la_std
predicted_df["launch_speed_pred"] = pred_ev_mean
predicted_df["launch_speed_std"] = pred_ev_std

predicted_df.to_excel("predicted_results_pytorch.xlsx", index=False)
print("\nPredictions saved to 'predicted_results_pytorch.xlsx'")


Predictions saved to 'predicted_results_pytorch.xlsx'


In [10]:
def player_season_summary(batter_id, df, X_df, model_la, model_ev, filter_bip=True):

    player_df = df[df['batter'] == batter_id].copy()
    
    if filter_bip:
        player_df = player_df[player_df['description'] == 'hit_into_play']
    
    if player_df.empty:
        print(f"No data found for batter ID {batter_id} with filter_bip={filter_bip}")
        return None
    
    actual_la = player_df['launch_angle'].mean()
    actual_ev = player_df['launch_speed'].mean()
    
    X_player = X_df.loc[player_df.index].copy().astype(np.float32)
    X_player_tensor = torch.tensor(X_player.values, dtype=torch.float32)
    
    pred_la_mean, pred_la_std = predict_with_uncertainty(model_la, X_player_tensor)
    pred_ev_mean, pred_ev_std = predict_with_uncertainty(model_ev, X_player_tensor)
    
    predicted_la = pred_la_mean.mean()
    predicted_ev = pred_ev_mean.mean()
    
    summary = pd.DataFrame({
        "Metric": ["Launch Angle", "Exit Velocity"],
        "Actual": [actual_la, actual_ev],
        "Predicted": [predicted_la, predicted_ev],
        "Pred Std (avg)": [pred_la_std.mean(), pred_ev_std.mean()]
    })
    
    return summary

In [11]:
freeman_summary = player_season_summary(
    batter_id=518692,  # Freddie Freeman
    df=test_df,
    X_df=X_test,
    model_la=model_la,
    model_ev=model_ev,
    filter_bip=True
)

print(freeman_summary)

          Metric     Actual  Predicted  Pred Std (avg)
0   Launch Angle  15.739130  14.553982        3.870971
1  Exit Velocity  89.502609  89.087524        7.427976
