Data Collection:

-Using PyBaseball package to collect pitch data from 2022

In [5]:
from datetime import date, timedelta
import os
import time
from typing import List, Tuple
import io
import sys
import contextlib
import warnings

import numpy as np
import pandas as pd
from pybaseball import statcast


# -------------------------------
# Quiet mode
# -------------------------------
# Disable tqdm progress bars used internally
os.environ["TQDM_DISABLE"] = "1"
# Silence pybaseball/pandas FutureWarnings etc.
warnings.filterwarnings("ignore", module="pybaseball")
warnings.filterwarnings("ignore", category=FutureWarning)

@contextlib.contextmanager
def _suppress_stdout_stderr():
    """Temporarily redirect stdout/stderr to devnull (tqdm writes to stderr)."""
    with open(os.devnull, 'w') as devnull:
        old_out, old_err = sys.stdout, sys.stderr
        try:
            sys.stdout = devnull
            sys.stderr = devnull
            yield
        finally:
            sys.stdout = old_out
            sys.stderr = old_err


# -------------------------------
# Config
# -------------------------------
TRAIN_RANGES: List[Tuple[date, date]] = [
    (date(2021, 4, 1),  date(2021, 10, 3)),
    (date(2022, 4, 7),  date(2022, 10, 5)),
    (date(2023, 3, 30), date(2023, 10, 1)),
    (date(2024, 3, 28), date(2024, 9, 29)),
]
TEST_RANGES: List[Tuple[date, date]] = [
    (date(2025, 3, 27), date(2025, 8, 31)),
]

CHUNK_DAYS = 5
REQUEST_SLEEP_SEC = 1.0
MAX_RETRIES = 3
RETRY_SLEEP_SEC = 3.0

OUTPUT_DIR = "output"
TRAIN_OUT = os.path.join(OUTPUT_DIR, "train.csv")
TEST_OUT  = os.path.join(OUTPUT_DIR, "test.csv")


# -------------------------------
# Utilities
# -------------------------------
def generate_chunks(start_date: date, end_date: date, span_days: int = CHUNK_DAYS) -> List[Tuple[date, date]]:
    chunks = []
    cur = start_date
    delta = timedelta(days=span_days - 1)
    while cur <= end_date:
        nxt = min(cur + delta, end_date)
        chunks.append((cur, nxt))
        cur = nxt + timedelta(days=1)
    return chunks


def fetch_statcast_window(start_dt: str, end_dt: str) -> pd.DataFrame:
    last_err = None
    for _ in range(MAX_RETRIES):
        try:
            with _suppress_stdout_stderr():
                return statcast(start_dt=start_dt, end_dt=end_dt)
        except Exception as e:
            last_err = e
            time.sleep(RETRY_SLEEP_SEC)
    # On permanent failure return empty frame silently
    return pd.DataFrame()


def collect_ranges(ranges: List[Tuple[date, date]]) -> pd.DataFrame:
    frames = []
    for s, e in ranges:
        for a, b in generate_chunks(s, e, CHUNK_DAYS):
            df = fetch_statcast_window(a.strftime("%Y-%m-%d"), b.strftime("%Y-%m-%d"))
            if df is not None and not df.empty:
                frames.append(df)
            time.sleep(REQUEST_SLEEP_SEC)
    if not frames:
        return pd.DataFrame()
    out = pd.concat(frames, ignore_index=True)
    if "game_date" in out.columns:
        out["game_date"] = pd.to_datetime(out["game_date"], errors="coerce")
    return out


def compute_vaa_row(row: pd.Series) -> float:
    for k in ("vy0", "ay", "vz0", "az"):
        if pd.isna(row.get(k)):
            return np.nan
    vy0, ay, vz0, az = row["vy0"], row["ay"], row["vz0"], row["az"]
    if pd.isna(vy0) or pd.isna(ay) or pd.isna(vz0) or pd.isna(az) or ay == 0:
        return np.nan
    y0, yf = 50.0, 17.0 / 12.0
    vy_f_sq = vy0**2 - 2.0 * ay * (y0 - yf)
    if vy_f_sq < 0:
        return np.nan
    vy_f = -np.sqrt(vy_f_sq)
    t = (vy_f - vy0) / ay
    if not np.isfinite(t) or t <= 0:
        return np.nan
    vz_f = vz0 + az * t
    if vy_f == 0:
        return np.nan
    return float(-np.degrees(np.arctan(vz_f / vy_f)))


def add_vaa(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df["vaa"] = df.apply(compute_vaa_row, axis=1)
    return df


def sort_and_dedup(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    sort_cols = [c for c in ["game_date", "game_pk", "at_bat_number", "pitch_number"] if c in df.columns]
    if sort_cols:
        df = df.sort_values(sort_cols, kind="mergesort")
    dedup_keys = [c for c in ["game_pk", "at_bat_number", "pitch_number"] if c in df.columns]
    if dedup_keys:
        df = df.drop_duplicates(subset=dedup_keys, keep="first")
    else:
        df = df.drop_duplicates(keep="first")
    return df.reset_index(drop=True)


def ensure_dir(path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)


# -------------------------------
# Main
# -------------------------------
def main():
    ensure_dir(TRAIN_OUT)
    ensure_dir(TEST_OUT)

    # TRAIN
    train_df = collect_ranges(TRAIN_RANGES)
    train_df = add_vaa(train_df)
    train_df = sort_and_dedup(train_df)
    train_df.to_csv(TRAIN_OUT, index=False)
    print(f"✅ Wrote {len(train_df):,} rows to {TRAIN_OUT}")

    # TEST
    test_df = collect_ranges(TEST_RANGES)
    test_df = add_vaa(test_df)
    test_df = sort_and_dedup(test_df)
    test_df.to_csv(TEST_OUT, index=False)
    print(f"✅ Wrote {len(test_df):,} rows to {TEST_OUT}")


if __name__ == "__main__":
    main()


✅ Wrote 2,844,586 rows to output\train.csv
✅ Wrote 599,491 rows to output\test.csv
