# Pitcher Year Start Dataset

Calls the pybaseball APIs to get all the starting performances for each pitcher during the specified year.

In [9]:
from pybaseball import pitching_stats_range
from pybaseball import statcast_pitcher
import pandas as pd

YEAR=2024

In [10]:
raw_pitcher_stats = pitching_stats_range(f'{YEAR}-03-01', f'{YEAR}-11-30')

# Save raw pitcher games data to CSV
raw_pitcher_stats.to_csv(f'../../data/historical/raw_pitcher_stats_{YEAR}.csv', index=False)

In [11]:
# Get raw pitcher stats from season
# Filter for starting pitchers (GS >= 1) and select name and ID
starting_pitchers = raw_pitcher_stats[raw_pitcher_stats['GS'] >= 1][['Name', 'mlbID', 'GS']].reset_index(drop=True)

# Check for duplicates
duplicate_ids = starting_pitchers[starting_pitchers.duplicated(subset=['mlbID'], keep=False)]
if len(duplicate_ids) > 0:
    print("Warning: Duplicate MLB IDs found:")
    print(duplicate_ids)

In [None]:
def fetch_raw_pitcher_stats(pid, name):
    """
    Fetch raw pitch-by-pitch data for a pitcher in the specified season.
    Returns DataFrame or empty DataFrame on error.
    """
    try:
        df = statcast_pitcher(f'{YEAR}-03-01', f'{YEAR}-11-30', pid)
        return df
    except Exception as e:
        print(f"Failed to fetch logs for {name} ({pid}): {e}")
        return pd.DataFrame()

def filter_starts_only(df):
    """
    Return only starts: games where pitcher threw the first pitch in the bottom of the 1st (home) or top of the 1st (away).
    """
    if df.empty:
        return df
    
    # Get first pitch per game
    first_ab = df.groupby('game_pk')['at_bat_number'].min().reset_index()
    df = df.merge(first_ab, on='game_pk', suffixes=('', '_min'))

    # Must be inning 1, at_bat_number == first of game
    starts = df[(df['inning'] == 1) & (df['at_bat_number'] == df['at_bat_number_min'])]
    
    # Keep only those game_pks
    return df[df['game_pk'].isin(starts['game_pk'].unique())]

def infer_pitcher_team(df):
    """
    Infer pitcher's team per game using inning_topbot logic.
    """
    team_info = (
        df.groupby('game_pk')
        .first()
        .reset_index()[['game_pk', 'inning_topbot', 'home_team', 'away_team']]
    )
    team_info['team'] = team_info.apply(
        lambda row: row['home_team'] if row['inning_topbot'] == 'Top' else row['away_team'], axis=1
    )
    return team_info[['game_pk', 'team']]

def calculate_game_stats(df):
    """
    Calculate per-game stats: strikeouts, CSW, BF, total pitches, opponent.
    """
    if df.empty:
        return pd.DataFrame()
        
    batters_faced = df.groupby('game_pk')['at_bat_number'].nunique()

    game_stats = df.groupby(['game_pk', 'game_date', 'player_name']).agg({
        'pitcher': 'first',
        'home_team': 'first',
        'away_team': 'first',
        'pitch_number': 'count',
        'events': lambda x: (x == 'strikeout').sum(),
        'description': lambda x: x.isin(['called_strike', 'swinging_strike', 'swinging_strike_blocked']).sum()
    }).reset_index()

    game_stats['batters_faced'] = game_stats['game_pk'].map(batters_faced)

    inferred_teams = infer_pitcher_team(df)
    game_stats = pd.merge(game_stats, inferred_teams, on='game_pk', how='left')

    # Identify opponent
    game_stats['opp'] = game_stats.apply(
        lambda row: row['away_team'] if row['team'] == row['home_team'] else row['home_team'],
        axis=1
    )

    return game_stats.rename(columns={
        'pitch_number': 'total_pitches',
        'events': 'strikeouts', 
        'description': 'csw_count'
    })

# --- Checkpointing ---
checkpoint_file = f'../../data/interim/pitcher_game_stats_{YEAR}_checkpoint.csv'

try:
    all_game_stats = pd.read_csv(checkpoint_file)
    all_game_stats['pitcher'] = all_game_stats['pitcher'].astype(int)
    processed_pitcher_ids = set(all_game_stats['pitcher'].unique())
    print(f"Loaded {len(processed_pitcher_ids)} pitchers from checkpoint")
except FileNotFoundError:
    all_game_stats = pd.DataFrame()
    processed_pitcher_ids = set()
    print("Starting fresh - no checkpoint found")

# Ensure integer IDs
starting_pitchers['mlbID'] = starting_pitchers['mlbID'].astype(int)

for _, pitcher in starting_pitchers.iterrows():
    pitcher_id = int(pitcher['mlbID'])
    name = pitcher['Name']
    
    if pitcher_id in processed_pitcher_ids:
        print(f"Skipping {pitcher_id} - already processed")
        continue

    print(f"\nProcessing pitcher ID {pitcher_id} - {name}")

    existing_games = (
        all_game_stats[all_game_stats['pitcher'] == pitcher_id]['game_pk'].unique()
        if not all_game_stats.empty else []
    )
    
    raw_stats = fetch_raw_pitcher_stats(pitcher_id, name)
    raw_stats = filter_starts_only(raw_stats)
    game_stats = calculate_game_stats(raw_stats)

    if not game_stats.empty:
        new_games = game_stats[~game_stats['game_pk'].isin(existing_games)]
        
        if not new_games.empty:
            all_game_stats = pd.concat([all_game_stats, new_games], ignore_index=True)
            all_game_stats.to_csv(checkpoint_file, index=False)
            print(f"Saved {len(new_games)} new games for pitcher ID {pitcher_id}")
        else:
            print(f"No new games found for pitcher ID {pitcher_id}")

        processed_pitcher_ids.add(pitcher_id)
    else:
        print(f"No data found for pitcher ID {pitcher_id}")

print("\n✅ Processing complete!")
print(f"Total pitchers processed: {len(processed_pitcher_ids)}")
print(f"Total games recorded: {len(all_game_stats)}")
