In [None]:
# pip install MLB-StatsAPI
# pip install pandas

In [1]:
import statsapi
import pandas as pd
from datetime import datetime, timedelta
import pickle

## Collect all games over a specific date range

In [58]:
def get_run_differential(game):
    home_runs = game['home_score']
    away_runs = game['away_score']
    run_differential = home_runs - away_runs
    home_win = 1 if home_runs > away_runs else 0
    return {
        'game_id': game['game_id'],
        'date': game['game_date'], 
        'home_team_name': game['home_name'],
        'away_team_name': game['away_name'],
        'home_id': game['home_id'],
        'away_id': game['away_id'],
        'home_score': home_runs,
        'away_score': away_runs,
        'run_differential': run_differential,
        'home_win': home_win,
    }

In [59]:
def fetch_run_differentials(start_date, end_date):
    games_data = []
    current_date = start_date
    while current_date <= end_date:
        schedule = statsapi.schedule(start_date=current_date.strftime('%Y-%m-%d'), end_date=(current_date + timedelta(days=6)).strftime('%Y-%m-%d'))
        for game in schedule:
            if game['status'] == 'Final':
                game_info = get_run_differential(game)
                games_data.append(game_info)
        current_date += timedelta(days=7)
    return games_data


In [60]:
end_date = datetime(2024, 10, 1)
start_date = datetime(2019, 3, 1)
games_data = fetch_run_differentials(start_date, end_date)
games_df = pd.DataFrame(games_data)

## Create Team Stat Season Data by Day

In [45]:
def create_stat_dataframe(team_ids, season):
    date_range = pd.date_range(start=f'{season}-03-28', end=f'{season}-10-01') # alter for opening day each season
    df = pd.DataFrame(index=team_ids, columns=date_range)
    df.index.name = 'team_id'
    return df

In [46]:
def populate_stat_dataframes(df_dict, team_id, season, stat_types):
    print(f"Populating stats for Team {team_id}")

    totals = {f"{stat_type}_{stat}": 0.0 for stat_type, stat_list in stat_types.items() for stat in stat_list}
    games_played = {f"{stat_type}_{stat}": 0 for stat_type, stat_list in stat_types.items() for stat in stat_list}
    schedule = statsapi.schedule(team=team_id, start_date=f'{season}-03-28', end_date=f'{season}-10-01') # alter for opening day each season

    for game in schedule:
        game_date = pd.to_datetime(game['game_date'])
        if game['status'] == 'Final':
            boxscore = statsapi.boxscore_data(game['game_id'])
            team_side = 'home' if team_id == game['home_id'] else 'away'
            for stat_type, stat_list in stat_types.items():
                for stat in stat_list:
                    stat_key = f"{stat_type}_{stat}"
                    stat_value = boxscore[team_side]['teamStats'][stat_type].get(stat, 0)

                    totals[stat_key] += float(stat_value)
                    games_played[stat_key] += 1
                    current_stat_value = totals[stat_key] / games_played[stat_key]

                    mask = (df_dict[stat_key].columns <= game_date) & (df_dict[stat_key].loc[team_id].isna())
                    df_dict[stat_key].loc[team_id, mask] = current_stat_value

In [47]:
batting_stats = [
    'doubles', 'triples', 'homeRuns', 'strikeOuts', 'baseOnBalls',
    'hits', 'avg', 'atBats', 'obp', 'slg', 'ops', 'stolenBases', 'rbi', 'leftOnBase',
]

pitching_stats = [
    'runs', 'doubles', 'triples', 'homeRuns', 'strikeOuts', 'baseOnBalls',
    'hits', 'atBats', 'obp', 'stolenBases', 'numberOfPitches', 'era',
    'inningsPitched', 'earnedRuns', 'pitchesThrown', 'strikes', 'rbi',
]


stat_types = {'batting': batting_stats, 'pitching': pitching_stats}
team_ids = [team['id'] for team in statsapi.lookup_team('')]

In [48]:
def get_season_stats(season, stat_types, team_ids):
    season_stats = {f"{stat_type}_{stat}": create_stat_dataframe(team_ids, season) 
                    for stat_type, stat_list in stat_types.items() for stat in stat_list}
    for team_id in team_ids:
        populate_stat_dataframes(season_stats, team_id, season, stat_types)
    with open(f'stats/season_{season}_stats', 'wb') as file:
        pickle.dump(season_stats, file)

In [None]:
get_season_stats(2019, stat_types, team_ids)

In [None]:
# get_season_stats(2020, stat_types, team_ids)

In [None]:
get_season_stats(2021, stat_types, team_ids)

In [None]:
get_season_stats(2022, stat_types, team_ids)

In [None]:
get_season_stats(2023, stat_types, team_ids)

In [None]:
get_season_stats(2024, stat_types, team_ids)

## Merge stats with games

In [37]:
def get_home_stat(row, df2):
    date = pd.to_datetime(row['date']) - pd.Timedelta(days=1)
    id_ = row['home_id']
    if id_ in df2.index and date in df2.columns:
        return df2.loc[id_, date]
    else:
        return None

def get_away_stat(row, df2):
    date = pd.to_datetime(row['date']) - pd.Timedelta(days=1)
    id_ = row['away_id']
    if id_ in df2.index and date in df2.columns:
        return df2.loc[id_, date]
    else:
        return None

In [62]:
with open('stats/season_2019_stats', 'rb') as file:
    season_2019_stats = pickle.load(file)

for stat_name, stat_df in season_2019_stats.items():
    new_home_stats = games_df.apply(lambda row: get_home_stat(row, stat_df), axis=1)
    if f'home_{stat_name}' not in games_df.columns:
        games_df[f'home_{stat_name}'] = pd.NA 
    games_df[f'home_{stat_name}'] = games_df[f'home_{stat_name}'].where(games_df[f'home_{stat_name}'].notna(), new_home_stats)

for stat_name, stat_df in season_2019_stats.items():
    new_away_stats = games_df.apply(lambda row: get_away_stat(row, stat_df), axis=1)
    if f'away_{stat_name}' not in games_df.columns:
        games_df[f'away_{stat_name}'] = pd.NA 
    games_df[f'away_{stat_name}'] = games_df[f'away_{stat_name}'].where(games_df[f'away_{stat_name}'].notna(), new_away_stats)

In [63]:
with open('stats/season_2021_stats', 'rb') as file:
    season_2021_stats = pickle.load(file)

for stat_name, stat_df in season_2021_stats.items():
    new_home_stats = games_df.apply(lambda row: get_home_stat(row, stat_df), axis=1)
    if f'home_{stat_name}' not in games_df.columns:
        games_df[f'home_{stat_name}'] = pd.NA 
    games_df[f'home_{stat_name}'] = games_df[f'home_{stat_name}'].where(games_df[f'home_{stat_name}'].notna(), new_home_stats)

for stat_name, stat_df in season_2021_stats.items():
    new_away_stats = games_df.apply(lambda row: get_away_stat(row, stat_df), axis=1)
    if f'away_{stat_name}' not in games_df.columns:
        games_df[f'away_{stat_name}'] = pd.NA 
    games_df[f'away_{stat_name}'] = games_df[f'away_{stat_name}'].where(games_df[f'away_{stat_name}'].notna(), new_away_stats)

In [64]:
with open('stats/season_2022_stats', 'rb') as file:
    season_2022_stats = pickle.load(file)

for stat_name, stat_df in season_2022_stats.items():
    new_home_stats = games_df.apply(lambda row: get_home_stat(row, stat_df), axis=1)
    if f'home_{stat_name}' not in games_df.columns:
        games_df[f'home_{stat_name}'] = pd.NA 
    games_df[f'home_{stat_name}'] = games_df[f'home_{stat_name}'].where(games_df[f'home_{stat_name}'].notna(), new_home_stats)

for stat_name, stat_df in season_2022_stats.items():
    new_away_stats = games_df.apply(lambda row: get_away_stat(row, stat_df), axis=1)
    if f'away_{stat_name}' not in games_df.columns:
        games_df[f'away_{stat_name}'] = pd.NA 
    games_df[f'away_{stat_name}'] = games_df[f'away_{stat_name}'].where(games_df[f'away_{stat_name}'].notna(), new_away_stats)

In [65]:
with open('stats/season_2023_stats', 'rb') as file:
    season_2023_stats = pickle.load(file)

for stat_name, stat_df in season_2023_stats.items():
    new_home_stats = games_df.apply(lambda row: get_home_stat(row, stat_df), axis=1)
    if f'home_{stat_name}' not in games_df.columns:
        games_df[f'home_{stat_name}'] = pd.NA 
    games_df[f'home_{stat_name}'] = games_df[f'home_{stat_name}'].where(games_df[f'home_{stat_name}'].notna(), new_home_stats)

for stat_name, stat_df in season_2023_stats.items():
    new_away_stats = games_df.apply(lambda row: get_away_stat(row, stat_df), axis=1)
    if f'away_{stat_name}' not in games_df.columns:
        games_df[f'away_{stat_name}'] = pd.NA 
    games_df[f'away_{stat_name}'] = games_df[f'away_{stat_name}'].where(games_df[f'away_{stat_name}'].notna(), new_away_stats)

In [66]:
with open('stats/season_2024_stats', 'rb') as file:
    season_2024_stats = pickle.load(file)

for stat_name, stat_df in season_2024_stats.items():
    new_home_stats = games_df.apply(lambda row: get_home_stat(row, stat_df), axis=1)
    if f'home_{stat_name}' not in games_df.columns:
        games_df[f'home_{stat_name}'] = pd.NA 
    games_df[f'home_{stat_name}'] = games_df[f'home_{stat_name}'].where(games_df[f'home_{stat_name}'].notna(), new_home_stats)

for stat_name, stat_df in season_2024_stats.items():
    new_away_stats = games_df.apply(lambda row: get_away_stat(row, stat_df), axis=1)
    if f'away_{stat_name}' not in games_df.columns:
        games_df[f'away_{stat_name}'] = pd.NA 
    games_df[f'away_{stat_name}'] = games_df[f'away_{stat_name}'].where(games_df[f'away_{stat_name}'].notna(), new_away_stats)

In [67]:
games_df = games_df.dropna(subset=['away_pitching_rbi'])

In [69]:
stats_to_compare = ['batting_doubles', 'batting_triples', 'batting_homeRuns',
                    'batting_strikeOuts', 'batting_baseOnBalls', 'batting_hits', 
                    'batting_avg', 'batting_atBats', 'batting_obp', 'batting_slg', 
                    'batting_ops', 'batting_stolenBases', 'batting_rbi', 'batting_leftOnBase',
                    'pitching_runs', 'pitching_doubles', 'pitching_triples', 
                    'pitching_homeRuns', 'pitching_strikeOuts', 'pitching_baseOnBalls', 
                    'pitching_hits', 'pitching_atBats', 'pitching_obp', 'pitching_stolenBases', 
                    'pitching_numberOfPitches', 'pitching_era', 'pitching_inningsPitched', 
                    'pitching_earnedRuns', 'pitching_pitchesThrown', 'pitching_strikes', 
                    'pitching_rbi']

df_differential = games_df[['game_id', 'date', 'home_team_name', 'away_team_name', 'home_id', 'away_id', 'home_score', 'away_score']].copy()

for stat in stats_to_compare:
    home_col = f'home_{stat}'
    away_col = f'away_{stat}'
    diff_col = f'diff_{stat}'
    
    if home_col in games_df.columns and away_col in games_df.columns:
        df_differential[diff_col] = games_df[home_col] - games_df[away_col]


In [70]:
games_df.to_pickle('training_data/split_stats_training_data.pkl')
df_differential.to_pickle('training_data/differential_stats_training_data.pkl')