In [None]:
# pip install MLB-StatsAPI
# pip install pandas

In [41]:
import statsapi
import pandas as pd
from datetime import datetime, timedelta

## Collect all games over a specific date range

In [2]:
def get_run_differential(game):
    home_runs = game['home_score']
    away_runs = game['away_score']
    run_differential = home_runs - away_runs
    home_win = 1 if home_runs > away_runs else 0
    return {
        'game_id': game['game_id'],
        'date': game['game_date'],
        'home_team_name': game['home_name'],
        'away_team_name': game['away_name'],
        'home_id': game['home_id'],
        'away_id': game['away_id'],
        'home_score': home_runs,
        'away_score': away_runs,
        'run_differential': run_differential,
        'home_win': home_win,
    }

In [3]:
def fetch_run_differentials(start_date, end_date):
    games_data = []
    current_date = start_date
    while current_date <= end_date:
        schedule = statsapi.schedule(start_date=current_date.strftime('%Y-%m-%d'), end_date=(current_date + timedelta(days=6)).strftime('%Y-%m-%d'))
        for game in schedule:
            if game['status'] == 'Final':
                game_info = get_run_differential(game)
                games_data.append(game_info)
        current_date += timedelta(days=7)
    return games_data


In [4]:
end_date = datetime.today()
start_date = end_date - timedelta(days=30) # change days for a different date range
games_data = fetch_run_differentials(start_date, end_date)
games_df = pd.DataFrame(games_data)

## Incorporate Specific stats

In [67]:
def create_stat_dataframe(team_ids, season):
    date_range = pd.date_range(start=f'{season}-03-01', end=f'{season}-10-01')
    df = pd.DataFrame(index=team_ids, columns=date_range)
    df.index.name = 'team_id'
    return df

In [68]:
def populate_stat_dataframes(df_dict, team_id, season, stat_types):
    print(f"Populating stats for Team {team_id}")

    totals = {f"{stat_type}_{stat}": 0.0 for stat_type, stat_list in stat_types.items() for stat in stat_list}
    games_played = {f"{stat_type}_{stat}": 0 for stat_type, stat_list in stat_types.items() for stat in stat_list}
    schedule = statsapi.schedule(team=team_id, start_date=f'{season}-03-01', end_date=f'{season}-10-01')

    for game in schedule:
        game_date = pd.to_datetime(game['game_date'])
        if game['status'] == 'Final':
            boxscore = statsapi.boxscore_data(game['game_id'])
            team_side = 'home' if team_id == game['home_id'] else 'away'
            for stat_type, stat_list in stat_types.items():
                for stat in stat_list:
                    stat_key = f"{stat_type}_{stat}"
                    stat_value = boxscore[team_side]['teamStats'][stat_type].get(stat, 0)

                    totals[stat_key] += float(stat_value)
                    games_played[stat_key] += 1
                    current_stat_value = totals[stat_key] / games_played[stat_key]

                    mask = (df_dict[stat_key].columns <= game_date) & (df_dict[stat_key].loc[team_id].isna())
                    df_dict[stat_key].loc[team_id, mask] = current_stat_value

In [69]:
batting_stats = [
    'doubles', 'triples', 'homeRuns', 'strikeOuts', 'baseOnBalls',
    'hits', 'avg', 'atBats', 'obp', 'slg', 'ops', 'stolenBases', 'rbi', 'leftOnBase',
]

pitching_stats = [
    'runs', 'doubles', 'triples', 'homeRuns', 'strikeOuts', 'baseOnBalls',
    'hits', 'atBats', 'obp', 'stolenBases', 'numberOfPitches', 'era',
    'inningsPitched', 'earnedRuns', 'pitchesThrown', 'strikes', 'rbi',
]


stat_types = {'batting': batting_stats, 'pitching': pitching_stats}
team_ids = [team['id'] for team in statsapi.lookup_team('')]

In [None]:
season = 2024
season_2024_stats = {f"{stat_type}_{stat}": create_stat_dataframe(team_ids, season) 
                     for stat_type, stat_list in stat_types.items() for stat in stat_list}
for team_id in team_ids:
    populate_stat_dataframes(season_2024_stats, team_id, season, stat_types)

In [62]:
for key, val in season_2024_stats.items():
    val.to_pickle(f"{season}_{key}.pkl")