In [None]:
# pip install MLB-StatsAPI
# pip install pandas

In [1]:
import statsapi
import pandas as pd
from datetime import datetime, timedelta

## Collect run differential for all games over a specific date range

In [16]:
# Function to calculate run differential for a game
def get_run_differential(game):
    home_runs = game['home_score']
    away_runs = game['away_score']
    run_differential = home_runs - away_runs
    home_win = 1 if home_runs > away_runs else 0
    return {
        'game_id': game['game_id'],
        'date': game['game_date'],
        'home_team_name': game['home_name'],
        'away_team_name': game['away_name'],
        'home_id': game['home_id'],
        'away_id': game['away_id'],
        'home_score': home_runs,
        'away_score': away_runs,
        'run_differential': run_differential,
        'home_win': home_win,
    }

In [17]:
# Function to fetch game data between a date range
def fetch_run_differentials(start_date, end_date):
    games_data = []
    current_date = start_date
    while current_date <= end_date:
        schedule = statsapi.schedule(start_date=current_date.strftime('%Y-%m-%d'), end_date=(current_date + timedelta(days=6)).strftime('%Y-%m-%d'))
        for game in schedule:
            if game['status'] == 'Final':
                game_info = get_run_differential(game)
                games_data.append(game_info)
        current_date += timedelta(days=7)
    return games_data


In [18]:
end_date = datetime.today()
start_date = end_date - timedelta(days=2) # change days for a different date range
games_data = fetch_run_differentials(start_date, end_date)
games_df = pd.DataFrame(games_data)

## Incorporate Specific stats

In [None]:
batting_stats = [
  'doubles',
  'triples',
  'homeRuns',
  'strikeOuts',
  'baseOnBalls',
  'hits'
  'avg',
  'atBats',
  'obp',
  'slg',
  'ops',
  'stolenBases',
  'rbi',
  'leftOnBase',
]

pitching_stats = [
  'runs',
  'doubles',
  'triples',
  'homeRuns',
  'strikeOuts',
  'baseOnBalls',
  'hits',
  'atBats',
  'obp',
  'stolenBases',
  'numberOfPitches',
  'era',
  'inningsPitched',
  'earnedRuns',
  'pitchesThrown',
  'strikes',
  'rbi',
]

In [26]:
# NEED TO ADJUST TO WORK WITH ANY STAT
# NEED TO FIGURE OUT EARLY SEASON METHOD
# NEED TO FIGURE OUT ONLY PULLING DATES THAT ARE IN-SEASON DATES
# NEED TO IGNORE PLAYOFF GAMES
# NEED TO EXPORT DATAFRAMES IN PICKLE FILE TO USE BETWEEN SESSIONS

# Step 1: Initialize DataFrame with team IDs and dates
def create_stat_dataframe(team_ids, start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    df = pd.DataFrame(index=team_ids, columns=date_range)
    df.index.name = 'team_id'
    return df

# Step 2: Populate OBP DataFrame using prefix sums
def populate_obp_dataframe(df, team_ids, start_date, end_date):
    for team_id in team_ids:
        print(f"Populating OBP for Team {team_id}")
        
        # Initialize prefix sum variables
        obp_total = 0.0
        games_played = 0

        # Fetch the team's schedule for the season
        schedule = statsapi.schedule(team=team_id, start_date=start_date, end_date=end_date)

        # Loop through each game in the team's schedule in chronological order
        for game in schedule:
            game_date = pd.to_datetime(game['game_date'])

            # We only process games that have been completed (status 'Final')
            if game['status'] == 'Final':
                boxscore = statsapi.boxscore_data(game['game_id'])
                
                # Determine whether the team is the home or away team and get OBP
                if team_id == game['home_id']:
                    obp = boxscore['home']['teamStats']['batting']['obp']
                else:
                    obp = boxscore['away']['teamStats']['batting']['obp']

                # Update the cumulative OBP total and increment the games played
                obp_total += float(obp)
                games_played += 1

                # Calculate the running average OBP (prefix sum divided by games played)
                current_obp = obp_total / games_played

                # Fill the DataFrame for all dates up to the current game date
                mask = (df.columns <= game_date) & (df.loc[team_id].isna())
                df.loc[team_id, mask] = current_obp

# Step 3: Retrieve OBP for a specific team and date
def get_obp_for_game(df, team_id, game_date):
    return df.loc[team_id, game_date]

# Example usage
team_ids = [108, 147]  # Example team IDs (Angels, Yankees)
start_date = '2023-03-01'
end_date = '2023-05-01'

# Create and populate the DataFrame
# obp_df = create_stat_dataframe(team_ids, start_date, end_date)
# populate_obp_dataframe(obp_df, team_ids, start_date, end_date)

# Example retrieval for a game on May 15, 2023
game_date = pd.to_datetime('2023-04-15')
team_obp = get_obp_for_game(obp_df, 108, game_date)
print(f"Team OBP on {game_date}: {team_obp}")




Team OBP on 2023-04-15 00:00:00: 0.34492500000000004
