In [14]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import concurrent.futures
import re
import statsapi
import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

pd.set_option('display.max_columns', None)


In [15]:
team_df = []


team_url = "https://sports.core.api.espn.com/v2/sports/baseball/leagues/mlb/teams"
team_data = requests.get(team_url).json()['items']

for key in team_data:
    team_url = key['$ref']
    team_info = requests.get(team_url).json()
    team_df.append({
        'id' : team_info['id'],
        'Team' : team_info['displayName']
    })

team_df = pd.DataFrame(team_df)

In [16]:
roster_df = []
record_df = []
next_game_df = []

game_id_list = []


for team_id in team_df['id'].to_list():


    team_url = f'https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/teams/{team_id}?enable=roster,projection,stats'
    team_response = requests.get(team_url)
    team_data = team_response.json()

    team_name = team_data['team']['displayName']
    team_abbr = team_data['team']['abbreviation']
    team_id = team_data['team']['id']

    for athlete in team_data['team']['athletes']:
        roster_df.append(
            {
                'Team' : team_name,
                'Abbreviation' : team_abbr,
                'espn_team_id' : team_id,
                'espn_player_id' : athlete.get('id'),
                'Name' : athlete.get('displayName'),
                'weight' : athlete.get('weight'),
                'height' : athlete.get('height'),
                'age' : athlete.get('age'),
                'rookieYear' : athlete.get('debutYear'),
                'jersey' : athlete.get('jersey'),
                'position' : athlete.get('position', {}).get('abbreviation'),
                'experience' : athlete.get('experience', {}).get('years'),
                'isActive' : athlete.get('active'),
                'Bats' : athlete.get('bats', {}).get('abbreviation'),
                'Throws' : athlete.get('throws', {}).get('abbreviation'),
                'Injuries' : athlete.get('injuries')
            }
        )
    
    record_data = team_data['team']['record']
    team_dict = {}
    team_dict['Team'] = team_name
    team_dict['Abbreviation'] = team_abbr
    team_dict['espn_team_id'] = team_id
    for record_type in record_data['items']:
        game_type = record_type['type']
        for stat in record_type['stats']:
            team_dict[f"{game_type}_{stat['name']}"] = stat['value'] 
    team_dict['standingSummary'] = team_data['team']['standingSummary']
    record_df.append(team_dict)

    next_event = team_data['team']['nextEvent'][0]

    next_event_dict = {
            'Team' : team_name,
            'Abbreviation' :  team_abbr,
            'espn_team_id' : team_id,
            'espn_game_id' : next_event.get('id')
        }
    game_data = next_event['competitions'][0]

    next_event_dict['isNeutralSite'] = game_data['neutralSite']
    next_event_dict['game_time'] = game_data['status']['type']['shortDetail']
    next_event_dict['venue'] = game_data['venue']['fullName']

    for team in game_data['competitors']:
        
        if team['team']['displayName'] == team_name:
            next_event_dict['home_or_away'] = team['homeAway']
            next_event_dict['team_probable_starting_pitcher'] = team.get('probables',[{}])[0].get('athlete',{}).get('displayName')
            next_event_dict['team_probable_starting_pitcher_id'] = team.get('probables', [{}])[0].get('athlete',{}).get('id')
        else:
            next_event_dict['opponent'] = team['team']['displayName']
            next_event_dict['opponent_espn_team_id'] = team['team']['id']
            next_event_dict['opponent_probable_starting_pitcher'] = team.get('probables',[{}])[0].get('athlete',{}).get('displayName')
            next_event_dict['opponent_probable_starting_pitcher_id'] = team.get('probables', [{}])[0].get('athlete',{}).get('id')
    next_game_df.append(next_event_dict)




    




roster_df = pd.DataFrame(roster_df)
record_df = pd.DataFrame(record_df)
next_game_df = pd.DataFrame(next_game_df)

In [17]:
# Assuming `team_df` is a DataFrame with a column 'id' containing team IDs.
team_ids = team_df['id'].to_list()
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

# Function to fetch game IDs for a given team and year
def fetch_game_ids(team_id, year):
    game_id_list = []
    game_id_url = f"https://sports.core.api.espn.com/v2/sports/baseball/leagues/mlb/seasons/{year}/teams/{team_id}/events?limit=300"
    try:
        response = requests.get(game_id_url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        game_id_data = response.json().get('items', [])

        for key in game_id_data:
            game_id = key['$ref'].split('?')[0].split('/')
            game_id_list.append(game_id[-1])
    except HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Python 3.6+
    except Exception as err:
        print(f"Other error occurred: {err}")  # Python 3.6+
    return game_id_list

# Main function to use multithreading for fetching game IDs
def grab_all_game_ids():
    # Use a ThreadPoolExecutor to manage a pool of threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        # Prepare a list of all tasks (each task is a team-year combination)
        tasks = [executor.submit(fetch_game_ids, team_id, year) for team_id in team_ids for year in years]
        
        # Initialize an empty list to collect all game IDs
        all_game_ids = []
        
        # Wait for the futures to complete and collect results
        for future in concurrent.futures.as_completed(tasks):
            all_game_ids.extend(future.result())

    return all_game_ids

# Execute the main function and get all game IDs
espn_game_id_list = grab_all_game_ids()

espn_game_id_list = list(set(espn_game_id_list))

In [37]:
all_game_score_df = []
all_batting_boxscore_df = []
all_pitching_boxscore_df = []
all_roster_boxscore_df = []
all_odds_df = []
all_play_by_play_df = []

game_score = []
batting_boxscore = []
pitching_boxscore = []
roster_boxscore = []
odds_and_predictions_df = []
play_by_play_df = []

game_url = f'https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/summary?event={game_id}'
print(game_url)
game_response = requests.get(game_url)
game_data = game_response.json()

game_boxscore = game_data.get('boxscore')
game_boxscore




for team in game_boxscore['teams']:
    row = {}
    row['espn_game_id'] = game_id
    row['id'] = team.get('team',{}).get('id')
    row['Team'] = team.get('team',{}).get('displayName')
    row['abbr'] = team.get('team',{}).get('abbreviation')

    stat_dict = team['statistics']

    for batorpitch in stat_dict:
        displayName = batorpitch.get('name')

        for stat in batorpitch['stats']:
            row[f"{displayName}_{stat.get('name')}"] = stat.get('value')
    game_score.append(row)




for player in game_boxscore['players']:
    player_row = {}
    player_row['espn_game_id'] = game_id
    team_id = player.get('team',{}).get('id')
    team_name = player.get('team',{}).get('displayName')

    statistics = player['statistics']
    for stat in statistics:
        if stat['type'] == 'batting':
            batting_statistics = stat
        else:
            pitching_statistics = stat

    batting_labels = batting_statistics['names']
    pitching_labels = pitching_statistics['names']

    for athlete in batting_statistics['athletes']:
        player_row = {}
        player_row['Name'] = athlete['athlete']['displayName']
        player_row['id'] = athlete['athlete']['id']
        player_row['jersey'] = athlete.get('athlete',{}).get('jersey')
        player_row['position'] = athlete['position']['abbreviation']
        player_row['isStarter'] = athlete['starter']
        player_row['battingOrder'] = athlete['batOrder']

        for index, item in enumerate(athlete['stats']):
            player_row[batting_labels[index]] = item

        batting_boxscore.append(player_row)
    
    for athlete in pitching_statistics['athletes']:
        player_row = {}
        player_row['Name'] = athlete['athlete']['displayName']
        player_row['id'] = athlete['athlete']['id']
        player_row['jersey'] = athlete.get('athlete',{}).get('jersey')
        player_row['position'] = athlete['athlete']['position']['abbreviation']
        player_row['isStarter'] = athlete['starter']
        player_row['current_position'] = athlete['position']['abbreviation']

        for index, item in enumerate(athlete['stats']):
            player_row[pitching_labels[index]] = item


        pitching_boxscore.append(player_row)


for team in game_data['rosters']:

    #print(team)
    isHome = team['homeAway']
    isWinner = team.get('winner')
    team_name = team['team']['displayName']
    team_id = team['team']['id']
    team_abbr = team['team']['abbreviation']

    for player in team.get('roster',[]):
        player_dict = {
                'game_id' : game_id,
                'Name' : player['athlete']['displayName'],
                'player_id' : player['athlete']['id'],
                'homeOrAway' : isHome,
                'isWinner' : isWinner,
                'Team' : team_name,
                'team_id' : team_id,
                'team_abbr' : team_abbr,
                'Bats' : player['athlete']['bats']['abbreviation'],
                'Throws' : player['athlete']['throws']['abbreviation'],
                'Position' : player['position']['abbreviation'],
                'batOrder' : player['batOrder'],
                'subbedIn' : player['subbedIn'],
                'subbedOut' : player['subbedOut']
            }
        # for stat in player['stats']:
        #     player_dict[stat['abbreviation']] = stat['value']
        roster_boxscore.append(player_dict)


prediction = game_data.get('predictor')

odds = game_data.get('pickcenter')
for odd in odds:
    if odd['provider']['name']=='teamrankings':
        print(f'{game_id} good')
        game_odds = odd
    elif odd['provider']['name'] == 'consensus':
        consensus_odds = odd
if prediction:
    odds_and_predictions_df.append(
        {
            'espn_game_id' : game_id,
            'espn_away_team_win_prediction' : prediction['awayTeam']['gameProjection'],
            'espn_home_team_win_prediction' : prediction['homeTeam']['gameProjection'],
            'awayTeamOdds' : game_odds['awayTeamOdds']['moneyLine'],
            'consensus_away_team_win_prediction' : consensus_odds['awayTeamOdds']['winPercentage'],
            'awayTeamIsFavorite' : game_odds['awayTeamOdds']['favorite'],
            'homeTeamOdds' : game_odds['homeTeamOdds']['moneyLine'],
            'consensus_home_team_win_prediction' : consensus_odds['homeTeamOdds']['winPercentage'],
            'homeTeamIsFavorite' : game_odds['homeTeamOdds']['favorite'],


        }
    )


play_by_play = game_data.get('plays', [])
if len(play_by_play) > 0:
    for play in play_by_play:
        if play['type']['text'] not in ['End Inning', 'start batter/pitcher', 'Start Inning', 'Play Result', 'end batter/pitcher']:
            play_dict = {
                    'game_id' : game_id,
                    'play_type' : play['type']['text'],
                    'awayScore' : play.get('awayScore'),
                    'homeScore' : play.get('homeScore'),
                    'inningState' : play.get('period', {}).get('type'),
                    'inning' : play.get('period', {}).get('number'),
                    'isScoringPlay' : play.get('scoringPlay'),
                    'scoringValue' : play.get('scoreValue'),
                    'battingOrder' : play.get('batOrder'),
                    'batter_hands' : play.get('bats', {}).get('abbreviation'),
                    'atBatPitchNumber' : play.get('atBatPitchNumber'),
                    'pitchXCoordinate' : play.get('pitchCoordinate',{}).get('x'),
                    'pitchYCoordinate' : play.get('pitchCoordinate',{}).get('y'),
                    'pre_pitch_count_balls' : play.get('pitchCount', {}).get('balls'),
                    'pre_pitch_count_strikes' : play.get('pitchCount', {}).get('strikes'),
                    'post_pitch_count_balls' : play.get('resultCount', {}).get('balls'),
                    'post_pitch_count_strikes' : play.get('resultCount', {}).get('strikes'),

                    
                                }
            for player in play.get('participants',[]):
                if player['type'] == 'pitcher':
                    play_dict['pitcher'] = player['athlete']['id']
                elif player['type'] == 'batter':
                    play_dict['batter'] = player['athlete']['id']
                elif 'batter' not in play_dict.keys():
                    play_dict['batter'] = None
            play_by_play_df.append(play_dict)
    play_by_play_df = pd.DataFrame(play_by_play_df)


odds_and_predictions_df = pd.DataFrame(odds_and_predictions_df)
game_score = pd.DataFrame(game_score)
batting_boxscore = pd.DataFrame(batting_boxscore)
pitching_boxscore = pd.DataFrame(pitching_boxscore)
roster_boxscore = pd.DataFrame(roster_boxscore)






if len(play_by_play_df) > 0:
    batter_map = batting_boxscore.set_index('id')['Name']
    pitcher_map = pitching_boxscore.set_index('id')['Name']
    # Map 'player_id' in df_new to 'player_name' using the mapping
    play_by_play_df['batter_name'] = play_by_play_df['batter'].map(batter_map)
    play_by_play_df['pitcher_name'] = play_by_play_df['pitcher'].map(pitcher_map)

    pitch_counts = play_by_play_df.groupby(['game_id', 'inning', 'battingOrder', 'batter_name']).transform('count')['play_type']
    play_by_play_df['pitches_for_atBat'] = pitch_counts


    # Group by pitcher, game_id, and inning, and count the number of pitches
    pitch_counts_per_inning = play_by_play_df.groupby(['pitcher', 'pitcher_name', 'game_id', 'inning']).size().reset_index(name='pitch_count')

    # Determine if the pitch was a strike or a ball based on the change in counts
    play_by_play_df['pitchType'] = play_by_play_df.apply(lambda row: 'Ball' if row['post_pitch_count_balls'] > row['pre_pitch_count_balls'] 
                            else ('Strike' if row['post_pitch_count_strikes'] > row['pre_pitch_count_strikes'] 
                                    else row['play_type']), axis=1)
    
    try:
        all_play_by_play_df = pd.concat([all_play_by_play_df, play_by_play_df])
    except:
        all_play_by_play_df = play_by_play_df




try:
    all_game_score_df = pd.concat([all_game_score_df, game_score])
    all_batting_boxscore_df = pd.concat([all_batting_boxscore_df, batter_boxscore])
    all_pitching_boxscore_df = pd.concat([all_pitching_boxscore_df, pitcher_boxscore])
    all_roster_boxscore_df = pd.concat([all_roster_boxscore_df, roster_boxscore])
    all_odds_df = pd.concat([all_odds_df, odds_and_predictions_df])
except:
    all_game_score_df = game_score
    all_batting_boxscore_df = batting_boxscore
    all_pitcher_boxscores = pitching_boxscore
    all_roster_boxscore_df = roster_boxscore
    all_odds_df = odds_and_predictions_df

all_batting_boxscore_df


https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/summary?event=401076301
401076301 good


Unnamed: 0,Name,id,jersey,position,isStarter,battingOrder,H-AB,AB,R,H,RBI,HR,BB,K,#P,AVG,OBP,SLG
0,Max Kepler,31870,26,RF,True,1,0-3,3,0,0,0,0,0,0,13,0.26,0.333,0.531
1,Jorge Polanco,32525,7,SS,True,2,0-3,3,0,0,0,0,1,0,16,0.304,0.362,0.506
2,Nelson Cruz,6242,32,DH,True,3,0-3,3,0,0,0,0,1,2,19,0.282,0.375,0.605
3,Eddie Rosario,31944,8,LF,True,4,0-4,4,0,0,0,0,0,0,7,0.28,0.307,0.514
4,Mitch Garver,33667,18,C,True,5,0-4,4,0,0,0,0,0,2,17,0.285,0.371,0.632
5,Marwin Gonzalez,30327,14,1B,True,6,1-3,3,0,1,0,0,0,0,17,0.257,0.324,0.417
6,Miguel Sano,31260,22,3B,True,7,0-3,3,0,0,0,0,0,1,17,0.237,0.327,0.561
7,Jonathan Schoop,31988,7,2B,True,8,0-3,3,0,0,0,0,0,1,9,0.256,0.303,0.457
8,Byron Buxton,32655,25,CF,True,9,2-3,3,1,2,0,0,0,1,11,0.255,0.31,0.498
9,Leury Garcia,30664,63,LF,True,1,2-5,5,1,2,0,0,0,1,12,0.29,0.326,0.386


In [33]:
all_odds_df

Unnamed: 0,espn_game_id,espn_away_team_win_prediction,espn_home_team_win_prediction,awayTeamOdds,consensus_away_team_win_prediction,awayTeamIsFavorite,homeTeamOdds,consensus_home_team_win_prediction,homeTeamIsFavorite
0,401473229,60.6,39.4,-119,57.0,True,110,43.0,False


In [None]:
pitching_boxscore[pitching_boxscore['isStarter']]

odds_and_predictions_df = []

for game_id in espn_game_id_list[-10:]:
    game_url = f'https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/summary?event={game_id}'
    print(game_url)
    game_response = requests.get(game_url)
    game_data = game_response.json()
    prediction = game_data.get('predictor')

    odds = game_data.get('pickcenter')
    for odd in odds:
        if odd['provider']['name']=='teamrankings':
            print(f'{game_id} good')
            game_odds = odd
        elif odd['provider']['name'] == 'consensus':
            consensus_odds = odd
    if prediction:
        odds_and_predictions_df.append(
            {
                'espn_game_id' : game_id,
                'espn_away_team_win_prediction' : prediction['awayTeam']['gameProjection'],
                'espn_home_team_win_prediction' : prediction['homeTeam']['gameProjection'],
                'awayTeamOdds' : game_odds['awayTeamOdds']['moneyLine'],
                'consensus_away_team_win_prediction' : consensus_odds['awayTeamOdds']['winPercentage'],
                'awayTeamIsFavorite' : game_odds['awayTeamOdds']['favorite'],
                'homeTeamOdds' : game_odds['homeTeamOdds']['moneyLine'],
                'consensus_home_team_win_prediction' : consensus_odds['homeTeamOdds']['winPercentage'],
                'homeTeamIsFavorite' : game_odds['homeTeamOdds']['favorite'],


            }
        )
pd.DataFrame(odds_and_predictions_df)