In [315]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import concurrent.futures
import re
import statsapi
import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import numpy as np
pd.set_option('display.max_columns', None)

In [316]:
url = 'https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/teams'
data = requests.get(url).json()

mlb_teams = data['sports'][0]['leagues'][0]['teams']
mlb_teams

mlb_team_logo_df = []

for team in mlb_teams:
    mlb_team_logo_df.append({
        "Team" : f"{team['team']['displayName']}",
        "Logos" : team['team']['logos'][0]['href']
    })

pd.DataFrame(mlb_team_logo_df).to_csv('datasets/MLB_Team_Logos.csv', index=False)

In [317]:
all_lines = pd.read_csv('datasets/prize_picks_lines.csv')

In [318]:
basic_gamelogs = pd.read_csv('datasets/basicGameLogs.csv')
max(list(set(basic_gamelogs['game_date'])))

def findTeamKeys():
    mlbTeamKeys = []
    url = "https://statsapi.mlb.com/api/v1/teams/"
    response = requests.get(url)



    data = response.json()
    for team in data['teams']:
        #if team.get('sport', {}).get('name', '') == "Major League Baseball":
        mlbTeamKeys.append(
            {
            'Team' : team.get('name'),
            'team_id' : team.get('id'),
            'team_abbr' : team.get('abbreviation'),
            'league' : team.get('league',{}).get('name'),
            'division' : team.get('division',{}).get('name'),
            'sport' : team.get('sport', {}).get('name')
            }
        )
    return pd.DataFrame(mlbTeamKeys)

statsApiTeamKeys = findTeamKeys()
mlbTeamKeys = statsApiTeamKeys[statsApiTeamKeys['sport'] == 'Major League Baseball']


# Define the start and end dates
start_date = datetime.strptime(max(list(set(basic_gamelogs['game_date']))), '%Y-%m-%d')
end_date = datetime.now()

# Initialize the list to hold the date ranges
date_ranges = []

# Loop to generate the dates in 30 day increments
current_date = start_date
while current_date < end_date:
    # Add the current date to the list
    date_ranges.append(current_date.strftime('%m/%d/%Y'))
    # Increment the current date by 30 days
    current_date += timedelta(days=30)
date_ranges.append(end_date.strftime('%m/%d/%Y'))

# Function to fetch schedules for a given team ID and date range
def fetch_schedule_for_team_date_range(team_id, start_date, end_date):
    sched = statsapi.schedule(start_date=start_date, end_date=end_date, team=team_id)
    return sched

# Main function to fetch schedules using multithreading
def fetch_schedules_multithreaded(date_ranges, team_ids):
    fullSched = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
        # Prepare future tasks for each combination of team ID and date range
        future_to_sched = {
            executor.submit(fetch_schedule_for_team_date_range, team_id, date_ranges[index-1], date_ranges[index]): (team_id, index)
            for index in range(1, len(date_ranges))
            for team_id in team_ids
        }
        # As each future completes, append its result to fullSched
        for future in concurrent.futures.as_completed(future_to_sched):
            sched = future.result()
            if sched:
                for row in sched:
                    fullSched.append(row)
    return fullSched

# Example usage
team_ids = mlbTeamKeys['team_id'].to_list()  # Assuming this list is available
fullSched = fetch_schedules_multithreaded(date_ranges, team_ids)
fullSched = pd.DataFrame(fullSched)
fullSched = fullSched.drop_duplicates(subset=['game_id'], keep='first')
fullSched = fullSched[fullSched['status'] == 'Final']

basic_gamelogs = pd.concat([basic_gamelogs, fullSched])
basic_gamelogs = basic_gamelogs.drop_duplicates(subset=['game_id'], keep='first')
basic_gamelogs = basic_gamelogs[basic_gamelogs['status'] == 'Final']
basic_gamelogs = basic_gamelogs.sort_values(by='game_date')

basic_gamelogs.to_csv('datasets/basicGameLogs.csv', index=False)

strikeouts_per_inning = pd.read_csv('datasets/k_per_inning.csv')
walks_per_inning = pd.read_csv('datasets/bb_per_inning.csv')
hits_allowed_per_inning = pd.read_csv('datasets/h_per_inning.csv')
all_play_by_play = pd.read_csv('datasets/all_play_by_play.csv')

In [319]:
daySlate = []
current_date = datetime.now()
# current_date = datetime.now() + timedelta(days=1)

def fetch_schedule(team_id, current_date=current_date.strftime('%Y-%m-%d')):
    sched = statsapi.schedule(date=current_date,team=team_id)
    return sched

with ThreadPoolExecutor(max_workers=30) as executor:
    future_to_team = {executor.submit(fetch_schedule, team): team for team in team_ids}

    for future in as_completed(future_to_team):
        sched = future.result()
        for game in sched:
            # Convert game dict to a hashable form (tuple of items) for storing in a set
            if game not in daySlate:
                daySlate.append(game)

daySlate = pd.DataFrame(daySlate)
daySlate.to_csv('datasets/daySlate.csv', index = False)

In [320]:
all_batter_boxscores = pd.read_csv('datasets/batterBoxscores.csv', low_memory=False)
all_pitcher_boxscores = pd.read_csv('datasets/pitcherBoxscores.csv', low_memory=False)

In [321]:
len(all_pitcher_boxscores['game_id'].unique())


21871

In [322]:
regular_games = basic_gamelogs[basic_gamelogs['game_type'] == 'R']
game_id_list = regular_games['game_id'].to_list()
len(game_id_list)
difference = [item for item in game_id_list if item not in all_pitcher_boxscores['game_id'].unique()]

In [323]:
len(set(difference))
#difference[-1]

31

In [324]:
updateSched = basic_gamelogs[basic_gamelogs['game_id'].isin(difference)]

In [325]:





game_id_list = difference
#game_id_list = list(set(game_id_list))


def fetch_and_process_game_data(game_id, game_date):
    max_attempts = 3
    attempt = 0
    while attempt< max_attempts:
        try:
            # Fetch the boxscore data
            boxscore = statsapi.boxscore_data(game_id, timecode=None)

            # Initialize lists to store data
            batter_boxscore = []
            pitcher_boxscore = []

            # Extract team information
            home_team = f"{boxscore['teamInfo']['home']['shortName']} {boxscore['teamInfo']['home']['teamName']}"
            away_team = f"{boxscore['teamInfo']['away']['shortName']} {boxscore['teamInfo']['away']['teamName']}"
            home_id = boxscore['teamInfo']['home']['id']
            away_id = boxscore['teamInfo']['away']['id']

            # Process batters and pitchers for both teams
            for batter in boxscore['awayBatters'][1:] + boxscore['homeBatters'][1:]:
                batter_boxscore.append({
                    'Team': away_team if batter in boxscore['awayBatters'][1:] else home_team,
                    'team_id': away_id if batter in boxscore['awayBatters'][1:] else home_id,
                    **batter
                })

            for pitcher in boxscore['awayPitchers'][1:] + boxscore['homePitchers'][1:]:
                pitcher_boxscore.append({
                    'Team': away_team if pitcher in boxscore['awayPitchers'][1:] else home_team,
                    'team_id': away_id if pitcher in boxscore['awayPitchers'][1:] else home_id,
                    **pitcher
                })

            # Convert lists to DataFrames
            batter_boxscore_df = pd.DataFrame(batter_boxscore)
            pitcher_boxscore_df = pd.DataFrame(pitcher_boxscore)

            # Add game_id and game_date to both DataFrames
            for df in [batter_boxscore_df, pitcher_boxscore_df]:
                df['game_id'] = game_id
                df['date'] = game_date

                # Calculate 'TB' for batters
                if 'doubles' in df.columns and 'triples' in df.columns and 'hr' in df.columns and 'h' in df.columns:
                    df['TB'] = (df['doubles'].astype(int) * 2 + 
                                df['triples'].astype(int) * 3 + 
                                df['hr'].astype(int) * 4 + 
                                (df['h'].astype(int) - df['doubles'].astype(int) - df['triples'].astype(int) - df['hr'].astype(int)))

                # Example merging or additional processing could go here
            id_df = []
            for player_id in boxscore['playerInfo'].keys():
                id_df.append( {
                    'personId' : boxscore['playerInfo'][player_id]['id'],
                    'Name' : boxscore['playerInfo'][player_id]['fullName']
                })

            id_df = pd.DataFrame(id_df)

            batter_boxscore_df = batter_boxscore_df.merge(id_df, how='left', on='personId')
            pitcher_boxscore_df = pitcher_boxscore_df.merge(id_df, how='left', on='personId')
                
            # Return processed DataFrames
            return batter_boxscore_df, pitcher_boxscore_df
        except Exception as e:
            print(f'Error processing game ID {game_id}: {e} on attempt {attempt}')
            time.sleep(2**attempt)
            attempt += 1
            # Return empty DataFrames in case of an error to maintain consistency
    if attempt == max_attempts:
        print(f"Failed to fetch data after maximum attempts for {game_id}")
        return pd.DataFrame(), pd.DataFrame()





def process_all_games(df):
    process_batter_boxscores = pd.DataFrame()
    process_pitcher_boxscores = pd.DataFrame()

    with ThreadPoolExecutor(max_workers=200) as executor:
        futures = [executor.submit(fetch_and_process_game_data, row['game_id'], row['game_date']) for index, row in df.iterrows()]

        for future in as_completed(futures):
            try:
                batter_boxscore, pitcher_boxscore = future.result()
                process_batter_boxscores = pd.concat([process_batter_boxscores, batter_boxscore], ignore_index=True)
                process_pitcher_boxscores = pd.concat([process_pitcher_boxscores, pitcher_boxscore], ignore_index=True)
            except Exception as e:
                # Handle exception, perhaps logging the game_id that caused it
                print(f'An error occurred: {e}')

    return process_batter_boxscores, process_pitcher_boxscores

batter_boxscores, pitcher_boxscores = process_all_games(updateSched)


#all_batter_boxscores = all_batter_boxscores.merge(id_df, how='left', on='personId')
batter_boxscores = batter_boxscores.merge(updateSched, how='left', on='game_id')

pitcher_boxscores = pitcher_boxscores.merge(updateSched,how='left', on='game_id')
#all_pitcher_boxscores = all_pitcher_boxscores.merge(id_df, how='left', on='personId')

batter_boxscores['isWinner'] = batter_boxscores['Team'] == batter_boxscores['winning_team']
pitcher_boxscores['isWinner'] = pitcher_boxscores['Team'] == pitcher_boxscores['winning_team']
batter_boxscores['isStarter'] = (batter_boxscores['battingOrder'].astype(int) % 100 == 0)


all_batter_boxscores = pd.concat([all_batter_boxscores,batter_boxscores])
all_pitcher_boxscores = pd.concat([all_pitcher_boxscores, pitcher_boxscores])
all_batter_boxscores = all_batter_boxscores.drop_duplicates(subset=['game_id', 'Name', 'winning_pitcher'], keep='first')
all_pitcher_boxscores = all_pitcher_boxscores.drop_duplicates(subset=['game_id', 'Name', 'winning_pitcher'], keep='first')


In [326]:
def calculatePitcherFantasyScore(row):
    if row['pitching_outs'] >= 18 and row['er'] <= 3:
        qualityStart = 1
    else:
        qualityStart = 0
    return (row['PitcherIsWinBool'] * 6) + (qualityStart * 4) + (row['er'] * -3) + (row['k'] * 3) + (row['pitching_outs'] * 1)


In [327]:
complex_pitcher_box_scores = all_pitcher_boxscores
def innings_to_outs(ip):

    whole, fraction = divmod(float(ip), 1)
    # Convert the fractional part to a proportion of an inning and multiply by 3 to get the outs
    fraction_outs = round(fraction * 10)  # Convert .1, .2, etc. to 1, 2, etc.
    total_outs = int(whole) * 3 + fraction_outs
    return total_outs
def correct_team_name(row):
    if row['team_id'] == row['away_id']:
        return row['away_name']
    elif row['team_id'] == row['home_id']:
        return row['home_name']
    else:
        return row['Team']  # Return the original value if no match found
    
def find_opponent(row):
    if row['team_id'] == row['away_id']:
        return row['home_name']
    else:
        return row['away_name']

def find_opposing_pitcher(row):
    if row['team_id'] == row['away_id']:
        return row['home_probable_pitcher']
    else:
        return row['away_probable_pitcher']

complex_batter_box_scores = all_batter_boxscores
complex_batter_box_scores['Team'] = complex_batter_box_scores.apply(correct_team_name, axis=1)
complex_batter_box_scores['Opponent'] = complex_batter_box_scores.apply(find_opponent, axis=1)
complex_batter_box_scores['Opposing_Pitcher'] = complex_batter_box_scores.apply(find_opposing_pitcher, axis=1)

complex_batter_box_scores['k'] = complex_batter_box_scores['k'].astype(int)
complex_batter_box_scores['bb'] = complex_batter_box_scores['bb'].astype(int)
complex_batter_box_scores['h'] = complex_batter_box_scores['h'].astype(int)
complex_batter_box_scores['r'] = complex_batter_box_scores['r'].astype(int)


complex_pitcher_box_scores['k'] = complex_pitcher_box_scores['k'].astype(int)
complex_pitcher_box_scores['bb'] = complex_pitcher_box_scores['bb'].astype(int)
complex_pitcher_box_scores['h'] = complex_pitcher_box_scores['h'].astype(int)
complex_pitcher_box_scores['p'] = complex_pitcher_box_scores['p'].astype(int)
complex_pitcher_box_scores['er'] = complex_pitcher_box_scores['er'].astype(int)


complex_pitcher_box_scores['Team'] = complex_pitcher_box_scores.apply(correct_team_name, axis=1)
complex_pitcher_box_scores['Opponent'] = complex_pitcher_box_scores.apply(find_opponent, axis=1)
complex_pitcher_box_scores['isWinner'] = complex_pitcher_box_scores['Team'] == complex_pitcher_box_scores['winning_team']
complex_pitcher_box_scores['isStarter'] = complex_pitcher_box_scores.apply(lambda row: row['Name'] in [row['home_probable_pitcher'], row['away_probable_pitcher']], axis=1)
complex_pitcher_box_scores['pitcherIsWinner'] = complex_pitcher_box_scores.apply(lambda row: row['Name'] == row['winning_pitcher'], axis=1)
complex_pitcher_box_scores['pitch_count_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['p'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['strikeout_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['k'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['walks_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['bb'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['h_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['h'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['pitching_outs'] = complex_pitcher_box_scores['ip'].apply(innings_to_outs)
complex_pitcher_box_scores['pitching_outs'] = complex_pitcher_box_scores['pitching_outs'].astype(int)




complex_batter_box_scores['seasonNumber'] = pd.to_datetime(complex_batter_box_scores['game_date']).dt.year
complex_pitcher_box_scores['seasonNumber'] = pd.to_datetime(complex_pitcher_box_scores['game_date']).dt.year

complex_pitcher_box_scores['game_score'] = 40 + 2 * complex_pitcher_box_scores['pitching_outs'] + complex_pitcher_box_scores['k'] - 2 * complex_pitcher_box_scores['bb'] - 2 * complex_pitcher_box_scores['h'] - 3 * complex_pitcher_box_scores['er']

def categorize_performance(score):
    if score >= 80:
        return 'Excellent'
    elif score >= 60:
        return 'Good'
    elif score >= 40:
        return 'Average'
    elif score >= 20:
        return 'Below Average'
    else:
        return 'Poor'

complex_pitcher_box_scores['performance_category'] = complex_pitcher_box_scores['game_score'].apply(categorize_performance)



team_strikeouts = complex_batter_box_scores.groupby(['game_id', 'Team'])['k'].sum().reset_index()
team_strikeouts.rename(columns={'k': 'team_strikeouts'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_strikeouts, on=['game_id', 'Team'])
complex_batter_box_scores['average_of_team_strikeouts'] = complex_batter_box_scores['k'] / complex_batter_box_scores['team_strikeouts']
complex_batter_box_scores['average_team_strikeouts_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_strikeouts'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)


team_walks = complex_batter_box_scores.groupby(['game_id', 'Team'])['bb'].sum().reset_index()
team_walks.rename(columns={'bb': 'team_walks'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_walks, on=['game_id', 'Team'])
complex_batter_box_scores['average_of_team_walks'] = complex_batter_box_scores['bb'] / complex_batter_box_scores['team_walks']
complex_batter_box_scores['average_team_walks_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_walks'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)

team_hits = complex_batter_box_scores.groupby(['game_id', 'Team'])['h'].sum().reset_index()
team_hits.rename(columns={'h': 'team_hits'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_hits, on=['game_id', 'Team'])
complex_batter_box_scores['average_of_team_hits'] = complex_batter_box_scores['h'] / complex_batter_box_scores['team_hits']
complex_batter_box_scores['average_team_hits_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_hits'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)

team_runs = complex_batter_box_scores.groupby(['game_id', 'Team'])['r'].sum().reset_index()
team_runs.rename(columns={'r': 'team_runs'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_runs, on=['game_id', 'Team'])
print(complex_batter_box_scores['team_runs'].iloc[0])
complex_batter_box_scores['average_of_team_runs'] = np.where(
    complex_batter_box_scores['team_runs'].iloc[0] != 0,
    complex_batter_box_scores['r'] / complex_batter_box_scores['team_runs'],
    0  # Or np.nan, depending on how you want to handle division by zero
)
complex_batter_box_scores['average_team_runs_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_runs'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)

complex_batter_box_scores['h'] = complex_batter_box_scores['h'].astype(int)
complex_batter_box_scores['hr'] = complex_batter_box_scores['hr'].astype(int)
complex_batter_box_scores['doubles'] = complex_batter_box_scores['doubles'].astype(int)
complex_batter_box_scores['triples'] = complex_batter_box_scores['triples'].astype(int)
complex_batter_box_scores['r'] = complex_batter_box_scores['r'].astype(int)
complex_batter_box_scores['rbi'] = complex_batter_box_scores['rbi'].astype(int)
complex_batter_box_scores['bb'] = complex_batter_box_scores['bb'].astype(int)

complex_batter_box_scores['sb'] = complex_batter_box_scores['sb'].astype(int)

complex_pitcher_box_scores['PitcherIsWinBool'] = complex_pitcher_box_scores['pitcherIsWinner'].astype(int)

complex_pitcher_box_scores['Pitcher Fantasy Score'] = complex_pitcher_box_scores.apply(calculatePitcherFantasyScore, axis=1)

complex_batter_box_scores['singles'] = complex_batter_box_scores['h'] - complex_batter_box_scores['hr'] - complex_batter_box_scores['doubles'] - complex_batter_box_scores['triples']
complex_batter_box_scores['H+R+RBIs'] = complex_batter_box_scores['h'] + complex_batter_box_scores['r'] + complex_batter_box_scores['rbi']
complex_batter_box_scores['Batter Fantasy Score'] = (complex_batter_box_scores['singles'] * 3) + (complex_batter_box_scores['doubles'] * 5) + (complex_batter_box_scores['triples'] * 8) + (complex_batter_box_scores['hr'] * 10) + (complex_batter_box_scores['r'] * 2) + (complex_batter_box_scores['rbi'] * 2) + (complex_batter_box_scores['bb'] * 2) + (complex_batter_box_scores['sb'] * 5)




bullpen_boxscores = complex_pitcher_box_scores[complex_pitcher_box_scores['isStarter'] == False][['Team', 'team_id', 'ip', 'h', 'r', 'er', 'bb', 'k', 'hr','p', 's','era','personId', 'game_id', 'date','Name', 'game_date','away_name', 'home_name', 'away_id', 'home_id', 'doubleheader', 'home_probable_pitcher', 'away_probable_pitcher', 'away_score', 'home_score', 'winning_team', 'losing_team', 'winning_pitcher', 'losing_pitcher', 'save_pitcher', 'isWinner', 'pitching_outs', 'game_score', 'PitcherIsWinBool']]

grouped = bullpen_boxscores.groupby(['Team', 'game_id']).agg({'er':'sum', 'ip':'sum', 'bb':'sum', 'h': 'sum'}).reset_index()

# grouped['bullpen_WHIP'] = round((grouped['bb'].astype(int) + grouped['h'].astype(int)) / grouped['ip'],3)
# grouped['bullpen_ERA'] = round((grouped['er'].astype(int)/ grouped['ip']) * 9,3)
# complex_pitcher_box_scores = complex_pitcher_box_scores.merge(grouped[['Team', 'game_id', 'bullpen_ERA', 'bullpen_WHIP']], on=['Team', 'game_id'])

complex_pitcher_box_scores['game_date'] = pd.to_datetime(complex_pitcher_box_scores['game_date'])


complex_pitcher_box_scores = complex_pitcher_box_scores.sort_values(by=['personId', 'game_date'])
complex_pitcher_box_scores['days_between_games'] = complex_pitcher_box_scores.groupby(['personId','seasonNumber'])['game_date'].diff().dt.days




complex_pitcher_box_scores['game_datetime'] = pd.to_datetime(complex_pitcher_box_scores['game_datetime'])
complex_pitcher_box_scores['starting_time'] = complex_pitcher_box_scores['game_datetime'].dt.time


complex_pitcher_box_scores.to_csv('datasets/complex_pitchers.csv', index=False)
complex_batter_box_scores.to_csv('datasets/complex_batters.csv', index=False)

9


In [328]:
complex_pitcher_box_scores[['k','strikeout_MA3']]

starting_pitchers = complex_pitcher_box_scores[complex_pitcher_box_scores['isStarter']]

starting_pitchers['k is more than MA3'] = (starting_pitchers['k'] > starting_pitchers['strikeout_MA3'] - 3)
res = starting_pitchers[['Name', 'k','strikeout_MA3', 'k is more than MA3']].dropna()['k is more than MA3'].value_counts()
res[True]


np.int64(36526)

In [329]:

#for game_id in temp:
def grabMatrices(game_id):
    max_attempts = 5
    attempt = 0
    while attempt< max_attempts:
        try:
            def getPlayByPlay(game_id):
                
                play_by_play = statsapi.get('game_playByPlay', {'gamePk' : game_id})
                all_plays = play_by_play['allPlays']

                game_play_by_play_df = []
                simple_game_play_by_play_df = []
                for play in all_plays:
                    batter_id = play.get('matchup',{}).get('batter',{}).get('id')
                    pitcher_id = play.get('matchup',{}).get('pitcher',{}).get('id')
                    batter_name = play.get('matchup',{}).get('batter',{}).get('fullName')
                    pitcher_name = play.get('matchup',{}).get('pitcher',{}).get('fullName')

                    simple_game_play_by_play_df.append(
                        {
                            'game_ID' : game_id,
                            'atBatResult' : play.get('result',{}).get('event'),
                            'halfInning' : play.get('about', {}).get('halfInning'),
                            'inning' : play.get('about',{}).get('inning'),
                            'batter_id' : batter_id,
                            'pitcher_id' : pitcher_id,
                            'batter_name' : batter_name,
                            'pitcher_name' : pitcher_name,
                            'batter_hand' : play.get('matchup',{}).get('batSide',{}).get('code'),
                            'pitcher_hand' : play.get('matchup',{}).get('pitchHand',{}).get('code'),
                            'isScoringPlay' : play.get('about', {}).get('isScoringPlay'),
                            'rbiOnPlay' : play.get('result', {}).get('rbi'),
                            'home_score' : play.get('result', {}).get('homeScore'),
                            'away_score' : play.get('result', {}).get('awayScore')
                            
                        }
                    )

                    for pitch in play['playEvents']:

                        if pitch['isPitch']:

                            pitch_dict = {
                                'game_ID' : game_id,
                                'atBatIndex' : play.get('about',{}).get('atBatIndex'),
                                'atBatResult' : play.get('result',{}).get('event'),
                                'halfInning' : play.get('about', {}).get('halfInning'),
                                'inning' : play.get('about',{}).get('inning'),
                                'batter_id' : batter_id,
                                'pitcher_id' : pitcher_id,
                                'batter_name' : batter_name,
                                'pitcher_name' : pitcher_name,
                                'batter_hand' : play.get('matchup',{}).get('batSide',{}).get('code'),
                                'pitcher_hand' : play.get('matchup',{}).get('pitchHand',{}).get('code'),
                                'play_description' : pitch.get('details', {}).get('description'),
                                'isInPlay' : pitch.get('details', {}).get('isInPlay'),
                                'isStrike' : pitch.get('details', {}).get('isStrike'),
                                'isBall' : pitch.get('details', {}).get('isBall'),
                                'pitchNumber' : pitch.get('pitchNumber'),
                                'pitch_type' : pitch.get('details', {}).get('type',{}).get('description'),
                                'isOut' : pitch.get('details', {}).get('isOut'),
                                'current_balls' : pitch.get('count',{}).get('balls'),
                                'current_strikes' : pitch.get('count', {}).get('strikes'),
                                'current_outs' : pitch.get('count', {}).get('outs'),
                                'pitchStartSpeed' : pitch.get('pitchData',{}).get('startSpeed'),
                                'pitchEndSpeed' : pitch.get('pitchData', {}).get('endSpeed'),
                                'pitchStrikeZoneTop' : pitch.get('pitchData', {}).get('strikeZoneTop'),
                                'pitchStikeZoneBottom' : pitch.get('pitchData', {}).get('strikeZoneBottom'),
                                'pitchZone' : pitch.get('pitchData',{}).get('zone'),
                                'pitchTypeConfidence' : pitch.get('pitchData', {}).get('typeConfidence'),
                                'isScoringPlay' : play.get('about', {}).get('isScoringPlay'),
                                'rbiOnPlay' : play.get('result', {}).get('rbi'),
                                'home_score' : play.get('result', {}).get('homeScore'),
                                'away_score' : play.get('result', {}).get('awayScore')

                            }

                            for coord in pitch.get('pitchData',{}).get('coordinates',{}).keys():
                                pitch_dict[coord] = pitch.get('pitchData',{}).get('coordinates',{}).get(coord)
                            for breaks in pitch.get('pitchData',{}).get('breaks',{}).keys():
                                pitch_dict[breaks] = pitch.get('pitchData',{}).get('breaks',{}).get(breaks)

                            game_play_by_play_df.append(pitch_dict)    


                play_by_play = pd.DataFrame(game_play_by_play_df)
                simple_game_play_by_play_df = pd.DataFrame(simple_game_play_by_play_df)

                play_by_play['pitchesThrownByCurrentPitcher'] = play_by_play.groupby('pitcher_id').cumcount() + 1
                # Initialize columns to zero in the play_by_play DataFrame
                play_by_play['cumulativeBalls'] = 0
                play_by_play['cumulativeStrikes'] = 0

                # Find indices of balls and strikes in the play_by_play DataFrame
                ball_indices = play_by_play[play_by_play['isBall'] == True].index
                strike_indices = play_by_play[play_by_play['isStrike'] == True].index

                # Update columns with correct cumulative counts in the play_by_play DataFrame
                play_by_play.loc[ball_indices, 'cumulativeBalls'] = play_by_play.loc[ball_indices].groupby('pitcher_id').cumcount() + 1
                play_by_play.loc[strike_indices, 'cumulativeStrikes'] = play_by_play.loc[strike_indices].groupby('pitcher_id').cumcount() + 1

                # Define a function to count strikeouts and hits allowed
                def count_stats(row):
                    if row['atBatResult'] == 'Strikeout':
                        return 'Strikeout'
                    elif row['atBatResult'] in ['Single', 'Double', 'Triple', 'Home Run']:
                        return 'Hit'
                    else:
                        return 'Other'

                # Apply the function to categorize each pitch
                play_by_play['stat_category'] = play_by_play.apply(count_stats, axis=1)

                return play_by_play, simple_game_play_by_play_df

            play_by_play, simple_game_play_by_play_df = getPlayByPlay(game_id)

            return play_by_play
        except Exception as e:
            print(f'Error processing game ID {game_id}: {e} on attempt {attempt}')
            time.sleep(2**attempt)
            attempt += 1
    if attempt == max_attempts:
        print(f"Failed to fetch data after maximum attempts for {game_id}")
        return pd.DataFrame()
        



def process_games(game_ids):
    all_play_by_play = pd.DataFrame([])

    with ThreadPoolExecutor(max_workers=250) as executor:
        futures = [executor.submit(grabMatrices, game_id) for game_id in game_ids]
        for future in futures:
            play_by_play_df= future.result()
            all_play_by_play = pd.concat([all_play_by_play, play_by_play_df])
    
    return all_play_by_play


In [330]:
play_by_play = process_games(game_id_list)
all_play_by_play = pd.concat([all_play_by_play, play_by_play])
all_play_by_play = all_play_by_play.drop_duplicates(keep='first')


In [331]:
all_play_by_play.to_csv('datasets/all_play_by_play.csv', index=False)

In [332]:
all_play_by_play = pd.read_csv('datasets/all_play_by_play.csv')

In [333]:
game_id_list = all_play_by_play['game_ID'].unique()

pitcher_runs_per_inning = pd.read_csv('datasets/r_per_inning.csv')
pitcher_k_per_inning = pd.read_csv('datasets/k_per_inning.csv')
pitcher_pitches_per_inning = pd.read_csv('datasets/pitches_per_inning.csv')
pitcher_hits_per_inning = pd.read_csv('datasets/h_per_inning.csv')
pitcher_walks_per_inning = pd.read_csv('datasets/bb_per_inning.csv')
pitcher_singles_per_inning = pd.read_csv('datasets/singles_per_inning.csv')
pitcher_doubles_per_inning = pd.read_csv('datasets/doubles_per_inning.csv')
pitcher_triples_per_inning = pd.read_csv('datasets/triples_per_inning.csv')
pitcher_homeRuns_per_inning = pd.read_csv('datasets/hr_per_inning.csv')

In [334]:
game_id_list = set(all_play_by_play['game_ID'].to_list()) - set(pitcher_runs_per_inning['game_id'].to_list())


In [335]:






for game_id in game_id_list:
    
    sample_game = all_play_by_play[all_play_by_play['game_ID'] == game_id].drop_duplicates(subset=['atBatIndex'], keep='last')
    sample_game = sample_game[sample_game['inning'] <= 9]
    pc = sample_game.copy()
    pc = pc.drop_duplicates(subset=['inning', 'pitcher_name'], keep='last')


    for pitcher_name in sample_game['pitcher_name'].unique():
        if sample_game[sample_game['pitcher_name'] == pitcher_name]['halfInning'].iloc[0] == 'top':
            homeAway = 'home'
        else:
            homeAway = 'away'

        pitcher_dict = {
            'game_id' : game_id,
            'Name' : pitcher_name,
            'pitcher_id' : sample_game[sample_game['pitcher_name'] == pitcher_name]['pitcher_id'].iloc[0],
            'homeOrAway' : homeAway
            }

        runs_pitcher_dict = pitcher_dict.copy()
        k_pitcher_dict = pitcher_dict.copy()
        h_dict = pitcher_dict.copy()
        bb_dict = pitcher_dict.copy()
        single_dict = pitcher_dict.copy()
        doubles_dict = pitcher_dict.copy()
        triples_dict = pitcher_dict.copy()
        homeRuns_dict = pitcher_dict.copy()
        pc_dict = pitcher_dict.copy()

        run_total = 0
        k_total = 0
        pc_total = 0
        h_total = 0
        bb_total = 0
        single_total = 0
        double_total = 0
        triple_total = 0
        homeRun_total = 0

        last_inning = 0
        for inning in range(1,10):
            runs_pitcher_dict[str(inning)] = sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning)]['rbiOnPlay'].sum()
            k_pitcher_dict[str(inning)] = len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Strikeout')])
            h_dict[str(inning)] = len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['stat_category'] == 'Hit')])
            bb_dict[str(inning)] = len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Walk')])
            single_dict[str(inning)] = len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Single')])
            doubles_dict[str(inning)] = len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Double')])
            triples_dict[str(inning)] = len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Triple')])
            homeRuns_dict[str(inning)] = len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Home Run')])





            if pc[(pc['pitcher_name'] == pitcher_name) & (pc['inning'] == inning)]['pitchesThrownByCurrentPitcher'].sum() > 0:
                pc_dict[str(inning)] = pc[(pc['pitcher_name'] == pitcher_name) & (pc['inning'] == inning)]['pitchesThrownByCurrentPitcher'].sum() - last_inning
                pc_total += pc[(pc['pitcher_name'] == pitcher_name) & (pc['inning'] == inning)]['pitchesThrownByCurrentPitcher'].sum() - last_inning
            else: pc_dict[str(inning)] = 0
            last_inning = pc[(pc['pitcher_name'] == pitcher_name) & (pc['inning'] == inning)]['pitchesThrownByCurrentPitcher'].sum()
            
            
            
            run_total += sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning)]['rbiOnPlay'].sum()
            k_total += len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Strikeout')])
            h_total += len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['stat_category'] == 'Hit')])
            bb_total += len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Walk')])
            single_total += len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Single')])
            double_total += len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Double')])
            triple_total += len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Triple')])
            homeRun_total += len(sample_game[(sample_game['pitcher_name'] == pitcher_name) & (sample_game['inning'] == inning) & (sample_game['atBatResult'] == 'Home Run')])



        runs_pitcher_dict['Total'] = run_total
        k_pitcher_dict['Total'] = k_total
        pc_dict['Total'] = pc_total
        h_dict['Total'] = h_total
        bb_dict['Total'] = bb_total
        single_dict['Total'] = single_total
        doubles_dict['Total'] = double_total
        triples_dict['Total'] = triple_total
        homeRuns_dict['Total'] = homeRun_total



        runs_pitcher_dict, k_pitcher_dict, pc_dict, h_dict, bb_dict, single_dict, doubles_dict, triples_dict, homeRuns_dict = pd.DataFrame([runs_pitcher_dict]), pd.DataFrame([k_pitcher_dict]), pd.DataFrame([pc_dict]), pd.DataFrame([h_dict]), pd.DataFrame([bb_dict]), pd.DataFrame([single_dict]), pd.DataFrame([doubles_dict]), pd.DataFrame([triples_dict]), pd.DataFrame([homeRuns_dict])



        pitcher_runs_per_inning = pd.concat([pitcher_runs_per_inning, runs_pitcher_dict])
        pitcher_k_per_inning = pd.concat([pitcher_k_per_inning, k_pitcher_dict])
        pitcher_pitches_per_inning = pd.concat([pitcher_pitches_per_inning, pc_dict])
        pitcher_hits_per_inning = pd.concat([pitcher_hits_per_inning, h_dict])
        pitcher_walks_per_inning = pd.concat([pitcher_walks_per_inning, bb_dict])
        pitcher_singles_per_inning = pd.concat([pitcher_singles_per_inning, single_dict])
        pitcher_doubles_per_inning = pd.concat([pitcher_doubles_per_inning, doubles_dict])
        pitcher_triples_per_inning = pd.concat([pitcher_triples_per_inning, triples_dict])
        pitcher_homeRuns_per_inning = pd.concat([pitcher_homeRuns_per_inning, homeRuns_dict])
    
pitcher_runs_per_inning = pitcher_runs_per_inning.drop_duplicates(keep='last')
pitcher_k_per_inning = pitcher_k_per_inning.drop_duplicates(keep='last')
pitcher_pitches_per_inning = pitcher_pitches_per_inning.drop_duplicates(keep='last')
pitcher_hits_per_inning = pitcher_hits_per_inning.drop_duplicates(keep='last')
pitcher_walks_per_inning = pitcher_walks_per_inning.drop_duplicates(keep='last')
pitcher_singles_per_inning = pitcher_singles_per_inning.drop_duplicates(keep='last')
pitcher_doubles_per_inning = pitcher_doubles_per_inning.drop_duplicates(keep='last')
pitcher_triples_per_inning = pitcher_triples_per_inning.drop_duplicates(keep='last')
pitcher_homeRuns_per_inning = pitcher_homeRuns_per_inning.drop_duplicates(keep='last')




In [336]:
pitcher_homeRuns_per_inning['homeOrAway'] = pitcher_homeRuns_per_inning['homeOrAway'].replace({'home': 'away', 'away': 'home'})


In [337]:
pitcher_pitches_per_inning.to_csv('datasets/pitches_per_inning.csv', index=False)
pitcher_runs_per_inning.to_csv('datasets/r_per_inning.csv', index=False)
pitcher_k_per_inning.to_csv('datasets/k_per_inning.csv', index=False)
pitcher_hits_per_inning.to_csv('datasets/h_per_inning.csv', index=False)
pitcher_walks_per_inning.to_csv('datasets/bb_per_inning.csv', index=False)
pitcher_singles_per_inning.to_csv('datasets/singles_per_inning.csv', index=False)
pitcher_doubles_per_inning.to_csv('datasets/doubles_per_inning.csv', index=False)
pitcher_triples_per_inning.to_csv('datasets/triples_per_inning.csv', index=False)
pitcher_homeRuns_per_inning.to_csv('datasets/hr_per_inning.csv', index=False)

In [338]:
team_id_df = []


team_url = "https://sports.core.api.espn.com/v2/sports/baseball/leagues/mlb/teams"
team_data = requests.get(team_url).json()['items']

for key in team_data:
    team_url = key['$ref']
    team_info = requests.get(team_url).json()
    team_id_df.append({
        'id' : team_info['id'],
        'Team' : team_info['displayName']
    })

team_url = "https://sports.core.api.espn.com/v2/sports/baseball/leagues/mlb/teams?page=2"
team_data = requests.get(team_url).json()['items']

for key in team_data:
    team_url = key['$ref']
    team_info = requests.get(team_url).json()
    team_id_df.append({
        'id' : team_info['id'],
        'Team' : team_info['displayName']
    })
team_id_df = pd.DataFrame(team_id_df)


In [339]:
roster_df = []
record_df = []
next_game_df = []

game_id_list = []


for team_id in team_id_df['id'].to_list():


    team_url = f'https://site.api.espn.com/apis/site/v2/sports/baseball/mlb/teams/{team_id}?enable=roster,projection,stats'
    team_response = requests.get(team_url)
    team_data = team_response.json()

    team_name = team_data['team']['displayName']
    team_abbr = team_data['team']['abbreviation']
    team_id = team_data['team']['id']
    team_logo = team_data['team']['logos'][0]['href']

    for athlete in team_data['team']['athletes']:
        roster_df.append(
            {
                'Team' : team_name,
                'Team_Logo' : team_logo,
                'Abbreviation' : team_abbr,
                'espn_team_id' : team_id,
                'espn_player_id' : athlete.get('id'),
                'Name' : athlete.get('displayName'),
                'weight' : athlete.get('weight'),
                'height' : athlete.get('height'),
                'age' : athlete.get('age'),
                'rookieYear' : athlete.get('debutYear'),
                'jersey' : athlete.get('jersey'),
                'position' : athlete.get('position', {}).get('abbreviation'),
                'experience' : athlete.get('experience', {}).get('years'),
                'isActive' : athlete.get('active'),
                'Bats' : athlete.get('bats', {}).get('abbreviation'),
                'Throws' : athlete.get('throws', {}).get('abbreviation'),
                'Injuries' : athlete.get('injuries'),
                'headShot' : athlete.get('headshot',{}).get('href')

            }
        )
    
    record_data = team_data['team']['record']
    team_dict = {}
    team_dict['Team'] = team_name
    team_dict['Abbreviation'] = team_abbr
    team_dict['espn_team_id'] = team_id
    for record_type in record_data['items']:
        game_type = record_type['type']
        for stat in record_type['stats']:
            team_dict[f"{game_type}_{stat['name']}"] = stat['value'] 
    team_dict['standingSummary'] = team_data['team']['standingSummary']
    record_df.append(team_dict)

    next_event = team_data['team']['nextEvent'][0]

    next_event_dict = {
            'Team' : team_name,
            'Abbreviation' :  team_abbr,
            'espn_team_id' : team_id,
            'espn_game_id' : next_event.get('id')
        }
    game_data = next_event['competitions'][0]

    next_event_dict['isNeutralSite'] = game_data['neutralSite']
    next_event_dict['game_time'] = game_data['status']['type']['shortDetail']
    next_event_dict['venue'] = game_data['venue']['fullName']

    for team in game_data['competitors']:
        
        if team['team']['displayName'] == team_name:
            next_event_dict['home_or_away'] = team['homeAway']
            next_event_dict['team_probable_starting_pitcher'] = team.get('probables',[{}])[0].get('athlete',{}).get('displayName')
            next_event_dict['team_probable_starting_pitcher_id'] = team.get('probables', [{}])[0].get('athlete',{}).get('id')
        else:
            next_event_dict['opponent'] = team['team']['displayName']
            next_event_dict['opponent_espn_team_id'] = team['team']['id']
            next_event_dict['opponent_probable_starting_pitcher'] = team.get('probables',[{}])[0].get('athlete',{}).get('displayName')
            next_event_dict['opponent_probable_starting_pitcher_id'] = team.get('probables', [{}])[0].get('athlete',{}).get('id')
    next_game_df.append(next_event_dict)


roster_df = pd.DataFrame(roster_df)
record_df = pd.DataFrame(record_df)
next_game_df = pd.DataFrame(next_game_df)

In [340]:
def changeInjuries(row):
    if row['Injuries'] != []:
        return row['Injuries'][0]
    else:
        return False

roster_df['Injuries'] = roster_df.apply(changeInjuries,axis=1)

In [341]:
roster_df.to_csv('datasets/roster_df.csv', index=False)
record_df.to_csv('datasets/record_df.csv', index=False)
next_game_df.to_csv('datasets/next_game_df.csv', index=False)

In [342]:
injured = roster_df[roster_df['Injuries'] != False]
injury_df = []
for index, row in injured.iterrows():
    player_df = {
        'Team' : row.get('Team'),
        'team_abbr' : row.get('Abbreviation'),
        'espn_team_id' : row.get('espn_team_id'),
        'espn_player_id' : row.get('espn_player_id'),
        'Name' : row.get('Name'),
        'Position' : row.get('position'),
        'longComment' : row.get('Injuries', {}).get('longComment'),
        'shortComment' : row.get('Injuries', {}).get('shortComment'),
        'status' : row.get('Injuries', {}).get('status'),
        'injuryDate' : row.get('Injuries',{}).get('date').split('T')[0]
    }
    injury_df.append(player_df)
injury_df = pd.DataFrame(injury_df)

In [343]:
injury_df.to_csv('datasets/injuryReport.csv', index=False)

In [344]:
# WRITE TO CSV
# RERUN WITH NEW VALUES

In [345]:
all_batter_boxscores.to_csv('datasets/batterBoxscores.csv', index= False)
all_pitcher_boxscores.to_csv('datasets/pitcherBoxscores.csv', index=False)

In [346]:
pitcher_data = pd.read_csv('datasets/complex_pitchers.csv')

In [347]:
#pitcher_data = pitcher_data.rename(columns={'k': 'Pitcher Strikeouts', 'er': 'Earned Runs Allowed', 'h': 'Hits Allowed', 'pitching_outs' : 'Pitching Outs', 'p': 'Pitches Thrown'})

In [348]:
pitcher_data = pitcher_data[pitcher_data['seasonNumber'] == 2024]

teams = list(pitcher_data['Opponent'].unique())
print(teams)

stat_df = []

for team in teams:
    gameLog = pitcher_data[pitcher_data['Opponent'] == team].copy()
    number_of_games = len(gameLog['game_id'].unique())
    stat_dict = {
        'Team' : team,
        'Pitcher Strikeouts' : round(gameLog['k'].astype(int).sum() / number_of_games,2),
        'Earned Runs Allowed' : round(gameLog['er'].astype(int).sum() / number_of_games,2),
        'Hits Allowed' : round(gameLog['h'].astype(int).sum() / number_of_games,2),
        'Pitching Outs' :round(gameLog['pitching_outs'].astype(int).sum() / number_of_games,2),
        'Pitcher Fantasy Score' : round(gameLog['Pitcher Fantasy Score'].astype(int).sum() / number_of_games,2)
    }
    stat_df.append(stat_dict)

pd.DataFrame(stat_df)

#gameLog

['Washington Nationals', 'Chicago Cubs', 'Cleveland Guardians', 'New York Yankees', 'Detroit Tigers', 'Milwaukee Brewers', 'Oakland Athletics', 'Seattle Mariners', 'St. Louis Cardinals', 'Los Angeles Angels', 'Boston Red Sox', 'Philadelphia Phillies', 'Baltimore Orioles', 'Pittsburgh Pirates', 'Minnesota Twins', 'Tampa Bay Rays', 'Chicago White Sox', 'Toronto Blue Jays', 'Cincinnati Reds', 'San Diego Padres', 'Miami Marlins', 'Kansas City Royals', 'Los Angeles Dodgers', 'Texas Rangers', 'Houston Astros', 'Arizona Diamondbacks', 'New York Mets', 'San Francisco Giants', 'Colorado Rockies', 'Atlanta Braves']


Unnamed: 0,Team,Pitcher Strikeouts,Earned Runs Allowed,Hits Allowed,Pitching Outs,Pitcher Fantasy Score
0,Washington Nationals,7.48,3.92,8.14,26.68,42.57
1,Chicago Cubs,8.53,4.13,8.0,26.51,43.88
2,Cleveland Guardians,7.28,4.07,7.84,26.46,40.16
3,New York Yankees,8.03,4.74,8.44,26.62,39.99
4,Detroit Tigers,8.76,3.86,7.95,26.71,45.93
5,Milwaukee Brewers,8.88,4.53,8.59,26.45,43.22
6,Oakland Athletics,9.15,3.87,7.73,26.65,47.56
7,Seattle Mariners,10.2,3.56,7.02,26.5,51.08
8,St. Louis Cardinals,8.13,3.75,8.41,26.6,44.22
9,Los Angeles Angels,8.5,3.48,7.56,26.74,47.09


In [349]:
stat_df = pd.DataFrame(stat_df)
stat_df.to_csv('datasets/team_stats.csv', index=False)