In [1]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import concurrent.futures
import re
import statsapi
import pprint
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
basic_gamelogs = pd.read_csv('datasets/basicGameLogs.csv')
max(list(set(basic_gamelogs['game_date'])))

def findTeamKeys():
    mlbTeamKeys = []
    url = "https://statsapi.mlb.com/api/v1/teams/"
    response = requests.get(url)



    data = response.json()
    for team in data['teams']:
        #if team.get('sport', {}).get('name', '') == "Major League Baseball":
        mlbTeamKeys.append(
            {
            'Team' : team.get('name'),
            'team_id' : team.get('id'),
            'team_abbr' : team.get('abbreviation'),
            'league' : team.get('league',{}).get('name'),
            'division' : team.get('division',{}).get('name'),
            'sport' : team.get('sport', {}).get('name')
            }
        )
    return pd.DataFrame(mlbTeamKeys)

statsApiTeamKeys = findTeamKeys()
mlbTeamKeys = statsApiTeamKeys[statsApiTeamKeys['sport'] == 'Major League Baseball']


# Define the start and end dates
start_date = datetime.strptime(max(list(set(basic_gamelogs['game_date']))), '%Y-%m-%d')
end_date = datetime.now()

# Initialize the list to hold the date ranges
date_ranges = []

# Loop to generate the dates in 30 day increments
current_date = start_date
while current_date < end_date:
    # Add the current date to the list
    date_ranges.append(current_date.strftime('%m/%d/%Y'))
    # Increment the current date by 30 days
    current_date += timedelta(days=30)
date_ranges.append(end_date.strftime('%m/%d/%Y'))

# Function to fetch schedules for a given team ID and date range
def fetch_schedule_for_team_date_range(team_id, start_date, end_date):
    sched = statsapi.schedule(start_date=start_date, end_date=end_date, team=team_id)
    return sched

# Main function to fetch schedules using multithreading
def fetch_schedules_multithreaded(date_ranges, team_ids):
    fullSched = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
        # Prepare future tasks for each combination of team ID and date range
        future_to_sched = {
            executor.submit(fetch_schedule_for_team_date_range, team_id, date_ranges[index-1], date_ranges[index]): (team_id, index)
            for index in range(1, len(date_ranges))
            for team_id in team_ids
        }
        # As each future completes, append its result to fullSched
        for future in concurrent.futures.as_completed(future_to_sched):
            sched = future.result()
            if sched:
                for row in sched:
                    fullSched.append(row)
    return fullSched

# Example usage
team_ids = mlbTeamKeys['team_id'].to_list()  # Assuming this list is available
fullSched = fetch_schedules_multithreaded(date_ranges, team_ids)
fullSched = pd.DataFrame(fullSched)
fullSched = fullSched.drop_duplicates(subset=['game_id'], keep='first')
fullSched = fullSched[fullSched['status'] == 'Final']

basic_gamelogs = pd.concat([basic_gamelogs, fullSched])
basic_gamelogs = basic_gamelogs.drop_duplicates(subset=['game_id'], keep='first')
basic_gamelogs = basic_gamelogs[basic_gamelogs['status'] == 'Final']
basic_gamelogs = basic_gamelogs.sort_values(by='game_date')

basic_gamelogs.to_csv('datasets/basicGameLogs.csv', index=False)

strikeouts_per_inning = pd.read_csv('datasets/k_per_inning.csv')
walks_per_inning = pd.read_csv('datasets/bb_per_inning.csv')
hits_allowed_per_inning = pd.read_csv('datasets/h_per_inning.csv')
all_play_by_play = pd.read_csv('datasets/all_play_by_play.csv')

In [3]:
daySlate = []
current_date = datetime.now()
# current_date = datetime.now() + timedelta(days=1)

def fetch_schedule(team_id, current_date=current_date.strftime('%Y-%m-%d')):
    sched = statsapi.schedule(date=current_date,team=team_id)
    return sched

with ThreadPoolExecutor(max_workers=30) as executor:
    future_to_team = {executor.submit(fetch_schedule, team): team for team in team_ids}

    for future in as_completed(future_to_team):
        sched = future.result()
        for game in sched:
            # Convert game dict to a hashable form (tuple of items) for storing in a set
            if game not in daySlate:
                daySlate.append(game)

daySlate = pd.DataFrame(daySlate)
daySlate.to_csv('datasets/daySlate.csv', index = False)

In [4]:
all_batter_boxscores = pd.read_csv('datasets/batterBoxscores.csv', low_memory=False)
all_pitcher_boxscores = pd.read_csv('datasets/pitcherBoxscores.csv', low_memory=False)

In [5]:
len(all_pitcher_boxscores['game_id'].unique())


20428

In [6]:
regular_games = basic_gamelogs[basic_gamelogs['game_type'] == 'R']
game_id_list = regular_games['game_id'].to_list()
len(game_id_list)
difference = [item for item in game_id_list if item not in all_pitcher_boxscores['game_id'].unique()]

In [7]:
len(set(difference))
#difference[-1]

12

In [8]:
updateSched = basic_gamelogs[basic_gamelogs['game_id'].isin(difference)]

In [9]:





game_id_list = difference
#game_id_list = list(set(game_id_list))


def fetch_and_process_game_data(game_id, game_date):
    max_attempts = 3
    attempt = 0
    while attempt< max_attempts:
        try:
            # Fetch the boxscore data
            boxscore = statsapi.boxscore_data(game_id, timecode=None)

            # Initialize lists to store data
            batter_boxscore = []
            pitcher_boxscore = []

            # Extract team information
            home_team = f"{boxscore['teamInfo']['home']['shortName']} {boxscore['teamInfo']['home']['teamName']}"
            away_team = f"{boxscore['teamInfo']['away']['shortName']} {boxscore['teamInfo']['away']['teamName']}"
            home_id = boxscore['teamInfo']['home']['id']
            away_id = boxscore['teamInfo']['away']['id']

            # Process batters and pitchers for both teams
            for batter in boxscore['awayBatters'][1:] + boxscore['homeBatters'][1:]:
                batter_boxscore.append({
                    'Team': away_team if batter in boxscore['awayBatters'][1:] else home_team,
                    'team_id': away_id if batter in boxscore['awayBatters'][1:] else home_id,
                    **batter
                })

            for pitcher in boxscore['awayPitchers'][1:] + boxscore['homePitchers'][1:]:
                pitcher_boxscore.append({
                    'Team': away_team if pitcher in boxscore['awayPitchers'][1:] else home_team,
                    'team_id': away_id if pitcher in boxscore['awayPitchers'][1:] else home_id,
                    **pitcher
                })

            # Convert lists to DataFrames
            batter_boxscore_df = pd.DataFrame(batter_boxscore)
            pitcher_boxscore_df = pd.DataFrame(pitcher_boxscore)

            # Add game_id and game_date to both DataFrames
            for df in [batter_boxscore_df, pitcher_boxscore_df]:
                df['game_id'] = game_id
                df['date'] = game_date

                # Calculate 'TB' for batters
                if 'doubles' in df.columns and 'triples' in df.columns and 'hr' in df.columns and 'h' in df.columns:
                    df['TB'] = (df['doubles'].astype(int) * 2 + 
                                df['triples'].astype(int) * 3 + 
                                df['hr'].astype(int) * 4 + 
                                (df['h'].astype(int) - df['doubles'].astype(int) - df['triples'].astype(int) - df['hr'].astype(int)))

                # Example merging or additional processing could go here
            id_df = []
            for player_id in boxscore['playerInfo'].keys():
                id_df.append( {
                    'personId' : boxscore['playerInfo'][player_id]['id'],
                    'Name' : boxscore['playerInfo'][player_id]['fullName']
                })

            id_df = pd.DataFrame(id_df)

            batter_boxscore_df = batter_boxscore_df.merge(id_df, how='left', on='personId')
            pitcher_boxscore_df = pitcher_boxscore_df.merge(id_df, how='left', on='personId')
                
            # Return processed DataFrames
            return batter_boxscore_df, pitcher_boxscore_df
        except Exception as e:
            print(f'Error processing game ID {game_id}: {e} on attempt {attempt}')
            time.sleep(2**attempt)
            attempt += 1
            # Return empty DataFrames in case of an error to maintain consistency
    if attempt == max_attempts:
        print(f"Failed to fetch data after maximum attempts for {game_id}")
        return pd.DataFrame(), pd.DataFrame()





def process_all_games(df):
    process_batter_boxscores = pd.DataFrame()
    process_pitcher_boxscores = pd.DataFrame()

    with ThreadPoolExecutor(max_workers=200) as executor:
        futures = [executor.submit(fetch_and_process_game_data, row['game_id'], row['game_date']) for index, row in df.iterrows()]

        for future in as_completed(futures):
            try:
                batter_boxscore, pitcher_boxscore = future.result()
                process_batter_boxscores = pd.concat([process_batter_boxscores, batter_boxscore], ignore_index=True)
                process_pitcher_boxscores = pd.concat([process_pitcher_boxscores, pitcher_boxscore], ignore_index=True)
            except Exception as e:
                # Handle exception, perhaps logging the game_id that caused it
                print(f'An error occurred: {e}')

    return process_batter_boxscores, process_pitcher_boxscores

batter_boxscores, pitcher_boxscores = process_all_games(updateSched)


#all_batter_boxscores = all_batter_boxscores.merge(id_df, how='left', on='personId')
batter_boxscores = batter_boxscores.merge(updateSched, how='left', on='game_id')

pitcher_boxscores = pitcher_boxscores.merge(updateSched,how='left', on='game_id')
#all_pitcher_boxscores = all_pitcher_boxscores.merge(id_df, how='left', on='personId')

batter_boxscores['isWinner'] = batter_boxscores['Team'] == batter_boxscores['winning_team']
pitcher_boxscores['isWinner'] = pitcher_boxscores['Team'] == pitcher_boxscores['winning_team']
batter_boxscores['isStarter'] = (batter_boxscores['battingOrder'].astype(int) % 100 == 0)


all_batter_boxscores = pd.concat([all_batter_boxscores,batter_boxscores])
all_pitcher_boxscores = pd.concat([all_pitcher_boxscores, pitcher_boxscores])
all_batter_boxscores = all_batter_boxscores.drop_duplicates(subset=['game_id', 'Name', 'winning_pitcher'], keep='first')
all_pitcher_boxscores = all_pitcher_boxscores.drop_duplicates(subset=['game_id', 'Name', 'winning_pitcher'], keep='first')


In [10]:
complex_pitcher_box_scores = all_pitcher_boxscores
def innings_to_outs(ip):

    whole, fraction = divmod(float(ip), 1)
    # Convert the fractional part to a proportion of an inning and multiply by 3 to get the outs
    fraction_outs = round(fraction * 10)  # Convert .1, .2, etc. to 1, 2, etc.
    total_outs = int(whole) * 3 + fraction_outs
    return total_outs
def correct_team_name(row):
    if row['team_id'] == row['away_id']:
        return row['away_name']
    elif row['team_id'] == row['home_id']:
        return row['home_name']
    else:
        return row['Team']  # Return the original value if no match found
    
def find_opponent(row):
    if row['team_id'] == row['away_id']:
        return row['home_name']
    else:
        return row['away_name']

def find_opposing_pitcher(row):
    if row['team_id'] == row['away_id']:
        return row['home_probable_pitcher']
    else:
        return row['away_probable_pitcher']

complex_batter_box_scores = all_batter_boxscores
complex_batter_box_scores['Team'] = complex_batter_box_scores.apply(correct_team_name, axis=1)
complex_batter_box_scores['Opponent'] = complex_batter_box_scores.apply(find_opponent, axis=1)
complex_batter_box_scores['Opposing_Pitcher'] = complex_batter_box_scores.apply(find_opposing_pitcher, axis=1)

complex_batter_box_scores['k'] = complex_batter_box_scores['k'].astype(int)
complex_batter_box_scores['bb'] = complex_batter_box_scores['bb'].astype(int)
complex_batter_box_scores['h'] = complex_batter_box_scores['h'].astype(int)
complex_batter_box_scores['r'] = complex_batter_box_scores['r'].astype(int)


complex_pitcher_box_scores['k'] = complex_pitcher_box_scores['k'].astype(int)
complex_pitcher_box_scores['bb'] = complex_pitcher_box_scores['bb'].astype(int)
complex_pitcher_box_scores['h'] = complex_pitcher_box_scores['h'].astype(int)
complex_pitcher_box_scores['p'] = complex_pitcher_box_scores['p'].astype(int)
complex_pitcher_box_scores['er'] = complex_pitcher_box_scores['er'].astype(int)


complex_pitcher_box_scores['Team'] = complex_pitcher_box_scores.apply(correct_team_name, axis=1)
complex_pitcher_box_scores['Opponent'] = complex_pitcher_box_scores.apply(find_opponent, axis=1)
complex_pitcher_box_scores['isWinner'] = complex_pitcher_box_scores['Team'] == complex_pitcher_box_scores['winning_team']
complex_pitcher_box_scores['isStarter'] = complex_pitcher_box_scores.apply(lambda row: row['Name'] in [row['home_probable_pitcher'], row['away_probable_pitcher']], axis=1)
complex_pitcher_box_scores['pitcherIsWinner'] = complex_pitcher_box_scores.apply(lambda row: row['Name'] == row['winning_pitcher'], axis=1)
complex_pitcher_box_scores['pitch_count_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['p'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['strikeout_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['k'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['walks_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['bb'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['h_MA3'] = round(complex_pitcher_box_scores.groupby('personId')['h'].transform(lambda x: x.shift(1).rolling(window=3).mean()),2)
complex_pitcher_box_scores['pitching_outs'] = complex_pitcher_box_scores['ip'].apply(innings_to_outs)
complex_pitcher_box_scores['pitching_outs'] = complex_pitcher_box_scores['pitching_outs'].astype(int)



complex_batter_box_scores['seasonNumber'] = pd.to_datetime(complex_batter_box_scores['game_date']).dt.year
complex_pitcher_box_scores['seasonNumber'] = pd.to_datetime(complex_pitcher_box_scores['game_date']).dt.year

complex_pitcher_box_scores['game_score'] = 40 + 2 * complex_pitcher_box_scores['pitching_outs'] + complex_pitcher_box_scores['k'] - 2 * complex_pitcher_box_scores['bb'] - 2 * complex_pitcher_box_scores['h'] - 3 * complex_pitcher_box_scores['er']

def categorize_performance(score):
    if score >= 80:
        return 'Excellent'
    elif score >= 60:
        return 'Good'
    elif score >= 40:
        return 'Average'
    elif score >= 20:
        return 'Below Average'
    else:
        return 'Poor'

complex_pitcher_box_scores['performance_category'] = complex_pitcher_box_scores['game_score'].apply(categorize_performance)



team_strikeouts = complex_batter_box_scores.groupby(['game_id', 'Team'])['k'].sum().reset_index()
team_strikeouts.rename(columns={'k': 'team_strikeouts'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_strikeouts, on=['game_id', 'Team'])
complex_batter_box_scores['average_of_team_strikeouts'] = complex_batter_box_scores['k'] / complex_batter_box_scores['team_strikeouts']
complex_batter_box_scores['average_team_strikeouts_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_strikeouts'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)


team_walks = complex_batter_box_scores.groupby(['game_id', 'Team'])['bb'].sum().reset_index()
team_walks.rename(columns={'bb': 'team_walks'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_walks, on=['game_id', 'Team'])
complex_batter_box_scores['average_of_team_walks'] = complex_batter_box_scores['bb'] / complex_batter_box_scores['team_walks']
complex_batter_box_scores['average_team_walks_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_walks'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)

team_hits = complex_batter_box_scores.groupby(['game_id', 'Team'])['h'].sum().reset_index()
team_hits.rename(columns={'h': 'team_hits'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_hits, on=['game_id', 'Team'])
complex_batter_box_scores['average_of_team_hits'] = complex_batter_box_scores['h'] / complex_batter_box_scores['team_hits']
complex_batter_box_scores['average_team_hits_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_hits'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)

team_runs = complex_batter_box_scores.groupby(['game_id', 'Team'])['r'].sum().reset_index()
team_runs.rename(columns={'r': 'team_runs'}, inplace=True)
complex_batter_box_scores = complex_batter_box_scores.merge(team_runs, on=['game_id', 'Team'])
print(complex_batter_box_scores['team_runs'].iloc[0])
complex_batter_box_scores['average_of_team_runs'] = np.where(
    complex_batter_box_scores['team_runs'].iloc[0] != 0,
    complex_batter_box_scores['r'] / complex_batter_box_scores['team_runs'],
    0  # Or np.nan, depending on how you want to handle division by zero
)
complex_batter_box_scores['average_team_runs_MA3'] = round(complex_batter_box_scores.groupby('personId')['average_of_team_runs'].transform(lambda x: x.shift(1).rolling(window=3).mean()),3)




complex_pitcher_box_scores.to_csv('datasets/complex_pitchers.csv', index=False)
complex_batter_box_scores.to_csv('datasets/complex_batters.csv', index=False)

9


In [11]:
complex_pitcher_box_scores[complex_pitcher_box_scores['Team'] == 'Philadelphia Phillies'].tail(20)

Unnamed: 0,Team,team_id,namefield,ip,h,r,er,bb,k,hr,p,s,era,name,personId,note,game_id,date,Name,game_datetime,game_date,game_type,status,away_name,home_name,away_id,home_id,doubleheader,game_num,home_probable_pitcher,away_probable_pitcher,home_pitcher_note,away_pitcher_note,away_score,home_score,current_inning,inning_state,venue_id,venue_name,national_broadcasts,series_status,winning_team,losing_team,winning_pitcher,losing_pitcher,save_pitcher,summary,losing_Team,isWinner,Opponent,isStarter,pitcherIsWinner,pitch_count_MA3,strikeout_MA3,walks_MA3,h_MA3,pitching_outs,seasonNumber,game_score,performance_category
174839,Philadelphia Phillies,143,"Suárez, R (W, 6-0)",6.0,7,3,3,1,6,1,94,66,1.72,"Suárez, R",624133,"(W, 6-0)",745583,2024-05-04,Ranger Suárez,2024-05-04T22:05:00Z,2024-05-04,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Ranger Suárez,Keaton Winn,,,3,14,9.0,Top,2681,Citizens Bank Park,[],PHI leads 2-0,Philadelphia Phillies,San Francisco Giants,Ranger Suárez,Keaton Winn,,2024-05-04 - San Francisco Giants (3) @ Philad...,,True,San Francisco Giants,True,True,98.67,7.0,0.67,4.0,18,2024,57,Average
174840,Philadelphia Phillies,143,"Ruiz, J",2.0,2,0,0,2,1,0,33,19,0.0,"Ruiz, J",614179,,745583,2024-05-04,José Ruiz,2024-05-04T22:05:00Z,2024-05-04,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Ranger Suárez,Keaton Winn,,,3,14,9.0,Top,2681,Citizens Bank Park,[],PHI leads 2-0,Philadelphia Phillies,San Francisco Giants,Ranger Suárez,Keaton Winn,,2024-05-04 - San Francisco Giants (3) @ Philad...,,True,San Francisco Giants,False,False,25.33,1.0,0.67,2.0,6,2024,45,Average
174841,Philadelphia Phillies,143,Domínguez,1.0,1,0,0,0,1,0,11,8,8.03,Domínguez,622554,,745583,2024-05-04,Seranthony Domínguez,2024-05-04T22:05:00Z,2024-05-04,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Ranger Suárez,Keaton Winn,,,3,14,9.0,Top,2681,Citizens Bank Park,[],PHI leads 2-0,Philadelphia Phillies,San Francisco Giants,Ranger Suárez,Keaton Winn,,2024-05-04 - San Francisco Giants (3) @ Philad...,,True,San Francisco Giants,False,False,20.33,1.0,0.67,0.67,3,2024,45,Average
174918,Philadelphia Phillies,143,"Walker, T (W, 2-0)",6.1,5,3,3,1,7,1,90,58,6.39,"Walker, T",592836,"(W, 2-0)",745587,2024-05-05,Taijuan Walker,2024-05-05T23:10:00Z,2024-05-05,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Taijuan Walker,Logan Webb,,,4,5,9.0,Top,2681,Citizens Bank Park,['ESPN'],PHI wins 3-0,Philadelphia Phillies,San Francisco Giants,Taijuan Walker,Logan Webb,José Alvarado,2024-05-05 - San Francisco Giants (4) @ Philad...,,True,San Francisco Giants,True,True,100.33,3.67,2.33,5.67,19,2024,64,Good
174919,Philadelphia Phillies,143,"Soto, G (H, 4)",0.2,0,0,0,1,0,0,16,10,6.1,"Soto, G",642397,"(H, 4)",745587,2024-05-05,Gregory Soto,2024-05-05T23:10:00Z,2024-05-05,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Taijuan Walker,Logan Webb,,,4,5,9.0,Top,2681,Citizens Bank Park,['ESPN'],PHI wins 3-0,Philadelphia Phillies,San Francisco Giants,Taijuan Walker,Logan Webb,José Alvarado,2024-05-05 - San Francisco Giants (4) @ Philad...,,True,San Francisco Giants,False,False,19.67,1.33,0.67,1.33,2,2024,42,Average
174920,Philadelphia Phillies,143,"Hoffman (H, 5)",1.0,1,0,0,0,3,0,19,12,1.13,Hoffman,656546,"(H, 5)",745587,2024-05-05,Jeff Hoffman,2024-05-05T23:10:00Z,2024-05-05,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Taijuan Walker,Logan Webb,,,4,5,9.0,Top,2681,Citizens Bank Park,['ESPN'],PHI wins 3-0,Philadelphia Phillies,San Francisco Giants,Taijuan Walker,Logan Webb,José Alvarado,2024-05-05 - San Francisco Giants (4) @ Philad...,,True,San Francisco Giants,False,False,16.67,3.0,0.0,0.33,3,2024,47,Average
174921,Philadelphia Phillies,143,"Alvarado (S, 7)",1.0,1,1,1,0,1,1,17,11,4.3,Alvarado,621237,"(S, 7)",745587,2024-05-05,José Alvarado,2024-05-05T23:10:00Z,2024-05-05,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Taijuan Walker,Logan Webb,,,4,5,9.0,Top,2681,Citizens Bank Park,['ESPN'],PHI wins 3-0,Philadelphia Phillies,San Francisco Giants,Taijuan Walker,Logan Webb,José Alvarado,2024-05-05 - San Francisco Giants (4) @ Philad...,,True,San Francisco Giants,False,False,19.33,0.67,0.33,1.67,3,2024,42,Average
175036,Philadelphia Phillies,143,"Wheeler (W, 4-3)",7.0,4,1,0,1,11,0,102,66,1.64,Wheeler,554430,"(W, 4-3)",745585,2024-05-06,Zack Wheeler,2024-05-06T20:05:00Z,2024-05-06,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Zack Wheeler,Mason Black,,,1,6,9.0,Top,2681,Citizens Bank Park,[],PHI wins 4-0,Philadelphia Phillies,San Francisco Giants,Zack Wheeler,Mason Black,,2024-05-06 - San Francisco Giants (1) @ Philad...,,True,San Francisco Giants,True,True,105.0,7.33,2.33,2.33,21,2024,83,Excellent
175037,Philadelphia Phillies,143,Strahm,1.0,1,0,0,0,2,0,17,12,1.29,Strahm,621381,,745585,2024-05-06,Matt Strahm,2024-05-06T20:05:00Z,2024-05-06,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Zack Wheeler,Mason Black,,,1,6,9.0,Top,2681,Citizens Bank Park,[],PHI wins 4-0,Philadelphia Phillies,San Francisco Giants,Zack Wheeler,Mason Black,,2024-05-06 - San Francisco Giants (1) @ Philad...,,True,San Francisco Giants,False,False,11.67,1.67,0.0,0.67,3,2024,46,Average
175038,Philadelphia Phillies,143,Kerkering,1.0,0,0,0,0,0,0,14,8,1.04,Kerkering,689147,,745585,2024-05-06,Orion Kerkering,2024-05-06T20:05:00Z,2024-05-06,R,Final,San Francisco Giants,Philadelphia Phillies,137,143,N,1,Zack Wheeler,Mason Black,,,1,6,9.0,Top,2681,Citizens Bank Park,[],PHI wins 4-0,Philadelphia Phillies,San Francisco Giants,Zack Wheeler,Mason Black,,2024-05-06 - San Francisco Giants (1) @ Philad...,,True,San Francisco Giants,False,False,27.0,1.33,0.67,1.33,3,2024,46,Average


In [12]:
complex_pitcher_box_scores[['k','strikeout_MA3']]

starting_pitchers = complex_pitcher_box_scores[complex_pitcher_box_scores['isStarter']]

starting_pitchers['k is more than MA3'] = (starting_pitchers['k'] > starting_pitchers['strikeout_MA3'] - 3)
res = starting_pitchers[['Name', 'k','strikeout_MA3', 'k is more than MA3']].dropna()['k is more than MA3'].value_counts()
res[True]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  starting_pitchers['k is more than MA3'] = (starting_pitchers['k'] > starting_pitchers['strikeout_MA3'] - 3)


34066

In [13]:
def decimal_to_implied_probability(decimal_odds):
    """
    Convert decimal odds to implied probability.
    
    Args:
    decimal_odds (float): The decimal odds from which to calculate probability.
    
    Returns:
    float: The implied probability as a percentage.
    """
    return 100 / decimal_odds
def decimal_to_american(decimal_odds):
    """
    Convert decimal odds to American odds format.
    
    Args:
    decimal_odds (float): The decimal odds to convert.
    
    Returns:
    str: The American odds as a string.
    """
    if decimal_odds >= 2.0:
        # Positive American odds
        american_odds = (decimal_odds - 1) * 100
        return f"+{int(american_odds)}"
    else:
        # Negative American odds
        american_odds = -100 / (decimal_odds - 1)
        return f"{int(american_odds)}"
def main():
    decimal_odds = res[True]/res[False]  # Example decimal odds input
    implied_prob = decimal_to_implied_probability(decimal_odds)
    american_odds = decimal_to_american(decimal_odds)
    
    print("True implied odds")
    print(f"Decimal Odds: {decimal_odds}")
    print(f"Implied Probability: {implied_prob:.2f}%")
    print(f"American Odds: {american_odds}")

    decimal_odds = res[False]/res[True]  # Example decimal odds input
    implied_prob = decimal_to_implied_probability(decimal_odds)
    american_odds = decimal_to_american(decimal_odds)
    
    print("False implied odds")
    print(f"Decimal Odds: {decimal_odds}")
    print(f"Implied Probability: {implied_prob:.2f}%")
    print(f"American Odds: {american_odds}")

# Call the main function to execute the example
main()


True implied odds
Decimal Odds: 6.898744430943702
Implied Probability: 14.50%
American Odds: +589
False implied odds
Decimal Odds: 0.14495391299242646
Implied Probability: 689.87%
American Odds: 116


In [14]:

#for game_id in temp:
def grabMatrices(game_id):
    max_attempts = 5
    attempt = 0
    while attempt< max_attempts:
        try:
            def getPlayByPlay(game_id):
                
                play_by_play = statsapi.get('game_playByPlay', {'gamePk' : game_id})
                all_plays = play_by_play['allPlays']

                game_play_by_play_df = []
                simple_game_play_by_play_df = []
                for play in all_plays:
                    batter_id = play.get('matchup',{}).get('batter',{}).get('id')
                    pitcher_id = play.get('matchup',{}).get('pitcher',{}).get('id')
                    batter_name = play.get('matchup',{}).get('batter',{}).get('fullName')
                    pitcher_name = play.get('matchup',{}).get('pitcher',{}).get('fullName')

                    simple_game_play_by_play_df.append(
                        {
                            'game_ID' : game_id,
                            'atBatResult' : play.get('result',{}).get('event'),
                            'halfInning' : play.get('about', {}).get('halfInning'),
                            'inning' : play.get('about',{}).get('inning'),
                            'batter_id' : batter_id,
                            'pitcher_id' : pitcher_id,
                            'batter_name' : batter_name,
                            'pitcher_name' : pitcher_name,
                            'batter_hand' : play.get('matchup',{}).get('batSide',{}).get('code'),
                            'pitcher_hand' : play.get('matchup',{}).get('pitchHand',{}).get('code')
                            
                        }
                    )

                    for pitch in play['playEvents']:

                        if pitch['isPitch']:

                            pitch_dict = {
                                'game_ID' : game_id,
                                'atBatResult' : play.get('result',{}).get('event'),
                                'halfInning' : play.get('about', {}).get('halfInning'),
                                'inning' : play.get('about',{}).get('inning'),
                                'batter_id' : batter_id,
                                'pitcher_id' : pitcher_id,
                                'batter_name' : batter_name,
                                'pitcher_name' : pitcher_name,
                                'batter_hand' : play.get('matchup',{}).get('batSide',{}).get('code'),
                                'pitcher_hand' : play.get('matchup',{}).get('pitchHand',{}).get('code'),
                                'play_description' : pitch.get('details', {}).get('description'),
                                'isInPlay' : pitch.get('details', {}).get('isInPlay'),
                                'isStrike' : pitch.get('details', {}).get('isStrike'),
                                'isBall' : pitch.get('details', {}).get('isBall'),
                                'pitchNumber' : pitch.get('pitchNumber'),
                                'pitch_type' : pitch.get('details', {}).get('type',{}).get('description'),
                                'isOut' : pitch.get('details', {}).get('isOut'),
                                'current_balls' : pitch.get('count',{}).get('balls'),
                                'current_strikes' : pitch.get('count', {}).get('strikes'),
                                'current_outs' : pitch.get('count', {}).get('outs'),
                                'pitchStartSpeed' : pitch.get('pitchData',{}).get('startSpeed'),
                                'pitchEndSpeed' : pitch.get('pitchData', {}).get('endSpeed'),
                                'pitchStrikeZoneTop' : pitch.get('pitchData', {}).get('strikeZoneTop'),
                                'pitchStikeZoneBottom' : pitch.get('pitchData', {}).get('strikeZoneBottom'),
                                'pitchZone' : pitch.get('pitchData',{}).get('zone'),
                                'pitchTypeConfidence' : pitch.get('pitchData', {}).get('typeConfidence')

                            }

                            for coord in pitch.get('pitchData',{}).get('coordinates',{}).keys():
                                pitch_dict[coord] = pitch.get('pitchData',{}).get('coordinates',{}).get(coord)
                            for breaks in pitch.get('pitchData',{}).get('breaks',{}).keys():
                                pitch_dict[breaks] = pitch.get('pitchData',{}).get('breaks',{}).get(breaks)

                            game_play_by_play_df.append(pitch_dict)    


                play_by_play = pd.DataFrame(game_play_by_play_df)
                simple_game_play_by_play_df = pd.DataFrame(simple_game_play_by_play_df)

                play_by_play['pitchesThrownByCurrentPitcher'] = play_by_play.groupby('pitcher_id').cumcount() + 1
                # Initialize columns to zero in the play_by_play DataFrame
                play_by_play['cumulativeBalls'] = 0
                play_by_play['cumulativeStrikes'] = 0

                # Find indices of balls and strikes in the play_by_play DataFrame
                ball_indices = play_by_play[play_by_play['isBall'] == True].index
                strike_indices = play_by_play[play_by_play['isStrike'] == True].index

                # Update columns with correct cumulative counts in the play_by_play DataFrame
                play_by_play.loc[ball_indices, 'cumulativeBalls'] = play_by_play.loc[ball_indices].groupby('pitcher_id').cumcount() + 1
                play_by_play.loc[strike_indices, 'cumulativeStrikes'] = play_by_play.loc[strike_indices].groupby('pitcher_id').cumcount() + 1

                # Define a function to count strikeouts and hits allowed
                def count_stats(row):
                    if row['atBatResult'] == 'Strikeout':
                        return 'Strikeout'
                    elif row['atBatResult'] in ['Single', 'Double', 'Triple', 'Home Run']:
                        return 'Hit'
                    else:
                        return 'Other'

                # Apply the function to categorize each pitch
                play_by_play['stat_category'] = play_by_play.apply(count_stats, axis=1)

                return play_by_play, simple_game_play_by_play_df

            play_by_play, simple_game_play_by_play_df = getPlayByPlay(game_id)

            return play_by_play
        except Exception as e:
            print(f'Error processing game ID {game_id}: {e} on attempt {attempt}')
            time.sleep(2**attempt)
            attempt += 1
    if attempt == max_attempts:
        print(f"Failed to fetch data after maximum attempts for {game_id}")
        return pd.DataFrame()
        



def process_games(game_ids):
    all_play_by_play = pd.DataFrame([])

    with ThreadPoolExecutor(max_workers=250) as executor:
        futures = [executor.submit(grabMatrices, game_id) for game_id in game_ids]
        for future in futures:
            play_by_play_df= future.result()
            all_play_by_play = pd.concat([all_play_by_play, play_by_play_df])
    
    return all_play_by_play


In [15]:
len(all_play_by_play)

5988834

In [16]:
play_by_play = process_games(game_id_list)
all_play_by_play = pd.concat([all_play_by_play, play_by_play])
all_play_by_play = all_play_by_play.drop_duplicates(keep='first')


In [17]:
len(play_by_play['game_ID'].unique())
play_by_play['game_ID'].unique()[-1]
play_by_play = statsapi.get('game_playByPlay', {'gamePk' : 746395})
play_by_play

{'copyright': 'Copyright 2024 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt',
 'allPlays': [{'result': {'type': 'atBat',
    'event': 'Double',
    'eventType': 'double',
    'description': 'Josh Rojas doubles (4) on a sharp line drive to center fielder Joey Loperfido.',
    'rbi': 0,
    'awayScore': 0,
    'homeScore': 0,
    'isOut': False},
   'about': {'atBatIndex': 0,
    'halfInning': 'top',
    'isTopInning': True,
    'inning': 1,
    'startTime': '2024-05-05T18:10:59.677Z',
    'endTime': '2024-05-05T18:11:32.371Z',
    'isComplete': True,
    'isScoringPlay': False,
    'hasReview': False,
    'hasOut': False,
    'captivatingIndex': 34},
   'count': {'balls': 0, 'strikes': 1, 'outs': 0},
   'matchup': {'batter': {'id': 668942,
     'fullName': 'Josh Rojas',
     'link': '/api/v1/people/668942'},
    'batSide': {'code': 'L', 'description': 'Left'},
    'pitcher': {'id': 68

In [18]:
# game_id_list = all_play_by_play['game_ID'].unique()


# #game_id = game_id_list[-1]


# game_strikeouts_matrix = []
# game_walks_matrix = []
# game_hits_matrix = []
# game_singles_matrix = []
# game_doubles_matrix = []
# game_triples_matrix = []
# game_home_runs_matrix = []

# pitches_per_inning_matrix = []

# for game_id in game_id_list[:20]:
#     game_play_by_play = all_play_by_play[all_play_by_play['game_ID'] == game_id]



#     end_of_batter_df = game_play_by_play.copy().reset_index()
#     new_df = pd.DataFrame(columns=end_of_batter_df.columns).copy()

#     # Iterate through the DataFrame, starting from the second row
#     for i in range(1, len(end_of_batter_df)):
#         # Check if the current row's batter_name is different from the previous row's batter_name
#         if end_of_batter_df.loc[i, 'batter_name'] != end_of_batter_df.loc[i - 1, 'batter_name']:
#             # Add the previous row to the new DataFrame
#             sample = end_of_batter_df.iloc[i - 1]
#             new_row = {}
#             for col in sample.keys():
#                 new_row[col] = sample[col]
#             new_row = pd.DataFrame([new_row])
#             if not new_df.empty:
#                 new_df = pd.concat([new_df,new_row])
#             else:
#                 new_df = new_row
    
#     end_of_batter_df = new_df


#     for pitcher_name in game_play_by_play['pitcher_name'].unique():

#         for stat in [['Strikeout'], ['Walk'], ['Single', 'Double', 'Triple', 'Home Run'], ['Single'], ['Double'], ['Triple'], ['Home Run']]:
#             stat_pitcher_dict = {'game_id' : game_id, 'Name' : pitcher_name, 'pitcher_id' : game_play_by_play[game_play_by_play['pitcher_name'] == pitcher_name]['pitcher_id'].iloc[0]}

#             stat_total = 0
#             for inning in range(1,10):
#                 stat_df = end_of_batter_df[(end_of_batter_df['atBatResult'].isin(stat))]
#                 #stat_df = stat_df.drop_duplicates(subset=['game_ID', 'atBatResult', 'inning', 'batter_id', 'pitcher_id'], keep='last')

#                 inning_df = stat_df[(stat_df['pitcher_name'] == pitcher_name) & (stat_df['inning'] == inning)]
                
#                 #inning_df = inning_df.drop_duplicates(subset=['inning','batter_id','pitcher_id','atBatResult'], keep='last')
#                 stat_pitcher_dict[inning] = len(inning_df)
#                 stat_total += len(inning_df)
            
#             stat_pitcher_dict['Total'] = stat_total



#             if stat == ['Strikeout']:
#                 game_strikeouts_matrix.append(stat_pitcher_dict)
#             elif stat == ['Walk']:
#                 game_walks_matrix.append(stat_pitcher_dict)
#             elif stat == ['Single']:
#                 game_singles_matrix.append(stat_pitcher_dict)
#             elif stat == ['Double']:
#                 game_doubles_matrix.append(stat_pitcher_dict)
#             elif stat == ['Triple']:
#                 game_triples_matrix.append(stat_pitcher_dict)
#             elif stat == ['Home Run']:
#                 game_home_runs_matrix.append(stat_pitcher_dict)
#             else:
#                 game_hits_matrix.append(stat_pitcher_dict)




#         pitch_count_df = game_play_by_play[game_play_by_play['pitcher_name'] == pitcher_name]
#         pitch_count_df = pitch_count_df.drop_duplicates(subset=['game_ID', 'inning', 'pitcher_id'], keep='last')
#         last = 0
#         pitcher_pitch_dict = {'game_id' : game_id, 'Name' : pitcher_name, 'pitcher_id' : game_play_by_play[game_play_by_play['pitcher_name'] == pitcher_name]['pitcher_id'].iloc[0]}
#         pitch_total = 0
#         for inning in range(1,10):
            
#             inning_df = pitch_count_df[(pitch_count_df['pitcher_name'] == pitcher_name) & (pitch_count_df['inning'] == inning)]
#             #inning_df = inning_df.drop_duplicates(subset=['inning','batter_id','pitcher_id','atBatResult'], keep='last')
#             try:
#                 pc  = inning_df['pitchesThrownByCurrentPitcher'].iloc[0] - last
#             except:
#                 pc = 0
#             pitcher_pitch_dict[inning] = pc

#             pitch_total += pc
#             last = pitch_total
#         pitcher_pitch_dict['Total'] = pitch_total

#         pitches_per_inning_matrix.append(pitcher_pitch_dict)      

        




# game_strikeouts_matrix = pd.DataFrame(game_strikeouts_matrix)
# game_walks_matrix = pd.DataFrame(game_walks_matrix)
# game_hits_matrix = pd.DataFrame(game_hits_matrix)
# game_singles_matrix = pd.DataFrame(game_singles_matrix)
# game_doubles_matrix = pd.DataFrame(game_doubles_matrix)
# game_triples_matrix = pd.DataFrame(game_triples_matrix)
# game_home_runs_matrix = pd.DataFrame(game_home_runs_matrix)

# pitches_per_inning_matrix = pd.DataFrame(pitches_per_inning_matrix)

# #end_of_batter_df

In [19]:
all_strikeouts_matrix = pd.read_csv('datasets/k_per_inning.csv')
all_walks_matrix = pd.read_csv('datasets/bb_per_inning.csv')
all_hits_matrix = pd.read_csv('datasets/h_per_inning.csv')
all_singles_matrix = pd.read_csv('datasets/singles_per_inning.csv')
all_doubles_matrix = pd.read_csv('datasets/doubles_per_inning.csv')
all_triples_matrix = pd.read_csv('datasets/triples_per_inning.csv')
all_home_runs_matrix = pd.read_csv('datasets/hr_per_inning.csv')

In [20]:
len(all_strikeouts_matrix['game_id'].unique())

20298

In [21]:
len(game_id_list)

12

In [22]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

#game_id_list = all_play_by_play['game_ID'].unique()


def process_game(game_id):
    # Extract play-by-play data for the current game
    game_play_by_play = all_play_by_play[all_play_by_play['game_ID'] == game_id]

    # Initialize data structures to store results
    strikeouts_matrix = []
    walks_matrix = []
    hits_matrix = []
    singles_matrix = []
    doubles_matrix = []
    triples_matrix = []
    home_runs_matrix = []
    pitches_per_inning_matrix = []

    # Copy DataFrame to manipulate and detect end of batter
    end_of_batter_df = game_play_by_play.copy().reset_index()
    new_df = pd.DataFrame(columns=end_of_batter_df.columns)

    # Iterate through the DataFrame to detect changes in batter
    for i in range(1, len(end_of_batter_df)):
        if end_of_batter_df.loc[i, 'batter_name'] != end_of_batter_df.loc[i - 1, 'batter_name']:
            # Add the previous row to the new DataFrame
            if not new_df.empty:
                new_df = pd.concat([new_df, pd.DataFrame([end_of_batter_df.iloc[i - 1]])])
            else:
                new_df = pd.DataFrame([end_of_batter_df.iloc[i - 1]])

    # Use the updated DataFrame for further analysis
    end_of_batter_df = new_df

    # Iterate through each unique pitcher in the game
    for pitcher_name in game_play_by_play['pitcher_name'].unique():
        pitcher_data = game_play_by_play[game_play_by_play['pitcher_name'] == pitcher_name]
        pitcher_id = pitcher_data['pitcher_id'].iloc[0]

        # Define the outcomes to summarize
        outcomes = [
            (['Strikeout'], strikeouts_matrix),
            (['Walk'], walks_matrix),
            (['Single', 'Double', 'Triple', 'Home Run'], hits_matrix),
            (['Single'], singles_matrix),
            (['Double'], doubles_matrix),
            (['Triple'], triples_matrix),
            (['Home Run'], home_runs_matrix),
        ]

        # Process each type of outcome
        for outcome, matrix in outcomes:
            stat_pitcher_dict = {'game_id': game_id, 'Name': pitcher_name, 'pitcher_id': pitcher_id}
            stat_total = 0

            # Summarize outcomes by inning
            for inning in range(1, 10):
                inning_data = end_of_batter_df[(end_of_batter_df['atBatResult'].isin(outcome)) &
                                               (end_of_batter_df['pitcher_name'] == pitcher_name) &
                                               (end_of_batter_df['inning'] == inning)]
                stat_pitcher_dict[str(inning)] = len(inning_data)
                stat_total += len(inning_data)

            stat_pitcher_dict['Total'] = stat_total
            matrix.append(stat_pitcher_dict)

    # Return a dictionary containing all matrices for this game
    return {
        "strikeouts": strikeouts_matrix,
        "walks": walks_matrix,
        "hits": hits_matrix,
        "singles": singles_matrix,
        "doubles": doubles_matrix,
        "triples": triples_matrix,
        "home_runs": home_runs_matrix
            }

def collect_results():
    # Define the global matrices to collect results
    game_strikeouts_matrix = []
    game_walks_matrix = []
    game_hits_matrix = []
    game_singles_matrix = []
    game_doubles_matrix = []
    game_triples_matrix = []
    game_home_runs_matrix = []


    # Use ThreadPoolExecutor to parallelize the processing of each game
    with ThreadPoolExecutor(max_workers=250) as executor:
        results = list(executor.map(process_game, game_id_list))

    # Combine results from each game
    for result in results:
        game_strikeouts_matrix += result["strikeouts"]
        game_walks_matrix += result["walks"]
        game_hits_matrix += result["hits"]
        game_singles_matrix += result["singles"]
        game_doubles_matrix += result["doubles"]
        game_triples_matrix += result["triples"]
        game_home_runs_matrix += result["home_runs"]

    # Convert lists of dictionaries to DataFrames
    game_strikeouts_matrix = pd.DataFrame(game_strikeouts_matrix)
    game_walks_matrix = pd.DataFrame(game_walks_matrix)
    game_hits_matrix = pd.DataFrame(game_hits_matrix)
    game_singles_matrix = pd.DataFrame(game_singles_matrix)
    game_doubles_matrix = pd.DataFrame(game_doubles_matrix)
    game_triples_matrix = pd.DataFrame(game_triples_matrix)
    game_home_runs_matrix = pd.DataFrame(game_home_runs_matrix)

    return game_strikeouts_matrix, game_walks_matrix, game_hits_matrix, game_singles_matrix, game_doubles_matrix, game_triples_matrix, game_home_runs_matrix

# Call the function to start processing
game_strikeouts_matrix, game_walks_matrix, game_hits_matrix, game_singles_matrix, game_doubles_matrix, game_triples_matrix, game_home_runs_matrix = collect_results()


In [23]:
if len(all_strikeouts_matrix) > 0:
    all_strikeouts_matrix = pd.concat([all_strikeouts_matrix, game_strikeouts_matrix])
    all_walks_matrix = pd.concat([all_walks_matrix, game_walks_matrix])
    all_hits_matrix = pd.concat([all_hits_matrix, game_hits_matrix])
    all_singles_matrix = pd.concat([all_singles_matrix, game_singles_matrix])
    all_doubles_matrix = pd.concat([all_walks_matrix, game_doubles_matrix])
    all_triples_matrix = pd.concat([all_triples_matrix, game_triples_matrix])
    all_home_runs_matrix = pd.concat([all_home_runs_matrix, game_home_runs_matrix])
else:
    all_strikeouts_matrix = game_strikeouts_matrix
    all_walks_matrix = game_walks_matrix
    all_hits_matrix = game_hits_matrix
    all_singles_matrix = game_singles_matrix
    all_doubles_matrix = game_doubles_matrix
    all_triples_matrix = game_triples_matrix
    all_home_runs_matrix = game_home_runs_matrix



all_strikeouts_matrix.drop_duplicates(keep='first')
all_walks_matrix.drop_duplicates(keep='first')
all_hits_matrix.drop_duplicates(keep='first')
all_singles_matrix.drop_duplicates(keep='first')
all_doubles_matrix.drop_duplicates(keep='first')
all_triples_matrix.drop_duplicates(keep='first')
all_home_runs_matrix.drop_duplicates(keep='first')

Unnamed: 0,game_id,Name,pitcher_id,1,2,3,4,5,6,7,8,9,Total
0,529406,Chris Archer,502042,0,1,0,0,0,0,0,0,0,1
1,529406,Chris Sale,519242,0,0,0,0,0,0,0,0,0,0
2,529406,Austin Pruitt,643493,0,0,0,0,0,0,0,0,0,0
3,529406,Matt Barnes,598264,0,0,0,0,0,0,0,0,0,0
4,529406,Joe Kelly,523260,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,745993,Lance Lynn,458681,0,0,0,0,0,0,0,0,0,0
84,745993,Matthew Liberatore,669461,0,0,0,0,0,0,0,0,0,0
85,745993,Kyle Leahy,681517,0,0,0,0,0,0,0,0,0,0
86,745993,Kevin Herget,643361,0,0,0,0,0,0,0,0,1,1


In [24]:
all_strikeouts_matrix.to_csv('datasets/k_per_inning.csv', index=False)
all_walks_matrix.to_csv('datasets/bb_per_inning.csv', index=False)
all_hits_matrix.to_csv('datasets/h_per_inning.csv', index=False)
all_singles_matrix.to_csv('datasets/singles_per_inning.csv', index=False)
all_doubles_matrix.to_csv('datasets/doubles_per_inning.csv', index=False)
all_triples_matrix.to_csv('datasets/triples_per_inning.csv', index=False)
all_home_runs_matrix.to_csv('datasets/hr_per_inning.csv', index=False)

In [25]:
all_play_by_play[all_play_by_play['atBatResult'] == 'Home Run']

Unnamed: 0,game_ID,atBatResult,halfInning,inning,batter_id,pitcher_id,batter_name,pitcher_name,batter_hand,pitcher_hand,play_description,isInPlay,isStrike,isBall,pitchNumber,pitch_type,isOut,current_balls,current_strikes,current_outs,pitchStartSpeed,pitchEndSpeed,pitchStrikeZoneTop,pitchStikeZoneBottom,pitchZone,pitchTypeConfidence,aY,aZ,pfxX,pfxZ,pX,pZ,vX0,vY0,vZ0,x,y,x0,y0,z0,aX,breakAngle,breakLength,breakY,breakVertical,breakVerticalInduced,breakHorizontal,spinRate,spinDirection,pitchesThrownByCurrentPitcher,cumulativeBalls,cumulativeStrikes,stat_category
40,529406,Home Run,top,2,456488,502042,Eduardo Núñez,Chris Archer,R,R,"In play, run(s)",True,False,False,1,Slider,False,0,0,1,87.3,81.0,3.330474,1.501941,8.0,2.00,24.457531,-31.290023,2.924446,0.538590,0.115494,1.434052,4.224346,-126.920293,-5.903835,112.60,200.06,-1.946294,50.000000,6.262973,4.800094,9.6,8.4,24.0,-37.0,-0.0,-6.4,2461.0,90.0,28,0,0,Hit
277,529407,Home Run,top,1,664023,570632,Ian Happ,José Ureña,L,R,"In play, run(s)",True,False,False,1,Sinker,False,0,0,0,95.5,87.1,3.600602,1.689697,5.0,2.00,31.500366,-16.732196,-9.880247,7.938027,0.242677,2.533997,7.766284,-138.716117,-6.034276,107.75,170.38,-1.312016,50.000000,5.855881,-19.220056,44.4,4.8,24.0,-17.2,13.6,16.6,2168.0,230.0,1,0,0,Hit
346,529407,Home Run,top,2,519203,570632,Anthony Rizzo,José Ureña,L,R,"In play, run(s)",True,False,False,1,Sinker,False,0,0,2,95.8,86.0,3.558426,1.589525,3.0,2.00,36.611748,-18.279148,-12.122561,7.234955,0.305946,3.018241,7.616279,-139.025589,-4.576837,105.35,157.37,-0.921140,50.000000,5.931326,-23.281661,48.0,6.0,24.0,-18.5,12.2,20.2,2083.0,238.0,49,0,0,Hit
502,529407,Home Run,top,7,656941,594027,Kyle Schwarber,Tayron Guerrero,L,R,Ball,False,False,True,1,Four-Seam Fastball,False,1,0,1,96.2,86.5,3.569637,1.606576,11.0,2.00,36.681060,-14.297889,-6.071075,9.172642,-1.245201,3.899938,4.266498,-139.946166,-3.902308,164.47,133.52,-2.014358,50.000000,6.272837,-11.831651,34.8,3.6,24.0,-14.0,15.6,9.8,2015.0,212.0,24,10,0,Hit
503,529407,Home Run,top,7,656941,594027,Kyle Schwarber,Tayron Guerrero,L,R,Ball,False,False,True,2,Four-Seam Fastball,False,2,0,1,96.5,87.2,3.445518,1.544491,14.0,2.00,34.752401,-11.995772,-8.697727,10.321076,1.540580,1.894824,11.301392,-139.729338,-9.713041,58.26,187.75,-1.447581,50.000000,6.227741,-17.004538,44.4,4.8,24.0,-13.2,16.4,13.0,2126.0,218.0,25,11,0,Hit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,745827,Home Run,bottom,7,596019,450203,Francisco Lindor,Charlie Morton,L,R,"In play, run(s)",True,False,False,1,Four-Seam Fastball,False,0,0,0,90.7,83.3,3.380000,1.530000,2.0,0.86,26.428946,-19.491112,-7.874889,7.154851,0.140873,2.984710,7.891589,-131.914219,-2.474638,111.63,158.19,-1.857505,50.005131,5.362021,-13.966583,32.4,4.8,24.0,-20.5,12.6,12.7,2279.0,239.0,91,0,0,Hit
230,745094,Home Run,top,9,519203,542888,Anthony Rizzo,Shawn Armstrong,L,R,"In play, run(s)",True,False,False,1,Cutter,False,0,0,0,89.4,82.0,3.730000,1.780000,6.0,0.90,27.797554,-23.516589,3.076838,5.052810,0.355277,2.427313,3.725309,-130.223539,-3.996662,103.46,173.24,-1.494642,50.005484,5.764729,5.275925,14.4,6.0,24.0,-25.7,8.5,-6.4,2572.0,164.0,1,0,0,Hit
289,745993,Home Run,top,9,669357,643361,Nolan Gorman,Kevin Herget,L,R,Swinging Strike,False,True,False,1,Changeup,False,0,1,1,82.5,75.6,3.400000,1.610000,4.0,0.90,23.681748,-23.157424,-8.994338,6.201363,-0.361933,2.216361,5.072456,-119.967152,-4.040791,130.80,178.94,-1.337495,50.004719,5.992087,-13.083038,26.4,7.2,24.0,-29.9,10.4,15.0,1893.0,238.0,39,0,20,Hit
290,745993,Home Run,top,9,669357,643361,Nolan Gorman,Kevin Herget,L,R,Ball,False,False,True,2,Changeup,False,1,1,1,82.2,75.1,3.400000,1.610000,11.0,0.90,23.847933,-23.748031,-10.307456,5.833763,-0.819324,3.565991,4.438908,-119.646554,-0.814480,148.23,142.50,-1.362995,50.004163,6.045796,-14.892092,30.0,7.2,24.0,-30.0,10.7,17.5,2225.0,240.0,40,11,0,Hit


In [26]:
# WRITE TO CSV
# RERUN WITH NEW VALUES

## Test Matrix puller

In [27]:
game_id_list = all_play_by_play['game_ID'].unique()


game_id = game_id_list[-1]


game_strikeouts_matrix = []
game_walks_matrix = []
game_hits_matrix = []


for game_id in game_id_list[-2:]:
    game_play_by_play = all_play_by_play[all_play_by_play['game_ID'] == game_id]

    strikeout_df = game_play_by_play[(game_play_by_play['atBatResult'] == 'Strikeout')]
    strikeout_df = strikeout_df.drop_duplicates(subset=['game_ID', 'atBatResult', 'inning', 'batter_id', 'pitcher_id'], keep='last')

    walks_df = game_play_by_play[(game_play_by_play['atBatResult'] == 'Walk')]
    walks_df = walks_df.drop_duplicates(subset=['game_ID', 'atBatResult', 'inning', 'batter_id', 'pitcher_id'], keep='last')

    hits_df = game_play_by_play[(game_play_by_play['atBatResult'].isin(['Single', 'Double', 'Triple', 'Homerun']))]
    hits_df = hits_df.drop_duplicates(subset=['game_ID', 'atBatResult', 'inning', 'batter_id', 'pitcher_id'], keep='last')


    for pitcher_name in game_play_by_play['pitcher_name'].unique():
        k_total = 0
        bb_total = 0
        hit_total = 0


        k_pitcher_dict = {'game_id' : game_id, 'Name' : pitcher_name, 'pitcher_id' : game_play_by_play[game_play_by_play['pitcher_name'] == pitcher_name]['pitcher_id'].iloc[0]}
        bb_pitcher_dict = {'game_id' : game_id, 'Name' : pitcher_name, 'pitcher_id' : game_play_by_play[game_play_by_play['pitcher_name'] == pitcher_name]['pitcher_id'].iloc[0]}
        hit_pitcher_dict = {'game_id' : game_id, 'Name' : pitcher_name, 'pitcher_id' : game_play_by_play[game_play_by_play['pitcher_name'] == pitcher_name]['pitcher_id'].iloc[0]}

        for inning in range(1,10):
            inning_df = strikeout_df[(strikeout_df['pitcher_name'] == pitcher_name) & (strikeout_df['inning'] == inning)]
            #inning_df = inning_df.drop_duplicates(subset=['inning','batter_id','pitcher_id','atBatResult'], keep='last')
            k_pitcher_dict[inning] = len(inning_df)
            k_total += len(inning_df)

            inning_df = walks_df[(walks_df['pitcher_name'] == pitcher_name) & (walks_df['inning'] == inning)]
            bb_pitcher_dict[inning] = len(inning_df)
            bb_total += len(inning_df)

            inning_df = hits_df[(hits_df['pitcher_name'] == pitcher_name) & (hits_df['inning'] == inning)]
            hit_pitcher_dict[inning] = len(inning_df)
            hit_total += len(inning_df)

            

            
        
        k_pitcher_dict['Total'] = k_total
        bb_pitcher_dict['Total'] = bb_total
        hit_pitcher_dict['Total'] = hit_total


        game_strikeouts_matrix.append(k_pitcher_dict)
        game_walks_matrix.append(bb_pitcher_dict)
        game_hits_matrix.append(hit_pitcher_dict)

pd.DataFrame(game_hits_matrix)


Unnamed: 0,game_id,Name,pitcher_id,1,2,3,4,5,6,7,8,9,Total
0,745094,Taj Bradley,671737,0,0,0,2,1,1,0,0,0,4
1,745094,Clarke Schmidt,657376,1,0,1,1,1,1,0,0,0,5
2,745094,Kevin Kelly,687330,0,0,0,0,0,0,2,0,0,2
3,745094,Garrett Cleavinger,664076,0,0,0,0,0,0,0,0,0,0
4,745094,Nick Burdi,595897,0,0,0,0,0,0,0,0,0,0
5,745094,Jason Adam,592094,0,0,0,0,0,0,0,0,0,0
6,745094,Luke Weaver,596133,0,0,0,0,0,0,0,0,0,0
7,745094,Shawn Armstrong,542888,0,0,0,0,0,0,0,0,2,2
8,745094,Clay Holmes,605280,0,0,0,0,0,0,0,0,1,1
9,745993,Robert Gasser,688107,0,1,1,0,0,0,0,0,0,2


In [28]:
all_batter_boxscores.to_csv('datasets/batterBoxscores.csv', index= False)
all_pitcher_boxscores.to_csv('datasets/pitcherBoxscores.csv', index=False)

In [29]:
all_strikeouts_matrix

Unnamed: 0,game_id,Name,pitcher_id,1,2,3,4,5,6,7,8,9,Total
0,529406,Chris Archer,502042,1,0,1,1,1,2,0,0,0,6
1,529406,Chris Sale,519242,2,2,1,1,1,2,0,0,0,9
2,529406,Austin Pruitt,643493,0,0,0,0,0,0,0,0,0,0
3,529406,Matt Barnes,598264,0,0,0,0,0,0,0,0,0,0
4,529406,Joe Kelly,523260,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,745993,Lance Lynn,458681,2,1,0,0,2,0,0,0,0,5
84,745993,Matthew Liberatore,669461,0,0,0,0,1,0,0,0,0,1
85,745993,Kyle Leahy,681517,0,0,0,0,0,1,1,0,0,2
86,745993,Kevin Herget,643361,0,0,0,0,0,0,0,0,0,0
