In [47]:
## Curtesy of Robert Frey's github
## https://github.com/robert-frey/YouTube/blob/master/Minor%20League%20Statcast%20Data%20in%20Python/sc_search_minor_leagues.py

import numpy as np
import pandas as pd
import requests
import datetime
import io

def validate_datestring(date_text):
    try:
        datetime.datetime.strptime(date_text, '%Y-%m-%d')
    except ValueError:
        raise ValueError("Incorrect data format, should be YYYY-MM-DD")

def sanitize_input(start_dt, end_dt):
    # if no dates are supplied, assume they want yesterday's data
    # send a warning in case they wanted to specify
    if start_dt is None and end_dt is None:
        today = datetime.datetime.today()
        start_dt = (today - datetime.timedelta(1)).strftime("%Y-%m-%d")
        end_dt = today.strftime("%Y-%m-%d")
        print("Warning: no date range supplied. Returning yesterday's Statcast data. For a different date range, try get_statcast(start_dt, end_dt).")
    #if only one date is supplied, assume they only want that day's stats
    #query in this case is from date 1 to date 1
    if start_dt is None:
        start_dt = end_dt
    if end_dt is None:
        end_dt = start_dt
    # now that both dates are not None, make sure they are valid date strings
    validate_datestring(start_dt)
    validate_datestring(end_dt)
    return start_dt, end_dt

def single_game_request(game_pk):

    url = "https://baseballsavant.mlb.com/statcast-search-minors/csv?all=true&type=details&minors=true&game_pk={game_pk}".format(game_pk=game_pk)
    s=requests.get(url, timeout=None).content
    data = pd.read_csv(io.StringIO(s.decode('utf-8')))#, error_bad_lines=False) # skips 'bad lines' breaking scrapes. still testing this.
    return data

def small_request(start_dt,end_dt, player_type, Level):
    url = f"https://baseballsavant.mlb.com/statcast-search-minors/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type={player_type}&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt={start_dt}&game_date_lt={end_dt}&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&hfLevel={Level}&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details&minors=true&"
    #https://baseballsavant.mlb.com/statcast-search-minors?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2025%7C&hfSit=&player_type=batter&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInn=&hfBBT=&hfFlag=is%5C.%5C.tracked%7C&hfLevel=AAA%7C&metric_1=&hfTeamAffiliate=&hfOpponentAffiliate=&group_by=name&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&chk_is..tracked=on#results
    #  print(url)
    s=requests.get(url, timeout=None).content
    data = pd.read_csv(io.StringIO(s.decode('utf-8')))#, error_bad_lines=False) # skips 'bad lines' breaking scrapes. still testing this.
    return data

def large_request(start_dt,end_dt,d1,d2,player_type, Level, step,verbose):
    """
    break start and end date into smaller increments, collecting all data in small chunks and appending all results to a common dataframe
    end_dt is the date strings for the final day of the query
    d1 and d2 are datetime objects for first and last day of query, for doing date math
    a third datetime object (d) will be used to increment over time for the several intermediate queries
    """
    error_counter = 0 # count failed requests. If > X, break
    no_success_msg_flag = False # a flag for passing over the success message of requests are failing
    print("This is a large query, it may take a moment to complete")
    dataframe_list = []
    #step = 3 # number of days per mini-query (test this later to see how large I can make this without losing data)
    d = d1 + datetime.timedelta(days=step)
    while d <= d2: #while intermediate query end_dt <= global query end_dt, keep looping
        # dates before 3/15 and after 11/15 will always be offseason
        # if these dates are detected, check if the next season is within the user's query
        # if yes, fast-forward to the next season to avoid empty requests
        # if no, break the loop. all useful data has been pulled.
        if ((d.month < 4 and d.day < 15) or (d1.month > 10 and d1.day > 14)):
            if d2.year > d.year:
                print('Skipping offseason dates')
                d1 = d1.replace(month=3,day=15,year=d1.year+1)
                d = d1 + datetime.timedelta(days=step+1)
            else:
                break

        start_dt = d1.strftime('%Y-%m-%d')
        intermediate_end_dt = d.strftime('%Y-%m-%d')
        data = small_request(start_dt,intermediate_end_dt, player_type, Level)
        # append to list of dataframes if not empty or failed (failed requests have one row saying "Error: Query Timeout")
        if data.shape[0] > 1:
            dataframe_list.append(data)
        # if it failed, retry up to three times
        else:
            success = 0
            while success == 0:
                data = small_request(start_dt,intermediate_end_dt, player_type, Level)
                if data.shape[0] > 1:
                    dataframe_list.append(data)
                    success = 1
                else:
                    error_counter += 1
                if error_counter > 2:
                    # this request is probably too large. Cut a day off of this request and make that its own separate request.
                    # For each, append to dataframe list if successful, skip and print error message if failed
                    tmp_end = d - datetime.timedelta(days=1)
                    tmp_end = tmp_end.strftime('%Y-%m-%d')
                    smaller_data_1 = small_request(start_dt, tmp_end, player_type, Level)
                    smaller_data_2 = small_request(intermediate_end_dt,intermediate_end_dt, player_type, Level)
                    if smaller_data_1.shape[0] > 1:
                        dataframe_list.append(smaller_data_1)
                        print("Completed sub-query from {} to {}".format(start_dt,tmp_end))
                    else:
                        print("Query unsuccessful for data from {} to {}. Skipping these dates.".format(start_dt,tmp_end))
                    if smaller_data_2.shape[0] > 1:
                        dataframe_list.append(smaller_data_2)
                        print("Completed sub-query from {} to {}".format(intermediate_end_dt,intermediate_end_dt))
                    else:
                        print("Query unsuccessful for data from {} to {}. Skipping these dates.".format(intermediate_end_dt,intermediate_end_dt))

                    no_success_msg_flag = True # flag for passing over the success message since this request failed
                    error_counter = 0 # reset counter
                    break


        if verbose:
            if no_success_msg_flag is False:
                print("Completed sub-query from {} to {}".format(start_dt,intermediate_end_dt))
            else:
                no_success_msg_flag = False # if failed, reset this flag so message will send again next iteration
        # increment dates
        d1 = d + datetime.timedelta(days=1)
        d = d + datetime.timedelta(days=step+1)

    # if start date > end date after being incremented, the loop captured each date's data
    if d1 > d2:
        pass
    # if start date <= end date, then there are a few leftover dates to grab data for.
    else:
        # start_dt from the earlier loop will work, but instead of d we now want the original end_dt
        start_dt = d1.strftime('%Y-%m-%d')
        data = small_request(start_dt,end_dt, player_type)
        dataframe_list.append(data)
        if verbose:
            print("Completed sub-query from {} to {}".format(start_dt,end_dt))

    # concatenate all dataframes into final result set
    final_data = pd.concat(dataframe_list, axis=0)
    return final_data

def postprocessing(data, team):
    #replace empty entries and 'null' strings with np.NaN
    data.replace(r'^\s*$', np.nan, regex=True, inplace = True)
    data.replace(r'^null$', np.nan, regex=True, inplace = True)

    # convert columns to numeric
    #not_numeric = ['sv_id', 'umpire', 'type', 'inning_topbot', 'bb_type', 'away_team', 'home_team', 'p_throws', 
    #               'stand', 'game_type', 'des', 'description', 'events', 'player_name', 'game_date', 'pitch_type', 'pitch_name']

    numeric_cols = ['release_speed','release_pos_x','release_pos_z','batter','pitcher','zone','hit_location','balls',
                    'strikes','game_year','pfx_x','pfx_z','plate_x','plate_z','on_3b','on_2b','on_1b','outs_when_up','inning',
                    'hc_x','hc_y','fielder_2','vx0','vy0','vz0','ax','ay','az','sz_top','sz_bot',
                    'hit_distance_sc','launch_speed','launch_angle','effective_speed','release_spin_rate','release_extension',
                    'game_pk','fielder_3','fielder_4','fielder_5',
                    'fielder_6','fielder_7','fielder_8','fielder_9','release_pos_y',
                    'estimated_ba_using_speedangle','estimated_woba_using_speedangle','woba_value','woba_denom','babip_value',
                    'iso_value','launch_speed_angle','at_bat_number','pitch_number','home_score','away_score','bat_score',
                    'fld_score','post_away_score','post_home_score','post_bat_score','post_fld_score']

    data[numeric_cols] = data[numeric_cols].astype(float)

    # convert date col to datetime data type and sort so that this returns in an order that makes sense (by date and game)
    data['game_date'] = pd.to_datetime(data['game_date'], format='%Y-%m-%d')
    data = data.sort_values(['game_date', 'game_pk', 'at_bat_number', 'pitch_number'], ascending=False)

    #select only pitches from a particular team
    valid_teams = ['MIN', 'PHI', 'BAL', 'NYY', 'LAD', 'OAK', 'SEA', 'TB', 'MIL', 'MIA',
       'KC', 'TEX', 'CHC', 'ATL', 'COL', 'HOU', 'CIN', 'LAA', 'DET', 'TOR',
       'PIT', 'NYM', 'CLE', 'CWS', 'STL', 'WSH', 'SF', 'SD', 'BOS','ARI','ANA','WAS']

    if(team in valid_teams):
        data = data.loc[(data['home_team']==team)|(data['away_team']==team)]
    elif(team != None):
        raise ValueError('Error: invalid team abbreviation. Valid team names are: {}'.format(valid_teams))
    data = data.reset_index()
    return data

def statcast_minor_leagues(start_dt=None, end_dt=None, team=None, verbose=True, player_type = 'pitcher', Level = 'AAA'):
    """
    Pulls minor league statcast play-level data from Baseball Savant for a given date range.

    INPUTS:
    start_dt: YYYY-MM-DD : the first date for which you want statcast data
    end_dt: YYYY-MM-DD : the last date for which you want statcast data
    team: optional (defaults to None) : city abbreviation of the team you want data for (e.g. SEA or BOS)

    If no arguments are provided, this will return yesterday's statcast data. If one date is provided, it will return that date's statcast data.
    """


    start_dt, end_dt = sanitize_input(start_dt, end_dt)
    # 3 days or less -> a quick one-shot request. Greater than 3 days -> break it into multiple smaller queries
    small_query_threshold = 3
    # inputs are valid if either both or zero dates are supplied. Not valid of only one given.


    if start_dt and end_dt:
        # how many days worth of data are needed?
        date_format = "%Y-%m-%d"
        d1 = datetime.datetime.strptime(start_dt, date_format)
        d2 = datetime.datetime.strptime(end_dt, date_format)
        days_in_query = (d2 - d1).days
        if days_in_query <= small_query_threshold:
            data = small_request(start_dt,end_dt, player_type, Level)
        else:
            data = large_request(start_dt,end_dt,d1,d2, player_type, Level, step=small_query_threshold,verbose=verbose)

        data = postprocessing(data, team)
        return data

def statcast_minor_league_single_game(game_pk, team=None):
    """
    Pulls minor league statcast play-level data from Baseball Savant for a single game,
    identified by its MLB game ID (game_pk in statcast data)

    INPUTS:
    game_pk : 6-digit integer MLB game ID to retrieve
    """
    data = single_game_request(game_pk)
    data = postprocessing(data, team)
    return data

In [48]:
minor_league_data_AAA = statcast_minor_leagues('2024-03-20', '2024-10-01', 
                                               player_type='batter', Level ='AAA')

This is a large query, it may take a moment to complete
Query unsuccessful for data from 2024-03-20 to 2024-03-22. Skipping these dates.
Query unsuccessful for data from 2024-03-23 to 2024-03-23. Skipping these dates.
Query unsuccessful for data from 2024-03-24 to 2024-03-26. Skipping these dates.
Query unsuccessful for data from 2024-03-27 to 2024-03-27. Skipping these dates.
Completed sub-query from 2024-03-28 to 2024-03-31
Completed sub-query from 2024-04-01 to 2024-04-04
Completed sub-query from 2024-04-05 to 2024-04-08
Completed sub-query from 2024-04-09 to 2024-04-12
Completed sub-query from 2024-04-13 to 2024-04-16
Completed sub-query from 2024-04-17 to 2024-04-20
Completed sub-query from 2024-04-21 to 2024-04-24
Completed sub-query from 2024-04-25 to 2024-04-28
Completed sub-query from 2024-04-29 to 2024-05-02
Completed sub-query from 2024-05-03 to 2024-05-06
Completed sub-query from 2024-05-07 to 2024-05-10
Completed sub-query from 2024-05-11 to 2024-05-14
Completed sub-query 

In [23]:
pd.set_option('display.max_columns', None)

In [50]:
minor_league_data_AAA

Unnamed: 0,index,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,fielder_2,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,bat_speed,swing_length,estimated_slg_using_speedangle,delta_pitcher_run_exp,hyper_speed,home_score_diff,bat_score_diff,home_win_exp,bat_win_exp,age_pit_legacy,age_bat_legacy,age_pit,age_bat,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,134,FF,2024-09-26,97.1,-2.17,5.13,"Straw, Myles",664702.0,656186.0,field_out,hit_into_play,,,,,12.0,Myles Straw flies out to center fielder Drew W...,W,R,R,OMA,COL,X,8.0,fly_ball,1.0,2.0,2024.0,-0.91,1.29,0.06,3.57,,,672356.0,2.0,9.0,Top,137.03,73.20,,,,,7.990519,-141.163496,-1.549005,-14.072148,33.728822,-14.826361,3.05,1.54,314.0,89.6,37.0,96.0,2349.0,5.8,769022.0,657247.0,668472.0,686475.0,681987.0,687478.0,670231.0,671221.0,694355.0,54.65,0.006,0.005,0.0,1.0,0.0,0.0,3.0,78.0,4.0,4-Seam Fastball,7.0,3.0,3.0,7.0,3.0,7.0,3.0,7.0,Standard,Standard,226.0,0.003,-0.196,,,0.006,0.196,89.6,4,-4,0.997,0.003,31,29,32,30,1,4,,,,,1.14,0.91,0.91,
1,145,FF,2024-09-26,98.1,-2.31,5.07,"Straw, Myles",664702.0,656186.0,,foul,,,,,11.0,Myles Straw flies out to center fielder Drew W...,W,R,R,OMA,COL,S,,,1.0,1.0,2024.0,-0.81,1.07,-1.39,2.30,,,672356.0,2.0,9.0,Top,,,,,,,4.305881,-142.807576,-4.437999,-12.077641,32.827829,-16.767032,3.05,1.54,3.0,70.4,-38.0,97.0,2371.0,5.6,769022.0,657247.0,668472.0,686475.0,681987.0,687478.0,670231.0,671221.0,694355.0,54.86,,,,,,,,78.0,3.0,4-Seam Fastball,7.0,3.0,3.0,7.0,3.0,7.0,3.0,7.0,Standard,Standard,220.0,0.000,-0.071,,,,0.071,88.0,4,-4,0.997,0.003,31,29,32,30,1,4,,,,,1.30,0.81,0.81,
2,148,FF,2024-09-26,96.8,-2.30,5.11,"Straw, Myles",664702.0,656186.0,,foul,,,,,2.0,Myles Straw flies out to center fielder Drew W...,W,R,R,OMA,COL,S,,,1.0,0.0,2024.0,-0.69,1.38,0.12,2.83,,,672356.0,2.0,9.0,Top,,,,,,,7.960194,-140.742264,-3.665808,-10.942581,30.692805,-13.096391,3.05,1.54,215.0,75.3,59.0,96.0,2351.0,5.8,769022.0,657247.0,668472.0,686475.0,681987.0,687478.0,670231.0,671221.0,694355.0,54.72,,,,,,,,78.0,2.0,4-Seam Fastball,7.0,3.0,3.0,7.0,3.0,7.0,3.0,7.0,Standard,Standard,221.0,0.000,-0.067,,,,0.067,88.0,4,-4,0.997,0.003,31,29,32,30,1,4,,,,,1.04,0.69,0.69,
3,159,FF,2024-09-26,97.5,-2.32,5.04,"Straw, Myles",664702.0,656186.0,,ball,,,,,13.0,Myles Straw flies out to center fielder Drew W...,W,R,R,OMA,COL,B,,,0.0,0.0,2024.0,-0.79,1.18,-0.21,0.90,,,672356.0,2.0,9.0,Top,,,,,,,7.386315,-141.680415,-8.244513,-12.283029,30.882955,-14.580450,3.05,1.54,,,,96.6,2327.0,5.7,769022.0,657247.0,668472.0,686475.0,681987.0,687478.0,670231.0,671221.0,694355.0,54.77,,,,,,,,78.0,1.0,4-Seam Fastball,7.0,3.0,3.0,7.0,3.0,7.0,3.0,7.0,Standard,Standard,220.0,0.000,0.049,,,,-0.049,,4,-4,0.997,0.003,31,29,32,30,1,4,,,,,1.22,0.79,0.79,
4,165,SL,2024-09-26,89.1,-2.41,5.16,"Arias, Gabriel",672356.0,656186.0,single,hit_into_play,,,,,4.0,Gabriel Arias singles on a ground ball to shor...,W,R,R,OMA,COL,X,6.0,ground_ball,2.0,2.0,2024.0,0.47,0.01,-0.70,2.59,,,,2.0,9.0,Top,136.61,170.43,,,,,3.167562,-129.848487,-0.405380,4.787109,23.794867,-32.214603,3.25,1.64,1.0,77.4,-63.0,88.5,2724.0,5.6,769022.0,657247.0,668472.0,686475.0,681987.0,687478.0,670231.0,671221.0,694355.0,54.92,0.303,0.294,0.9,1.0,1.0,0.0,2.0,77.0,5.0,Slider,7.0,3.0,3.0,7.0,3.0,7.0,3.0,7.0,Standard,Standard,99.0,-0.002,0.388,,,0.306,-0.388,88.0,4,-4,0.999,0.001,31,24,32,24,1,4,,,,,2.83,-0.47,-0.47,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
675320,12547,SI,2024-03-29,89.1,3.19,5.70,"Pages, Andy",681624.0,646242.0,field_out,hit_into_play,,,,,6.0,Andy Pages flies out to center fielder Jonatan...,R,R,L,TAC,OKC,X,8.0,fly_ball,0.0,0.0,2024.0,1.20,0.62,0.35,2.46,,,,1.0,1.0,Top,125.16,39.37,,,,,-9.448799,-129.388677,-3.242367,15.431153,25.629513,-24.675726,3.23,1.63,409.0,107.3,27.0,88.8,2285.0,6.1,751442.0,657709.0,613564.0,656308.0,605119.0,669208.0,669392.0,682729.0,687799.0,54.39,0.963,1.942,0.0,1.0,0.0,0.0,6.0,2.0,1.0,Sinker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Standard,Standard,128.0,0.015,-0.290,,,3.766,0.290,107.3,0,0,0.522,0.478,27,23,28,24,1,0,,,,,2.25,1.20,-1.20,
675321,12710,SL,2024-03-29,79.8,3.18,5.78,"Owings, Chris",572008.0,646242.0,strikeout,swinging_strike,,,,,8.0,Chris Owings strikes out swinging.,R,R,L,TAC,OKC,S,2.0,,1.0,2.0,2024.0,-1.38,0.10,-0.19,1.89,,,,0.0,1.0,Top,,,,,,,-4.749370,-116.068777,-1.912888,-11.365154,24.417321,-31.139187,3.12,1.57,,,,78.4,2825.0,5.6,751442.0,657709.0,613564.0,656308.0,605119.0,669208.0,669392.0,682729.0,687799.0,54.86,,0.000,0.0,1.0,0.0,0.0,,1.0,4.0,Slider,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Standard,Standard,294.0,0.022,-0.210,,,,0.210,,0,0,0.500,0.500,27,32,28,33,1,0,,,,,3.52,-1.38,1.38,
675322,12836,SI,2024-03-29,88.4,3.28,5.71,"Owings, Chris",572008.0,646242.0,,called_strike,,,,,7.0,Chris Owings strikes out swinging.,R,R,L,TAC,OKC,S,,,1.0,1.0,2024.0,0.97,0.46,-0.33,1.61,,,,0.0,1.0,Top,,,,,,,-10.698955,-128.343949,-4.854997,12.806192,25.043508,-26.213066,3.12,1.57,,,,87.7,2314.0,5.8,751442.0,657709.0,613564.0,656308.0,605119.0,669208.0,669392.0,682729.0,687799.0,54.67,,,,,,,,1.0,3.0,Sinker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Standard,Standard,129.0,0.000,-0.077,,,,0.077,,0,0,0.500,0.500,27,32,28,33,1,0,,,,,2.45,0.97,-0.97,
675323,13030,SI,2024-03-29,88.1,3.25,5.61,"Owings, Chris",572008.0,646242.0,,foul,,,,,14.0,Chris Owings strikes out swinging.,R,R,L,TAC,OKC,S,,,1.0,0.0,2024.0,1.33,0.35,1.08,2.09,,,,0.0,1.0,Top,,,,,,,-7.945536,-128.051163,-3.114438,16.283526,27.726376,-27.899179,3.12,1.57,232.0,76.4,37.0,87.1,2085.0,5.8,751442.0,657709.0,613564.0,656308.0,605119.0,669208.0,669392.0,682729.0,687799.0,54.67,,,,,,,,1.0,2.0,Sinker,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Standard,Standard,117.0,0.000,-0.085,,,,0.085,88.0,0,0,0.500,0.500,27,32,28,33,1,0,,,,,2.61,1.33,-1.33,


In [51]:
minor_league_data_AAA.to_csv('minor_league_data_AAA.csv', index=False)