In [1]:
import pandas as pd
import pyarrow.dataset as pads
import os

In [2]:
data_path = "/Users/pranavrajaram/SMT-Data-Challenge-2025"

In [7]:
def readDataSubset(table_type, data_path):
    """
    Loads a specified SMT data subset as a PyArrow dataset.
    """
    valid_tables = ['ball_pos', 'game_events', 'game_info', 'player_pos', 'rosters']
    if table_type not in valid_tables:
        print("Invalid data subset name. Please try again with a valid data subset.")
        return None

    if table_type == 'rosters':
        return pads.dataset(source=os.path.join(data_path, 'rosters.csv'), format='csv')
    else:
        
        return pads.dataset(
            source=os.path.join(data_path, table_type),
            format='csv'
        )

In [11]:
game_info_ds = readDataSubset('game_info', data_path)
game_events_ds = readDataSubset('game_events', data_path)
ball_pos_ds = readDataSubset('ball_pos', data_path)
player_pos_ds = readDataSubset('player_pos', data_path)
rosters_ds = readDataSubset('rosters', data_path)

In [12]:
import pyarrow as pa

filter_criteria = (
    (pads.field("home_team") == "QEA")
)

game_info_df = game_info_ds.to_table().to_pandas()
game_events_df = game_events_ds.to_table().to_pandas()

In [14]:
# some keys
position_key = pd.DataFrame({
    "code": [*range(1, 14), 255, 14, 15, 16, 17, 18, 19],
    "position": [
        "pitcher", "catcher", "first baseman", "second baseman", "third baseman",
        "shortstop", "left field", "center field", "right field", "batter",
        "runner on first base", "runner on second base", "runner on third base",
        "ball event with no player (e.g., ball bounce)", "home plate umpire",
        "field umpire", "field umpire", "field umpire",
        "first base coach", "third base coach"
    ]
})

event_key = pd.DataFrame({
    "code": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16],
    "play_type": [
        "pitch", "ball acquired", "throw (ball-in-play)", "ball hit into play", 
        "end of play", "pickoff throw", "ball acquired - unknown field position", 
        "throw (ball-in-play) - unknown field position", "ball deflection", 
        "ball deflection off of wall", "home run", "ball bounce"
    ]
})

In [15]:
big_ie = pd.merge(
    game_events_df, 
    game_info_df, 
    on=['game_str', 'play_per_game'], 
    suffixes=('', '_dup')
)

big_ie = big_ie.loc[:, ~big_ie.columns.str.endswith('_dup')]

big_ie['player_position'] = pd.to_numeric(big_ie['player_position'], errors='coerce').astype('Int64')
big_ie['event_code'] = pd.to_numeric(big_ie['event_code'], errors='coerce').astype('Int64')

big_ie = big_ie.merge(position_key, how='left', left_on='player_position', right_on='code').drop('code', axis=1)

big_ie = big_ie.merge(event_key, how='left', left_on='event_code', right_on='code').drop('code', axis=1)

In [119]:
big_ie.head()

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,non_play
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch,True
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired,True
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,,end of play,True
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch,True
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired,True


In [118]:
prev2 = big_ie['event_code'].shift(2)
prev1 = big_ie['event_code'].shift(1)
curr  = big_ie['event_code']
next1 = big_ie['event_code'].shift(-1)
next2 = big_ie['event_code'].shift(-2)

pattern1 = (curr == 5) & (prev1 == 2) & (prev2 == 1)  # non hit pitch
pattern2 = (curr == 1) & (next1 == 2) & (next2 == 5)   
pattern3 = (curr == 2) & (prev1 == 1) & (next1 == 5) 
pattern4 = (curr == 5) & (prev1 == 4) # foul 
pattern5 = (curr == 4) & (next1 == 5) 
pattern6 = (curr == 1) # don't care about pitches

big_ie['non_play'] = pattern1 | pattern2 | pattern3 | pattern4 | pattern5 | pattern6

balls_in_play = big_ie[big_ie['non_play'] == False]
balls_in_play.head()

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,non_play
34,y1_d069_ACN_QEA,141,71,141,5668844,10,4,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1276,ACN-2472,,,batter,ball hit into play,False
35,y1_d069_ACN_QEA,141,71,141,5674144,255,11,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1276,ACN-2472,,,"ball event with no player (e.g., ball bounce)",home run,False
36,y1_d069_ACN_QEA,141,71,141,5674694,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1276,ACN-2472,,,,end of play,False
50,y1_d069_ACN_QEA,146,72,146,5795794,10,4,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1433,,ACN-1623,,batter,ball hit into play,False
51,y1_d069_ACN_QEA,146,72,146,5795894,255,16,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1433,,ACN-1623,,"ball event with no player (e.g., ball bounce)",ball bounce,False


In [34]:
ball_pos_df = ball_pos_ds.to_table().to_pandas()
player_pos_df = player_pos_ds.to_table().to_pandas()

In [None]:
candidate_plays = balls_in_play[
    (balls_in_play['first_baserunner'] != 'NA') | (balls_in_play['second_baserunner'] != 'NA')
]

candidate_plays_small = candidate_plays[candidate_plays['home_team'] == 'QEA']

In [None]:
# Pivot player positions
player_pos_pivot = player_pos_df.pivot_table(
    index=['game_str', 'play_id', 'timestamp'],
    columns='player_position',
    values=['field_x', 'field_y']
)

player_pos_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_x,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y,field_y
Unnamed: 0_level_1,Unnamed: 1_level_1,player_position,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
game_str,play_id,timestamp,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2
y1_d001_CGA_QEA,1,8699,0.1728,0.0555,69.6597,48.7089,-43.3815,-32.7906,-138.6321,-31.1307,109.1400,3.2805,,,,,,,-84.9531,87.1338,-77.6565,56.1612,-5.9649,87.1266,132.0027,70.1199,144.9864,230.9421,307.0320,258.7404,1.0482,,,,,,,83.5506,86.5839,55.5597
y1_d001_CGA_QEA,1,8749,0.1926,0.0270,69.5571,48.6915,-43.3473,-32.7612,-138.5343,-31.1985,109.1367,3.2715,,,,,,,-84.9054,87.0300,-77.6559,55.9401,-5.9316,87.0279,131.9445,70.0149,144.7236,230.9007,306.8640,258.6951,1.0749,,,,,,,83.4954,86.4639,55.5600
y1_d001_CGA_QEA,1,8799,0.2136,-0.0012,69.4509,48.6777,-43.3158,-32.7306,-138.4380,-31.2663,109.1358,3.2625,,,,,,,-84.8586,86.9466,-77.6556,55.7199,-5.8986,86.9268,131.8860,69.9072,144.4596,230.8596,306.6960,258.6498,1.1013,,,,,,,83.4417,86.3532,55.5600
y1_d001_CGA_QEA,1,8849,0.2355,-0.0297,69.3420,48.6672,-43.2861,-32.6991,-138.3426,-31.3341,109.1370,3.2535,,,,,,,-84.8124,86.8710,-77.6550,55.5003,-5.8653,86.8236,131.8281,69.7974,144.1950,230.8182,306.5250,258.6045,1.1280,,,,,,,83.3889,86.2644,55.5603
y1_d001_CGA_QEA,1,8899,0.2580,-0.0582,69.2310,48.6591,-43.2585,-32.6676,-138.2490,-31.4019,109.1400,3.2445,,,,,,,-84.7671,86.8119,-77.6547,55.2816,-5.8323,86.7192,131.7699,69.6861,143.9307,230.7768,306.3570,258.5595,1.1544,,,,,,,83.3376,86.1855,55.5600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
y2_d099_YJD_RZQ,292,10147948,1.0521,0.0000,71.2845,52.4973,-48.1095,-12.5124,-111.0312,-15.5784,118.3521,2.9841,,,,0.0,,,,89.5809,-82.3752,55.5234,-4.5000,94.5525,140.0055,110.9124,153.1509,268.1436,308.7033,257.0847,-0.4713,,,,-6.6,,,,76.5105,80.6157
y2_d099_YJD_RZQ,292,10147998,1.1016,0.0000,71.2005,52.5603,-48.1713,-12.5040,-111.0039,-15.6582,118.3509,3.0297,,,,0.0,,,,89.6301,-82.4028,55.4679,-4.5000,94.5054,139.9809,110.8392,153.1242,268.1094,308.6394,257.0739,-0.4560,,,,-6.6,,,,76.5072,80.6184
y2_d099_YJD_RZQ,292,10148048,1.1517,0.0000,71.1162,52.6233,-48.2328,-12.4956,-110.9769,-15.7377,118.3494,3.0753,,,,0.0,,,,89.6787,-82.4304,55.4112,-4.5000,94.4586,139.9563,110.7657,153.0978,268.0752,308.5755,257.0628,-0.4404,,,,-6.6,,,,76.5042,80.6208
y2_d099_YJD_RZQ,292,10148098,1.2018,0.0000,71.0316,52.6863,-48.2943,-12.4869,-110.9499,-15.8169,118.3482,3.1206,,,,0.0,,,,89.7270,-82.4580,55.3542,-4.5000,94.4121,139.9317,110.6922,153.0711,268.0416,308.5116,257.0520,-0.4248,,,,-6.6,,,,76.5012,80.6229


In [74]:
player_pos_pivot.columns = [f"{coord}_{pos}" for coord, pos in player_pos_pivot.columns]
player_pos_pivot = player_pos_pivot.reset_index()
player_pos_pivot

Unnamed: 0,game_str,play_id,timestamp,field_x_1,field_x_2,field_x_3,field_x_4,field_x_5,field_x_6,field_x_7,field_x_8,field_x_9,field_x_10,field_x_11,field_x_12,field_x_13,field_x_14,field_x_15,field_x_16,field_x_17,field_x_18,field_x_19,field_y_1,field_y_2,field_y_3,field_y_4,field_y_5,field_y_6,field_y_7,field_y_8,field_y_9,field_y_10,field_y_11,field_y_12,field_y_13,field_y_14,field_y_15,field_y_16,field_y_17,field_y_18,field_y_19
0,y1_d001_CGA_QEA,1,8699,0.1728,0.0555,69.6597,48.7089,-43.3815,-32.7906,-138.6321,-31.1307,109.1400,3.2805,,,,,,,-84.9531,87.1338,-77.6565,56.1612,-5.9649,87.1266,132.0027,70.1199,144.9864,230.9421,307.0320,258.7404,1.0482,,,,,,,83.5506,86.5839,55.5597
1,y1_d001_CGA_QEA,1,8749,0.1926,0.0270,69.5571,48.6915,-43.3473,-32.7612,-138.5343,-31.1985,109.1367,3.2715,,,,,,,-84.9054,87.0300,-77.6559,55.9401,-5.9316,87.0279,131.9445,70.0149,144.7236,230.9007,306.8640,258.6951,1.0749,,,,,,,83.4954,86.4639,55.5600
2,y1_d001_CGA_QEA,1,8799,0.2136,-0.0012,69.4509,48.6777,-43.3158,-32.7306,-138.4380,-31.2663,109.1358,3.2625,,,,,,,-84.8586,86.9466,-77.6556,55.7199,-5.8986,86.9268,131.8860,69.9072,144.4596,230.8596,306.6960,258.6498,1.1013,,,,,,,83.4417,86.3532,55.5600
3,y1_d001_CGA_QEA,1,8849,0.2355,-0.0297,69.3420,48.6672,-43.2861,-32.6991,-138.3426,-31.3341,109.1370,3.2535,,,,,,,-84.8124,86.8710,-77.6550,55.5003,-5.8653,86.8236,131.8281,69.7974,144.1950,230.8182,306.5250,258.6045,1.1280,,,,,,,83.3889,86.2644,55.5603
4,y1_d001_CGA_QEA,1,8899,0.2580,-0.0582,69.2310,48.6591,-43.2585,-32.6676,-138.2490,-31.4019,109.1400,3.2445,,,,,,,-84.7671,86.8119,-77.6547,55.2816,-5.8323,86.7192,131.7699,69.6861,143.9307,230.7768,306.3570,258.5595,1.1544,,,,,,,83.3376,86.1855,55.5600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6058959,y2_d099_YJD_RZQ,292,10147948,1.0521,0.0000,71.2845,52.4973,-48.1095,-12.5124,-111.0312,-15.5784,118.3521,2.9841,,,,0.0,,,,89.5809,-82.3752,55.5234,-4.5000,94.5525,140.0055,110.9124,153.1509,268.1436,308.7033,257.0847,-0.4713,,,,-6.6,,,,76.5105,80.6157
6058960,y2_d099_YJD_RZQ,292,10147998,1.1016,0.0000,71.2005,52.5603,-48.1713,-12.5040,-111.0039,-15.6582,118.3509,3.0297,,,,0.0,,,,89.6301,-82.4028,55.4679,-4.5000,94.5054,139.9809,110.8392,153.1242,268.1094,308.6394,257.0739,-0.4560,,,,-6.6,,,,76.5072,80.6184
6058961,y2_d099_YJD_RZQ,292,10148048,1.1517,0.0000,71.1162,52.6233,-48.2328,-12.4956,-110.9769,-15.7377,118.3494,3.0753,,,,0.0,,,,89.6787,-82.4304,55.4112,-4.5000,94.4586,139.9563,110.7657,153.0978,268.0752,308.5755,257.0628,-0.4404,,,,-6.6,,,,76.5042,80.6208
6058962,y2_d099_YJD_RZQ,292,10148098,1.2018,0.0000,71.0316,52.6863,-48.2943,-12.4869,-110.9499,-15.8169,118.3482,3.1206,,,,0.0,,,,89.7270,-82.4580,55.3542,-4.5000,94.4121,139.9317,110.6922,153.0711,268.0416,308.5116,257.0520,-0.4248,,,,-6.6,,,,76.5012,80.6229


In [78]:

cp_pos = candidate_plays_small.merge(
    player_pos_pivot,
    on=['game_str', 'play_id', 'timestamp'],
    how='left'
).sort_values(by=['game_str', 'timestamp', 'play_id'])


In [81]:
cp_pos.head(5)

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,non_play,field_x_1,field_x_2,field_x_3,field_x_4,field_x_5,field_x_6,field_x_7,field_x_8,field_x_9,field_x_10,field_x_11,field_x_12,field_x_13,field_x_14,field_x_15,field_x_16,field_x_17,field_x_18,field_x_19,field_y_1,field_y_2,field_y_3,field_y_4,field_y_5,field_y_6,field_y_7,field_y_8,field_y_9,field_y_10,field_y_11,field_y_12,field_y_13,field_y_14,field_y_15,field_y_16,field_y_17,field_y_18,field_y_19
1049,y1_d001_CGA_QEA,4,2,4,84049,10,4,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,batter,ball hit into play,False,0.8787,-1.8429,56.7573,25.7118,-44.52,-25.2048,-133.4985,-32.6625,95.8002,2.4033,49.7067,,,,72.3738,8.6856,,,-81.6675,54.2682,-4.4274,64.0506,122.0712,70.7724,131.5413,230.9931,288.4542,262.5258,-0.0645,78.2334,,,,70.4334,103.8762,,,65.9304
1050,y1_d001_CGA_QEA,4,2,4,86099,255,16,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,"ball event with no player (e.g., ball bounce)",ball bounce,False,0.8067,-0.4992,54.9321,17.034,-47.4159,-40.9239,-126.5103,-32.5383,94.5087,20.697,29.0952,,,,71.6985,8.0838,,,-79.1304,49.8015,-4.2675,65.1348,124.4832,67.9965,125.271,215.0952,285.0882,260.6424,12.4002,99.885,,,,67.0239,103.7217,,,64.14
1051,y1_d001_CGA_QEA,4,2,4,86799,7,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,left field,ball acquired,False,-3.2481,-0.4107,54.2241,14.1813,-50.2191,-50.6505,-123.78,-32.4504,94.2384,33.8289,19.0095,,,,70.6788,7.6758,,,-78.9333,44.5842,-1.7121,64.6944,125.2791,66.3126,123.8514,206.6874,282.5754,259.1673,20.8035,109.614,,,,63.4512,104.1897,,,63.447
1052,y1_d001_CGA_QEA,4,2,4,88399,7,3,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,left field,throw (ball-in-play),False,-12.588,-1.0491,52.9533,8.4936,-55.7001,-70.686,-119.7978,-30.3444,94.3872,56.6424,-0.4104,,,,69.2358,5.9955,,,-79.7931,36.9105,3.2631,63.8367,126.6075,64.5138,121.1397,196.4517,277.6065,255.4932,48.1422,125.7387,,,,57.6384,104.7717,,,63.4905
1053,y1_d001_CGA_QEA,4,2,4,89699,6,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,shortstop,ball acquired,False,-17.7561,-1.4526,54.0573,4.6386,-58.6998,-80.6802,-120.1602,-27.5835,95.9751,60.4419,-4.0221,,,,69.1008,2.5425,,,-79.6893,37.0581,6.8436,63.0855,126.7554,64.5243,121.6113,197.3796,274.5747,252.306,70.8816,126.3249,,,,55.8636,104.7033,,,63.612


In [None]:
ball_pos_pivot = ball_pos_df.pivot_table(
    index=['game_str', 'play_id', 'timestamp'],
    values=['ball_position_x', 'ball_position_y', 'ball_position_z']
).reset_index()

full_plays = cp_pos.merge(
    ball_pos_pivot,
    on=['game_str', 'play_id', 'timestamp'],
    how='left'
)

full_plays.head()

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,non_play,field_x_1,field_x_2,field_x_3,field_x_4,field_x_5,field_x_6,field_x_7,field_x_8,field_x_9,field_x_10,field_x_11,field_x_12,field_x_13,field_x_14,field_x_15,field_x_16,field_x_17,field_x_18,field_x_19,field_y_1,field_y_2,field_y_3,field_y_4,field_y_5,field_y_6,field_y_7,field_y_8,field_y_9,field_y_10,field_y_11,field_y_12,field_y_13,field_y_14,field_y_15,field_y_16,field_y_17,field_y_18,field_y_19,ball_position_x,ball_position_y,ball_position_z
0,y1_d001_CGA_QEA,4,2,4,84049,10,4,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,batter,ball hit into play,False,0.8787,-1.8429,56.7573,25.7118,-44.52,-25.2048,-133.4985,-32.6625,95.8002,2.4033,49.7067,,,,72.3738,8.6856,,,-81.6675,54.2682,-4.4274,64.0506,122.0712,70.7724,131.5413,230.9931,288.4542,262.5258,-0.0645,78.2334,,,,70.4334,103.8762,,,65.9304,-0.080938,-1.274727,2.062173
1,y1_d001_CGA_QEA,4,2,4,86099,255,16,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,"ball event with no player (e.g., ball bounce)",ball bounce,False,0.8067,-0.4992,54.9321,17.034,-47.4159,-40.9239,-126.5103,-32.5383,94.5087,20.697,29.0952,,,,71.6985,8.0838,,,-79.1304,49.8015,-4.2675,65.1348,124.4832,67.9965,125.271,215.0952,285.0882,260.6424,12.4002,99.885,,,,67.0239,103.7217,,,64.14,-100.9464,177.8661,-0.328377
2,y1_d001_CGA_QEA,4,2,4,86799,7,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,left field,ball acquired,False,-3.2481,-0.4107,54.2241,14.1813,-50.2191,-50.6505,-123.78,-32.4504,94.2384,33.8289,19.0095,,,,70.6788,7.6758,,,-78.9333,44.5842,-1.7121,64.6944,125.2791,66.3126,123.8514,206.6874,282.5754,259.1673,20.8035,109.614,,,,63.4512,104.1897,,,63.447,-123.9168,206.0034,-0.3963
3,y1_d001_CGA_QEA,4,2,4,88399,7,3,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,left field,throw (ball-in-play),False,-12.588,-1.0491,52.9533,8.4936,-55.7001,-70.686,-119.7978,-30.3444,94.3872,56.6424,-0.4104,,,,69.2358,5.9955,,,-79.7931,36.9105,3.2631,63.8367,126.6075,64.5138,121.1397,196.4517,277.6065,255.4932,48.1422,125.7387,,,,57.6384,104.7717,,,63.4905,-116.8014,196.8126,5.0715
4,y1_d001_CGA_QEA,4,2,4,89699,6,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,shortstop,ball acquired,False,-17.7561,-1.4526,54.0573,4.6386,-58.6998,-80.6802,-120.1602,-27.5835,95.9751,60.4419,-4.0221,,,,69.1008,2.5425,,,-79.6893,37.0581,6.8436,63.0855,126.7554,64.5243,121.6113,197.3796,274.5747,252.306,70.8816,126.3249,,,,55.8636,104.7033,,,63.612,-82.1835,123.7863,2.159709


In [None]:
# logic to detect base running attempts
def detect_attempts(df):

    home_x_thresh = -35
    home_y_thresh = 35

    third_x_thresh = -35
    third_y_thresh = 98

    attempts = []

    for (game_str, play_id), group in df.groupby(['game_str', 'play_id']):

        group_sorted = group.sort_values('timestamp')

        # Hitter positions
        f10_x = group_sorted['field_x_10']
        f10_y = group_sorted['field_y_10']
        f10_second_attempt = (f10_x > 40) & (f10_y > 90)
        f10_third_attempt = (f10_x < -35) & (f10_y < 98)

        # 1B runner positions
        f11_x = group_sorted['field_x_11']
        f11_y = group_sorted['field_y_11']
        f11_home_attempt = (f11_x > home_x_thresh) & (f11_y < home_y_thresh)
        f11_third_attempt = (f11_x < third_x_thresh) & (f11_y < third_y_thresh)

        # 2B runner positions
        f12_x = group_sorted['field_x_12']
        f12_y = group_sorted['field_y_12']
        f12_home_attempt = (f12_x > home_x_thresh) & (f12_y < home_y_thresh)

        if f11_home_attempt.any():
            attempts.append({
                'game_str': game_str,
                'play_id': play_id,
                'player_position': 11,
                'from_base': '1B',
                'to_base': 'Home',
                'attempted': True
            })

        if f11_third_attempt.any():
            attempts.append({
                'game_str': game_str,
                'play_id': play_id,
                'player_position': 11,
                'from_base': '1B',
                'to_base': '3B',
                'attempted': True
            })

        if f12_home_attempt.any():
            attempts.append({
                'game_str': game_str,
                'play_id': play_id,
                'player_position': 12,
                'from_base': '2B',
                'to_base': 'Home',
                'attempted': True
            })

        if f10_second_attempt.any():
            attempts.append({
                'game_str': game_str,
                'play_id': play_id,
                'player_position': 10,
                'from_base': 'Home',
                'to_base': '2B',
                'attempted': True
            })


        if f10_third_attempt.any():
            attempts.append({
                'game_str': game_str,
                'play_id': play_id,
                'player_position': 10,
                'from_base': 'Home',
                'to_base': '3B',
                'attempted': True
            })

    return pd.DataFrame(attempts)


In [None]:
attempts_df = detect_attempts(full_plays)

attempts_df['attempt_label'] = (
    'runner_' + attempts_df['player_position'].astype(str) +
    '_attempt_' + attempts_df['to_base']
)

# Pivot wider
attempts_wide = attempts_df.pivot_table(
    index=['game_str', 'play_id'],
    columns='attempt_label',
    values='attempted',
    aggfunc='first'
).reset_index()

attempts_wide = attempts_wide.fillna(False)

attempts_wide

  attempts_wide = attempts_wide.fillna(False)


attempt_label,game_str,play_id,runner_10_attempt_2B,runner_10_attempt_3B,runner_11_attempt_3B,runner_11_attempt_Home,runner_12_attempt_Home
0,y1_d001_CGA_QEA,17,False,False,False,False,True
1,y1_d001_CGA_QEA,21,True,False,False,False,False
2,y1_d001_CGA_QEA,108,False,True,True,True,True
3,y1_d001_CGA_QEA,114,True,False,False,False,False
4,y1_d001_CGA_QEA,216,False,False,False,False,True
...,...,...,...,...,...,...,...
176,y1_d096_XAX_QEA,16,False,False,True,False,True
177,y1_d096_XAX_QEA,82,False,True,True,True,False
178,y1_d096_XAX_QEA,106,False,False,True,False,False
179,y1_d096_XAX_QEA,153,False,False,True,False,False


In [114]:
# Merge score_attemps with full_plays on game_str, play_id, and player_position
full_plays_att = full_plays.merge(
    attempts_wide,
    on=['game_str', 'play_id'],
    how='left'
)


full_plays_att.head(15)

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,non_play,field_x_1,field_x_2,field_x_3,field_x_4,field_x_5,field_x_6,field_x_7,field_x_8,field_x_9,field_x_10,field_x_11,field_x_12,field_x_13,field_x_14,field_x_15,field_x_16,field_x_17,field_x_18,field_x_19,field_y_1,field_y_2,field_y_3,field_y_4,field_y_5,field_y_6,field_y_7,field_y_8,field_y_9,field_y_10,field_y_11,field_y_12,field_y_13,field_y_14,field_y_15,field_y_16,field_y_17,field_y_18,field_y_19,ball_position_x,ball_position_y,ball_position_z,runner_10_attempt_2B,runner_10_attempt_3B,runner_11_attempt_3B,runner_11_attempt_Home,runner_12_attempt_Home
0,y1_d001_CGA_QEA,4,2,4,84049,10,4,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,batter,ball hit into play,False,0.8787,-1.8429,56.7573,25.7118,-44.52,-25.2048,-133.4985,-32.6625,95.8002,2.4033,49.7067,,,,72.3738,8.6856,,,-81.6675,54.2682,-4.4274,64.0506,122.0712,70.7724,131.5413,230.9931,288.4542,262.5258,-0.0645,78.2334,,,,70.4334,103.8762,,,65.9304,-0.080938,-1.274727,2.062173,,,,,
1,y1_d001_CGA_QEA,4,2,4,86099,255,16,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,"ball event with no player (e.g., ball bounce)",ball bounce,False,0.8067,-0.4992,54.9321,17.034,-47.4159,-40.9239,-126.5103,-32.5383,94.5087,20.697,29.0952,,,,71.6985,8.0838,,,-79.1304,49.8015,-4.2675,65.1348,124.4832,67.9965,125.271,215.0952,285.0882,260.6424,12.4002,99.885,,,,67.0239,103.7217,,,64.14,-100.9464,177.8661,-0.328377,,,,,
2,y1_d001_CGA_QEA,4,2,4,86799,7,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,left field,ball acquired,False,-3.2481,-0.4107,54.2241,14.1813,-50.2191,-50.6505,-123.78,-32.4504,94.2384,33.8289,19.0095,,,,70.6788,7.6758,,,-78.9333,44.5842,-1.7121,64.6944,125.2791,66.3126,123.8514,206.6874,282.5754,259.1673,20.8035,109.614,,,,63.4512,104.1897,,,63.447,-123.9168,206.0034,-0.3963,,,,,
3,y1_d001_CGA_QEA,4,2,4,88399,7,3,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,left field,throw (ball-in-play),False,-12.588,-1.0491,52.9533,8.4936,-55.7001,-70.686,-119.7978,-30.3444,94.3872,56.6424,-0.4104,,,,69.2358,5.9955,,,-79.7931,36.9105,3.2631,63.8367,126.6075,64.5138,121.1397,196.4517,277.6065,255.4932,48.1422,125.7387,,,,57.6384,104.7717,,,63.4905,-116.8014,196.8126,5.0715,,,,,
4,y1_d001_CGA_QEA,4,2,4,89699,6,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,shortstop,ball acquired,False,-17.7561,-1.4526,54.0573,4.6386,-58.6998,-80.6802,-120.1602,-27.5835,95.9751,60.4419,-4.0221,,,,69.1008,2.5425,,,-79.6893,37.0581,6.8436,63.0855,126.7554,64.5243,121.6113,197.3796,274.5747,252.306,70.8816,126.3249,,,,55.8636,104.7033,,,63.612,-82.1835,123.7863,2.159709,,,,,
5,y1_d001_CGA_QEA,4,2,4,89699,0,5,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-2010,CGA-1458,,,,end of play,False,-17.7561,-1.4526,54.0573,4.6386,-58.6998,-80.6802,-120.1602,-27.5835,95.9751,60.4419,-4.0221,,,,69.1008,2.5425,,,-79.6893,37.0581,6.8436,63.0855,126.7554,64.5243,121.6113,197.3796,274.5747,252.306,70.8816,126.3249,,,,55.8636,104.7033,,,63.612,-82.1835,123.7863,2.159709,,,,,
6,y1_d001_CGA_QEA,17,5,17,438749,10,4,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1929,CGA-1547,CGA-2010,CGA-1458,batter,ball hit into play,False,0.8127,0.1545,70.2456,27.7263,-51.6372,-26.406,-139.9302,-29.2776,110.8389,3.1611,52.0722,-17.4315,-51.5502,,,,,,-69.3339,54.8061,-6.2772,89.2827,123.5244,83.7081,135.1806,239.3487,308.316,261.8859,0.3978,74.8515,116.5311,50.6724,,,,,,42.759,0.001318,4.20123,2.656302,False,False,False,False,True
7,y1_d001_CGA_QEA,17,5,17,439699,255,16,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1929,CGA-1547,CGA-2010,CGA-1458,"ball event with no player (e.g., ball bounce)",ball bounce,False,1.9611,-0.6603,70.3329,20.8701,-52.9572,-30.4836,-137.2755,-31.1706,109.4964,6.3696,45.9129,-24.4422,-48.1548,,,,,,-69.1776,53.7042,-5.9154,85.3905,125.6103,79.3296,130.9578,242.7009,306.36,261.7455,4.5879,81.8331,110.1876,48.3375,,,,,,42.8961,-27.73875,106.7703,-0.755106,False,False,False,False,True
8,y1_d001_CGA_QEA,17,5,17,439999,6,9,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1929,CGA-1547,CGA-2010,CGA-1458,shortstop,ball deflection,False,2.2626,-1.4703,70.1523,16.4997,-54.1851,-32.9415,-134.6727,-33.5718,108.9075,8.0256,43.3485,-28.2003,-45.2673,,,,,,-68.88,53.5815,-6.2217,82.2015,126.7134,76.5495,129.6528,245.1171,305.178,261.3375,6.4131,85.1283,107.3433,45.2163,,,,,,42.9477,-33.9477,129.4128,2.939976,False,False,False,False,True
9,y1_d001_CGA_QEA,17,5,17,440949,255,16,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1929,CGA-1547,CGA-2010,CGA-1458,"ball event with no player (e.g., ball bounce)",ball bounce,False,2.9022,0.2559,67.1091,3.3681,-60.0213,-36.6162,-121.7199,-44.1438,106.9923,21.3693,28.7232,-43.9185,-31.4778,,,,,,-66.0339,53.7039,-5.2851,69.8679,128.5554,67.6185,127.1757,251.6037,297.4026,257.7852,20.7225,100.6758,94.7724,32.8458,,,,,,43.0122,-22.37685,140.6385,-0.087786,False,False,False,False,True


In [None]:
# filter for moment that the ball is acquired
ball_acquired_df = full_plays_att[(full_plays_att['play_type'] == 'ball acquired') & (full_plays_att['player_position'].isin([7,8,9]))]
attempt_cols = [col for col in ball_acquired_df.columns if col.startswith('runner_')]
ball_acquired_df = ball_acquired_df[ball_acquired_df[attempt_cols].any(axis=1)]
ball_acquired_df

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,non_play,field_x_1,field_x_2,field_x_3,field_x_4,field_x_5,field_x_6,field_x_7,field_x_8,field_x_9,field_x_10,field_x_11,field_x_12,field_x_13,field_x_14,field_x_15,field_x_16,field_x_17,field_x_18,field_x_19,field_y_1,field_y_2,field_y_3,field_y_4,field_y_5,field_y_6,field_y_7,field_y_8,field_y_9,field_y_10,field_y_11,field_y_12,field_y_13,field_y_14,field_y_15,field_y_16,field_y_17,field_y_18,field_y_19,ball_position_x,ball_position_y,ball_position_z,runner_10_attempt_2B,runner_10_attempt_3B,runner_11_attempt_3B,runner_11_attempt_Home,runner_12_attempt_Home
56,y1_d001_CGA_QEA,108,30,108,4098455,8,2,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1824,CGA-1929,CGA-1592,CGA-2010,center field,ball acquired,False,10.6242,1.8981,14.3628,41.7405,-54.0183,15.2175,-110.8791,94.4808,65.4084,35.7615,-44.5356,-62.0019,5.1159,,,-14.2134,,,,-8.5950,3.9579,90.6048,194.3859,61.8522,145.7340,230.5518,338.4810,291.4254,110.5899,100.8936,65.0769,-6.6138,,,82.6446,,,,95.3448,339.4440,0.000000,False,True,True,True,True
63,y1_d001_CGA_QEA,114,31,114,4231155,9,2,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1353,CGA-1824,CGA-1929,CGA-1592,right field,ball acquired,False,10.5693,1.1388,37.5054,25.8921,-49.8090,-15.0987,-121.0008,28.4658,139.5966,61.1802,,,-62.1426,,105.7353,,,80.1735,,29.4282,-4.7436,67.8333,137.6235,75.4221,101.4882,236.3949,327.7230,299.0814,61.2882,,,61.9074,,136.4676,,,61.2753,,138.1734,295.9152,5.194530,True,False,False,False,False
116,y1_d001_CGA_QEA,216,60,216,8333280,9,2,QEA,CGA,top,QEA-0152,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1929,CGA-1592,CGA-1547,,right field,ball acquired,False,-11.5893,0.7209,54.6249,86.6613,-62.0220,11.6007,-119.2062,-29.9289,188.1207,56.9868,,-50.5641,,,,,,,,42.6054,-2.2872,67.4337,163.7433,80.8035,132.4140,233.3106,274.1229,245.4093,46.3326,,38.5989,,,,,,,,188.0088,246.3627,0.000000,False,False,False,False,True
219,y1_d002_CGA_QEA,92,21,92,3087421,7,2,QEA,CGA,bottom,CGA-2074,CGA-1127,CGA-1592,CGA-1824,CGA-1198,CGA-2010,CGA-1353,CGA-1458,CGA-1547,QEA-0252,QEA-0235,QEA-0120,,left field,ball acquired,False,-13.9494,-1.2843,52.2426,-4.0035,-54.6465,-55.6794,-200.5506,-61.6011,113.8962,57.1884,-6.3957,-57.1140,,,,-18.6291,,69.7533,-59.1864,44.5128,-2.1459,67.1742,129.3156,74.8638,140.5182,236.1000,276.9882,207.5649,47.2554,122.2593,67.5888,,,,92.0280,,61.1097,41.8479,-201.4455,237.1629,0.000000,True,False,True,False,True
234,y1_d002_CGA_QEA,99,22,99,3366171,9,2,QEA,CGA,bottom,CGA-2074,CGA-1127,CGA-1592,CGA-1824,CGA-1198,CGA-2010,CGA-1353,CGA-1458,CGA-1547,QEA-0263,,QEA-0252,QEA-0235,right field,ball acquired,False,-1.6434,-0.3699,45.3174,55.3470,-61.3548,-21.3045,-132.2520,2.4126,150.7599,36.5151,12.8382,,-20.8230,,53.6865,-18.7200,-112.3923,68.8656,-64.2873,32.5035,-6.1887,63.3072,133.9908,65.0082,116.0889,244.0428,290.4192,178.6944,27.3090,121.6398,,18.2217,,71.4021,87.4467,93.5823,55.6809,54.0930,150.4056,177.5004,0.000000,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7150,y1_d096_XAX_QEA,16,6,16,517708,8,2,QEA,XAX,top,QEA-0410,QEA-0218,QEA-0263,QEA-0201,QEA-0369,QEA-0027,QEA-0365,QEA-0182,QEA-0249,XAX-1158,XAX-1967,XAX-1882,,center field,ball acquired,False,10.8729,0.0555,28.0512,12.3585,-67.3782,-21.3207,-135.7350,28.8465,93.2925,37.2735,12.6909,-57.8232,,,,-15.8868,,75.5175,-67.1820,40.8555,0.7176,82.4541,145.0977,79.9695,132.5874,237.6507,263.6898,280.2213,30.1611,119.0442,74.6697,,,,81.0759,,62.6472,35.4822,30.7827,262.2435,1.301292,False,False,True,False,True
7170,y1_d096_XAX_QEA,82,23,82,2738858,9,2,QEA,XAX,bottom,XAX-1396,XAX-1967,XAX-1882,XAX-1217,XAX-1799,XAX-2586,XAX-2378,XAX-2387,XAX-1460,QEA-0365,QEA-0182,,,right field,ball acquired,False,-14.3493,-0.3183,35.9952,11.6181,-50.5962,-0.6792,-108.2844,44.3814,42.8040,15.3114,-62.1264,,,,,-5.0397,,76.1874,-64.4556,39.0981,3.1077,65.3283,228.6246,63.7503,165.1398,135.7005,376.9410,367.4379,122.3778,66.5913,,,,,86.0295,,56.1012,47.9121,43.4127,367.8690,0.000000,False,True,True,True,False
7187,y1_d096_XAX_QEA,106,30,106,3597808,9,2,QEA,XAX,top,QEA-0410,QEA-0218,QEA-0263,QEA-0201,QEA-0369,QEA-0027,QEA-0365,QEA-0182,QEA-0249,XAX-2586,XAX-2387,,,right field,ball acquired,False,2.9391,0.0099,53.4825,27.8034,-57.6813,-11.0307,-129.2172,25.2063,86.2305,57.1692,-14.7330,,,,,5.0358,,,,52.8351,-2.4807,64.0320,155.9769,77.6814,134.3184,233.3280,299.2389,304.4487,49.3320,123.7893,,,,,89.5296,,,,86.2086,305.1780,4.551600,False,False,True,False,False
7204,y1_d096_XAX_QEA,153,43,153,5349108,7,2,QEA,XAX,bottom,XAX-1396,XAX-1967,XAX-1882,XAX-1217,XAX-1799,XAX-2586,XAX-2378,XAX-2387,XAX-1460,QEA-0290,,QEA-0365,QEA-0027,left field,ball acquired,False,-13.5468,-1.8369,53.1681,16.4568,-57.7380,-55.2630,-81.4059,-48.1110,117.2007,56.4918,-7.0050,,,,,2.9982,,73.4526,-63.6624,31.3920,-3.1890,65.4249,136.0935,77.2779,164.6367,306.2886,342.5718,241.0203,47.1294,126.2043,,,,,111.5700,,55.2615,50.1099,-81.1731,305.3100,0.000000,False,False,True,False,False


In [171]:
import numpy as np
pd.options.mode.chained_assignment = None

safe_radius = 5

### -------- 1. Runner 11 to 3B -------- ###
tag_11_3B = full_plays_att[
    (full_plays_att['play_type'] == 'ball acquired') &
    (full_plays_att['position'] == 'third baseman') &
    (full_plays_att['runner_11_attempt_3B'] == True)
].copy()

third_base_x, third_base_y = -63.64, 63.64
tag_11_3B['runner_11_dist_to_3B'] = np.sqrt((tag_11_3B['field_x_11'] - third_base_x) ** 2 + (tag_11_3B['field_y_11'] - third_base_y) ** 2)
tag_11_3B['ball_dist_to_3B'] = np.sqrt((tag_11_3B['ball_position_x'] - third_base_x) ** 2 + (tag_11_3B['ball_position_y'] - third_base_y) ** 2)

tag_11_3B['runner_11_safe_3B'] = (
    (tag_11_3B['runner_11_dist_to_3B'] < safe_radius) |
    (tag_11_3B['ball_dist_to_3B'] > safe_radius)
)

# Force safe if runner also attempted home
tag_11_3B['runner_11_safe_3B'] = tag_11_3B.apply(
    lambda row: True if (row.get('runner_11_attempt_3B') and row.get('runner_11_attempt_Home')) else row['runner_11_safe_3B'],
    axis=1
)

### -------- 2. Runner 11 to Home -------- ###
tag_11_H = full_plays_att[
    (full_plays_att['play_type'] == 'ball acquired') &
    (full_plays_att['position'] == 'catcher') &
    (full_plays_att['runner_11_attempt_Home'] == True)
].copy()

home_x, home_y = 0, 0
tag_11_H['runner_11_dist_to_Home'] = np.sqrt((tag_11_H['field_x_11'] - home_x) ** 2 + (tag_11_H['field_y_11'] - home_y) ** 2)
tag_11_H['ball_dist_to_Home'] = np.sqrt((tag_11_H['ball_position_x'] - home_x) ** 2 + (tag_11_H['ball_position_y'] - home_y) ** 2)

tag_11_H['runner_11_safe_Home'] = (
    (tag_11_H['runner_11_dist_to_Home'] < safe_radius) |
    (tag_11_H['ball_dist_to_Home'] > safe_radius)
)

### -------- 3. Runner 12 to Home -------- ###
tag_12_H = full_plays_att[
    (full_plays_att['play_type'] == 'ball acquired') &
    (full_plays_att['position'] == 'catcher') &
    (full_plays_att['runner_12_attempt_Home'] == True)
].copy()

tag_12_H['runner_12_dist_to_Home'] = np.sqrt((tag_12_H['field_x_12'] - home_x) ** 2 + (tag_12_H['field_y_12'] - home_y) ** 2)
tag_12_H['ball_dist_to_Home'] = np.sqrt((tag_12_H['ball_position_x'] - home_x) ** 2 + (tag_12_H['ball_position_y'] - home_y) ** 2)

tag_12_H['runner_12_safe_Home'] = (
    (tag_12_H['runner_12_dist_to_Home'] < safe_radius) |
    (tag_12_H['ball_dist_to_Home'] > safe_radius)
)

### -------- 4. Runner 10 to 2B -------- ###
second_base_x, second_base_y = 0, 127
tag_10_2B = full_plays_att[
    (full_plays_att['play_type'] == 'ball acquired') &
    (full_plays_att['position'] == 'second baseman') &
    (full_plays_att['runner_10_attempt_2B'] == True)
].copy()

tag_10_2B['runner_10_dist_to_2B'] = np.sqrt((tag_10_2B['field_x_10'] - second_base_x) ** 2 + (tag_10_2B['field_y_10'] - second_base_y) ** 2)
tag_10_2B['ball_dist_to_2B'] = np.sqrt((tag_10_2B['ball_position_x'] - second_base_x) ** 2 + (tag_10_2B['ball_position_y'] - second_base_y) ** 2)

tag_10_2B['runner_10_safe_2B'] = (
    (tag_10_2B['runner_10_dist_to_2B'] < safe_radius) |
    (tag_10_2B['ball_dist_to_2B'] > safe_radius)
)

### -------- 5. Runner 10 to 3B -------- ###
tag_10_3B = full_plays_att[
    (full_plays_att['play_type'] == 'ball acquired') &
    (full_plays_att['position'] == 'third baseman') &
    (full_plays_att['runner_10_attempt_3B'] == True)
].copy()

tag_10_3B['runner_10_dist_to_3B'] = np.sqrt((tag_10_3B['field_x_10'] - third_base_x) ** 2 + (tag_10_3B['field_y_10'] - third_base_y) ** 2)
tag_10_3B['ball_dist_to_3B'] = np.sqrt((tag_10_3B['ball_position_x'] - third_base_x) ** 2 + (tag_10_3B['ball_position_y'] - third_base_y) ** 2)

tag_10_3B['runner_10_safe_3B'] = (
    (tag_10_3B['runner_10_dist_to_3B'] < safe_radius) |
    (tag_10_3B['ball_dist_to_3B'] > safe_radius)
)

### -------- Merge all tags into ball_acquired_df -------- ###

ball_acquired_df = ball_acquired_df.merge(tag_10_2B[['game_str', 'play_id', 'runner_10_safe_2B']], on=['game_str', 'play_id'], how='left')
ball_acquired_df = ball_acquired_df.merge(tag_10_3B[['game_str', 'play_id', 'runner_10_safe_3B']], on=['game_str', 'play_id'], how='left')
ball_acquired_df = ball_acquired_df.merge(tag_11_3B[['game_str', 'play_id', 'runner_11_safe_3B']], on=['game_str', 'play_id'], how='left')
ball_acquired_df = ball_acquired_df.merge(tag_11_H[['game_str', 'play_id', 'runner_11_safe_Home']], on=['game_str', 'play_id'], how='left')
ball_acquired_df = ball_acquired_df.merge(tag_12_H[['game_str', 'play_id', 'runner_12_safe_Home']], on=['game_str', 'play_id'], how='left')


# Fill missing safes with True if runner attempted but wasn't targeted
fill_safe_cols = [
    ('runner_11_attempt_3B', 'runner_11_safe_3B'),
    ('runner_11_attempt_Home', 'runner_11_safe_Home'),
    ('runner_12_attempt_Home', 'runner_12_safe_Home'),
    ('runner_10_attempt_2B', 'runner_10_safe_2B'),
    ('runner_10_attempt_3B', 'runner_10_safe_3B')
]

for attempt_col, safe_col in fill_safe_cols:
    ball_acquired_df[safe_col] = ball_acquired_df.apply(
        lambda row: True if row.get(attempt_col) and pd.isna(row.get(safe_col)) else row.get(safe_col),
        axis=1
    )


ball_acquired_df


Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,non_play,field_x_1,field_x_2,field_x_3,field_x_4,field_x_5,field_x_6,field_x_7,field_x_8,field_x_9,field_x_10,field_x_11,field_x_12,field_x_13,field_x_14,field_x_15,field_x_16,field_x_17,field_x_18,field_x_19,field_y_1,field_y_2,field_y_3,field_y_4,field_y_5,field_y_6,field_y_7,field_y_8,field_y_9,field_y_10,field_y_11,field_y_12,field_y_13,field_y_14,field_y_15,field_y_16,field_y_17,field_y_18,field_y_19,ball_position_x,ball_position_y,ball_position_z,runner_10_attempt_2B,runner_10_attempt_3B,runner_11_attempt_3B,runner_11_attempt_Home,runner_12_attempt_Home,runner_10_safe_2B,runner_10_safe_3B,runner_11_safe_3B,runner_11_safe_Home,runner_12_safe_Home
0,y1_d001_CGA_QEA,108,30,108,4098455,8,2,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1824,CGA-1929,CGA-1592,CGA-2010,center field,ball acquired,False,10.6242,1.8981,14.3628,41.7405,-54.0183,15.2175,-110.8791,94.4808,65.4084,35.7615,-44.5356,-62.0019,5.1159,,,-14.2134,,,,-8.5950,3.9579,90.6048,194.3859,61.8522,145.7340,230.5518,338.4810,291.4254,110.5899,100.8936,65.0769,-6.6138,,,82.6446,,,,95.3448,339.4440,0.000000,False,True,True,True,True,,True,True,True,True
1,y1_d001_CGA_QEA,114,31,114,4231155,9,2,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1353,CGA-1824,CGA-1929,CGA-1592,right field,ball acquired,False,10.5693,1.1388,37.5054,25.8921,-49.8090,-15.0987,-121.0008,28.4658,139.5966,61.1802,,,-62.1426,,105.7353,,,80.1735,,29.4282,-4.7436,67.8333,137.6235,75.4221,101.4882,236.3949,327.7230,299.0814,61.2882,,,61.9074,,136.4676,,,61.2753,,138.1734,295.9152,5.194530,True,False,False,False,False,True,,,,
2,y1_d001_CGA_QEA,216,60,216,8333280,9,2,QEA,CGA,top,QEA-0152,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1929,CGA-1592,CGA-1547,,right field,ball acquired,False,-11.5893,0.7209,54.6249,86.6613,-62.0220,11.6007,-119.2062,-29.9289,188.1207,56.9868,,-50.5641,,,,,,,,42.6054,-2.2872,67.4337,163.7433,80.8035,132.4140,233.3106,274.1229,245.4093,46.3326,,38.5989,,,,,,,,188.0088,246.3627,0.000000,False,False,False,False,True,,,,,True
3,y1_d002_CGA_QEA,92,21,92,3087421,7,2,QEA,CGA,bottom,CGA-2074,CGA-1127,CGA-1592,CGA-1824,CGA-1198,CGA-2010,CGA-1353,CGA-1458,CGA-1547,QEA-0252,QEA-0235,QEA-0120,,left field,ball acquired,False,-13.9494,-1.2843,52.2426,-4.0035,-54.6465,-55.6794,-200.5506,-61.6011,113.8962,57.1884,-6.3957,-57.1140,,,,-18.6291,,69.7533,-59.1864,44.5128,-2.1459,67.1742,129.3156,74.8638,140.5182,236.1000,276.9882,207.5649,47.2554,122.2593,67.5888,,,,92.0280,,61.1097,41.8479,-201.4455,237.1629,0.000000,True,False,True,False,True,True,,True,,True
4,y1_d002_CGA_QEA,99,22,99,3366171,9,2,QEA,CGA,bottom,CGA-2074,CGA-1127,CGA-1592,CGA-1824,CGA-1198,CGA-2010,CGA-1353,CGA-1458,CGA-1547,QEA-0263,,QEA-0252,QEA-0235,right field,ball acquired,False,-1.6434,-0.3699,45.3174,55.3470,-61.3548,-21.3045,-132.2520,2.4126,150.7599,36.5151,12.8382,,-20.8230,,53.6865,-18.7200,-112.3923,68.8656,-64.2873,32.5035,-6.1887,63.3072,133.9908,65.0082,116.0889,244.0428,290.4192,178.6944,27.3090,121.6398,,18.2217,,71.4021,87.4467,93.5823,55.6809,54.0930,150.4056,177.5004,0.000000,True,False,True,False,False,True,,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,y1_d096_XAX_QEA,16,6,16,517708,8,2,QEA,XAX,top,QEA-0410,QEA-0218,QEA-0263,QEA-0201,QEA-0369,QEA-0027,QEA-0365,QEA-0182,QEA-0249,XAX-1158,XAX-1967,XAX-1882,,center field,ball acquired,False,10.8729,0.0555,28.0512,12.3585,-67.3782,-21.3207,-135.7350,28.8465,93.2925,37.2735,12.6909,-57.8232,,,,-15.8868,,75.5175,-67.1820,40.8555,0.7176,82.4541,145.0977,79.9695,132.5874,237.6507,263.6898,280.2213,30.1611,119.0442,74.6697,,,,81.0759,,62.6472,35.4822,30.7827,262.2435,1.301292,False,False,True,False,True,,,True,,True
156,y1_d096_XAX_QEA,82,23,82,2738858,9,2,QEA,XAX,bottom,XAX-1396,XAX-1967,XAX-1882,XAX-1217,XAX-1799,XAX-2586,XAX-2378,XAX-2387,XAX-1460,QEA-0365,QEA-0182,,,right field,ball acquired,False,-14.3493,-0.3183,35.9952,11.6181,-50.5962,-0.6792,-108.2844,44.3814,42.8040,15.3114,-62.1264,,,,,-5.0397,,76.1874,-64.4556,39.0981,3.1077,65.3283,228.6246,63.7503,165.1398,135.7005,376.9410,367.4379,122.3778,66.5913,,,,,86.0295,,56.1012,47.9121,43.4127,367.8690,0.000000,False,True,True,True,False,,True,True,True,
157,y1_d096_XAX_QEA,106,30,106,3597808,9,2,QEA,XAX,top,QEA-0410,QEA-0218,QEA-0263,QEA-0201,QEA-0369,QEA-0027,QEA-0365,QEA-0182,QEA-0249,XAX-2586,XAX-2387,,,right field,ball acquired,False,2.9391,0.0099,53.4825,27.8034,-57.6813,-11.0307,-129.2172,25.2063,86.2305,57.1692,-14.7330,,,,,5.0358,,,,52.8351,-2.4807,64.0320,155.9769,77.6814,134.3184,233.3280,299.2389,304.4487,49.3320,123.7893,,,,,89.5296,,,,86.2086,305.1780,4.551600,False,False,True,False,False,,,True,,
158,y1_d096_XAX_QEA,153,43,153,5349108,7,2,QEA,XAX,bottom,XAX-1396,XAX-1967,XAX-1882,XAX-1217,XAX-1799,XAX-2586,XAX-2378,XAX-2387,XAX-1460,QEA-0290,,QEA-0365,QEA-0027,left field,ball acquired,False,-13.5468,-1.8369,53.1681,16.4568,-57.7380,-55.2630,-81.4059,-48.1110,117.2007,56.4918,-7.0050,,,,,2.9982,,73.4526,-63.6624,31.3920,-3.1890,65.4249,136.0935,77.2779,164.6367,306.2886,342.5718,241.0203,47.1294,126.2043,,,,,111.5700,,55.2615,50.1099,-81.1731,305.3100,0.000000,False,False,True,False,False,,,True,,


In [175]:
# Melt ball_acquired_df to long format for runner attempts and safe status
id_vars = [
    'game_str', 'play_id', 'timestamp', 'player_position', 'position', 'play_type',
    'home_team', 'away_team', 'top_bottom_inning', 'pitcher', 'catcher', 'first_base',
    'second_base', 'third_base', 'shortstop', 'left_field', 'center_field', 'right_field',
    'batter', 'first_baserunner', 'second_baserunner', 'third_baserunner',
    'field_x_10', 'field_y_10', 'field_x_11', 'field_y_11', 'field_x_12', 'field_y_12',
    'ball_position_x', 'ball_position_y', 'ball_position_z'
]

# List of runner attempt and safe columns
runner_cols = [
    ('runner_10_attempt_2B', 'runner_10_safe_2B', 10, '1B', '2B'),
    ('runner_10_attempt_3B', 'runner_10_safe_3B', 10, '1B', '3B'),
    ('runner_11_attempt_3B', 'runner_11_safe_3B', 11, '1B', '3B'),
    ('runner_11_attempt_Home', 'runner_11_safe_Home', 11, '1B', 'Home'),
    ('runner_12_attempt_Home', 'runner_12_safe_Home', 12, '2B', 'Home')
]

long_rows = []
for _, row in ball_acquired_df.iterrows():
    for attempt_col, safe_col, runner_id, from_base, to_base in runner_cols:
        if attempt_col in row and row[attempt_col] is True:
            long_row = row[id_vars].to_dict()
            long_row.update({
                'runner_id': runner_id,
                'from_base': from_base,
                'to_base': to_base,
                'attempted': row[attempt_col],
                'was_safe': row.get(safe_col, None),
                'runner_x': row.get(f'field_x_{runner_id}', None),
                'runner_y': row.get(f'field_y_{runner_id}', None)
            })
            long_rows.append(long_row)

ball_acquired_long = pd.DataFrame(long_rows)
ball_acquired_long

Unnamed: 0,game_str,play_id,timestamp,player_position,position,play_type,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,field_x_10,field_y_10,field_x_11,field_y_11,field_x_12,field_y_12,ball_position_x,ball_position_y,ball_position_z,runner_id,from_base,to_base,attempted,was_safe,runner_x,runner_y
0,y1_d001_CGA_QEA,108,4098455,8,center field,ball acquired,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1824,CGA-1929,CGA-1592,CGA-2010,35.7615,110.5899,-44.5356,100.8936,-62.0019,65.0769,95.3448,339.4440,0.00000,10,1B,3B,True,True,35.7615,110.5899
1,y1_d001_CGA_QEA,108,4098455,8,center field,ball acquired,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1824,CGA-1929,CGA-1592,CGA-2010,35.7615,110.5899,-44.5356,100.8936,-62.0019,65.0769,95.3448,339.4440,0.00000,11,1B,3B,True,True,-44.5356,100.8936
2,y1_d001_CGA_QEA,108,4098455,8,center field,ball acquired,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1824,CGA-1929,CGA-1592,CGA-2010,35.7615,110.5899,-44.5356,100.8936,-62.0019,65.0769,95.3448,339.4440,0.00000,11,1B,Home,True,True,-44.5356,100.8936
3,y1_d001_CGA_QEA,108,4098455,8,center field,ball acquired,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1824,CGA-1929,CGA-1592,CGA-2010,35.7615,110.5899,-44.5356,100.8936,-62.0019,65.0769,95.3448,339.4440,0.00000,12,2B,Home,True,True,-62.0019,65.0769
4,y1_d001_CGA_QEA,114,4231155,9,right field,ball acquired,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1353,CGA-1824,CGA-1929,CGA-1592,61.1802,61.2882,,,,,138.1734,295.9152,5.19453,10,1B,2B,True,True,61.1802,61.2882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253,y1_d096_XAX_QEA,82,2738858,9,right field,ball acquired,QEA,XAX,bottom,XAX-1396,XAX-1967,XAX-1882,XAX-1217,XAX-1799,XAX-2586,XAX-2378,XAX-2387,XAX-1460,QEA-0365,QEA-0182,,,15.3114,122.3778,-62.1264,66.5913,,,43.4127,367.8690,0.00000,11,1B,3B,True,True,-62.1264,66.5913
254,y1_d096_XAX_QEA,82,2738858,9,right field,ball acquired,QEA,XAX,bottom,XAX-1396,XAX-1967,XAX-1882,XAX-1217,XAX-1799,XAX-2586,XAX-2378,XAX-2387,XAX-1460,QEA-0365,QEA-0182,,,15.3114,122.3778,-62.1264,66.5913,,,43.4127,367.8690,0.00000,11,1B,Home,True,True,-62.1264,66.5913
255,y1_d096_XAX_QEA,106,3597808,9,right field,ball acquired,QEA,XAX,top,QEA-0410,QEA-0218,QEA-0263,QEA-0201,QEA-0369,QEA-0027,QEA-0365,QEA-0182,QEA-0249,XAX-2586,XAX-2387,,,57.1692,49.3320,-14.7330,123.7893,,,86.2086,305.1780,4.55160,11,1B,3B,True,True,-14.7330,123.7893
256,y1_d096_XAX_QEA,153,5349108,7,left field,ball acquired,QEA,XAX,bottom,XAX-1396,XAX-1967,XAX-1882,XAX-1217,XAX-1799,XAX-2586,XAX-2378,XAX-2387,XAX-1460,QEA-0290,,QEA-0365,QEA-0027,56.4918,47.1294,-7.0050,126.2043,,,-81.1731,305.3100,0.00000,11,1B,3B,True,True,-7.0050,126.2043


In [None]:
# ni hao
ball_acquired_long['was_safe'].value_counts()

was_safe
True     254
False      4
Name: count, dtype: int64