In [4]:
import pandas as pd
import pyarrow.dataset as pads
import os
import numpy as np
pd.set_option('display.max_columns', None)

data_path = "/Users/alexfrederick/Desktop/SMT-Data-Challenge-2025/"

In [5]:
def readDataSubset(table_type, data_path):
    """
    Loads a specified SMT data subset as a PyArrow dataset.
    """
    valid_tables = ['ball_pos', 'game_events', 'game_info', 'player_pos', 'rosters']
    if table_type not in valid_tables:
        print("Invalid data subset name. Please try again with a valid data subset.")
        return None

    if table_type == 'rosters':
        return pads.dataset(source=os.path.join(data_path, 'rosters.csv'), format='csv')
    else:
        
        return pads.dataset(
            source=os.path.join(data_path, table_type),
            format='csv'
        )

In [6]:
game_info_ds = readDataSubset('game_info', data_path)
game_events_ds = readDataSubset('game_events', data_path)
ball_pos_ds = readDataSubset('ball_pos', data_path)
player_pos_ds = readDataSubset('player_pos', data_path)
rosters_ds = readDataSubset('rosters', data_path)

ValueError: No objects to concatenate

In [9]:
import pyarrow as pa

filter_criteria = (
    (pads.field("home_team") == "QEA")
)

import pandas as pd
import glob

# Define where your game_info CSVs are stored
game_info_path = "/Users/alexfrederick/Desktop/SMT-Data-Challenge-2025/game_info"
csv_files = glob.glob(f"{game_info_path}/**/*.csv", recursive=True)

# Specify null indicators
na_values = ["", "NA", "NULL", "\\N"]

# Load all CSVs safely using pandas
game_info_df = pd.concat(
    [pd.read_csv(f, na_values=na_values) for f in csv_files],
    ignore_index=True
)

game_events_df = game_events_ds.to_table().to_pandas()

In [10]:
game_info_df

Unnamed: 0,game_str,home_team,away_team,at_bat,play_per_game,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner
0,y1_d081_FBP_QEA,QEA,FBP,1.0,1.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
1,y1_d081_FBP_QEA,QEA,FBP,1.0,2.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
2,y1_d081_FBP_QEA,QEA,FBP,1.0,3.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
3,y1_d081_FBP_QEA,QEA,FBP,1.0,4.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
4,y1_d081_FBP_QEA,QEA,FBP,2.0,5.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1154,,FBP-1349,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72561,y1_d074_PHS_RZQ,RZQ,PHS,,257.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1928,,,
72562,y1_d074_PHS_RZQ,RZQ,PHS,,258.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1359,,,
72563,y1_d074_PHS_RZQ,RZQ,PHS,,259.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1359,,,
72564,y1_d074_PHS_RZQ,RZQ,PHS,,260.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1359,,,


In [11]:
# some keys
position_key = pd.DataFrame({
    "code": [*range(1, 14), 255, 14, 15, 16, 17, 18, 19],
    "position": [
        "pitcher", "catcher", "first baseman", "second baseman", "third baseman",
        "shortstop", "left field", "center field", "right field", "batter",
        "runner on first base", "runner on second base", "runner on third base",
        "ball event with no player (e.g., ball bounce)", "home plate umpire",
        "field umpire", "field umpire", "field umpire",
        "first base coach", "third base coach"
    ]
})

event_key = pd.DataFrame({
    "code": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16],
    "play_type": [
        "pitch", "ball acquired", "throw (ball-in-play)", "ball hit into play", 
        "end of play", "pickoff throw", "ball acquired - unknown field position", 
        "throw (ball-in-play) - unknown field position", "ball deflection", 
        "ball deflection off of wall", "home run", "ball bounce"
    ]
})

In [14]:
big_ie = pd.merge(
    game_events_df, 
    game_info_df, 
    on=['game_str', 'play_per_game'], 
    suffixes=('', '_dup')
)

big_ie = big_ie.loc[:, ~big_ie.columns.str.endswith('_dup')]

big_ie['player_position'] = pd.to_numeric(big_ie['player_position'], errors='coerce').astype('Int64')
big_ie['event_code'] = pd.to_numeric(big_ie['event_code'], errors='coerce').astype('Int64')

big_ie = big_ie.merge(position_key, how='left', left_on='player_position', right_on='code').drop('code', axis=1)

big_ie = big_ie.merge(event_key, how='left', left_on='event_code', right_on='code').drop('code', axis=1)

big_ie.head()

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired


In [13]:
small_ie = big_ie.get(['game_str', 'play_id', 'at_bat', 'play_per_game', 'event_code', 'top_bottom_inning', 'first_baserunner', 'second_baserunner', 'third_baserunner', 'play_type'])
small_ie

Unnamed: 0,game_str,play_id,at_bat,play_per_game,event_code,top_bottom_inning,first_baserunner,second_baserunner,third_baserunner,play_type
0,y1_d069_ACN_QEA,1,1,1,1,top,,,,pitch
1,y1_d069_ACN_QEA,1,1,1,2,top,,,,ball acquired
2,y1_d069_ACN_QEA,1,1,1,5,top,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,1,top,,,,pitch
4,y1_d069_ACN_QEA,2,1,2,2,top,,,,ball acquired
...,...,...,...,...,...,...,...,...,...,...
273772,y1_d058_WZR_YJD,330,93,330,16,top,WZR-1285,,,ball bounce
273773,y1_d058_WZR_YJD,330,93,330,16,top,WZR-1285,,,ball bounce
273774,y1_d058_WZR_YJD,330,93,330,2,top,WZR-1285,,,ball acquired
273775,y1_d058_WZR_YJD,330,93,330,3,top,WZR-1285,,,throw (ball-in-play)


## Fix at bat col

In [16]:
at_bat = list()
game_info_df_sub = big_ie[big_ie['at_bat'].notna()]
for row in range(len(game_info_df_sub)):
    if row==0:
        at_bat = at_bat + [1]
    elif game_info_df_sub.loc[row, 'game_str'] != game_info_df_sub.loc[row-1, 'game_str']:
        at_bat = at_bat + [1]
    elif game_info_df_sub.loc[row, 'batter'] == game_info_df_sub.loc[row-1, 'batter']:
        at_bat = at_bat + [at_bat[row-1]]
    else:
        at_bat = at_bat + [at_bat[row-1] + 1]

In [17]:
game_info_df_sub['at_bat'] = at_bat
final_ie = game_info_df_sub
final_ie

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273772,y1_d058_WZR_YJD,330,82,330,12027662,255,16,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,"ball event with no player (e.g., ball bounce)",ball bounce
273773,y1_d058_WZR_YJD,330,82,330,12027959,255,16,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,"ball event with no player (e.g., ball bounce)",ball bounce
273774,y1_d058_WZR_YJD,330,82,330,12029972,8,2,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,center field,ball acquired
273775,y1_d058_WZR_YJD,330,82,330,12031028,8,3,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,center field,throw (ball-in-play)


---
calculate outs and assign them to a new column

logic:

1. current flyout logic
    why is that logic right? ball acquired means ball was caught in the air, not possibly on the ground?
2. solo homer has no impact
    if event code is 11 then in next AB, nobody should be on base and we don't count out
3. ball hit into play but runner not on base after
    if in [AB number], play id is 4 and in next AB number first pitch (play id 2) the previous hitter's code is not on any of the three bases, then it was an out
    (maybe have to think about clear fielder's choice plays)
4. at least 3 pitches in the AB but runner doesn't get on (assume strikeout)


In [22]:
def mark_outs_by_event_pattern(df):
    """
    Add an 'is_out' column: True if event_code 4 is immediately followed by event_code 2 (within same game_str, top_bottom_inning).
    """
    df = df.copy()
    df.sort_values(['game_str', 'top_bottom_inning', 'at_bat', 'play_id'], inplace=True)
    df['next_event_code'] = df.groupby(['game_str', 'top_bottom_inning'])['event_code'].shift(-1)
    df['is_out'] = (df['event_code'] == 4) & (df['next_event_code'] == 2)
    df['is_out'] = df['is_out'].fillna(False)  # Ensures no NA remains after execution
    return df.drop(columns='next_event_code')

def calculate_outs(df):
    """
    Add a cumulative 'outs' column that resets at each new half-inning, counting only when is_out is True.
    """
    df = df.copy()
    df['outs'] = 0
    group_cols = ['game_str', 'top_bottom_inning']

    def count_outs(subdf):
        outs = 0
        outs_list = []
        for is_out in subdf['is_out']:
            # Make sure is_out is always a bool
            if bool(is_out):  # Will be safe because of fillna above
                outs += 1
            outs_list.append(outs)
            if outs >= 3:
                outs = 0
        subdf['outs'] = outs_list
        return subdf

    return df.groupby(group_cols, group_keys=False).apply(count_outs)

marked = mark_outs_by_event_pattern(final_ie)
with_outs = calculate_outs(marked).sort_values(['game_str', 'play_id'])
with_outs

  return df.groupby(group_cols, group_keys=False).apply(count_outs)


Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,is_out,outs
7894,y1_d001_CGA_QEA,1,1,1,8699,1,1,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,pitcher,pitch,False,0
7895,y1_d001_CGA_QEA,1,1,1,9199,2,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,catcher,ball acquired,False,0
7896,y1_d001_CGA_QEA,1,1,1,9199,0,5,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,,end of play,False,0
7897,y1_d001_CGA_QEA,2,1,2,24149,1,1,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,pitcher,pitch,False,0
7898,y1_d001_CGA_QEA,2,1,2,24599,10,4,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,batter,ball hit into play,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161260,y2_d099_YJD_RZQ,291,78,291,1537052091220,2,2,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,catcher,ball acquired,False,1
161261,y2_d099_YJD_RZQ,291,78,291,1537052091220,0,5,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,,end of play,False,1
161262,y2_d099_YJD_RZQ,292,78,292,47698,0,5,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,,end of play,False,1
161263,y2_d099_YJD_RZQ,292,78,292,1537052107870,1,1,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,pitcher,pitch,False,1


In [19]:
pd.set_option('display.max_rows', 50)

In [23]:
with_outs.groupby(['game_str', 'top_bottom_inning', 'outs']).count()[['play_id']].rename(columns={'play_id': 'count'}).reset_index().sort_values(['game_str', 'top_bottom_inning', 'outs']).head(50)

Unnamed: 0,game_str,top_bottom_inning,outs,count
0,y1_d001_CGA_QEA,bottom,0,156
1,y1_d001_CGA_QEA,bottom,1,97
2,y1_d001_CGA_QEA,bottom,2,68
3,y1_d001_CGA_QEA,bottom,3,1
4,y1_d001_CGA_QEA,top,0,340
5,y1_d001_CGA_QEA,top,1,100
6,y1_d001_CGA_QEA,top,2,245
7,y1_d001_CGA_QEA,top,3,2
8,y1_d002_CGA_QEA,bottom,0,122
9,y1_d002_CGA_QEA,bottom,1,205


In [24]:
with_outs.to_csv('outs.csv', index=False)

In [25]:
with_outs.groupby(['outs']).count().reset_index()

Unnamed: 0,outs,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,is_out
0,0,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102780,102709,32077,19980,5087,75469,102780,102780
1,1,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91779,91756,30077,18567,4845,67880,91779,91779
2,2,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,78042,77998,25976,15494,4232,57568,78042,78042
3,3,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,345,219,52,1176,1176,1176
