In [34]:
import pandas as pd
import pyarrow.dataset as pads
import os
import numpy as np
pd.set_option('display.max_columns', None)

data_path = "/Users/andrewzaletski/Desktop/SMT/SMT-Data-Challenge-2025-Updated"

In [35]:
def readDataSubset(table_type, data_path):
    """
    Loads a specified SMT data subset as a PyArrow dataset.
    """
    valid_tables = ['ball_pos', 'game_events', 'game_info', 'player_pos', 'rosters']
    if table_type not in valid_tables:
        print("Invalid data subset name. Please try again with a valid data subset.")
        return None

    if table_type == 'rosters':
        return pads.dataset(source=os.path.join(data_path, 'rosters.csv'), format='csv')
    else:
        
        return pads.dataset(
            source=os.path.join(data_path, table_type),
            format='csv'
        )

In [36]:
game_info_ds = readDataSubset('game_info', data_path)
game_events_ds = readDataSubset('game_events', data_path)
ball_pos_ds = readDataSubset('ball_pos', data_path)
player_pos_ds = readDataSubset('player_pos', data_path)
rosters_ds = readDataSubset('rosters', data_path)

In [37]:
import pyarrow as pa

filter_criteria = (
    (pads.field("home_team") == "QEA")
)

game_info_df = game_info_ds.to_table().to_pandas()
game_events_df = game_events_ds.to_table().to_pandas()

In [38]:
# some keys
position_key = pd.DataFrame({
    "code": [*range(1, 14), 255, 14, 15, 16, 17, 18, 19],
    "position": [
        "pitcher", "catcher", "first baseman", "second baseman", "third baseman",
        "shortstop", "left field", "center field", "right field", "batter",
        "runner on first base", "runner on second base", "runner on third base",
        "ball event with no player (e.g., ball bounce)", "home plate umpire",
        "field umpire", "field umpire", "field umpire",
        "first base coach", "third base coach"
    ]
})

event_key = pd.DataFrame({
    "code": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16],
    "play_type": [
        "pitch", "ball acquired", "throw (ball-in-play)", "ball hit into play", 
        "end of play", "pickoff throw", "ball acquired - unknown field position", 
        "throw (ball-in-play) - unknown field position", "ball deflection", 
        "ball deflection off of wall", "home run", "ball bounce"
    ]
})

In [39]:
big_ie = pd.merge(
    game_events_df, 
    game_info_df, 
    on=['game_str', 'play_per_game'], 
    suffixes=('', '_dup')
)

big_ie = big_ie.loc[:, ~big_ie.columns.str.endswith('_dup')]

big_ie['player_position'] = pd.to_numeric(big_ie['player_position'], errors='coerce').astype('Int64')
big_ie['event_code'] = pd.to_numeric(big_ie['event_code'], errors='coerce').astype('Int64')

big_ie = big_ie.merge(position_key, how='left', left_on='player_position', right_on='code').drop('code', axis=1)

big_ie = big_ie.merge(event_key, how='left', left_on='event_code', right_on='code').drop('code', axis=1)

big_ie.head()

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired


In [40]:
big_ie = big_ie[~big_ie['batter'].isna()]

In [41]:
at_bat = list()
for row in range(len(big_ie)):
    if row == 0:
        at_bat = at_bat + [1]
    elif big_ie.loc[row, 'game_str'] != big_ie.loc[row-1, 'game_str']:
        at_bat = at_bat + [1]
    elif big_ie.loc[row, 'batter'] == big_ie.loc[row-1, 'batter']:
        at_bat = at_bat + [at_bat[row-1]]
    else:
        at_bat = at_bat + [at_bat[row-1]] + [1]

In [44]:
big_ie['at_bat'] = at_bat

ValueError: Length of values (295324) does not match length of index (273777)

calculate outs and assign them to a new column

In [45]:
def mark_outs_by_event_pattern(df):
    """
    Add an 'is_out' column: True if event_code 4 is immediately followed by event_code 2 (within same game_str, top_bottom_inning).
    """
    df = df.copy()
    df.sort_values(['game_str', 'top_bottom_inning', 'at_bat', 'play_id'], inplace=True)
    df['next_event_code'] = df.groupby(['game_str', 'top_bottom_inning'])['event_code'].shift(-1)
    df['is_out'] = (df['event_code'] == 4) & (df['next_event_code'] == 2)
    df['is_out'] = df['is_out'].fillna(False)  # Ensures no NA remains after execution
    return df.drop(columns='next_event_code')

def calculate_outs(df):
    """
    Add a cumulative 'outs' column that resets at each new half-inning, counting only when is_out is True.
    """
    df = df.copy()
    df['outs'] = 0
    group_cols = ['game_str', 'top_bottom_inning']

    def count_outs(subdf):
        outs = 0
        outs_list = []
        for is_out in subdf['is_out']:
            # Make sure is_out is always a bool
            if bool(is_out):  # Will be safe because of fillna above
                outs += 1
            outs_list.append(outs)
            if outs >= 3:
                outs = 0
        subdf['outs'] = outs_list
        return subdf

    return df.groupby(group_cols, group_keys=False).apply(count_outs)

small_ie = mark_outs_by_event_pattern(big_ie)
small_ie = calculate_outs(small_ie)
small_ie

  return df.groupby(group_cols, group_keys=False).apply(count_outs)


Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,is_out,outs
8061,y1_d001_CGA_QEA,66,19,66,2418699,1,1,QEA,CGA,bottom,CGA-2321,CGA-2223,CGA-1592,CGA-1458,CGA-1824,CGA-2010,CGA-1198,CGA-1547,CGA-1929,QEA-0365,,,,pitcher,pitch,False,0
8062,y1_d001_CGA_QEA,66,19,66,2419199,2,2,QEA,CGA,bottom,CGA-2321,CGA-2223,CGA-1592,CGA-1458,CGA-1824,CGA-2010,CGA-1198,CGA-1547,CGA-1929,QEA-0365,,,,catcher,ball acquired,False,0
8063,y1_d001_CGA_QEA,66,19,66,2419199,0,5,QEA,CGA,bottom,CGA-2321,CGA-2223,CGA-1592,CGA-1458,CGA-1824,CGA-2010,CGA-1198,CGA-1547,CGA-1929,QEA-0365,,,,,end of play,False,0
8064,y1_d001_CGA_QEA,67,19,67,2439249,1,1,QEA,CGA,bottom,CGA-2321,CGA-2223,CGA-1592,CGA-1458,CGA-1824,CGA-2010,CGA-1198,CGA-1547,CGA-1929,QEA-0365,,,,pitcher,pitch,False,0
8065,y1_d001_CGA_QEA,67,19,67,2439699,2,2,QEA,CGA,bottom,CGA-2321,CGA-2223,CGA-1592,CGA-1458,CGA-1824,CGA-2010,CGA-1198,CGA-1547,CGA-1929,QEA-0365,,,,catcher,ball acquired,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161260,y2_d099_YJD_RZQ,291,74,291,1537052091220,2,2,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,catcher,ball acquired,False,1
161261,y2_d099_YJD_RZQ,291,74,291,1537052091220,0,5,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,,end of play,False,1
161262,y2_d099_YJD_RZQ,292,74,292,47698,0,5,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,,end of play,False,1
161263,y2_d099_YJD_RZQ,292,74,292,1537052107870,1,1,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,pitcher,pitch,False,1
