In [32]:
import pandas as pd
import pyarrow.dataset as pads
import os
import numpy as np
pd.set_option('display.max_columns', None)

data_path = "/Users/pranavrajaram/SMT-Data-Challenge-2025/"

In [33]:
def readDataSubset(table_type, data_path):
    """
    Loads a specified SMT data subset as a PyArrow dataset.
    """
    valid_tables = ['ball_pos', 'game_events', 'game_info', 'player_pos', 'rosters']
    if table_type not in valid_tables:
        print("Invalid data subset name. Please try again with a valid data subset.")
        return None

    if table_type == 'rosters':
        return pads.dataset(source=os.path.join(data_path, 'rosters.csv'), format='csv')
    else:
        
        return pads.dataset(
            source=os.path.join(data_path, table_type),
            format='csv'
        )

In [34]:
game_info_ds = readDataSubset('game_info', data_path)
game_events_ds = readDataSubset('game_events', data_path)
ball_pos_ds = readDataSubset('ball_pos', data_path)
player_pos_ds = readDataSubset('player_pos', data_path)
rosters_ds = readDataSubset('rosters', data_path)

In [35]:
import pyarrow as pa

filter_criteria = (
    (pads.field("home_team") == "QEA")
)

game_info_df = game_info_ds.to_table().to_pandas()
game_events_df = game_events_ds.to_table().to_pandas()

In [36]:
game_info_df

Unnamed: 0,game_str,home_team,away_team,at_bat,play_per_game,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner
0,y1_d069_ACN_QEA,QEA,ACN,1.0,1.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,
1,y1_d069_ACN_QEA,QEA,ACN,1.0,2.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,
2,y1_d069_ACN_QEA,QEA,ACN,1.0,3.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,
3,y1_d069_ACN_QEA,QEA,ACN,2.0,4.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1416,,,
4,y1_d069_ACN_QEA,QEA,ACN,69.0,134.0,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-1147,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72561,y1_d058_WZR_YJD,YJD,WZR,91.0,325.0,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1285,,,
72562,y1_d058_WZR_YJD,YJD,WZR,93.0,326.0,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1285,,,
72563,y1_d058_WZR_YJD,YJD,WZR,93.0,327.0,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1285,,,
72564,y1_d058_WZR_YJD,YJD,WZR,93.0,329.0,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,


In [37]:
# some keys
position_key = pd.DataFrame({
    "code": [*range(1, 14), 255, 14, 15, 16, 17, 18, 19],
    "position": [
        "pitcher", "catcher", "first baseman", "second baseman", "third baseman",
        "shortstop", "left field", "center field", "right field", "batter",
        "runner on first base", "runner on second base", "runner on third base",
        "ball event with no player (e.g., ball bounce)", "home plate umpire",
        "field umpire", "field umpire", "field umpire",
        "first base coach", "third base coach"
    ]
})

event_key = pd.DataFrame({
    "code": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16],
    "play_type": [
        "pitch", "ball acquired", "throw (ball-in-play)", "ball hit into play", 
        "end of play", "pickoff throw", "ball acquired - unknown field position", 
        "throw (ball-in-play) - unknown field position", "ball deflection", 
        "ball deflection off of wall", "home run", "ball bounce"
    ]
})

In [38]:
big_ie = pd.merge(
    game_events_df, 
    game_info_df, 
    on=['game_str', 'play_per_game'], 
    suffixes=('', '_dup')
)

big_ie = big_ie.loc[:, ~big_ie.columns.str.endswith('_dup')]

big_ie['player_position'] = pd.to_numeric(big_ie['player_position'], errors='coerce').astype('Int64')
big_ie['event_code'] = pd.to_numeric(big_ie['event_code'], errors='coerce').astype('Int64')

big_ie = big_ie.merge(position_key, how='left', left_on='player_position', right_on='code').drop('code', axis=1)

big_ie = big_ie.merge(event_key, how='left', left_on='event_code', right_on='code').drop('code', axis=1)

big_ie.head()

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired


In [39]:
small_ie = big_ie.get(['game_str', 'play_id', 'at_bat', 'play_per_game', 'event_code', 'top_bottom_inning', 'first_baserunner', 'second_baserunner', 'third_baserunner', 'play_type'])
small_ie

Unnamed: 0,game_str,play_id,at_bat,play_per_game,event_code,top_bottom_inning,first_baserunner,second_baserunner,third_baserunner,play_type
0,y1_d069_ACN_QEA,1,1,1,1,top,,,,pitch
1,y1_d069_ACN_QEA,1,1,1,2,top,,,,ball acquired
2,y1_d069_ACN_QEA,1,1,1,5,top,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,1,top,,,,pitch
4,y1_d069_ACN_QEA,2,1,2,2,top,,,,ball acquired
...,...,...,...,...,...,...,...,...,...,...
273772,y1_d058_WZR_YJD,330,93,330,16,top,WZR-1285,,,ball bounce
273773,y1_d058_WZR_YJD,330,93,330,16,top,WZR-1285,,,ball bounce
273774,y1_d058_WZR_YJD,330,93,330,2,top,WZR-1285,,,ball acquired
273775,y1_d058_WZR_YJD,330,93,330,3,top,WZR-1285,,,throw (ball-in-play)


## Fix at bat col

In [None]:
at_bat = list()
game_info_df_sub = big_ie[big_ie['at_bat'].notna()]
for row in range(len(game_info_df_sub)):
    if row==0:
        at_bat = at_bat + [1]
    elif game_info_df_sub.loc[row, 'game_str'] != game_info_df_sub.loc[row-1, 'game_str']:
        at_bat = at_bat + [1]
    elif game_info_df_sub.loc[row, 'batter'] == game_info_df_sub.loc[row-1, 'batter']:
        at_bat = at_bat + [at_bat[row-1]]
    else:
        at_bat = at_bat + [at_bat[row-1] + 1]

In [None]:
game_info_df_sub['at_bat'] = at_bat
final_ie = game_info_df_sub
final_ie

---
calculate outs and assign them to a new column

In [None]:
def mark_outs_by_event_pattern(df):
    """
    Add an 'is_out' column: True if event_code 4 is immediately followed by event_code 2 (within same game_str, top_bottom_inning).
    """
    df = df.copy()
    df.sort_values(['game_str', 'top_bottom_inning', 'at_bat', 'play_id'], inplace=True)
    df['next_event_code'] = df.groupby(['game_str', 'top_bottom_inning'])['event_code'].shift(-1)
    df['is_out'] = (df['event_code'] == 4) & (df['next_event_code'] == 2)
    df['is_out'] = df['is_out'].fillna(False)  # Ensures no NA remains after execution
    return df.drop(columns='next_event_code')

def calculate_outs(df):
    """
    Add a cumulative 'outs' column that resets at each new half-inning, counting only when is_out is True.
    """
    df = df.copy()
    df['outs'] = 0
    group_cols = ['game_str', 'top_bottom_inning']

    def count_outs(subdf):
        outs = 0
        outs_list = []
        for is_out in subdf['is_out']:
            # Make sure is_out is always a bool
            if bool(is_out):  # Will be safe because of fillna above
                outs += 1
            outs_list.append(outs)
            if outs >= 3:
                outs = 0
        subdf['outs'] = outs_list
        return subdf

    return df.groupby(group_cols, group_keys=False).apply(count_outs)

marked = mark_outs_by_event_pattern(final_ie)
with_outs = calculate_outs(marked).sort_values(['game_str', 'play_id'])
with_outs

  return df.groupby(group_cols, group_keys=False).apply(count_outs)


Unnamed: 0,game_str,play_id,at_bat,play_per_game,event_code,top_bottom_inning,first_baserunner,second_baserunner,third_baserunner,play_type,is_out,outs
7894,y1_d001_CGA_QEA,1,1,1,1,top,,,,pitch,False,0
7895,y1_d001_CGA_QEA,1,1,1,2,top,,,,ball acquired,False,0
7896,y1_d001_CGA_QEA,1,1,1,5,top,,,,end of play,False,0
7897,y1_d001_CGA_QEA,2,1,2,1,top,,,,pitch,False,0
7898,y1_d001_CGA_QEA,2,1,2,4,top,,,,ball hit into play,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...
161260,y2_d099_YJD_RZQ,291,74,291,2,top,,,,ball acquired,False,1
161261,y2_d099_YJD_RZQ,291,74,291,5,top,,,,end of play,False,1
161262,y2_d099_YJD_RZQ,292,74,292,5,top,,,,end of play,False,1
161263,y2_d099_YJD_RZQ,292,74,292,1,top,,,,pitch,False,1


In [43]:
pd.set_option('display.max_rows', 50)

In [44]:
with_outs.groupby(['game_str', 'top_bottom_inning', 'outs']).count()[['play_id']].rename(columns={'play_id': 'count'}).reset_index().sort_values(['game_str', 'top_bottom_inning', 'outs']).head(20)

Unnamed: 0,game_str,top_bottom_inning,outs,count
0,y1_d001_CGA_QEA,bottom,0,146
1,y1_d001_CGA_QEA,bottom,1,89
2,y1_d001_CGA_QEA,bottom,2,86
3,y1_d001_CGA_QEA,bottom,3,1
4,y1_d001_CGA_QEA,top,0,226
5,y1_d001_CGA_QEA,top,1,142
6,y1_d001_CGA_QEA,top,2,317
7,y1_d001_CGA_QEA,top,3,2
8,y1_d002_CGA_QEA,bottom,0,218
9,y1_d002_CGA_QEA,bottom,1,129


In [45]:
with_outs.to_csv('outs.csv', index=False)

In [46]:
with_outs.groupby(['outs']).count().reset_index()

Unnamed: 0,outs,game_str,play_id,at_bat,play_per_game,event_code,top_bottom_inning,first_baserunner,second_baserunner,third_baserunner,play_type,is_out
0,0,100868,100868,100868,100868,100868,100868,100868,100868,100868,100868,100868
1,1,96552,96552,96552,96552,96552,96552,96552,96552,96552,96552,96552
2,2,75181,75181,75181,75181,75181,75181,75181,75181,75181,75181,75181
3,3,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176,1176
