In [2]:
import pandas as pd
import pyarrow.dataset as pads
import os
import numpy as np
pd.set_option('display.max_columns', None)

data_path = "/Users/alexfrederick/Desktop/SMT-Data-Challenge-2025/"

In [3]:
def readDataSubset(table_type, data_path):
    """
    Loads a specified SMT data subset as a PyArrow dataset.
    """
    valid_tables = ['ball_pos', 'game_events', 'game_info', 'player_pos', 'rosters']
    if table_type not in valid_tables:
        print("Invalid data subset name. Please try again with a valid data subset.")
        return None

    if table_type == 'rosters':
        return pads.dataset(source=os.path.join(data_path, 'rosters.csv'), format='csv')
    else:
        
        return pads.dataset(
            source=os.path.join(data_path, table_type),
            format='csv'
        )

In [4]:
game_info_ds = readDataSubset('game_info', data_path)
game_events_ds = readDataSubset('game_events', data_path)
ball_pos_ds = readDataSubset('ball_pos', data_path)
player_pos_ds = readDataSubset('player_pos', data_path)
rosters_ds = readDataSubset('rosters', data_path)

In [5]:
import pyarrow as pa

filter_criteria = (
    (pads.field("home_team") == "QEA")
)

import pandas as pd
import glob

# Define where your game_info CSVs are stored
game_info_path = "/Users/alexfrederick/Desktop/SMT-Data-Challenge-2025/game_info"
csv_files = glob.glob(f"{game_info_path}/**/*.csv", recursive=True)

# Specify null indicators
na_values = ["", "NA", "NULL", "\\N"]

# Load all CSVs safely using pandas
game_info_df = pd.concat(
    [pd.read_csv(f, na_values=na_values) for f in csv_files],
    ignore_index=True
)

game_events_df = game_events_ds.to_table().to_pandas()

In [6]:
game_info_df

Unnamed: 0,game_str,home_team,away_team,at_bat,play_per_game,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner
0,y1_d081_FBP_QEA,QEA,FBP,1.0,1.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
1,y1_d081_FBP_QEA,QEA,FBP,1.0,2.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
2,y1_d081_FBP_QEA,QEA,FBP,1.0,3.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
3,y1_d081_FBP_QEA,QEA,FBP,1.0,4.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1349,,,
4,y1_d081_FBP_QEA,QEA,FBP,2.0,5.0,top,QEA-0394,QEA-0218,QEA-0263,QEA-0365,QEA-0277,QEA-0027,QEA-0364,QEA-0421,QEA-0249,FBP-1154,,FBP-1349,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72561,y1_d074_PHS_RZQ,RZQ,PHS,,257.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1928,,,
72562,y1_d074_PHS_RZQ,RZQ,PHS,,258.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1359,,,
72563,y1_d074_PHS_RZQ,RZQ,PHS,,259.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1359,,,
72564,y1_d074_PHS_RZQ,RZQ,PHS,,260.0,top,RZQ-0044,RZQ-0059,RZQ-0333,RZQ-0347,RZQ-0258,RZQ-0287,RZQ-0014,RZQ-0274,RZQ-0214,PHS-1359,,,


In [7]:
# some keys
position_key = pd.DataFrame({
    "code": [*range(1, 14), 255, 14, 15, 16, 17, 18, 19],
    "position": [
        "pitcher", "catcher", "first baseman", "second baseman", "third baseman",
        "shortstop", "left field", "center field", "right field", "batter",
        "runner on first base", "runner on second base", "runner on third base",
        "ball event with no player (e.g., ball bounce)", "home plate umpire",
        "field umpire", "field umpire", "field umpire",
        "first base coach", "third base coach"
    ]
})

event_key = pd.DataFrame({
    "code": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16],
    "play_type": [
        "pitch", "ball acquired", "throw (ball-in-play)", "ball hit into play", 
        "end of play", "pickoff throw", "ball acquired - unknown field position", 
        "throw (ball-in-play) - unknown field position", "ball deflection", 
        "ball deflection off of wall", "home run", "ball bounce"
    ]
})

In [8]:
import pandas as pd

big_ie = pd.merge(
    game_events_df, 
    game_info_df, 
    on=['game_str', 'play_per_game'], 
    suffixes=('', '_dup')
)

big_ie = big_ie.loc[:, ~big_ie.columns.str.endswith('_dup')]

big_ie['player_position'] = pd.to_numeric(big_ie['player_position'], errors='coerce').astype('Int64')
big_ie['event_code'] = pd.to_numeric(big_ie['event_code'], errors='coerce').astype('Int64')

big_ie = big_ie.merge(position_key, how='left', left_on='player_position', right_on='code').drop('code', axis=1)

big_ie = big_ie.merge(event_key, how='left', left_on='event_code', right_on='code').drop('code', axis=1)

big_ie

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273772,y1_d058_WZR_YJD,330,93,330,12027662,255,16,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,"ball event with no player (e.g., ball bounce)",ball bounce
273773,y1_d058_WZR_YJD,330,93,330,12027959,255,16,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,"ball event with no player (e.g., ball bounce)",ball bounce
273774,y1_d058_WZR_YJD,330,93,330,12029972,8,2,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,center field,ball acquired
273775,y1_d058_WZR_YJD,330,93,330,12031028,8,3,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,center field,throw (ball-in-play)


In [9]:
small_ie = big_ie.get(['game_str', 'play_id', 'at_bat', 'play_per_game', 'event_code', 'top_bottom_inning', 'first_baserunner', 'second_baserunner', 'third_baserunner', 'play_type'])
small_ie

Unnamed: 0,game_str,play_id,at_bat,play_per_game,event_code,top_bottom_inning,first_baserunner,second_baserunner,third_baserunner,play_type
0,y1_d069_ACN_QEA,1,1,1,1,top,,,,pitch
1,y1_d069_ACN_QEA,1,1,1,2,top,,,,ball acquired
2,y1_d069_ACN_QEA,1,1,1,5,top,,,,end of play
3,y1_d069_ACN_QEA,2,1,2,1,top,,,,pitch
4,y1_d069_ACN_QEA,2,1,2,2,top,,,,ball acquired
...,...,...,...,...,...,...,...,...,...,...
273772,y1_d058_WZR_YJD,330,93,330,16,top,WZR-1285,,,ball bounce
273773,y1_d058_WZR_YJD,330,93,330,16,top,WZR-1285,,,ball bounce
273774,y1_d058_WZR_YJD,330,93,330,2,top,WZR-1285,,,ball acquired
273775,y1_d058_WZR_YJD,330,93,330,3,top,WZR-1285,,,throw (ball-in-play)


## Fix at bat col

In [10]:
at_bat = list()
game_info_df_sub = big_ie[big_ie['at_bat'].notna()]
for row in range(len(game_info_df_sub)):
    if row==0:
        at_bat = at_bat + [1]
    elif game_info_df_sub.loc[row, 'game_str'] != game_info_df_sub.loc[row-1, 'game_str']:
        at_bat = at_bat + [1]
    elif game_info_df_sub.loc[row, 'batter'] == game_info_df_sub.loc[row-1, 'batter']:
        at_bat = at_bat + [at_bat[row-1]]
    else:
        at_bat = at_bat + [at_bat[row-1] + 1]

In [165]:
game_info_df_sub['at_bat'] = at_bat
final_ie = game_info_df_sub
final_ie

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,batter_num_inning,active_baserunners,new_baserunner,baserunners_inning,batter_on_base,new_atbat,is_out,outs_inning,runs_inning
0,y1_d069_ACN_QEA,1,1,1,14853,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch,1,0,0,0,False,True,0,0,0
1,y1_d069_ACN_QEA,1,1,1,15303,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired,1,0,0,0,False,False,0,0,0
2,y1_d069_ACN_QEA,1,1,1,15303,0,5,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,,end of play,1,0,0,0,False,False,0,0,0
3,y1_d069_ACN_QEA,2,1,2,27753,1,1,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,pitcher,pitch,1,0,0,0,False,False,0,0,0
4,y1_d069_ACN_QEA,2,1,2,28253,2,2,QEA,ACN,top,QEA-0410,QEA-0071,QEA-0263,QEA-0277,QEA-0232,QEA-0027,QEA-0364,QEA-0365,QEA-0249,ACN-2455,,,,catcher,ball acquired,1,0,0,0,False,False,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273772,y1_d058_WZR_YJD,330,82,330,12027662,255,16,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,"ball event with no player (e.g., ball bounce)",ball bounce,41,1,0,10,False,False,0,40,9
273773,y1_d058_WZR_YJD,330,82,330,12027959,255,16,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,"ball event with no player (e.g., ball bounce)",ball bounce,41,1,0,10,False,False,0,40,9
273774,y1_d058_WZR_YJD,330,82,330,12029972,8,2,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,center field,ball acquired,41,1,0,10,False,False,0,40,9
273775,y1_d058_WZR_YJD,330,82,330,12031028,8,3,YJD,WZR,top,YJD-0157,YJD-0007,YJD-0373,YJD-0060,YJD-0284,YJD-0172,YJD-0398,YJD-0185,YJD-0340,WZR-1942,WZR-1285,,,center field,throw (ball-in-play),41,1,0,10,False,False,0,40,9


In [12]:
value_counts = final_ie['play_type'].value_counts(dropna=False)
value_counts

play_type
end of play                                      71684
pitch                                            71227
ball acquired                                    69108
ball bounce                                      24500
ball hit into play                               24220
throw (ball-in-play)                              9298
pickoff throw                                     1335
ball deflection                                   1183
ball deflection off of wall                        732
home run                                           437
ball acquired - unknown field position              50
throw (ball-in-play) - unknown field position        3
Name: count, dtype: int64

In [24]:
with_outs.to_csv('outs.csv', index=False)

Calculating number of outs by tracking runners around the bases and keeping note of the inning

In [162]:
final_ie = (
    final_ie
      .sort_values(['game_str', 'play_id'])
      .reset_index(drop=True)
)

# builds a half-inning ID within each game
final_ie['half_inning_id'] = (
    final_ie
      .groupby('game_str')['top_bottom_inning']
      .transform(lambda x: x.ne(x.shift()).cumsum())
)

# batter number of the half inning
final_ie['batter_num_inning'] = (
    final_ie
      .groupby(['game_str', 'half_inning_id'])['at_bat']
      .transform(lambda x: x.ne(x.shift()).cumsum())
)
final_ie['batter_num_inning'] = final_ie['batter_num_inning'].clip(upper=12)

# number of baserunners on base
final_ie['active_baserunners'] = (
    final_ie[['first_baserunner','second_baserunner','third_baserunner']]
      .notna().sum(axis=1)
)

# tracking runners
for base in ['first_baserunner','second_baserunner','third_baserunner']:
    final_ie[f'prev_{base}'] = (
        final_ie
          .groupby(['game_str', 'half_inning_id'])[base]
          .shift()
    )

# counting homers as baserunners not outs
final_ie['atbat_home_run'] = (
    final_ie['play_type'].eq('home run')
    .groupby([final_ie['game_str'], final_ie['half_inning_id'], final_ie['at_bat']])
    .transform('max')
)

def count_new_codes(row):
    curr = {row['first_baserunner'], row['second_baserunner'], row['third_baserunner']}
    prev = {row['prev_first_baserunner'], row['prev_second_baserunner'], row['prev_third_baserunner']}
    curr = {x for x in curr if pd.notna(x)}
    prev = {x for x in prev if pd.notna(x)}
    new = len(curr - prev)
    return new + (1 if row['atbat_home_run'] else 0)

final_ie['new_baserunner'] = final_ie.apply(count_new_codes, axis=1)

# total baserunners
final_ie['baserunners_inning'] = (
    final_ie
      .groupby(['game_str', 'half_inning_id'])['new_baserunner']
      .cumsum()
)
final_ie['baserunners_inning'] = final_ie['baserunners_inning'].clip(upper=10)

# tracking batter
final_ie['batter_on_base'] = (
    final_ie['batter'].eq(final_ie['first_baserunner']) |
    final_ie['batter'].eq(final_ie['second_baserunner']) |
    final_ie['batter'].eq(final_ie['third_baserunner'])
)
final_ie['batter_on_base_any'] = (
    final_ie
      .groupby(['game_str','half_inning_id','at_bat'])['batter_on_base']
      .transform('max')
)

# tracking batter
final_ie['prev_batter_on_base'] = (
    final_ie
      .groupby(['game_str', 'half_inning_id'])['batter_on_base_any']
      .shift(fill_value=False)
)

# sequencing of at bats
final_ie['prev_at_bat'] = (
    final_ie
      .groupby(['game_str', 'half_inning_id'])['at_bat']
      .shift()
)
final_ie['new_atbat'] = final_ie['at_bat'].ne(final_ie['prev_at_bat'])

bat_on_base_any = (
    final_ie['batter_on_base']
      .groupby([final_ie['game_str'],
                final_ie['half_inning_id'],
                final_ie['at_bat']])
      .transform('max')
      .astype(bool)
)

hr_any = (
    final_ie['play_type'].eq('home run')
      .groupby([final_ie['game_str'],
                final_ie['half_inning_id'],
                final_ie['at_bat']])
      .transform('max')
      .astype(bool)
)

# runs scored in the half inning
final_ie['runs_inning'] = (
    final_ie['baserunners_inning']
    - final_ie['active_baserunners']
)

final_ie['outs_inning'] = (
    final_ie['batter_num_inning']
    - final_ie['active_baserunners']
    - final_ie['runs_inning'] - 1
)

final_ie['outs_inning'] = final_ie['outs_inning'].where(
    final_ie['outs_inning'].isin([0, 1]),other=2)


final_ie = final_ie.drop(columns=[
    'prev_first_baserunner',
    'prev_second_baserunner',
    'prev_third_baserunner',
    'prev_at_bat',
    'batter_on_base_any',
    'prev_batter_on_base',
    'atbat_home_run'
])

final_ie

Unnamed: 0,game_str,play_id,at_bat,play_per_game,timestamp,player_position,event_code,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,position,play_type,batter_num_inning,active_baserunners,new_baserunner,baserunners_inning,batter_on_base,new_atbat,outs_inning,runs_inning,half_inning_id
0,y1_d001_CGA_QEA,1,1,1,8699,1,1,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,pitcher,pitch,1,0,0,0,False,True,0,0,1
1,y1_d001_CGA_QEA,1,1,1,9199,2,2,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,catcher,ball acquired,1,0,0,0,False,False,0,0,1
2,y1_d001_CGA_QEA,1,1,1,9199,0,5,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,,end of play,1,0,0,0,False,False,0,0,1
3,y1_d001_CGA_QEA,2,1,2,24149,1,1,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,pitcher,pitch,1,0,0,0,False,False,0,0,1
4,y1_d001_CGA_QEA,2,1,2,24599,10,4,QEA,CGA,top,QEA-0110,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1458,,,,batter,ball hit into play,1,0,0,0,False,False,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273772,y2_d099_YJD_RZQ,291,78,291,1537052091220,2,2,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,catcher,ball acquired,3,0,0,0,False,False,2,0,17
273773,y2_d099_YJD_RZQ,291,78,291,1537052091220,0,5,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,,end of play,3,0,0,0,False,False,2,0,17
273774,y2_d099_YJD_RZQ,292,78,292,47698,0,5,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,,end of play,3,0,0,0,False,False,2,0,17
273775,y2_d099_YJD_RZQ,292,78,292,1537052107870,1,1,RZQ,YJD,top,RZQ-0368,RZQ-0395,RZQ-0414,RZQ-0347,RZQ-0040,RZQ-0326,RZQ-0409,RZQ-0328,RZQ-0279,YJD-0007,,,,pitcher,pitch,3,0,0,0,False,False,2,0,17


In [None]:
distribution = (
    final_ie['outs_inning']
    .value_counts(normalize=True) 
    .sort_index() 
    .mul(100)  
    .round(2) 
)

distribution

outs_inning
0    33.97
1    32.18
2    33.86
Name: proportion, dtype: float64

In [164]:
outs_counts = final_ie['outs_inning'] \
    .value_counts() \
    .sort_index()

outs_counts

outs_inning
0    92997
1    88090
2    92690
Name: count, dtype: int64