In [1]:
import os
import csv
import numpy as np
import pandas as pd

In [2]:
input_directory = os.pardir + '/output_data/play_by_play'
sb_output_file = os.pardir + '/output_data/stolen_base_data.csv'

In [3]:
# constants
FIELDER_NUMBERS = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
PITCH_LABELS = ['B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y']

In [4]:
output_df = pd.DataFrame(columns=['game_id', 'home_team', 'away_team', 'date_time', 'play', 'is_stolen_base_attempt',
                                  'is_successful', 'inning', 'home_half', 'outs', 'pitcher', 'catcher', 'batter', 'on_first',
                                  'on_second', 'on_third', 'runner_on_first', 'runner_on_second', 'runner_on_third', 'pitches',
                                  'num_pitches', 'balls', 'strikes', 'pitch_num_on_event', 'strike_on_event', 'swing_on_event',
                                  'pitchout_on_event', 'blocked_on_event', 'pickoffs_to_first', 'pickoffs_to_second', 
                                  'pickoffs_to_third', 'pitchouts', 'pitches_run_on', 'total_outs'])

In [5]:
for file in os.listdir(input_directory):
    game_id = file.split('.')[0]
    input_file = input_directory + '/' + file
    
    with open(input_file, 'r', encoding='utf8') as fin:
        pbp = fin.readlines()
    num_lines = len(pbp)  
    
    current_line = 0

    home_batting_order = ['N/A'] * 10
    away_batting_order = ['N/A'] * 10
    #home_score = 0
    #away_score = 0
    prev_inning = 0
    outs = 0
    home_half = False
    on_first = False
    runner_on_first = 'N/A'
    on_second = False
    runner_on_second = 'N/A'
    on_third = False
    runner_on_third = 'N/A'


    while(current_line < num_lines):
        record = pbp[current_line].strip('\n').split(',')
        
        if(record[0] == 'info'):
            if(record[1] == 'visteam'):
                away_team = record[2]
            elif(record[1] == 'hometeam'):
                home_team = record[2]
            elif(record[1] == 'date'):
                date = record[2]
            elif(record[1] == 'starttime'):
                start_time = record[2]

                
        # note any pitching or catching changes or pinch runners
        elif(record[0] == 'start' or record[0] == 'sub'):
            new_player_id = record[1]
            batting_position = int(record[4])
            fielding_position = int(record[5])
            if(record[3] == '0'):
                # away team
                if(fielding_position == 1):
                    away_pitcher = new_player_id
                elif(fielding_position == 2):
                    away_catcher = new_player_id
                elif(fielding_position == 12):
                    # pinch runner
                    if(away_batting_order[batting_position] == runner_on_first):
                        runner_on_first = new_player_id
                    elif(away_batting_order[batting_position] == runner_on_second):
                        runner_on_second = new_player_id
                    elif(away_batting_order[batting_position] == runner_on_third):
                        runner_on_third = new_player_id
                away_batting_order[batting_position] = new_player_id
            else:
                # home team
                if(fielding_position == 1):
                    home_pitcher = new_player_id
                elif(fielding_position == 2):
                    home_catcher = new_player_id
                elif(fielding_position == 12):
                    # pinch runner
                    if(home_batting_order[batting_position] == runner_on_first):
                        runner_on_first = new_player_id
                    elif(home_batting_order[batting_position] == runner_on_second):
                        runner_on_second = new_player_id
                    elif(home_batting_order[batting_position] == runner_on_third):
                        runner_on_third = new_player_id
                home_batting_order[batting_position] = new_player_id



        # analyze the plays
        elif(record[0] == 'play'):

            curr_inning = int(record[1])
            
            if(curr_inning != prev_inning):
                # change in inning
                outs = 0
                on_first = False
                runner_on_first = 'N/A'
                on_second = False
                runner_on_second = 'N/A'
                on_third = False
                runner_on_third = 'N/A'
                prev_inning = curr_inning

            # home_half is true if the home team is batting; false otherwise
            if(record[2] == '0'):
                # away team batting (top)
                if(home_half):
                    # change in half-inning
                    outs = 0
                    on_first = False
                    runner_on_first = 'N/A'
                    on_second = False
                    runner_on_second = 'N/A'
                    on_third = False
                    runner_on_third = 'N/A'
                home_half = False
                pitcher = home_pitcher
                catcher = home_catcher

            else:
                # home team batting (bottom)
                if(not home_half):
                    # change in half-inning
                    outs = 0
                    on_first = False
                    runner_on_first = 'N/A'
                    on_second = False
                    runner_on_second = 'N/A'
                    on_third = False
                    runner_on_third = 'N/A'
                home_half = True
                pitcher = away_pitcher
                catcher = away_catcher
                
            if(home_half):
                total_outs = 6 * (curr_inning - 1) + 3 + outs
            else:
                total_outs = 6 * (curr_inning - 1) + outs


            batter = record[3]


            count_on_event = record[4]
            num_balls = int(count_on_event[0])
            num_strikes = int(count_on_event[1])


            pitches = record[5]

            strike_on_event = False
            swing_on_event = False
            pitchout_on_event = False
            blocked_on_event = False
            
            num_pitches = 0
            if(len(pitches) > 0):
                for i in range(len(pitches) - 1):
                    if(pitches[i] in PITCH_LABELS):
                        num_pitches = num_pitches + 1
                if(pitches[-1] in PITCH_LABELS):
                    pitch_num_on_event = num_pitches + 1
                else:
                    pitch_num_on_event = -1
            else:
                pitch_num_on_event = -1
            
            pickoffs_to_first = pitches.count('1')
            pickoffs_to_second = pitches.count('2')
            pickoffs_to_third = pitches.count('3')
            pitchouts = pitches.count('P') + pitches.count('Q') + pitches.count('R')
            pitches_run_on = pitches.count('>')


            if(len(pitches) > 0):
                pitch_on_event = pitches[-1]

            # describe the pitch that occurred on the event - pitchout, blocked, ball/strike, swinging, etc.
            if(len(pitches) > 1):
                if(pitches[-2] == '*'):
                    blocked_on_event = True

            if(pitch_on_event in ['P', 'Q', 'R']):
                pitchout_on_event = True

            if(pitch_on_event in ['F', 'Q', 'R', 'S', 'T', 'X', 'Y']):
                swing_on_event = True

            if(pitch_on_event in ['C', 'F', 'K', 'L', 'M', 'O', 'Q', 'R', 'S', 'T', 'X', 'Y']):
                strike_on_event = True


            # count outs and keep track of runners reaching base and advancing
            play = record[6]

            batter_play = play.split('.')[0]
            basic_play = batter_play.split('/')[0]

            if('CS' in basic_play and 'POCS' not in basic_play):
                is_stolen_base_attempt = True
                if('E' not in basic_play[basic_play.find('CS'):]):
                    is_successful = False
                else:
                    is_successful = True
            elif('SB' in basic_play):
                is_stolen_base_attempt = True
                is_successful = True
            elif('POCS' in basic_play):
                is_stolen_base_attempt = True
                if('E' not in basic_play[basic_play.find('POCS'):]):
                    is_successful = False
                else:
                    is_successful = True
            else:
                is_stolen_base_attempt = False
                is_successful = False

                
                
            #columns=['game_id', 'home_team', 'away_team', 'date_time', 'play', 'is_stolen_base_attempt',
            #                      'is_successful', 'inning', 'home_half', 'outs', 'pitcher', 'catcher', 'batter', 'on_first',
            #                      'on_second', 'on_third', 'runner_on_first', 'runner_on_second', 'runner_on_third', 'pitches',
            #                      'num_pitches', 'balls', 'strikes', 'pitch_num_on_event', 'strike_on_event', 'swing_on_event',
            #                      'pitchout_on_event', 'blocked_on_event', 'pickoffs_to_first', 'pickoffs_to_second', 
            #                      'pickoffs_to_third', 'pitchouts', 'pitches_run_on', 'home_score', 'away_score', 'total_outs']

            ### record game situation BEFORE each play and pitch information for the at-bat producing that play
            if(batter_play != 'NP'):
                output_df = output_df.append({'game_id':game_id, 'home_team':home_team, 'away_team':away_team,
                                              'date_time':pd.to_datetime(date + ' ' + start_time),
                                              'play':play, 'is_stolen_base_attempt':is_stolen_base_attempt,
                                              'is_successful':is_successful, 'inning':curr_inning, 'home_half':home_half, 
                                              'outs':outs, 'pitcher':pitcher, 'catcher':catcher, 'batter':batter,
                                              'on_first':on_first, 'on_second':on_second, 'on_third':on_third,
                                              'runner_on_first':runner_on_first, 'runner_on_second':runner_on_second,
                                              'runner_on_third':runner_on_third, 'pitches':pitches, 'num_pitches':num_pitches,
                                              'balls':num_balls, 'strikes':num_strikes, 'pitch_num_on_event':pitch_num_on_event,
                                              'strike_on_event':strike_on_event, 'swing_on_event':swing_on_event,
                                              'pitchout_on_event':pitchout_on_event, 'blocked_on_event':blocked_on_event, 
                                              'pickoffs_to_first':pickoffs_to_first,
                                              'pickoffs_to_second':pickoffs_to_second, 'pickoffs_to_third':pickoffs_to_third,
                                              'pitchouts':pitchouts, 'pitches_run_on':pitches_run_on, 'total_outs':total_outs}, 
                                              ignore_index=True)



            # record the movements of runners on base
            # advances are ordered starting with runner on third and ending with batter
            # different advances are separated by a semicolon
            batter_advance_noted = False
            first_advance_noted = False
            second_advance_noted = False
            third_advance_noted = False

            first_base_taken = False
            second_base_taken = False
            third_base_taken = False


            if(not on_first):
                first_advance_noted = True
            if(not on_second):
                second_advance_noted = True
            if(not on_third):
                third_advance_noted = True

            # check if there any advances noted in the play description
            # (basic play is separated from advances by a period)

            # note that an error indicator may negate an out in the advance

            if(len(play.split('.')) > 1):
                advances = play.split('.')[1].split(';')
                num_advances = len(advances)

                for i in range(num_advances):
                    current_advance = advances[i]
                    starting_base = current_advance[0]
                    if(current_advance[1] == '-'):
                        safe = True
                    elif(current_advance[1] == 'X'):
                        if('E' in current_advance):
                            safe = True
                        else:
                            safe = False
                    ending_base = current_advance[2]

                    ### change the value of "safe" based on whether an error negates the out

                    if(starting_base == '3'):
                        third_advance_noted = True
                        if(not safe):
                            outs = outs + 1
                            on_third = False
                            runner_on_third = 'N/A'
                        else:
                            if(ending_base == 'H'):
                                #if(home_half):
                                #    home_score = home_score + 1
                                #else:
                                #    away_score = away_score + 1
                                on_third = False
                                runner_on_third = 'N/A'

                    elif(starting_base == '2'):
                        second_advance_noted = True
                        if(not safe):
                            outs = outs + 1
                            on_second = False
                            runner_on_second = 'N/A'
                        else:
                            if(ending_base == '3'):
                                on_third = True
                                third_base_taken = True
                                runner_on_third = runner_on_second
                                on_second = False
                                runner_on_second = 'N/A'
                            elif(ending_base == 'H'):
                                #if(home_half):
                                #    home_score = home_score + 1
                                #else:
                                #    away_score = away_score + 1
                                on_second = False
                                runner_on_second = 'N/A'

                    elif(starting_base == '1'):
                        first_advance_noted = True
                        if(not safe):
                            outs = outs + 1
                            on_first = False
                            runner_on_first = 'N/A'
                        else:
                            if(ending_base == '2'):
                                on_second = True
                                second_base_taken = True
                                runner_on_second = runner_on_first
                                on_first = False
                                runner_on_first = 'N/A'
                            elif(ending_base == '3'):
                                on_third = True
                                third_base_taken = True
                                runner_on_third = runner_on_first
                                on_first = False
                                runner_on_first = 'N/A'
                            elif(ending_base == 'H'):
                                #if(home_half):
                                #    home_score = home_score + 1
                                #else:
                                #    away_score = away_score + 1
                                on_first = False
                                runner_on_first = 'N/A'

                    elif(starting_base == 'B'):
                        batter_advance_noted = True
                        if(not safe):
                            outs = outs + 1
                        else:
                            if(ending_base == '1'):
                                on_first = True
                                first_base_taken = True
                                runner_on_first = batter
                            elif(ending_base == '2'):
                                on_second = True
                                second_base_taken = True
                                runner_on_second = batter
                            elif(ending_base == '3'):
                                on_third = True
                                third_base_taken = True
                                runner_on_third = batter
                            #elif(ending_base == 'H'):
                            #    if(home_half):
                            #        home_score = home_score + 1
                            #    else:
                            #        away_score = away_score + 1


            # now record the at-bat outcome
            # do this after recording advances because you want to be able to update the base variables
            # if you record the batter's outcome before the advances, might get a situation where "on_first" is set to false
            # even though the batter made it to first

            # interference
            if(batter_play in ['C/E2', 'C/E1', 'C/E3']):
                if(not batter_advance_noted):
                    on_first = True
                    runner_on_first = batter

            # single
            elif(basic_play[0] == 'S' and basic_play[0:2] != 'SB'):
                if(not batter_advance_noted):
                    on_first = True
                    runner_on_first = batter

            # double or ground rule double
            elif(basic_play[0] == 'D'):
                if(not batter_advance_noted):
                    on_second = True
                    runner_on_second = batter

            # triple
            elif(basic_play[0] == 'T'):
                if(not batter_advance_noted):
                    on_third = True
                    runner_on_third = batter

            # error
            elif(basic_play[0] == 'E'):
                if(not batter_advance_noted):
                    on_first = True
                    runner_on_first = batter

            # fielder's choice
            # in this data, fielder's choices are not forceouts
            elif(basic_play[0:2] == 'FC'):
                if(not batter_advance_noted):
                    on_first = True
                    runner_on_first = batter
                # any outs would be recorded in the advances section

            # home run
            #elif(basic_play[0] == 'H' and basic_play[0:2] != 'HP'):
            #    if(not batter_advance_noted):
            #        if(home_half):
            #            home_score = home_score + 1
            #        else:
            #            away_score = away_score + 1

            # strikeout
            elif(basic_play[0] == 'K'):
                # need to check for batter reaching on passed ball or wild pitch
                if(not batter_advance_noted):
                    # no batter advances means it is a simple strikeout
                    outs = outs + 1
                # otherwise, the advance has already been noted by the previous section of code

            # walks, intentional walks, hit by pitch
            elif( (basic_play[0] in ['I', 'W'] and basic_play[0:2] != 'WP') or (basic_play[0:2] == 'HP') ):
                if(not batter_advance_noted):
                    on_first = True
                    runner_on_first = batter


            ### putouts

            # forceout
            elif('FO' in batter_play):
                # a forceout means the base behind the runner must be occupied
                # and the batter reaches first - otherwise it would be recorded simply as a groundout
                #if(not batter_advance_noted):
                #    on_first = True
                #    runner_on_first = batter
                if('(1)' in basic_play):
                    if(not first_advance_noted):
                        outs = outs + 1
                    if(not first_base_taken):
                        on_first = False
                        runner_on_first = 'N/A'
                elif('(2)' in basic_play):
                    if(not second_advance_noted):
                        outs = outs + 1
                    if(not second_base_taken):
                        on_second = False
                        runner_on_second = 'N/A'
                elif('(3)' in basic_play):
                    if(not third_advance_noted):
                        outs = outs + 1
                    if(not third_base_taken):
                        on_third = False
                        runner_on_third = 'N/A'
                if(not batter_advance_noted):
                    on_first = True
                    runner_on_first = batter

            # grounded into double play
            elif('GDP' in batter_play):
                if((basic_play.count('(') == 2) and ('(B)' not in basic_play)):
                    # the double play does not get the batter out
                    if(not batter_advance_noted):
                        on_first = True
                        runner_on_first = batter
                    if(('(3)' in basic_play) and ('(2)' in basic_play)):
                        if(not second_advance_noted):
                            outs = outs + 1
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not second_base_taken):
                            on_second = False
                            runner_on_second = 'N/A'
                        if(not third_base_taken):
                            on_third = False
                            runner_on_third = 'N/A'
                    elif(('(3)' in basic_play) and ('(1)' in basic_play)):
                        if(not first_advance_noted):
                            outs = outs + 1
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not third_base_taken):
                            on_third = False
                            runner_on_third = 'N/A'
                        # first base is taken by the batter
                        # if second base is empty, it will remain empty
                        # if second base is occupied, the advances section will tell us what happens
                    elif(('(2)' in basic_play) and ('(1)' in basic_play)):
                        if(not first_advance_noted):
                            outs = outs + 1
                        if(not second_advance_noted):
                            outs = outs + 1
                        if(not second_base_taken):
                            on_second = False
                            runner_on_second = 'N/A'
                else:
                    if(not batter_advance_noted):
                        outs = outs + 1  # the batter is out as part of the double play
                    # but is first base necessarily empty? if it was empty beforehand, it is still empty
                    # if there was a runner on first beforehand, either his advance was noted, he stayed on first, or he was part of the double play
                    if('(3)' in basic_play):
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not third_base_taken):
                            on_third = False
                            runner_on_third = 'N/A'
                    elif('(2)' in basic_play):
                        if(not second_advance_noted):
                            outs = outs + 1
                        if(not second_base_taken):
                            on_second = False
                            runner_on_second = 'N/A'
                    elif('(1)' in basic_play):
                        if(not first_advance_noted):
                            outs = outs + 1
                        on_first = False
                        runner_on_first = 'N/A'
                        # first will be empty because the runner leaving first is out and the batter is out              
            # does not explicitly tell you if the batter reaches
            # if someone is already on base, either they are involved in the double play or their advancement will be noted
            # can ignore the base in front of the lead runner
            # but need to worry about the bases the runners are leaving
            # runners on 1st and 2nd, 5-4 double play. Is second base necessarily empty now? 
            # In all likelihood it is, but perhaps the batter advanced to second for some reason.
            # runners on 1st and 2nd, 5-3 double play. 

            # lined into double play
            # batter is definitely out, but first base may not be empty afterward
            elif('LDP' in batter_play):
                if(not batter_advance_noted):
                    outs = outs + 1
                if('(1)' in basic_play):
                    if(not first_advance_noted):
                        outs = outs + 1
                    on_first = False
                    runner_on_first = 'N/A'
                elif('(2)' in basic_play):
                    if(not second_advance_noted):
                        outs = outs + 1
                    if(not second_base_taken):
                        on_second = False
                        runner_on_second = 'N/A'
                elif('(3)' in basic_play):
                    if(not third_advance_noted):
                        outs = outs + 1
                    if(not third_base_taken):
                        on_third = False
                        runner_on_third = 'N/A'
                        
            # unspecified double play (?)

            # grounded into triple play
            elif('GTP' in batter_play):
                if((basic_play.count('(') == 3) and ('(B)' not in basic_play)):
                    # the triple play does not get the batter out - so it gets the runners on third, second, and first
                    if(not third_advance_noted):
                        outs = outs + 1
                    if(not second_advance_noted):
                        outs = outs + 1
                    if(not third_advance_noted):
                        outs = outs + 1
                else:
                    # the triple play gets the batter and two other runners
                    if(not batter_advance_noted):
                        outs = outs + 1
                    if(('(3)' in basic_play) and ('(2)' in basic_play)):
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not second_advance_noted):
                            outs = outs + 1
                    elif(('(3)' in basic_play) and ('(1)' in basic_play)):
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not second_advance_noted):
                            outs = outs + 1
                    elif(('(2)' in basic_play) and ('(1)' in basic_play)):
                        if(not second_advance_noted):
                            outs = outs + 1
                        if(not first_advance_noted):
                            outs = outs + 1                 

            # lined into triple play
            elif('LTP' in batter_play):
                if(not batter_advance_noted):
                    outs = outs + 1
                if(('(3)' in basic_play) and ('(2)' in basic_play)):
                    if(not third_advance_noted):
                        outs = outs + 1
                    if(not second_advance_noted):
                        outs = outs + 1
                elif(('(3)' in basic_play) and ('(1)' in basic_play)):
                    if(not third_advance_noted):
                        outs = outs + 1
                    if(not second_advance_noted):
                        outs = outs + 1
                elif(('(2)' in basic_play) and ('(1)' in basic_play)):
                    if(not second_advance_noted):
                        outs = outs + 1
                    if(not first_advance_noted):
                        outs = outs + 1
            # for triple plays, don't need to empty the bases; this will be done with the teams changing sides

            # unassisted putout
            elif(basic_play in FIELDER_NUMBERS):
                if(not batter_advance_noted):
                    outs = outs + 1

            # (assisted) groundout
            # is a groundout if the basic play is a string of at least two numbers 
            elif(basic_play[0] in FIELDER_NUMBERS and basic_play[1] in FIELDER_NUMBERS):
                # Groundouts always result in the batter out at first
                # If batter weren't out at first, it would be denoted as a fielder's choice or forceout
                # And if the batter were out at second, there would either be a hit or an error
                if(not batter_advance_noted):
                    outs = outs + 1

            # Note: events like 'GDP' and 'FO' need to come before unassisted putouts and groundouts
            # because 'GDP' will probably include two fielder numbers in the first two characters of the play
            
            # fielder error
            elif(basic_play[0] in FIELDER_NUMBERS and basic_play[1] == 'E'):
                if(not batter_advance_noted):
                    on_first = True
                    runner_on_first = batter  


            ### baserunning events not involving the batter

            # balk: don't need to do anything; everything covered by advances

            ### TODO: make sure that if baserunning events occur alongside strikeouts or walks,
            ### recording the baserunning event does not mess up the information provided by the at-bat outcome (or vice versa)
            ### I think we're good on this

            # caught stealing:
            if('CS' in basic_play and 'POCS' not in basic_play):
                # note that an error may negate the out
                base_number = basic_play[basic_play.find('CS') + 2]
                if('E' not in basic_play[basic_play.find('CS'):]):
                    # no error, so an out occurred
                    if(base_number == '2'):
                        if(not first_advance_noted):
                            outs = outs + 1
                        if(not first_base_taken):
                            on_first = False
                            runner_on_first = 'N/A'
                    elif(base_number == '3'):
                        if(not second_advance_noted):
                            outs = outs + 1
                        if(not second_base_taken):
                            on_second = False
                            runner_on_second = 'N/A'
                    elif(base_number == 'H'):
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not third_base_taken):
                            on_third = False
                            runner_on_third = 'N/A'
                else:
                    # error occured, so a base was taken successfully, and no out was recorded
                    if(base_number == '2'):
                        if(not first_advance_noted):
                            on_second = True
                            runner_on_second = runner_on_first
                        if(not first_base_taken):
                            on_first = False
                            runner_on_first = 'N/A'
                    elif(base_number == '3'):
                        if(not second_advance_noted):
                            on_third = True
                            runner_on_third = runner_on_second
                        if(not second_base_taken):
                            on_second = False
                            runner_on_second = 'N/A'
                    elif(base_number == 'H'):
                        #if(not third_advance_noted):
                        #    if(home_half):
                        #        home_score = home_score + 1
                        #    else:
                        #        away_score = away_score + 1
                        if(not third_base_taken):
                            on_third = False
                            runner_on_third = 'N/A'

            # defensive indifference: don't need to do anything

            # OA (other advance): don't need to do anything

            # passed ball / wild pitch: don't need to do anything

            # picked off:
            if('PO' in basic_play and 'POCS' not in basic_play):
                base_number = basic_play[basic_play.find('PO') + 2]
                # error may negate out
                if('E' not in basic_play[basic_play.find('PO'):]):
                    # no error, so an out occurred
                    if(base_number == '1'):
                        if(not first_advance_noted):
                            outs = outs + 1
                        on_first = False
                        runner_on_first = 'N/A'
                    elif(base_number == '2'):
                        if(not second_advance_noted):
                            outs = outs + 1
                        if(not second_base_taken):
                            on_second = False
                            runner_on_second = 'N/A'
                    elif(base_number == '3'):
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not third_base_taken):
                            on_third = False
                            runner_on_third = 'N/A'
                # if an error did occur, no out was recorded, and the runner's advance would be included in the advances section

            # picked off caught stealing:
            if('POCS' in basic_play):
                base_number = basic_play[basic_play.find('POCS') + 4]
                if('E' not in basic_play[basic_play.find('POCS'):]):
                    if(base_number == '2'):
                        if(not first_advance_noted):
                            outs = outs + 1
                        on_first = False
                        runner_on_first = 'N/A'
                    elif(base_number == '3'):
                        if(not second_advance_noted):
                            outs = outs + 1
                        if(not second_base_taken):
                            on_second = False
                            runner_on_second = 'N/A'
                    elif(base_number == 'H'):
                        if(not third_advance_noted):
                            outs = outs + 1
                        if(not third_base_taken):
                            on_third = False
                            runner_on_third = 'N/A'

            # stolen base:
            # what if the runner advances an extra base due to another throw or something?
            # then the stolen base play would have a one-base advance, but the advances section would have a two-base advance
            # check if(not first_advance_noted)
            if('SB' in basic_play):
                # check for multiple stolen bases (double steals); if several, they go in reverse order, from home to second
                # so we can safely set on_second to false, for example, knowing that the next stolen base would tell us whether second is taken
                stolen_bases = basic_play.split(';')
                num_steals = len(stolen_bases)
                for i in range(num_steals):
                    current_steal = stolen_bases[i]
                    if('SB' in current_steal):
                        base_number = current_steal[current_steal.find('SB') + 2]
                        if(base_number == '2'):
                            if(not first_advance_noted):
                                on_second = True
                                runner_on_second = runner_on_first
                            if(not first_base_taken):
                                on_first = False
                                runner_on_first = 'N/A'
                        elif(base_number == '3'):
                            if(not second_advance_noted):
                                on_third = True
                                runner_on_third = runner_on_second
                            if(not second_base_taken):
                                on_second = False
                                runner_on_second = 'N/A'
                        elif(base_number == 'H'):
                            #if(not third_advance_noted):
                            #    if(home_half):
                            #        home_score = home_score + 1
                            #    else:
                            #        away_score = away_score + 1
                            if(not third_base_taken):
                                on_third = False
                                runner_on_third = 'N/A'


        current_line = current_line + 1

In [7]:
sb_df = output_df[output_df['is_stolen_base_attempt'] == True]
sb_df.to_csv(sb_output_file, index=False)

Unnamed: 0,game_id,home_team,away_team,date_time,play,is_stolen_base_attempt,is_successful,inning,home_half,outs,...,strike_on_event,swing_on_event,pitchout_on_event,blocked_on_event,pickoffs_to_first,pickoffs_to_second,pickoffs_to_third,pitchouts,pitches_run_on,total_outs
2,ANA201804020,ANA,CLE,2018-04-02 19:07:00,SB2,True,True,3,False,1,...,False,False,False,False,1,0,0,0,1,13
9,ANA201804040,ANA,CLE,2018-04-04 13:08:00,SB2,True,True,2,False,2,...,True,False,False,False,0,0,0,0,1,8
14,ANA201804040,ANA,CLE,2018-04-04 13:08:00,SB2,True,True,5,False,2,...,False,False,False,False,4,0,0,0,1,26
18,ANA201804060,ANA,OAK,2018-04-06 19:07:00,POCS2(1361),True,False,4,False,0,...,False,False,False,False,1,0,0,0,0,18
22,ANA201804060,ANA,OAK,2018-04-06 19:07:00,SB2,True,True,8,True,0,...,False,False,False,False,0,0,0,0,1,45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18554,WAS201809230,WAS,NYN,2018-09-23 14:00:00,SB2,True,True,3,True,0,...,False,False,False,False,1,0,0,0,1,15
18565,WAS201809240,WAS,MIA,2018-09-24 19:06:00,SB2.1-3(E2/TH),True,True,4,False,2,...,True,True,False,False,0,0,0,0,1,20
18571,WAS201809240,WAS,MIA,2018-09-24 19:06:00,SB2,True,True,8,True,2,...,False,False,False,False,0,0,0,0,1,47
18575,WAS201809250,WAS,MIA,2018-09-25 19:05:00,SB2,True,True,1,True,1,...,True,False,False,False,0,0,0,0,1,4
