In [1]:
import os
import csv
import pandas as pd
import numpy as np

In [2]:
sb_kaggle_file = os.pardir + '/output_data/merged_sb_kaggle_data.csv'
statcast_pitchers_file = os.pardir + '/input_data/statcast_data/statcast_pitchers.csv'
statcast_runners_file = os.pardir + '/input_data/statcast_data/statcast_runners.csv'
bbref_catchers_2017_file = os.pardir + '/input_data/statcast_data/bbref_catchers_2017.csv'
statcast_catchers_2018_file = os.pardir + '/input_data/statcast_data/statcast_catchers_2018.csv'

full_output_file = os.pardir + '/r_code_and_data/full_data.csv'
pitchless_output_file = os.pardir + '/r_code_and_data/pitchless_data.csv'

In [3]:
sb_kaggle_df = pd.read_csv(sb_kaggle_file, na_values='NaN', keep_default_na=False)
sb_kaggle_df.rename(columns={'_merge':'orig_merge'}, inplace=True)

In [4]:
# add a variable tracking the target base(s) for a stolen base attempt
target_base = len(sb_kaggle_df) * ['']
for i in range(len(sb_kaggle_df)):
    current_play = sb_kaggle_df['play'][i]
    if('SB2' in current_play or 'CS2' in current_play or 'POCS2' in current_play):
        target_base[i] = target_base[i] + '2;'
    if('SB3' in current_play or 'CS3' in current_play or 'POCS3' in current_play):
        target_base[i] = target_base[i] + '3;'
    if('SBH' in current_play or 'CSH' in current_play or 'POCSH' in current_play):
        target_base[i] = target_base[i] + 'H;'
    
sb_kaggle_df['target_base'] = target_base

In [5]:
# only consider steals of second
sb_kaggle_df = sb_kaggle_df[sb_kaggle_df['target_base']=='2;']

# and exclude all of the cases where a runner is on second (there are 11 of these)
# and can drop the 'on_second' variable
sb_kaggle_df = sb_kaggle_df[sb_kaggle_df['on_second']==False]

# no catcher is left-handed, so drop the handedness variable
print(len(sb_kaggle_df[sb_kaggle_df['catcher_throws']=='L']))

0


In [6]:
# convert data about balls and strikes to a categorical variable indicating the type of count
data_types_dict = {'b_count':str, 's_count':str}
sb_kaggle_df = sb_kaggle_df.astype(data_types_dict)
sb_kaggle_df['count'] = sb_kaggle_df['b_count'] + '-' + sb_kaggle_df['s_count']

# indicate whether the count is a hitter's (H) count, a pitcher's (P) count, or neutral (N)
count_dict = {'0-0':'N', '0-1':'N', '0-2':'P', '1-0':'N', '1-1':'N', '1-2':'P', '2-0':'H', '2-1':'N', '2-2':'P', 
              '3-0':'H', '3-1':'H', '3-2':'N', '4-2':'N'}
sb_kaggle_df['count_type'] = sb_kaggle_df['count'].map(count_dict)

# create dummy variables for count type
hitters_count_dict = {'H':True, 'P':False, 'N':False}
pitchers_count_dict = {'H':False, 'P':True, 'N':False}
sb_kaggle_df['is_hitters_count'] = sb_kaggle_df['count_type'].map(hitters_count_dict)
sb_kaggle_df['is_pitchers_count'] = sb_kaggle_df['count_type'].map(pitchers_count_dict)

In [7]:
success_dict = {True:'SUCCESS', False:'FAILURE'}
sb_kaggle_df['is_successful'] = sb_kaggle_df['is_successful'].map(success_dict)

sb_kaggle_df.rename(columns={'is_successful':'outcome'}, inplace=True)

In [8]:
print(sb_kaggle_df.columns.tolist())

['game_id', 'home_team', 'away_team', 'date_time', 'play', 'is_stolen_base_attempt', 'outcome', 'inning', 'home_half', 'outs', 'pitcher', 'catcher', 'batter', 'on_first', 'on_second', 'on_third', 'runner_on_first', 'runner_on_second', 'runner_on_third', 'pitches', 'num_pitches', 'b_count', 's_count', 'pitch_num_on_event', 'strike_on_event', 'swing_on_event', 'pitchout_on_event', 'blocked_on_event', 'pickoffs_to_first', 'pickoffs_to_second', 'pickoffs_to_third', 'pitchouts', 'pitches_run_on', 'total_outs', 'pitcher_name', 'pitcher_throws', 'pitcher_height', 'pitcher_weight', 'catcher_name', 'catcher_throws', 'catcher_height', 'catcher_weight', 'batter_name', 'batter_bats', 'batter_height', 'batter_weight', 'runner_on_first_name', 'runner_on_first_height', 'runner_on_first_weight', 'runner_on_second_name', 'runner_on_second_height', 'runner_on_second_weight', 'runner_on_third_name', 'runner_on_third_height', 'runner_on_third_weight', 'g_id', 'ab_id', 'pitcher_id', 'batter_id', 'p_score',

In [9]:
sb_kaggle_df = sb_kaggle_df[['outcome', 'inning', 'home_half', 'outs', 'on_third', 'pitches', 'num_pitches', 'is_hitters_count', 
                             'is_pitchers_count', 'pitch_num_on_event', 'strike_on_event', 'swing_on_event',
                             'pitchout_on_event', 'blocked_on_event', 'pickoffs_to_first', 'pitchouts', 'pitches_run_on', 'pitcher_name', 
                             'catcher_name', 'runner_on_first_name', 'p_score', 'pitcher_throws', 'batter_bats', 'start_speed', 'end_speed', 
                             'spin_rate', 'spin_dir', 'break_angle', 'break_length', 'zone', 'pitch_type', 'b_score', 'orig_merge']]

In [10]:
statcast_pitchers_df = pd.read_csv(statcast_pitchers_file)

statcast_pitchers_df = statcast_pitchers_df.iloc[:, :-1]
statcast_pitchers_df.rename(columns={' first_name':'first_name', 'player_id':'p_id'}, inplace=True) 
statcast_pitchers_df['pitcher_name'] = statcast_pitchers_df['first_name'] + ' ' + statcast_pitchers_df['last_name']
statcast_pitchers_df['pitcher_name'] = statcast_pitchers_df['pitcher_name'].apply(str.strip)
statcast_pitchers_df = statcast_pitchers_df[['pitcher_name', 'year', 'p_total_pa', 'p_total_stolen_base', 'p_pickoff_attempt_1b', 
                                             'p_pickoff_1b', 'p_stolen_base_2b', 'p_caught_stealing_2b', 'p_pickoff_error_1b', 
                                             'p_pitchout', 'p_total_pitches', 'out_zone', 'pitch_count_offspeed', 'pitch_count_fastball',
                                             'pitch_count_breaking', 'in_zone']]

statcast_pitchers_2017_df = statcast_pitchers_df[statcast_pitchers_df['year']==2017]
statcast_pitchers_2017_df = statcast_pitchers_2017_df.drop(['year'], axis=1)
statcast_pitchers_2017_df = statcast_pitchers_2017_df.add_suffix('_2017')
statcast_pitchers_2017_df.rename(columns={'pitcher_name_2017':'pitcher_name'}, inplace=True)

statcast_pitchers_2018_df = statcast_pitchers_df[statcast_pitchers_df['year']==2018]
statcast_pitchers_2018_df = statcast_pitchers_2018_df.drop(['year'], axis=1)
statcast_pitchers_2018_df = statcast_pitchers_2018_df.add_suffix('_2018')
statcast_pitchers_2018_df.rename(columns={'pitcher_name_2018':'pitcher_name'}, inplace=True)

print(statcast_pitchers_2017_df.columns)
print(statcast_pitchers_2018_df.columns)

Index(['pitcher_name', 'p_total_pa_2017', 'p_total_stolen_base_2017',
       'p_pickoff_attempt_1b_2017', 'p_pickoff_1b_2017',
       'p_stolen_base_2b_2017', 'p_caught_stealing_2b_2017',
       'p_pickoff_error_1b_2017', 'p_pitchout_2017', 'p_total_pitches_2017',
       'out_zone_2017', 'pitch_count_offspeed_2017',
       'pitch_count_fastball_2017', 'pitch_count_breaking_2017',
       'in_zone_2017'],
      dtype='object')
Index(['pitcher_name', 'p_total_pa_2018', 'p_total_stolen_base_2018',
       'p_pickoff_attempt_1b_2018', 'p_pickoff_1b_2018',
       'p_stolen_base_2b_2018', 'p_caught_stealing_2b_2018',
       'p_pickoff_error_1b_2018', 'p_pitchout_2018', 'p_total_pitches_2018',
       'out_zone_2018', 'pitch_count_offspeed_2018',
       'pitch_count_fastball_2018', 'pitch_count_breaking_2018',
       'in_zone_2018'],
      dtype='object')


In [11]:
statcast_runners_df = pd.read_csv(statcast_runners_file)

statcast_runners_df = statcast_runners_df.iloc[:, :-1]
statcast_runners_df.rename(columns={' first_name':'first_name', 'player_id':'r_id', 'n_bolts':'r_n_bolts', 
                                    'hp_to_1b':'r_hp_to_1b', 'sprint_speed':'r_sprint_speed'}, inplace=True)
statcast_runners_df['runner_name'] = statcast_runners_df['first_name'] + ' ' + statcast_runners_df['last_name']
statcast_runners_df = statcast_runners_df[['runner_name', 'year', 'r_caught_stealing_2b', 'r_pickoff_1b', 'r_stolen_base_2b',
                                           'r_sprint_speed']]

statcast_runners_2017_df = statcast_runners_df[statcast_runners_df['year']==2017]
statcast_runners_2017_df = statcast_runners_2017_df.drop(['year'], axis=1)
statcast_runners_2017_df = statcast_runners_2017_df.add_suffix('_2017')
statcast_runners_2017_df.rename(columns={'runner_name_2017':'runner_on_first_name'}, inplace=True)

statcast_runners_2018_df = statcast_runners_df[statcast_runners_df['year']==2018]
statcast_runners_2018_df = statcast_runners_2018_df.drop(['year'], axis=1)
statcast_runners_2018_df = statcast_runners_2018_df.add_suffix('_2018')
statcast_runners_2018_df.rename(columns={'runner_name_2018':'runner_on_first_name'}, inplace=True)

print(statcast_runners_2017_df.columns)
print(statcast_runners_2018_df.columns)

Index(['runner_on_first_name', 'r_caught_stealing_2b_2017',
       'r_pickoff_1b_2017', 'r_stolen_base_2b_2017', 'r_sprint_speed_2017'],
      dtype='object')
Index(['runner_on_first_name', 'r_caught_stealing_2b_2018',
       'r_pickoff_1b_2018', 'r_stolen_base_2b_2018', 'r_sprint_speed_2018'],
      dtype='object')


In [12]:
bbref_catchers_2017_df = pd.read_csv(bbref_catchers_2017_file)
statcast_catchers_2018_df = pd.read_csv(statcast_catchers_2018_file)

bbref_catchers_2017_df = bbref_catchers_2017_df[['Name', 'CS%']]
bbref_catchers_2017_df['Name'] = bbref_catchers_2017_df['Name'].apply(str.replace, args=('\xa0', ' '))
duplicate_indicator = bbref_catchers_2017_df['Name'].duplicated()
duplicate_indices = duplicate_indicator[duplicate_indicator].index.tolist()
bbref_catchers_2017_df.drop(duplicate_indices, axis=0, inplace=True)
bbref_catchers_2017_df = bbref_catchers_2017_df.reset_index(drop=True)

bbref_catchers_2017_df['CS%'] = bbref_catchers_2017_df['CS%'].astype(str)
bbref_catchers_2017_df['CS%'] = bbref_catchers_2017_df['CS%'].apply(str.replace, args=('%', ''))
bbref_catchers_2017_df['CS%'] = bbref_catchers_2017_df['CS%'].astype(float) / 100
bbref_catchers_2017_df['CS%'] = 1 - bbref_catchers_2017_df['CS%']
bbref_catchers_2017_df.rename(columns={'Name':'catcher_name', 'CS%':'c_sb_rate_2017'}, inplace=True)

statcast_catchers_2018_df = statcast_catchers_2018_df[['catcher', 'pop_2b_sba']]
statcast_catchers_2018_df.rename(columns={'catcher':'catcher_name', 'pop_2b_sba':'c_pop_2b_sba_2018'}, inplace=True)

print(bbref_catchers_2017_df.columns)
print(statcast_catchers_2018_df.columns)

Index(['catcher_name', 'c_sb_rate_2017'], dtype='object')
Index(['catcher_name', 'c_pop_2b_sba_2018'], dtype='object')


In [13]:
# fix pitcher names
sb_kaggle_unique_pitcher_names = pd.DataFrame(sb_kaggle_df['pitcher_name'].unique(), columns=['pitcher_name'])
sb_kaggle_unique_pitcher_names.to_csv(os.pardir + '/output_data/player_names/merging_statcast/sb_kaggle_pitcher_names.txt', 
                                     sep='\n', index=False)

statcast_2018_unique_pitcher_names = pd.DataFrame(statcast_pitchers_2018_df['pitcher_name'].unique(), columns=['pitcher_name'])
statcast_2018_unique_pitcher_names.to_csv(os.pardir + '/output_data/player_names/merging_statcast/statcast_2018_pitcher_names.txt', 
                                         sep='\n', index=False)

In [14]:
missed_pitchers_2018 = []
for i in range(len(sb_kaggle_unique_pitcher_names)):
    current_pitcher = sb_kaggle_unique_pitcher_names['pitcher_name'][i]
    pitcher_found = False
    for j in range(len(statcast_2018_unique_pitcher_names)):
        if(statcast_2018_unique_pitcher_names['pitcher_name'][j]==current_pitcher):
            pitcher_found = True
            break
    if(not pitcher_found):
        missed_pitchers_2018.append(current_pitcher)
print(missed_pitchers_2018)

['J.C. Ramirez', 'Lance McCullers', 'C.D. Pelham', 'Jorge de la Rosa', 'J.T. Chargois', 'Jimmie Sherfy', 'Daniel Winkler', 'Mike Wright', 'Michael Fiers', 'Jake Junis', 'Carl Edwards', 'Matt Boyd', 'Matt Festa', 'Phil Maton', 'Mark Leiter', 'Tom Milone', 'A.J. Ramos', 'Vincent Velasquez', 'Jose Valdez', 'Jacob Faria', 'Daniel Coulombe']


In [15]:
# sb_kaggle - statcast
# J.C. Ramirez - JC Ramirez
# Lance McCullers - Lance McCullers Jr.
# C.D. Pelham - CD Pelham
# Jorge de la Rosa - Jorge De La Rosa
# J.T. Chargois - JT Chargois
# Jimmie Sherfy - James Sherfy
# Daniel Winkler - Dan Winkler
# Mike Wright - Mike Wright Jr.
# Michael Fiers - Mike Fiers
# Jake Junis - Jakob Junis
# Carl Edwards - Carl Edwards Jr.
# Matt Boyd - Matthew Boyd
# Matt Festa - Matthew Festa
# Phil Maton - Phil Maton III
# Mark Leiter - Mark Leiter Jr.
# Tom Milone - Tommy Milone
# A.J. Ramos - AJ Ramos
# Vincent Velasquez - Vince Velasquez
# Jose Valdez - Jose A. Valdez
# Jacob Faria - Jake Faria
# Daniel Coulombe - Danny Coulombe
# Jose Fernandez - Jose Manuel Fernandez


# Seung Hwan Oh - Seunghwan Oh
# Michael Dunn - Mike Dunn
# Nestor Cortes - Nestor Cortes Jr.

In [16]:
old_pitcher_names = ['JC Ramirez', 'Lance McCullers Jr.', 'CD Pelham', 'Jorge De La Rosa', 'JT Chargois', 
                         'James Sherfy', 'Dan Winkler', 'Mike Wright Jr.', 'Mike Fiers', 'Jakob Junis', 'Carl Edwards Jr.', 'Matthew Boyd', 
                         'Matthew Festa', 'Phil Maton III', 'Mark Leiter Jr.', 'Tommy Milone', 'AJ Ramos', 'Vince Velasquez',
                         'Jose A. Valdez', 'Jake Faria', 'Danny Coulombe', 'Jose Manuel Fernandez']

new_pitcher_names = ['J.C. Ramirez', 'Lance McCullers', 'C.D. Pelham', 'Jorge de la Rosa', 'J.T. Chargois', 
                         'Jimmie Sherfy', 'Daniel Winkler', 'Mike Wright', 'Michael Fiers', 'Jake Junis', 'Carl Edwards',
                         'Matt Boyd', 'Matt Festa', 'Phil Maton', 'Mark Leiter', 'Tom Milone', 'A.J. Ramos',
                         'Vincent Velasquez', 'Jose Valdez', 'Jacob Faria', 'Daniel Coulombe', 'Jose Fernandez']

statcast_pitchers_2017_df = statcast_pitchers_2017_df[statcast_pitchers_2017_df['pitcher_name'].notnull()].copy()
statcast_pitchers_2017_df['pitcher_name'] = statcast_pitchers_2017_df['pitcher_name'].replace(old_pitcher_names, new_pitcher_names)

statcast_pitchers_2018_df = statcast_pitchers_2018_df[statcast_pitchers_2018_df['pitcher_name'].notnull()].copy()
statcast_pitchers_2018_df['pitcher_name'] = statcast_pitchers_2018_df['pitcher_name'].replace(old_pitcher_names, new_pitcher_names)

In [17]:
# fix catcher names
for i in range(len(bbref_catchers_2017_df)):
    current_catcher = bbref_catchers_2017_df['catcher_name'][i]
    catcher_encoding = current_catcher.encode('utf-8')
    
    if(catcher_encoding==b'Mart\xc3\xadn Maldonado'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Martin Maldonado'
        
    elif(catcher_encoding==b'Ren\xc3\xa9 Rivera'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Rene Rivera'
        
    elif(catcher_encoding==b'Jes\xc3\xbas Sucre'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Jesus Sucre'
        
    elif(catcher_encoding==b'Jos\xc3\xa9 Brice\xc3\xb1o'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Jose Briceno'
        
    elif(catcher_encoding==b'Omar Narv\xc3\xa1ez'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Omar Narvaez'
        
    elif(catcher_encoding==b'Elias D\xc3\xadaz'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Elias Diaz'
        
    elif(catcher_encoding==b'Carlos P\xc3\xa9rez'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Carlos Perez'
        
    elif(catcher_encoding==b'Tom\xc3\xa1s Nido'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Tomas Nido'
    
    elif(catcher_encoding==b'Manny Pi\xc3\xb1a'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Manny Pina'
        
    elif(catcher_encoding==b'Christian V\xc3\xa1zquez'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Christian Vazquez'
        
    elif(catcher_encoding==b'Alfredo Gonz\xc3\xa1lez'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Alfredo Gonzalez'
        
    elif(catcher_encoding==b'Francisco Pe\xc3\xb1a'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Francisco Pena'
        
    elif(catcher_encoding==b'Tom\xc3\xa1s Telis'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Tomas Telis'
        
    elif(catcher_encoding==b'Francisco Mej\xc3\xada'):
        bbref_catchers_2017_df.at[i, 'catcher_name'] = 'Francisco Mejia'

In [18]:
for i in range(len(statcast_catchers_2018_df)):
    current_catcher = statcast_catchers_2018_df['catcher_name'][i]
    catcher_encoding = current_catcher.encode('utf-8')
    
    if(catcher_encoding==b'Mart\xc3\xadn Maldonado'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Martin Maldonado'
        
    elif(catcher_encoding==b'Ren\xc3\xa9 Rivera'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Rene Rivera'
        
    elif(catcher_encoding==b'Jes\xc3\xbas Sucre'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Jesus Sucre'
        
    elif(catcher_encoding==b'Jos\xc3\xa9 Brice\xc3\xb1o'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Jose Briceno'
        
    elif(catcher_encoding==b'Omar Narv\xc3\xa1ez'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Omar Narvaez'
        
    elif(catcher_encoding==b'Elias D\xc3\xadaz'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Elias Diaz'
        
    elif(catcher_encoding==b'Carlos P\xc3\xa9rez'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Carlos Perez'
        
    elif(catcher_encoding==b'Tom\xc3\xa1s Nido'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Tomas Nido'
    
    elif(catcher_encoding==b'Manny Pi\xc3\xb1a'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Manny Pina'
        
    elif(catcher_encoding==b'Christian V\xc3\xa1zquez'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Christian Vazquez'
        
    elif(catcher_encoding==b'Alfredo Gonz\xc3\xa1lez'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Alfredo Gonzalez'
        
    elif(catcher_encoding==b'Francisco Pe\xc3\xb1a'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Francisco Pena'
        
    elif(catcher_encoding==b'Tom\xc3\xa1s Telis'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Tomas Telis'
        
    elif(catcher_encoding==b'Francisco Mej\xc3\xada'):
        statcast_catchers_2018_df.at[i, 'catcher_name'] = 'Francisco Mejia'

In [19]:
sb_kaggle_unique_catcher_names = pd.DataFrame(sb_kaggle_df['catcher_name'].unique(), columns=['catcher_name'])
sb_kaggle_unique_catcher_names.to_csv(os.pardir + '/output_data/player_names/merging_statcast/sb_kaggle_catcher_names.txt', 
                                     sep='\n', index=False)

statcast_2018_unique_catcher_names = pd.DataFrame(statcast_catchers_2018_df['catcher_name'].unique(), columns=['catcher_name'])
statcast_2018_unique_catcher_names.to_csv(os.pardir + '/output_data/player_names/merging_statcast/statcast_2018_catcher_names.txt', 
                                         sep='\n', index=False)

In [20]:
missed_catchers_2018 = []
for i in range(len(sb_kaggle_unique_catcher_names)):
    current_catcher = sb_kaggle_unique_catcher_names['catcher_name'][i]
    catcher_found = False
    for j in range(len(statcast_2018_unique_catcher_names)):
        if(statcast_2018_unique_catcher_names['catcher_name'][j]==current_catcher):
            catcher_found = True
            break
    if(not catcher_found):
        missed_catchers_2018.append(current_catcher)
print(missed_catchers_2018)

['Rafael Lopez', 'Mike Marjama', 'Steve Baron']


In [21]:
# sb_kaggle - statcast
# Rafael Lopez - Raffy Lopez
# Mike Marjama - NOT IN STATCAST 2018 DATA (is in 2017 data)
# Steve Baron - Steven Baron

In [22]:
old_catcher_names = ['Raffy Lopez', 'Steven Baron']
new_catcher_names = ['Rafael Lopez', 'Steve Baron']

bbref_catchers_2017_df = bbref_catchers_2017_df[bbref_catchers_2017_df['catcher_name'].notnull()].copy()
bbref_catchers_2017_df['catcher_name'] = bbref_catchers_2017_df['catcher_name'].replace(old_catcher_names, new_catcher_names)

statcast_catchers_2018_df = statcast_catchers_2018_df[statcast_catchers_2018_df['catcher_name'].notnull()].copy()
statcast_catchers_2018_df['catcher_name'] = statcast_catchers_2018_df['catcher_name'].replace(old_catcher_names, new_catcher_names)

In [23]:
# fix runner names
statcast_runners_2017_df = statcast_runners_2017_df[statcast_runners_2017_df['runner_on_first_name'].notnull()].copy()
statcast_runners_2017_df['runner_on_first_name'] = statcast_runners_2017_df['runner_on_first_name'].apply(str.strip)

statcast_runners_2018_df = statcast_runners_2018_df[statcast_runners_2018_df['runner_on_first_name'].notnull()].copy()
statcast_runners_2018_df['runner_on_first_name'] = statcast_runners_2018_df['runner_on_first_name'].apply(str.strip)

In [24]:
sb_kaggle_unique_runner_names = pd.DataFrame(sb_kaggle_df['runner_on_first_name'].unique(), columns=['runner_on_first_name'])
sb_kaggle_unique_runner_names.to_csv(os.pardir + '/output_data/player_names/merging_statcast/sb_kaggle_runner_names.txt', 
                                     sep='\n', index=False)

statcast_2018_unique_runner_names = pd.DataFrame(statcast_runners_2018_df['runner_on_first_name'].unique(), columns=['runner_on_first_name'])
statcast_2018_unique_runner_names.to_csv(os.pardir + '/output_data/player_names/merging_statcast/statcast_2018_runner_names.txt', 
                                         sep='\n', index=False)

In [25]:
missed_runners_2018 = []
for i in range(len(sb_kaggle_unique_runner_names)):
    current_runner = sb_kaggle_unique_runner_names['runner_on_first_name'][i]
    runner_found = False
    for j in range(len(statcast_2018_unique_runner_names)):
        if(statcast_2018_unique_runner_names['runner_on_first_name'][j]==current_runner):
            runner_found = True
            break
    if(not runner_found):
        missed_runners_2018.append(current_runner)
print(missed_runners_2018)

['Dee Gordon', 'Eric Young', 'A.J. Pollock', 'Steven Souza', 'Ronald Acuna', 'Michael Taylor', 'Michael Brantley', 'Steve Wilkerson', 'Jackie Bradley', 'Cedric Mullins', 'Lourdes Gurriel', 'J.T. Riddle', 'Yolmer Sanchez', 'Albert Almora', 'Terrance Gore', 'George Springer', 'Nelson Cruz', 'Jim Adduci', 'Yulieski Gurriel', 'J.B. Shuck', 'Buster Posey', 'Kevin Kaczmarski', 'Howie Kendrick', 'Rafael Lopez']


In [26]:
# sb_kaggle - statcast
# Dee Gordon - Dee Strange-Gordon
# Eric Young - Eric Young Jr.
# A.J. Pollock - AJ Pollock
# Steven Souza - Steven Souza Jr.
# Ronald Acuna - Ronald Acuna Jr.
# Michael Taylor - Michael A. Taylor
# Michael Brantley - Michael Brantley Jr.
# Steve Wilkerson - Stevie Wilkerson
# Jackie Bradley - Jackie Bradley Jr.
# Cedric Mullins - Cedric Mullins II
# Lourdes Gurriel - Lourdes Gurriel Jr.
# J.T. Riddle - JT Riddle
# Yolmer Sanchez - Carlos Sanchez
# Albert Almora - Albert Almora Jr.
# Terrance Gore - NOT IN STATCAST 2018 DATA
# George Springer - George Springer III
# Nelson Cruz - Nelson Cruz Jr.
# Jim Adduci - James Adduci
# Yulieski Gurriel - Yuli Gurriel
# J.B. Shuck - JB Shuck
# Buster Posey - Buster Posey III
# Kevin Kaczmarski - NOT IN STATCAST 2018 DATA
# Howie Kendrick - Howie Kendrick III
# Rafael Lopez - Raffy Lopez

In [27]:
old_runner_names = ['Dee Strange-Gordon', 'Eric Young Jr.', 'AJ Pollock', 'Steven Souza Jr.', 'Ronald Acuna Jr.',
                        'Michael A. Taylor', 'Michael Brantley Jr.', 'Stevie Wilkerson', 'Jackie Bradley Jr.', 'Cedric Mullins II',
                        'Lourdes Gurriel Jr.', 'JT Riddle', 'Carlos Sanchez', 'Albert Almora Jr.', 'George Springer III', 'Nelson Cruz Jr.',
                        'James Adduci', 'Yuli Gurriel', 'JB Shuck', 'Buster Posey III', 'Howie Kendrick III', 'Raffy Lopez']
new_runner_names = ['Dee Gordon', 'Eric Young', 'A.J. Pollock', 'Steven Souza', 'Ronald Acuna', 'Michael Taylor',
                        'Michael Brantley', 'Steve Wilkerson', 'Jackie Bradley', 'Cedric Mullins', 'Lourdes Gurriel',
                        'J.T. Riddle', 'Yolmer Sanchez', 'Albert Almora', 'George Springer', 'Nelson Cruz', 'Jim Adduci', 'Yulieski Gurriel',
                        'J.B. Shuck', 'Buster Posey', 'Howie Kendrick', 'Rafael Lopez']

statcast_runners_2017_df = statcast_runners_2017_df[statcast_runners_2017_df['runner_on_first_name'].notnull()].copy()
statcast_runners_2017_df['runner_on_first_name'] = statcast_runners_2017_df['runner_on_first_name'].replace(old_runner_names, new_runner_names)

statcast_runners_2017_df.loc[1052, 'runner_on_first_name'] = 'Daniel Robertson (CLE)'
statcast_runners_2017_df.loc[[1052]]

statcast_runners_2018_df = statcast_runners_2018_df[statcast_runners_2018_df['runner_on_first_name'].notnull()].copy()
statcast_runners_2018_df['runner_on_first_name'] = statcast_runners_2018_df['runner_on_first_name'].replace(old_runner_names, new_runner_names)

In [28]:
print(len(sb_kaggle_df))

2867


In [29]:
# add 2017 statcast data to sb_kaggle data

In [30]:
# add 2017 pitcher data
merged_2017_df_1 = sb_kaggle_df.merge(statcast_pitchers_2017_df, how='left', on='pitcher_name', indicator=True)
merged_2017_df_1.rename(columns={'_merge':'merge_1'}, inplace=True)

print(len(merged_2017_df_1))
print(len(merged_2017_df_1[merged_2017_df_1['merge_1']=='left_only']))

2867
414


In [31]:
# add 2017 catcher data
merged_2017_df_2 = merged_2017_df_1.merge(bbref_catchers_2017_df, how='left', on='catcher_name', indicator=True)
merged_2017_df_2.rename(columns={'_merge':'merge_2'}, inplace=True)

print(len(merged_2017_df_2))
print(len(merged_2017_df_2[merged_2017_df_2['merge_2']=='left_only']))

2867
290


In [32]:
# add 2017 runner data
merged_2017_df_3 = merged_2017_df_2.merge(statcast_runners_2017_df, how='left', on='runner_on_first_name', indicator=True)
merged_2017_df_3.rename(columns={'_merge':'merge_3'}, inplace=True)

print(len(merged_2017_df_3))
print(len(merged_2017_df_3[merged_2017_df_3['merge_3']=='left_only']))

2867
326


In [33]:
# add 2018 statcast data to sb_kaggle data

In [34]:
# add 2018 pitcher data
merged_2018_df_1 = merged_2017_df_3.merge(statcast_pitchers_2018_df, how='left', on='pitcher_name', indicator=True)
merged_2018_df_1.rename(columns={'_merge':'merge_4'}, inplace=True)

print(len(merged_2018_df_1))
print(len(merged_2018_df_1[merged_2018_df_1['merge_4']=='left_only']))

# every pitcher in stolen base data is accounted for in statcast data

2867
0


In [35]:
# add 2018 catcher data
merged_2018_df_2 = merged_2018_df_1.merge(statcast_catchers_2018_df, how='left', on='catcher_name', indicator=True)
merged_2018_df_2.rename(columns={'_merge':'merge_5'}, inplace=True)

print(len(merged_2018_df_2))
print(len(merged_2018_df_2[merged_2018_df_2['merge_5']=='left_only']))

2867
2


In [36]:
# add 2018 runner data
merged_2018_df_3 = merged_2018_df_2.merge(statcast_runners_2018_df, how='left', on='runner_on_first_name', indicator=True)
merged_2018_df_3.rename(columns={'_merge':'merge_6'}, inplace=True)

print(len(merged_2018_df_3))
print(len(merged_2018_df_3[merged_2018_df_3['merge_6']=='left_only']))

2867
6


In [37]:
print(merged_2018_df_3.columns.tolist())

['outcome', 'inning', 'home_half', 'outs', 'on_third', 'pitches', 'num_pitches', 'is_hitters_count', 'is_pitchers_count', 'pitch_num_on_event', 'strike_on_event', 'swing_on_event', 'pitchout_on_event', 'blocked_on_event', 'pickoffs_to_first', 'pitchouts', 'pitches_run_on', 'pitcher_name', 'catcher_name', 'runner_on_first_name', 'p_score', 'pitcher_throws', 'batter_bats', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir', 'break_angle', 'break_length', 'zone', 'pitch_type', 'b_score', 'orig_merge', 'p_total_pa_2017', 'p_total_stolen_base_2017', 'p_pickoff_attempt_1b_2017', 'p_pickoff_1b_2017', 'p_stolen_base_2b_2017', 'p_caught_stealing_2b_2017', 'p_pickoff_error_1b_2017', 'p_pitchout_2017', 'p_total_pitches_2017', 'out_zone_2017', 'pitch_count_offspeed_2017', 'pitch_count_fastball_2017', 'pitch_count_breaking_2017', 'in_zone_2017', 'merge_1', 'c_sb_rate_2017', 'merge_2', 'r_caught_stealing_2b_2017', 'r_pickoff_1b_2017', 'r_stolen_base_2b_2017', 'r_sprint_speed_2017', 'merge_3', 'p_to

In [39]:
columns_to_drop = ['home_half', 'inning', 'pitches', 'pitch_num_on_event', 'pitcher_name', 'catcher_name', 'runner_on_first_name', 
                   'end_speed', 'spin_rate', 'spin_dir', 'break_angle', 'break_length', 'zone', 'pitch_type', 'orig_merge',
                   'p_total_stolen_base_2017', 'p_pickoff_error_1b_2017', 'p_pitchout_2017', 'out_zone_2017', 'in_zone_2017', 
                   'merge_1', 'merge_2', 'merge_3', 'out_zone_2018', 'in_zone_2018', 'merge_4', 'merge_5', 'merge_6']
full_data = merged_2018_df_3.drop(columns_to_drop, axis=1)

In [40]:
print(full_data.columns.tolist())

['outcome', 'outs', 'on_third', 'num_pitches', 'is_hitters_count', 'is_pitchers_count', 'strike_on_event', 'swing_on_event', 'pitchout_on_event', 'blocked_on_event', 'pickoffs_to_first', 'pitchouts', 'pitches_run_on', 'p_score', 'pitcher_throws', 'batter_bats', 'start_speed', 'b_score', 'p_total_pa_2017', 'p_pickoff_attempt_1b_2017', 'p_pickoff_1b_2017', 'p_stolen_base_2b_2017', 'p_caught_stealing_2b_2017', 'p_total_pitches_2017', 'pitch_count_offspeed_2017', 'pitch_count_fastball_2017', 'pitch_count_breaking_2017', 'c_sb_rate_2017', 'r_caught_stealing_2b_2017', 'r_pickoff_1b_2017', 'r_stolen_base_2b_2017', 'r_sprint_speed_2017', 'p_total_pa_2018', 'p_total_stolen_base_2018', 'p_pickoff_attempt_1b_2018', 'p_pickoff_1b_2018', 'p_stolen_base_2b_2018', 'p_caught_stealing_2b_2018', 'p_pickoff_error_1b_2018', 'p_pitchout_2018', 'p_total_pitches_2018', 'pitch_count_offspeed_2018', 'pitch_count_fastball_2018', 'pitch_count_breaking_2018', 'c_pop_2b_sba_2018', 'r_caught_stealing_2b_2018', 'r_p

In [41]:
for i in range(len(full_data)):
    if(full_data['batter_bats'][i]=='B'):
        if(full_data['pitcher_throws'][i]=='R'):
            full_data.at[i, 'batter_bats']='L'
        else:
            full_data.at[i, 'batter_bats']='R'

bat_throw_dict = {'L': True, 'R': False}
full_data['batter_bats_left'] = full_data['batter_bats'].map(bat_throw_dict)
full_data['pitcher_throws_left'] = full_data['pitcher_throws'].map(bat_throw_dict)

full_data = full_data.drop(['pitcher_throws', 'batter_bats'], axis=1)

In [42]:
full_data['p_sb_rate_2017'] = full_data['p_stolen_base_2b_2017'] / (full_data['p_stolen_base_2b_2017'] + full_data['p_caught_stealing_2b_2017'])

full_data['p_pickoff_rate_2018'] = full_data['p_pickoff_1b_2018'] / full_data['p_pickoff_attempt_1b_2018']

full_data['p_fastball_rate_2018'] = full_data['pitch_count_fastball_2018'] / full_data['p_total_pitches_2018']

full_data['r_sb_rate_2017'] = full_data['r_stolen_base_2b_2017'] / (full_data['r_stolen_base_2b_2017'] + full_data['r_caught_stealing_2b_2017'])

full_data['b_score_difference'] = pd.to_numeric(full_data['b_score']) - pd.to_numeric(full_data['p_score'])

In [43]:
full_data = full_data[['outcome', 'outs', 'on_third', 'num_pitches', 'is_hitters_count', 'is_pitchers_count', 
                       'strike_on_event', 'swing_on_event', 'pitchout_on_event', 'blocked_on_event', 
                       'pickoffs_to_first', 'pitchouts', 'pitches_run_on', 'b_score_difference', 'pitcher_throws_left', 
                       'batter_bats_left', 'start_speed', 'p_sb_rate_2017', 'p_pickoff_rate_2018', 'p_fastball_rate_2018', 
                       'r_sb_rate_2017', 'r_sprint_speed_2018', 'c_sb_rate_2017', 'c_pop_2b_sba_2018']]

In [44]:
pitchless_data = full_data[['outcome', 'outs', 'on_third', 'num_pitches', 'is_hitters_count', 'is_pitchers_count', 
                            'pickoffs_to_first', 'pitchouts', 'pitches_run_on', 'pitcher_throws_left', 
                            'batter_bats_left', 'p_sb_rate_2017', 'p_pickoff_rate_2018', 'p_fastball_rate_2018', 
                            'r_sb_rate_2017', 'r_sprint_speed_2018', 'c_sb_rate_2017', 'c_pop_2b_sba_2018']]

# problem: any observation without pitch data also does not include the score of the game
# when we drop the pitch data we also have to drop score data

In [46]:
# somewhat imbalanced data:
print(len(full_data[full_data['outcome']=='SUCCESS']))
print(len(full_data[full_data['outcome']=='FAILURE']))

2049
818


In [47]:
full_data.to_csv(full_output_file, index=False)
pitchless_data.to_csv(pitchless_output_file, index=False)