In [1]:
# Clean data for modeling

pitch = pd.read_csv('../data/mlb-pitches.csv', index_col = [0])
pitch = pitch[['player_name', 'p_throws', 'pitch_type','release_speed', 'release_spin_rate', 'spin_axis', 
               'pfx_-x', 'pfx_z', 'bauer_units', 'effective_speed', 'release_pos_x', 'release_pos_z', 
               'release_extension', 'release_pos_y', 'plate_-x', 'plate_x', 'plate_z', 'type', 'balls','strikes', 
               'pitch_count', 'stand', 'description', 'events', 'hit_distance_sc', 'launch_speed','launch_angle', 
               'launch_speed_angle', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 
               'woba_value', 'woba_denom', 'babip_value', 'iso_value','at_bat_number', 'pitch_number', 
               'inning', 'inning_topbot', 'home_score', 'away_score', 'post_home_score', 'post_away_score', 
               'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'delta_run_exp']].copy()

#Rename some columns
col_dict = {
    'release_speed': 'velo',
    'release_spin_rate': 'spin_rate',
    'launch_speed': 'exit_velo',
    'estimated_ba_using_speedangle': 'xba',
    'estimated_woba_using_speedangle': 'xwobacon'
}
pitch.rename(columns = col_dict, inplace = True)
pitch.to_csv('../data/model-pitches.csv')

In [2]:
# Run Expectany Table

# 2010-2015 Run Expectancy
# matrix = [[0, 0, 0, 0, 0.481], [1, 0, 0, 0, 0.859], [0, 1, 0, 0, 1.100], [1, 1, 0, 0, 1.437], 
#           [0, 0, 1, 0, 1.350], [1, 0, 1, 0, 1.784], [0, 1, 1, 0, 1.964], [1, 1, 1, 0, 2.292], 
#           [0, 0, 0, 1, 0.254], [1, 0, 0, 1, 0.509], [0, 1, 0, 1, 0.664], [1, 1, 0, 1, 0.884], 
#           [0, 0, 1, 1, 0.950], [1, 0, 1, 1, 1.130], [0, 1, 1, 1, 1.376], [1, 1, 1, 1, 1.541],
#           [0, 0, 0, 2, 0.098], [1, 0, 0, 2, 0.224], [0, 1, 0, 2, 0.319], [1, 1, 0, 2, 0.429], 
#           [0, 0, 1, 2, 0.353], [1, 0, 1, 2, 0.478], [0, 1, 1, 2, 0.580], [1, 1, 1, 2, 0.752]]

# 2019 Run Expectancy
matrix = [[0, 0, 0, 0, 0.544], [1, 0, 0, 0, 0.935], [0, 1, 0, 0, 1.147], [1, 1, 0, 0, 1.537], 
          [0, 0, 1, 0, 1.369], [1, 0, 1, 0, 1.759], [0, 1, 1, 0, 1.971], [1, 1, 1, 0, 2.362],
          [0, 0, 0, 1, 0.298], [1, 0, 0, 1, 0.564], [0, 1, 0, 1, 0.713], [1, 1, 0, 1, 0.979], 
          [0, 0, 1, 1, 0.953], [1, 0, 1, 1, 1.219], [0, 1, 1, 1, 1.368], [1, 1, 1, 1, 1.634],
          [0, 0, 0, 2, 0.115], [1, 0, 0, 2, 0.242], [0, 1, 0, 2, 0.339], [1, 1, 0, 2, 0.467], 
          [0, 0, 1, 2, 0.391], [1, 0, 1, 2, 0.518], [0, 1, 1, 2, 0.615], [1, 1, 1, 2, 0.743]]

re = pd.DataFrame(matrix, columns = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up', 're'])
re.to_csv('../data/run_expectancy_table.csv')

In [3]:
# wOBA by Count

woba_value = [['0-0', 0.310], ['0-1', 0.262], ['0-2', 0.196], ['1-0', 0.355], ['1-1', 0.293], ['1-2', 0.223], 
              ['2-0', 0.436], ['2-1', 0.352], ['2-2', 0.273], ['3-0', 0.622], ['3-1', 0.470], ['3-2', 0.384]]

woba_value = pd.DataFrame(woba_value, columns = ['pitch_count', 'count_woba_value'])
woba_value.to_csv('../data/count_woba_value.csv')

In [4]:
# Add RV to model-pitches.csv

data = pd.read_csv('../data/model-pitches.csv', index_col = [0])
data.dropna(subset = ['pitch_type', 'velo', 'spin_rate', 'pfx_-x', 
                      'release_extension', 'delta_run_exp'], inplace = True)

data['events'].replace(['sac_bunt', 'double_play', 'caught_stealing_2b', 'strikeout_double_play',
                        'other_out', 'sac_fly_double_play', 'pickoff_2b', 'pickoff_3b', 'triple_play'
                        'caught_stealing_home', 'pickoff_caught_stealing_2b', 'pickoff_3b', 
                        'sac_bunt_double_play', 'pickoff_caught_stealing_3b', 'pickoff_1b', 
                        'caught_stealing_3b', 'triple_play', 'caught_stealing_home'], 'field_out', inplace = True)
data['events'].replace(['catcher_interf'], 'field_error', inplace = True)

data['description'].replace(['called_strike', 'swinging_strike', 'swinging_strike_blocked', 'missed_bunt',
                             'foul'], 'strike', inplace = True)
data['description'].replace(['hit_by_pitch'], 'ball', inplace = True)

data['inning_topbot'] = data.inning_topbot.map({'Top': 0, 'Bot': 1})
data['on_1b'] = [1 if x > 1 else 0 for x in data['on_1b']]
data['on_2b'] = [1 if x > 1 else 0 for x in data['on_2b']]
data['on_3b'] = [1 if x > 1 else 0 for x in data['on_3b']]

data['home_runs'] = data['post_home_score'] - data['home_score']
data['away_runs'] = data['post_away_score'] - data['away_score']
data['runs'] = data['home_runs'] + data['away_runs']

#re = pd.read_csv('../data/run_expectancy_table.csv', index_col = [0])
data = pd.merge(data, re, how = 'left', on = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up'])

data['re_end_state'] = data['delta_run_exp'] + data['re']
data['re24'] = data['re_end_state'] - data['re'] + data['runs']

#count_woba = pd.read_csv('../data/count_woba_value.csv', index_col = [0])
data = pd.merge(data, woba_value, how = 'left', on = ['pitch_count'])
data['count_woba_diff'] = round(-data['count_woba_value'].diff(1), 3)
data['count_woba_after'] = data['count_woba_value'] + data['count_woba_diff']
data['count_woba_diff'].fillna(0, inplace = True)
data['count_woba_after'].fillna(0, inplace = True)
data['events'].fillna('na', inplace = True)
data['count_woba_after'] = [0 if x != 'na' else y for (x, y) in zip(data['events'], data['count_woba_after'])]
data['count_woba_diff'] = data['count_woba_after'] - data['count_woba_value']
data['xRV'] = round((data['count_woba_diff']) / 1.15, 3)

pd.set_option('max_columns', None)
print(data.shape)
data.head()

(705403, 57)


Unnamed: 0,player_name,p_throws,pitch_type,velo,spin_rate,spin_axis,pfx_-x,pfx_z,bauer_units,effective_speed,release_pos_x,release_pos_z,release_extension,release_pos_y,plate_-x,plate_x,plate_z,type,balls,strikes,pitch_count,stand,description,events,hit_distance_sc,exit_velo,launch_angle,launch_speed_angle,xba,xwobacon,woba_value,woba_denom,babip_value,iso_value,at_bat_number,pitch_number,inning,inning_topbot,home_score,away_score,post_home_score,post_away_score,on_1b,on_2b,on_3b,outs_when_up,delta_run_exp,home_runs,away_runs,runs,re,re_end_state,re24,count_woba_value,count_woba_diff,count_woba_after,xRV
0,"Smith, Will",L,FF,92.3,2330.0,148.0,-8.28,16.56,25.24377,92.8,1.4,6.8,6.5,54.03,0.69,-0.69,2.83,X,1,2,1-2,R,hit_into_play,field_out,13.0,95.2,-13.0,2.0,0.174,0.158,0.0,1.0,0.0,0.0,61,4,9,0,5,0,5,0,0,0,0,2,-0.073,0,0,0,0.115,0.042,-0.073,0.223,-0.223,0.0,-0.194
1,"Smith, Will",L,SL,80.6,2254.0,315.0,9.24,5.76,27.965261,81.2,1.6,6.64,6.4,54.15,0.71,-0.71,2.62,S,1,1,1-1,R,strike,na,108.0,75.3,75.0,,,,,,,,61,3,9,0,5,0,5,0,0,0,0,2,-0.027,0,0,0,0.115,0.088,-0.027,0.293,-0.07,0.223,-0.061
2,"Smith, Will",L,CU,75.5,1940.0,328.0,7.8,-6.12,25.695364,75.2,1.46,6.88,6.2,54.34,0.04,-0.04,2.46,S,1,0,1-0,R,strike,na,157.0,83.5,65.0,,,,,,,,61,2,9,0,5,0,5,0,0,0,0,2,-0.02,0,0,0,0.115,0.095,-0.02,0.355,-0.062,0.293,-0.054
3,"Smith, Will",L,CU,75.0,2017.0,330.0,8.28,-8.28,26.893333,74.5,1.53,6.83,5.9,54.61,-2.1,2.1,3.89,B,0,0,0-0,R,ball,na,,,,,,,,,,,61,1,9,0,5,0,5,0,0,0,0,2,0.016,0,0,0,0.115,0.131,0.016,0.31,0.045,0.355,0.039
4,"Smith, Will",L,FF,91.2,2281.0,143.0,-7.56,15.36,25.010965,90.9,1.49,6.66,6.3,54.15,0.31,-0.31,2.8,X,1,0,1-0,L,hit_into_play,field_out,9.0,93.3,-18.0,2.0,0.1,0.09,0.0,1.0,0.0,0.0,60,2,9,0,5,0,5,0,0,0,0,1,-0.189,0,0,0,0.298,0.109,-0.189,0.355,-0.355,0.0,-0.309


In [5]:
#data.to_csv('../data/model-pitches-rv.csv')