In [1]:
# Clean data for modeling

pitch = pd.read_csv('../data/mlb-pitches.csv', index_col = [0])
pitch = pitch[['player_name', 'p_throws', 'pitch_type','release_speed', 'release_spin_rate', 'spin_axis', 
               'pfx_-x', 'pfx_z', 'bauer_units', 'effective_speed', 'release_pos_x', 'release_pos_z', 
               'release_extension', 'release_pos_y', 'plate_-x', 'plate_x', 'plate_z', 'type', 'balls','strikes', 
               'pitch_count', 'stand', 'description', 'events', 'hit_distance_sc', 'launch_speed','launch_angle', 
               'launch_speed_angle', 'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 
               'woba_value', 'woba_denom', 'babip_value', 'iso_value','at_bat_number', 'pitch_number', 
               'inning', 'inning_topbot', 'home_score', 'away_score', 'post_home_score', 'post_away_score', 
               'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'delta_run_exp']].copy()

#Rename some columns
col_dict = {
    'release_speed': 'velo',
    'release_spin_rate': 'spin_rate',
    'launch_speed': 'exit_velo',
    'estimated_ba_using_speedangle': 'xba',
    'estimated_woba_using_speedangle': 'xwobacon'
}
pitch.rename(columns = col_dict, inplace = True)
pitch.to_csv('../data/model-pitches.csv')

In [2]:
# Add RV to model-pitches.csv

data = pd.read_csv('../data/model-pitches.csv', index_col = [0])
data.dropna(subset = ['pitch_type', 'velo', 'spin_rate', 'pfx_-x', 
                      'release_extension', 'delta_run_exp'], inplace = True)

data['inning_topbot'] = data.inning_topbot.map({'Top': 0, 'Bot': 1})
data['on_1b'] = [1 if x > 1 else 0 for x in data['on_1b']]
data['on_2b'] = [1 if x > 1 else 0 for x in data['on_2b']]
data['on_3b'] = [1 if x > 1 else 0 for x in data['on_3b']]

data['home_runs'] = data['post_home_score'] - data['home_score']
data['away_runs'] = data['post_away_score'] - data['away_score']
data['runs'] = data['home_runs'] + data['away_runs']

re = pd.read_csv('../data/run_expectancy_table.csv', index_col = [0])
data = pd.merge(data, re, how = 'left', on = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up'])

data['re_end_state'] = data['delta_run_exp'] + data['re']
data['re24'] = data['re_end_state'] - data['re'] + data['runs']

count_woba = pd.read_csv('../data/count_woba_value.csv', index_col = [0])
data = pd.merge(data, count_woba, how = 'left', on = ['pitch_count'])
data['count_woba_diff'] = -data['count_woba_value'].diff(1)
data['count_woba_after'] = data['count_woba_value'] + data['count_woba_diff']
data['count_woba_diff'].fillna(0, inplace = True)
data['count_woba_after'].fillna(0, inplace = True)
data['rv_count'] = round((data['count_woba_diff']) / 1.15, 3)

data.to_csv('../data/model-pitches-rv.csv')