# Clean Dataset for Model

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Clean data for modeling

pitch = pd.read_csv('../data/mlb-pitches2022.csv', index_col = [0])
pitch = pitch[['player_name', 'p_throws', 'pitch_type','release_speed', 'release_spin_rate', 'spin_axis', 
               'pfx_-x', 'pfx_z', 'bauer_units', 'effective_speed', 'release_pos_-x', 'release_pos_z', 
               'release_extension', 'release_pos_y', 'plate_-x', 'plate_z', 'type', 'balls', 'strikes', 
               'pitch_count', 'stand', 'bb_type', 'description', 'events', 'hit_distance_sc', 'launch_speed', 
               'launch_angle', 'woba_value', 'woba_denom', 'xwoba', 'at_bat_number', 'pitch_number', 'inning', 
               'inning_topbot', 'home_score', 'away_score', 'post_home_score', 'post_away_score', 
               'on_1b', 'on_2b', 'on_3b', 'outs_when_up']].copy()

pitch.dropna(subset = ['pitch_type', 'release_speed', 'release_spin_rate', 'pfx_-x', 'pfx_z',
                       'release_extension', 'release_pos_-x', 'release_pos_z'], inplace = True)

#Rename some columns
col_dict = {
    'release_speed': 'velo',
    'release_spin_rate': 'spin_rate',
    'pfx_-x': 'pfx_x',
    'release_pos_-x': 'release_pos_x',
    'plate_-x': 'plate_x',
    'launch_speed': 'exit_velo',
}

pitch.rename(columns = col_dict, inplace = True)
pitch.to_csv('../data/model-data2022.csv')

# Run Expectany Table

# 2010-2015 Run Expectancy
matrix = [[0, 0, 0, 0, 0.53], [1, 0, 0, 0, 0.94], [0, 1, 0, 0, 1.17], [1, 1, 0, 0, 1.55], 
          [0, 0, 1, 0, 1.43], [1, 0, 1, 0, 1.80], [0, 1, 1, 0, 2.04], [1, 1, 1, 0, 2.32],
          
          [0, 0, 0, 1, 0.29], [1, 0, 0, 1, 0.56], [0, 1, 0, 1, 0.72], [1, 1, 0, 1, 1.00], 
          [0, 0, 1, 1, 1.00], [1, 0, 1, 1, 1.23], [0, 1, 1, 1, 1.42], [1, 1, 1, 1, 1.63],
          
          [0, 0, 0, 2, 0.11], [1, 0, 0, 2, 0.24], [0, 1, 0, 2, 0.33], [1, 1, 0, 2, 0.46], 
          [0, 0, 1, 2, 0.38], [1, 0, 1, 2, 0.54], [0, 1, 1, 2, 0.60], [1, 1, 1, 2, 0.77]]

re = pd.DataFrame(matrix, columns = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up', 're'])
# re.head()
re.to_csv('../data/run_expectancy_table.csv')

count_re = pd.read_csv('../data/2021-mlb-rv.csv')
print(count_re.shape)
count_re.head()

# Add RV to model-data.csv

data = pd.read_csv('../data/model-data2022.csv', index_col = [0])

# Add Arm angle
data['pitch_type2'] = data['pitch_type']
arm_angle = data.groupby(['player_name', 'pitch_type'], as_index=False)['release_pos_x', 'release_pos_z'].mean()
arm_angle = pd.DataFrame(arm_angle)
adj = (arm_angle['release_pos_z'] - arm_angle['release_pos_x']) * 0.7
opp = abs(arm_angle['release_pos_x'])
hyp = np.sqrt((opp ** 2) + (adj ** 2))
arm_angle['arm_angle'] = round(np.arccos(((adj ** 2 + hyp ** 2) - opp ** 2) / (2 * (adj * hyp))), 3) * 100
# slot: 1 (0-40): overhead, 2 (41-60): 3/4 arm slot, 3 (>61): sidearm/sub
arm_angle['slot'] = [1 if (x >= 0) & (x <= 40) else 2 if (x >= 41) & (x <= 60) else 3 
                     for x in arm_angle['arm_angle']]
arm_angle.drop(columns = ['release_pos_x', 'release_pos_z'], inplace = True)
data = pd.merge(data, arm_angle, how = 'left', on = ['player_name', 'pitch_type'])

data = data[data['events'] != 'wild_pitch']
data = data[data['events'] != 'passed_ball']
data = data[data['events'] != 'stolen_base_2b']
data = data[data['events'] != 'game_advisory']

data['events'].replace(['sac_bunt', 'double_play', 'caught_stealing_2b', 'strikeout_double_play',
                        'other_out', 'sac_fly_double_play', 'pickoff_2b', 'pickoff_3b', 'triple_play'
                        'caught_stealing_home', 'pickoff_caught_stealing_2b', 'pickoff_3b', 
                        'sac_bunt_double_play', 'pickoff_caught_stealing_3b', 'pickoff_1b', 
                        'caught_stealing_3b', 'triple_play', 'caught_stealing_home', 
                        'strikeout'], 'field_out', inplace = True)

data['events'].replace(['catcher_interf'], 'field_error', inplace = True)
data['description'].replace(['swinging_strike', 'swinging_strike_blocked',
                             'missed_bunt'], 'whiff', inplace = True)

data['whiff'] = [1 if x == 'whiff' else 0 for x in data['description']]

data['description'].replace(['called_strike', 'foul'], 'strike', inplace = True)
data['description'].replace(['passed_ball', 'wild_pitch'], 'ball', inplace = True)

data['is_strike'] = [1 if x != 'B' else 0 for x in data['type']]
data['is_ball'] = [1 if x == 'B' else 0 for x in data['type']]
data = data[data['pitch_count'] != '4-2']

data['inning_topbot'] = data.inning_topbot.map({'Top': 0, 'Bot': 1})
data['on_1b'] = [1 if x > 1 else 0 for x in data['on_1b']]
data['on_2b'] = [1 if x > 1 else 0 for x in data['on_2b']]
data['on_3b'] = [1 if x > 1 else 0 for x in data['on_3b']]

data['final_pitch_ab'] = [1 if x == x else 0 for x in data['events']]
data['out_to_end_inning'] = [1 if x == 'field_out' and y == 2 else 0 for (x, y) 
                             in zip(data['events'], data['outs_when_up'])]
data['home_runs'] = data['post_home_score'] - data['home_score']
data['away_runs'] = data['post_away_score'] - data['away_score']
data['runs'] = data['home_runs'] + data['away_runs']

# Merge RE Table with data
data = pd.merge(data, re, how = 'left', on = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up'])
data['re_change'] = round(-data['re'].diff(1), 3)
data['re_change'].fillna(-0.098, inplace = True)
data['re_end_state'] = data['re'] + data['re_change']
data['re_end_state'] = [0 if x == 1 else y for (x, y) in zip(data['out_to_end_inning'], data['re_end_state'])]
data['re_change'] = [-y if x == 1 else z for (x, y, z) in zip(data['out_to_end_inning'], 
                                                              data['re'], data['re_change'])]
data['re_change'].replace([-0.000], 0.000, inplace = True)
data['re24'] = data['re_change'] + data['runs']

# Create Linear Weights with average RE by events
mlb_lw = data.groupby(['events'], as_index = False)['re24'].mean()
lw_ball_in_play = pd.DataFrame(mlb_lw)
lw_ball_in_play['re24'] = round(lw_ball_in_play['re24'], 3)
lw_ball_in_play.rename(columns = {'re24': 'lin_weight_above_avg'}, inplace = True)

# For lin weights based on base out state 
data = pd.merge(data, lw_ball_in_play, how = 'left', on = ['events'])
data['lin_weight_rel_outs'] = data['lin_weight_above_avg'] + 0.271
data['woba_scale'] = 1.209
data['lin_weights_above_avg_scale'] = round(data['lin_weight_above_avg'] * data['woba_scale'], 3)
data['lin_weights_rel_outs_scale'] = round(data['lin_weight_rel_outs'] * data['woba_scale'], 3)
data['lin_weights_above_avg_scale'].fillna(0, inplace = True)
data['lin_weights_rel_outs_scale'].fillna(0, inplace = True)

# Merge ball strike count RE with data
data = pd.merge(data, count_re, how = 'left', on = ['pitch_count', 'is_strike', 'is_ball'])
data['rv'] = data['re24']
data['rv'] = [y if x == 0 else x for (x, y) in zip(data['rv'], data['wraa_change'])]

# Original RV
#data['rv'] = data['wraa_change'] + data['re24']

#data['wraa_scale'] = data['wraa_change'] * data['woba_scale']
#data['wraa_scale'] = data['wraa_change'] / data['woba_scale']
#data['rv'] = data['wraa_scale'] + data['re24'] / data['woba_scale']

# Test RV
#data['rv2'] = data['wraa_scale'] + data['re24'] * data['woba_scale']
#data['rv4'] = data['wraa_scale'] + data['re24'] * data['lin_weights_above_outs_scale']

data.to_csv('../data/model-pitches-rv2022.csv')
pd.set_option('max_columns', None)
print(data.shape)
data.head(10)

data2 = pd.read_csv('../data/model-pitches-rv.csv', index_col = [0])
data2['year'] = '2021'

data['year'] = '2022'
frames = [data, data2]
df = pd.concat(frames)
df.to_csv('../data/model-whiff.csv')

(24, 5)
(682572, 65)
