# Notebook Contents

- [Imports](#Imports)
- [Data](#Data)
- [Data Cleaning](#Data-Cleaning)
- [Preprocessing](#Preprocessing)
    - [Multicolinearity - VIF](#Multicolinearity---VIF)
- [Features](#Features)
- [Random Forest Modeling](#Random-Forest-Modeling)
    - [4-Seam](#Linear-Regression---4-Seam)
    - [Cutter](#Linear-Regression---Cutter)
    - [Sinker](#Linear-Regression---Sinker)
    - [Slider](#Linear-Regression---Slider)
    - [Curveball](#Linear-Regression---Curveball)
    - [Changeup](#Linear-Regression---Changeup)

# Imports

In [1]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, k_means
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# Data

In [2]:
data = pd.read_csv('../data/model-pitches.csv')
data.drop(columns = ['Unnamed: 0'], inplace = True) 
data.dropna(subset = ['pitch_type', 'velo', 'spin_rate'], inplace = True)

data['inning_topbot'] = data.inning_topbot.map({'Top': 0, 'Bot': 1})
data['on_1b'] = [1 if x > 1 else 0 for x in data['on_1b']]
data['on_2b'] = [1 if x > 1 else 0 for x in data['on_2b']]
data['on_3b'] = [1 if x > 1 else 0 for x in data['on_3b']]

matrix = [[0, 0, 0, 0, 0.481], [1, 0, 0, 0, 0.859], [0, 1, 0, 0, 1.100], [1, 1, 0, 0, 1.437], 
          [0, 0, 1, 0, 1.350], [1, 0, 1, 0, 1.784], [0, 1, 1, 0, 1.964], [1, 1, 1, 0, 2.292], 
          [0, 0, 0, 1, 0.254], [1, 0, 0, 1, 0.509], [0, 1, 0, 1, 0.664], [1, 1, 0, 1, 0.884], 
          [0, 0, 1, 1, 0.950], [1, 0, 1, 1, 1.130], [0, 1, 1, 1, 1.376], [1, 1, 1, 1, 1.541],
          [0, 0, 0, 2, 0.098], [1, 0, 0, 2, 0.224], [0, 1, 0, 2, 0.319], [1, 1, 0, 2, 0.429], 
          [0, 0, 1, 2, 0.353], [1, 0, 1, 2, 0.478], [0, 1, 1, 2, 0.580], [1, 1, 1, 2, 0.752]]

re24 = pd.DataFrame(matrix, columns = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up', 're24'])
data = pd.merge(data, re24, how = 'left', on = ['on_1b', 'on_2b', 'on_3b', 'outs_when_up'])
#data['re_after'] = 
#data['re_diff']

data['home_runs'] = data['post_home_score'] - data['home_score']
data['away_runs'] = data['post_away_score'] - data['away_score']
data['runs'] = data['home_runs'] + data['away_runs']

pd.set_option('max_columns', None)
print(data.shape)
data.head()

(705651, 50)


Unnamed: 0,player_name,p_throws,pitch_type,velo,spin_rate,spin_axis,pfx_-x,pfx_z,bauer_units,effective_speed,release_pos_x,release_pos_z,release_extension,release_pos_y,plate_-x,plate_x,plate_z,type,balls,strikes,delta_run_exp,stand,events,description,hit_distance_sc,exit_velo,launch_angle,launch_speed_angle,xba,xwobacon,woba_value,woba_denom,babip_value,iso_value,at_bat_number,pitch_number,inning,inning_topbot,home_score,away_score,post_home_score,post_away_score,on_1b,on_2b,on_3b,outs_when_up,re24,home_runs,away_runs,runs
0,"Smith, Will",L,FF,92.3,2330.0,148.0,-8.28,16.56,25.24377,92.8,1.4,6.8,6.5,54.03,0.69,-0.69,2.83,X,1,2,-0.073,R,field_out,hit_into_play,13.0,95.2,-13.0,2.0,0.174,0.158,0.0,1.0,0.0,0.0,61,4,9,0,5,0,5,0,0,0,0,2,0.098,0,0,0
1,"Smith, Will",L,SL,80.6,2254.0,315.0,9.24,5.76,27.965261,81.2,1.6,6.64,6.4,54.15,0.71,-0.71,2.62,S,1,1,-0.027,R,,foul,108.0,75.3,75.0,,,,,,,,61,3,9,0,5,0,5,0,0,0,0,2,0.098,0,0,0
2,"Smith, Will",L,CU,75.5,1940.0,328.0,7.8,-6.12,25.695364,75.2,1.46,6.88,6.2,54.34,0.04,-0.04,2.46,S,1,0,-0.02,R,,foul,157.0,83.5,65.0,,,,,,,,61,2,9,0,5,0,5,0,0,0,0,2,0.098,0,0,0
3,"Smith, Will",L,CU,75.0,2017.0,330.0,8.28,-8.28,26.893333,74.5,1.53,6.83,5.9,54.61,-2.1,2.1,3.89,B,0,0,0.016,R,,ball,,,,,,,,,,,61,1,9,0,5,0,5,0,0,0,0,2,0.098,0,0,0
4,"Smith, Will",L,FF,91.2,2281.0,143.0,-7.56,15.36,25.010965,90.9,1.49,6.66,6.3,54.15,0.31,-0.31,2.8,X,1,0,-0.189,L,field_out,hit_into_play,9.0,93.3,-18.0,2.0,0.1,0.09,0.0,1.0,0.0,0.0,60,2,9,0,5,0,5,0,0,0,0,1,0.254,0,0,0


In [3]:
runs = data[['home_score', 'away_score', 'post_home_score', 'post_away_score', 'home_runs', 'away_runs']]
runs.tail(10)

Unnamed: 0,home_score,away_score,post_home_score,post_away_score,home_runs,away_runs
705641,0,0,0,0,0,0
705642,0,0,0,0,0,0
705643,0,0,0,0,0,0
705644,0,0,0,0,0,0
705645,0,0,0,0,0,0
705646,0,0,0,0,0,0
705647,0,0,0,0,0,0
705648,0,0,0,0,0,0
705649,0,0,0,0,0,0
705650,0,0,0,0,0,0


In [4]:
re24.groupby(['re24']).mean()

Unnamed: 0_level_0,on_1b,on_2b,on_3b,outs_when_up
re24,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.098,0.0,0.0,0.0,2.0
0.224,1.0,0.0,0.0,2.0
0.254,0.0,0.0,0.0,1.0
0.319,0.0,1.0,0.0,2.0
0.353,0.0,0.0,1.0,2.0
...,...,...,...,...
1.437,1.0,1.0,0.0,0.0
1.541,1.0,1.0,1.0,1.0
1.784,1.0,0.0,1.0,0.0
1.964,0.0,1.0,1.0,0.0


# Data Cleaning

HR, 3B, 2B, 1B, reach on error, HBP, BB, IBB, out, K, passed ball, WP, balk, SB, CS, pickoff, pickoff error, OtherAdvance, interference, foulE?, DefensiveIndiff

In [5]:
data.events.value_counts()

field_out                     81386
strikeout                     41820
single                        24854
walk                          14986
double                         7816
home_run                       5894
field_error                    1230
sac_bunt                        761
triple                          667
double_play                     381
caught_stealing_2b              146
strikeout_double_play           108
catcher_interf                   61
other_out                        41
sac_fly_double_play              17
wild_pitch                        9
caught_stealing_3b                8
pickoff_2b                        6
pickoff_1b                        6
triple_play                       5
pickoff_caught_stealing_2b        4
caught_stealing_home              4
game_advisory                     3
pickoff_3b                        2
sac_bunt_double_play              2
stolen_base_2b                    1
passed_ball                       1
pickoff_caught_stealing_3b  

In [6]:
data.description.value_counts()

ball                       252646
foul                       132062
hit_into_play              120920
called_strike              115952
swinging_strike             76894
swinging_strike_blocked      4689
hit_by_pitch                 2094
missed_bunt                   361
pitchout                       33
Name: description, dtype: int64

# Preprocessing

### Multicolinearity - VIF

Velocity, Spin Rate, HB, VB, Release Extension, Horizontal Release Position, Vertical Release Position, Horizontal Plate Coords, Vertical Plate Coords

In [7]:
#features = data[['velo', 'spin_rate', 'pfx_-x', 'pfx_z', 'release_extension', 
#                 'release_pos_x', 'release_pos_z', 'plate_x', 'plate_z',
#                 'pitch_type', 'p_throws']]
#features_vif = features.select_dtypes([np.number])
#vif_data = pd.DataFrame()
#vif_data["feature"] = features_vif.columns
#
#vif_data["VIF"] = [variance_inflation_factor(features_vif.values, i)
#                   for i in range(len(features_vif.columns))]
#
#vif_data.sort_values(by = 'VIF').head(10)

# Features

**Independent Variables:** Velocity, Spin Rate, VB, HB, Release Extension, Horizontal Release Position, Vertical Release Position, Horizontal Plate Coords, Vertical Plate Coords

**Dependent Variable:** xRE

Pitch Types:

Fastball: 4-Seam, Cutter, Sinker (FF, FC, SI)

Breaking Ball: Slider, Curveball, Knuckle Curve (SL, CU, KC)

Offspeed: Changeup, Splitter (CH, FS)

In [8]:
ff = data.loc[data['pitch_type'] == 'FF']
fc = data.loc[data['pitch_type'] == 'FC']
fastball = ff.append(fc)
si = data.loc[data['pitch_type'] == 'SI']
fastball = fastball.append(si)
print('Fastball shape:', fastball.shape)
sl = data.loc[data['pitch_type'] == 'SL']
cu = data.loc[data['pitch_type'] == 'CU']
breaking_ball = sl.append(cu)
kc = data.loc[data['pitch_type'] == 'KC']
breaking_ball = breaking_ball.append(kc)
print('Breaking Ball:', breaking_ball.shape)
ch = data.loc[data['pitch_type'] == 'CH']
fs = data.loc[data['pitch_type'] == 'FS']
offspeed = ch.append(fs)
print('Off speed shape:', offspeed.shape)
rhp = data.loc[data['p_throws'] == 'R']
print('RHP shape:', rhp.shape)
lhp = data.loc[data['p_throws'] == 'L']
print('LHP shape:', lhp.shape)
rhp_rhh = data.loc[(data['p_throws'] == 'R') & (data['stand'] == 'R')]
print('RHP & RHH shape:', rhp_rhh.shape)
rhp_lhh = data.loc[(data['p_throws'] == 'R') & (data['stand'] == 'L')]
print('RHP & LHH shape:', rhp_lhh.shape)
lhp_rhh = data.loc[(data['p_throws'] == 'L') & (data['stand'] == 'R')]
print('LHP & RHH shape:', lhp_rhh.shape)
lhp_lhh = data.loc[(data['p_throws'] == 'L') & (data['stand'] == 'L')]
print('LHP & LHH shape:', lhp_lhh.shape)
rhp_fastball = fastball.loc[fastball['p_throws'] == 'R']
print('RHP Fastball shape:', rhp_fastball.shape)
lhp_fastball = fastball.loc[fastball['p_throws'] == 'L']
print('LHP Fastball shape:', lhp_fastball.shape)
rhp_breaking_ball = breaking_ball.loc[breaking_ball['p_throws'] == 'R']
print('RHP Breaking Ball shape:', rhp_breaking_ball.shape)
lhp_breaking_ball = breaking_ball.loc[breaking_ball['p_throws'] == 'L']
print('LHP Breaking Ball shape:', lhp_breaking_ball.shape)
rhp_offspeed = offspeed.loc[offspeed['p_throws'] == 'R']
print('RHP Offspeed shape:', rhp_offspeed.shape)
lhp_offspeed = offspeed.loc[offspeed['p_throws'] == 'L']
print('LHP Offspeed shape:', lhp_offspeed.shape)
zero_outs = data.loc[data['outs_when_up'] == 0]
print('0 outs:', zero_outs.shape)
one_out = data.loc[data['outs_when_up'] == 1]
print('1 out:', one_out.shape)
two_outs = data.loc[data['outs_when_up'] == 2]
print('2 outs:', two_outs.shape)

Fastball shape: (406393, 50)
Breaking Ball: (208057, 50)
Off speed shape: (91201, 50)
RHP shape: (496636, 50)
LHP shape: (209015, 50)
RHP & RHH shape: (267618, 50)
RHP & LHH shape: (229018, 50)
LHP & RHH shape: (149895, 50)
LHP & LHH shape: (59120, 50)
RHP Fastball shape: (283300, 50)
LHP Fastball shape: (123093, 50)
RHP Breaking Ball shape: (152423, 50)
LHP Breaking Ball shape: (55634, 50)
RHP Offspeed shape: (60913, 50)
LHP Offspeed shape: (30288, 50)
0 outs: (244527, 50)
1 out: (232655, 50)
2 outs: (228469, 50)


# Random Forest Modeling