# Notebook Contents

- [Imports](#Imports)
- [Data](#Data)
- [Data Cleaning](#Data-Cleaning)
- [Preprocessing](#Preprocessing)
    - [Multicolinearity - VIF](#Multicolinearity---VIF)
- [Features](#Features)
- [Random Forest Modeling](#Random-Forest-Modeling)
    - [4-Seam](#Linear-Regression---4-Seam)
    - [Cutter](#Linear-Regression---Cutter)
    - [Sinker](#Linear-Regression---Sinker)
    - [Slider](#Linear-Regression---Slider)
    - [Curveball](#Linear-Regression---Curveball)
    - [Changeup](#Linear-Regression---Changeup)

# Imports

In [1]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, k_means
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# Data

In [2]:
data = pd.read_csv('../data/model-pitches.csv')
data.drop(columns = ['Unnamed: 0'], inplace = True) 
#data.dropna(inplace = True)

pd.set_option('max_columns', None)
print(data.shape)
data.head()

(708653, 38)


Unnamed: 0,player_name,p_throws,pitch_type,velo,spin_rate,spin_axis,pfx_-x,pfx_z,bauer_units,effective_speed,release_pos_x,release_pos_z,release_extension,release_pos_y,plate_-x,plate_x,plate_z,type,delta_run_exp,stand,events,description,hit_distance_sc,exit_velo,launch_angle,launch_speed_angle,xba,xwobacon,woba_value,woba_denom,babip_value,iso_value,inning,inning_topbot,outs_when_up,on_1b,on_2b,on_3b
0,"Smith, Will",L,FF,92.3,2330.0,148.0,-8.28,16.56,25.24377,92.8,1.4,6.8,6.5,54.03,0.69,-0.69,2.83,X,-0.073,R,out,hit_into_play,13.0,95.2,-13.0,2.0,0.174,0.158,0.0,1.0,0.0,0.0,9,Top,2,,,
1,"Smith, Will",L,SL,80.6,2254.0,315.0,9.24,5.76,27.965261,81.2,1.6,6.64,6.4,54.15,0.71,-0.71,2.62,S,-0.027,R,,foul,108.0,75.3,75.0,,,,,,,,9,Top,2,,,
2,"Smith, Will",L,CU,75.5,1940.0,328.0,7.8,-6.12,25.695364,75.2,1.46,6.88,6.2,54.34,0.04,-0.04,2.46,S,-0.02,R,,foul,157.0,83.5,65.0,,,,,,,,9,Top,2,,,
3,"Smith, Will",L,CU,75.0,2017.0,330.0,8.28,-8.28,26.893333,74.5,1.53,6.83,5.9,54.61,-2.1,2.1,3.89,B,0.016,R,,ball,,,,,,,,,,,9,Top,2,,,
4,"Smith, Will",L,FF,91.2,2281.0,143.0,-7.56,15.36,25.010965,90.9,1.49,6.66,6.3,54.15,0.31,-0.31,2.8,X,-0.189,L,out,hit_into_play,9.0,93.3,-18.0,2.0,0.1,0.09,0.0,1.0,0.0,0.0,9,Top,1,,,


In [3]:
data.player_name.nunique()

891

# Data Cleaning

Pitch Types:

Fastball: 4-Seam, Cutter, Sinker (FF, FC, SI)

Breaking Ball: Slider, Curveball, Knuckle Curve (SL, CU, KC)

Offspeed: Changeup, Splitter (CH, FS)

In [4]:
ff = data.loc[data['pitch_type'] == 'FF']
fc = data.loc[data['pitch_type'] == 'FC']
ff = ff.append(fc)
si = data.loc[data['pitch_type'] == 'SI']
fastball = ff.append(si)
print('Fastball shape:', fastball.shape)
sl = data.loc[data['pitch_type'] == 'SL']
cu = data.loc[data['pitch_type'] == 'CU']
sl = sl.append(cu)
kc = data.loc[data['pitch_type'] == 'KC']
breaking_ball = sl.append(kc)
print('Breaking Ball:', breaking_ball.shape)
ch = data.loc[data['pitch_type'] == 'CH']
fs = data.loc[data['pitch_type'] == 'FS']
offspeed = ch.append(fs)
print('Off speed shape:', offspeed.shape)
rhp = data.loc[data['p_throws'] == 'R']
print('RHP shape:', rhp.shape)
lhp = data.loc[data['p_throws'] == 'L']
print('LHP shape:', lhp.shape)
rhp_rhh = data.loc[(data['p_throws'] == 'R') & (data['stand'] == 'R')]
print('RHP & RHH shape:', rhp_rhh.shape)
rhp_lhh = data.loc[(data['p_throws'] == 'R') & (data['stand'] == 'L')]
print('RHP & LHH shape:', rhp_lhh.shape)
lhp_rhh = data.loc[(data['p_throws'] == 'L') & (data['stand'] == 'R')]
print('LHP & RHH shape:', lhp_rhh.shape)
lhp_lhh = data.loc[(data['p_throws'] == 'L') & (data['stand'] == 'L')]
print('LHP & LHH shape:', lhp_lhh.shape)
rhp_fastball = fastball.loc[fastball['p_throws'] == 'R']
print('RHP Fastball shape:', rhp_fastball.shape)
lhp_fastball = fastball.loc[fastball['p_throws'] == 'L']
print('LHP Fastball shape:', lhp_fastball.shape)
rhp_breaking_ball = breaking_ball.loc[breaking_ball['p_throws'] == 'R']
print('RHP Breaking Ball shape:', rhp_breaking_ball.shape)
lhp_breaking_ball = breaking_ball.loc[breaking_ball['p_throws'] == 'L']
print('LHP Breaking Ball shape:', lhp_breaking_ball.shape)
rhp_offspeed = offspeed.loc[offspeed['p_throws'] == 'R']
print('RHP Offspeed shape:', rhp_offspeed.shape)
lhp_offspeed = offspeed.loc[offspeed['p_throws'] == 'L']
print('LHP Offspeed shape:', lhp_offspeed.shape)
zero_outs = data.loc[data['outs_when_up'] == 0]
print('0 outs:', zero_outs.shape)
one_out = data.loc[data['outs_when_up'] == 1]
print('1 out:', one_out.shape)
two_outs = data.loc[data['outs_when_up'] == 2]
print('2 outs:', two_outs.shape)

Fastball shape: (407950, 38)
Breaking Ball: (208755, 38)
Off speed shape: (91583, 38)
RHP shape: (498628, 38)
LHP shape: (210025, 38)
RHP & RHH shape: (268668, 38)
RHP & LHH shape: (229960, 38)
LHP & RHH shape: (150662, 38)
LHP & LHH shape: (59363, 38)
RHP Fastball shape: (284311, 38)
LHP Fastball shape: (123639, 38)
RHP Breaking Ball shape: (152952, 38)
LHP Breaking Ball shape: (55803, 38)
RHP Offspeed shape: (61155, 38)
LHP Offspeed shape: (30428, 38)
0 outs: (245543, 38)
1 out: (233625, 38)
2 outs: (229485, 38)


HR, 3B, 2B, 1B, reach on error, HBP, BB, IBB, out, K, passed ball, WP, balk, SB, CS, pickoff, pickoff error, OtherAdvance, interference, foulE?, DefensiveIndiff

In [4]:
data.events.value_counts()

out                           82962
strikeout                     41996
single                        24947
walk                          15061
double                         7843
home_run                       5921
sac_bunt                        764
triple                          670
double_play                     384
caught_stealing_2b              146
strikeout_double_play           109
catcher_interf                   61
other_out                        41
sac_fly_double_play              17
wild_pitch                        9
caught_stealing_3b                8
pickoff_2b                        6
pickoff_1b                        6
triple_play                       5
pickoff_caught_stealing_2b        4
caught_stealing_home              4
game_advisory                     3
pickoff_3b                        2
sac_bunt_double_play              2
stolen_base_2b                    1
passed_ball                       1
pickoff_caught_stealing_3b        1
Name: events, dtype: int64

# Preprocessing

### Multicolinearity - VIF

Velocity, Spin Rate, HB, VB, Release Extension, Horizontal Release Position, Vertical Release Position, Horizontal Plate Coords, Vertical Plate Coords

In [5]:
#features = data[['velo', 'spin_rate', 'pfx_-x', 'pfx_z', 'release_extension', 
#                 'release_pos_x', 'release_pos_z', 'plate_x', 'plate_z',
#                 'pitch_type', 'p_throws']]
#features_vif = features.select_dtypes([np.number])
#vif_data = pd.DataFrame()
#vif_data["feature"] = features_vif.columns
#
#vif_data["VIF"] = [variance_inflation_factor(features_vif.values, i)
#                   for i in range(len(features_vif.columns))]
#
#vif_data.sort_values(by = 'VIF').head(10)

# Features

**Independent Variables:** Velocity, Spin Rate, VB, HB, Release Extension, Horizontal Release Position, Vertical Release Position, Horizontal Plate Coords, Vertical Plate Coords

**Dependent Variable:** xRE

# Random Forest Modeling