# Baseball
Amanda Kuznecov (anr431)

In [262]:
import pandas as pd
import numpy as np
import scipy.stats as st
from scipy.special import logit, expit
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.tools.eval_measures import rmse
pd.set_option('display.max_columns', None)

In [263]:
#read in data
df = pd.read_csv('pitchers17.csv')

## Question 2: Pitching Statistics
### Part a

In [264]:
#probability of a plate appearance ending with a walk
df.bb.sum()/df.bf.sum()

0.07491447652363538

In [265]:
#probability of a plate appearance ending with a strikeout
df.k.sum()/df.bf.sum()

0.2063503168798128

In [266]:
#conditional probability of a plate appearance ending with a strikeout given a walk did not occur
df.k.sum()/(df.bf.sum()-df.bb.sum())

0.22306079994028244

In [267]:
#average number of home runs per plate appearance
df.hr.sum()/df.bf.sum()

0.027626467982645436

In [268]:
#conditional prob of a plate appearance ending with a home run given neither a walk nor a strikeout occurred
df.hr.sum()/(df.bf.sum()-df.bb.sum()-df.k.sum())

0.03843761614721209

In [269]:
#conditional prob of a plate appearance ending with a non-HR hit given that the plate appearance didn't end with a walk, strikeout, or homerun.
(df.h.sum()-df.hr.sum())/(df.bf.sum()-df.bb.sum()-df.k.sum()-df.hr.sum())

0.2914712660905554

### Part b: k-rate

In [270]:
#create k-rate variable
df.loc[:,'krate'] = df.k/(df.bf-df.bb)

In [271]:
#top 10 pitchers in 2016 season based on krate
df.loc[((df.y == 2016) & (df.bf >= 500))].sort_values('krate', ascending = False)[:10][['pitcher','krate','bf']]

Unnamed: 0,pitcher,krate,bf
1190,Fernandez_Jose_605228,0.370968,731
3524,Scherzer_Max_453286,0.335697,900
3721,Strasburg_Stephen_544931,0.330325,597
2021,Kershaw_Clayton_477132,0.322702,543
3775,Syndergaard_Noah_592789,0.310984,742
3183,Ray_Robbie_592662,0.30922,772
3445,Salazar_Danny_517593,0.309021,581
3942,Velasquez_Vincent_592826,0.300395,550
3964,Verlander_Justin_434378,0.300236,902
120,Archer_Chris_502042,0.297573,850


### Part c: h-rate

In [272]:
#create h-rate variable
df.loc[:,'hrate'] = (df.h-df.hr)/(df.bf-df.bb-df.k-df.hr)

In [273]:
#top 10 pitchers in 2016 season based on hrate
df.loc[((df.y == 2016) & (df.bf >= 500))].sort_values('hrate', ascending = False)[:10][['pitcher','hrate','bf']]

Unnamed: 0,pitcher,hrate,bf
3183,Ray_Robbie_592662,0.347732,772
2939,Paxton_James_572020,0.34626,508
2958,Pelfrey_Mike_460059,0.341981,541
759,Cole_Gerrit_543037,0.339726,503
2980,Perdomo_Luis_606131,0.336066,655
2511,McHugh_Collin_543521,0.335185,795
412,Bradley_Archie_605151,0.334951,630
1047,Duffey_Tyler_608648,0.334118,593
4009,Wacha_Michael_608379,0.333333,600
3775,Syndergaard_Noah_592789,0.332627,742


## Question 3: Predicting McCracken Components

In [274]:
#ensure sort by pitcher by season
df = df.sort_values(['pitcher','y'])

#check number of batters faced in previous season
df['bf_prev'] = df.groupby(['pitcher'])['bf'].transform(lambda x: x.shift(1,fill_value = 0))

#check how many seasons ago pitcher played (ie. 1 means they played last season, 0 means they haven't played yet)
df['played_prev'] = df.groupby(['pitcher'])['y'].transform(lambda x: x-x.shift(1,fill_value = x.min()))

### Part a: bb-rate

In [275]:
#create bb-rate variable and pre bb-rate variable
df.loc[:,'bbrate'] = df.bb/df.bf
df['bbrate_prev'] = df.groupby(['pitcher'])['bbrate'].transform(lambda x: x.shift(1,fill_value = 0))

#filter for seasons other than 2012, bf at least 200, and stats based on consecutive seasons only
data_b = df.loc[(df.y != 2012) & (df.bf >=200) & (df.bf_prev >=200)& (df.played_prev <= 1)]
data_b.head()

Unnamed: 0,pitcher,y,bf,bb,k,hr,h,lo,po,fo,go,krate,hrate,bf_prev,played_prev,bbrate,bbrate_prev
6,Abad_Fernando_472551,2015,202,16,45,11,45,19,10,27,33,0.241935,0.261538,213,1,0.079208,0.056338
28,Adleman_Timothy_534947,2017,530,50,108,29,124,29,33,78,95,0.225,0.276968,286,1,0.09434,0.066434
39,Albers_Matt_458006,2013,259,20,35,2,57,16,3,20,99,0.146444,0.272277,238,1,0.07722,0.079832
43,Albers_Matt_458006,2017,233,17,63,6,35,16,12,19,61,0.291667,0.197279,236,1,0.072961,0.076271
46,Alburquerque_Al_456379,2014,235,20,63,7,46,10,12,30,49,0.293023,0.268966,215,1,0.085106,0.134884


### Part a.i

In [276]:
mod_bb = smf.ols('bbrate~bbrate_prev',data = data_b).fit()
mod_bb.summary()

0,1,2,3
Dep. Variable:,bbrate,R-squared:,0.286
Model:,OLS,Adj. R-squared:,0.285
Method:,Least Squares,F-statistic:,475.1
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,7.219999999999999e-89
Time:,15:18:34,Log-Likelihood:,3034.6
No. Observations:,1189,AIC:,-6065.0
Df Residuals:,1187,BIC:,-6055.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0335,0.002,17.934,0.000,0.030,0.037
bbrate_prev,0.5474,0.025,21.797,0.000,0.498,0.597

0,1,2,3
Omnibus:,39.471,Durbin-Watson:,2.069
Prob(Omnibus):,0.0,Jarque-Bera (JB):,47.781
Skew:,0.379,Prob(JB):,4.21e-11
Kurtosis:,3.624,Cond. No.,46.1


### Part b: k-rate

In [277]:
#create krate prev variable
df['krate_prev'] = df.groupby(['pitcher'])['krate'].transform(lambda x: x.shift(1,fill_value = 0))

#filter for seasons other than 2012, bf at least 200, and stats based on consecutive seasons only
data_k = df.loc[(df.y != 2012) & (df.bf >=200) & (df.bf_prev >=200)& (df.played_prev <= 1)][['pitcher','y','krate','krate_prev','bf','bf_prev','played_prev']]
data_k.head()

Unnamed: 0,pitcher,y,krate,krate_prev,bf,bf_prev,played_prev
6,Abad_Fernando_472551,2015,0.241935,0.253731,202,213,1
28,Adleman_Timothy_534947,2017,0.225,0.17603,530,286,1
39,Albers_Matt_458006,2013,0.146444,0.200913,259,238,1
43,Albers_Matt_458006,2017,0.291667,0.137615,233,236,1
46,Alburquerque_Al_456379,2014,0.293023,0.376344,235,215,1


### Part b.i

In [278]:
mod_k = smf.ols('krate~krate_prev',data = data_k).fit()
mod_k.summary()

0,1,2,3
Dep. Variable:,krate,R-squared:,0.558
Model:,OLS,Adj. R-squared:,0.558
Method:,Least Squares,F-statistic:,1498.0
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,1.19e-212
Time:,15:18:35,Log-Likelihood:,2072.4
No. Observations:,1189,AIC:,-4141.0
Df Residuals:,1187,BIC:,-4131.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0583,0.005,12.505,0.000,0.049,0.067
krate_prev,0.7474,0.019,38.707,0.000,0.710,0.785

0,1,2,3
Omnibus:,87.997,Durbin-Watson:,2.15
Prob(Omnibus):,0.0,Jarque-Bera (JB):,127.11
Skew:,0.591,Prob(JB):,2.5e-28
Kurtosis:,4.081,Cond. No.,16.6


### Part c: hr-rate

In [279]:
#create bb-rate variable and pre bb-rate variable
df.loc[:,'hrrate'] = df.hr/(df.bf-df.bb-df.k)
df['hrrate_prev'] = df.groupby(['pitcher'])['hrrate'].transform(lambda x: x.shift(1,fill_value = 0))

#filter for seasons other than 2012, bf at least 200, and stats based on consecutive seasons only
data_hr = df.loc[(df.y != 2012) & (df.bf >=200) & (df.bf_prev >=200)& (df.played_prev <= 1)][['pitcher','y','hrrate','hrrate_prev','bf','bf_prev','played_prev']]
data_hr.head()

Unnamed: 0,pitcher,y,hrrate,hrrate_prev,bf,bf_prev,played_prev
6,Abad_Fernando_472551,2015,0.078014,0.026667,202,213,1
28,Adleman_Timothy_534947,2017,0.077957,0.059091,530,286,1
39,Albers_Matt_458006,2013,0.009804,0.051429,259,238,1
43,Albers_Matt_458006,2017,0.039216,0.053191,233,236,1
46,Alburquerque_Al_456379,2014,0.046053,0.043103,235,215,1


### Part c.i

In [280]:
mod_hr = smf.ols('hrrate~hrrate_prev',data = data_hr).fit()
mod_hr.summary()

0,1,2,3
Dep. Variable:,hrrate,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,87.85
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,3.44e-20
Time:,15:18:36,Log-Likelihood:,3367.6
No. Observations:,1189,AIC:,-6731.0
Df Residuals:,1187,BIC:,-6721.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0273,0.001,23.285,0.000,0.025,0.030
hrrate_prev,0.2954,0.032,9.373,0.000,0.234,0.357

0,1,2,3
Omnibus:,34.41,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.262
Skew:,0.381,Prob(JB):,4.91e-09
Kurtosis:,3.439,Cond. No.,76.3


### Part d: h-rate

In [281]:
#create hrate prev variable
df['hrate_prev'] = df.groupby(['pitcher'])['hrate'].transform(lambda x: x.shift(1,fill_value = 0))

#filter for seasons other than 2012, bf at least 200, and stats based on consecutive seasons only
data_h = df.loc[(df.y != 2012) & (df.bf >=200) & (df.bf_prev >=200)& (df.played_prev <= 1)][['pitcher','y','hrate','hrate_prev','bf','bf_prev','played_prev']]
data_h.head()

Unnamed: 0,pitcher,y,hrate,hrate_prev,bf,bf_prev,played_prev
6,Abad_Fernando_472551,2015,0.261538,0.205479,202,213,1
28,Adleman_Timothy_534947,2017,0.276968,0.246377,530,286,1
39,Albers_Matt_458006,2013,0.272277,0.222892,259,238,1
43,Albers_Matt_458006,2017,0.197279,0.320225,233,236,1
46,Alburquerque_Al_456379,2014,0.268966,0.306306,235,215,1


### Part d.i

In [282]:
mod_h = smf.ols('hrate~hrate_prev',data = data_h).fit()
mod_h.summary()

0,1,2,3
Dep. Variable:,hrate,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,35.55
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,3.27e-09
Time:,15:18:36,Log-Likelihood:,2449.4
No. Observations:,1189,AIC:,-4895.0
Df Residuals:,1187,BIC:,-4885.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2404,0.008,28.946,0.000,0.224,0.257
hrate_prev,0.1720,0.029,5.963,0.000,0.115,0.229

0,1,2,3
Omnibus:,12.209,Durbin-Watson:,1.967
Prob(Omnibus):,0.002,Jarque-Bera (JB):,18.016
Skew:,-0.061,Prob(JB):,0.000122
Kurtosis:,3.591,Cond. No.,34.9


### Improvements

In [283]:
#create new features as pitcher's cumulative rates
#lineouts rate
df.loc[:,'lo_rate'] = df.lo/df.bf
df['lo_rate_prev'] = df.groupby(['pitcher'])['lo_rate'].transform(lambda x: x.expanding().mean().shift(1,fill_value = 0))

#popouts rate
df.loc[:,'po_rate'] = df.po/df.bf
df['po_rate_prev'] = df.groupby(['pitcher'])['po_rate'].transform(lambda x: x.expanding().mean().shift(1,fill_value = 0))

#flyouts rate
df.loc[:,'fo_rate'] = df.fo/df.bf
df['fo_rate_prev'] = df.groupby(['pitcher'])['fo_rate'].transform(lambda x: x.expanding().mean().shift(1,fill_value = 0))

#groundouts rate
df.loc[:,'go_rate'] = df.go/df.bf
df['go_rate_prev'] = df.groupby(['pitcher'])['go_rate'].transform(lambda x: x.expanding().mean().shift(1,fill_value = 0))

In [285]:
#filter for data we want to use in analysis
data_all = df.loc[(df.y != 2012) & (df.bf >=200) & (df.bf_prev >=200)& (df.played_prev <= 1)]
data_all = data_all.copy()

In [288]:
#improving bbrate model
mod_bb1 = smf.ols('bbrate~bbrate_prev+fo_rate_prev+go_rate_prev+po_rate_prev',data = data_all).fit()
mod_bb1.summary()

0,1,2,3
Dep. Variable:,bbrate,R-squared:,0.307
Model:,OLS,Adj. R-squared:,0.304
Method:,Least Squares,F-statistic:,130.9
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,1.25e-92
Time:,15:20:45,Log-Likelihood:,3052.2
No. Observations:,1189,AIC:,-6094.0
Df Residuals:,1184,BIC:,-6069.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0737,0.007,10.411,0.000,0.060,0.088
bbrate_prev,0.4817,0.027,17.745,0.000,0.428,0.535
fo_rate_prev,-0.0634,0.024,-2.680,0.007,-0.110,-0.017
go_rate_prev,-0.0847,0.015,-5.570,0.000,-0.114,-0.055
po_rate_prev,-0.1549,0.043,-3.607,0.000,-0.239,-0.071

0,1,2,3
Omnibus:,25.74,Durbin-Watson:,2.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.949
Skew:,0.294,Prob(JB):,3.14e-07
Kurtosis:,3.509,Cond. No.,86.7


In [291]:
#improving krate model
mod_k1 = smf.ols('krate~krate_prev+fo_rate_prev+go_rate_prev',data = data_all).fit()
mod_k1.summary()

0,1,2,3
Dep. Variable:,krate,R-squared:,0.572
Model:,OLS,Adj. R-squared:,0.571
Method:,Least Squares,F-statistic:,527.9
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,8.82e-218
Time:,15:46:11,Log-Likelihood:,2091.6
No. Observations:,1189,AIC:,-4175.0
Df Residuals:,1185,BIC:,-4155.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1885,0.021,8.816,0.000,0.147,0.230
krate_prev,0.6202,0.028,22.144,0.000,0.565,0.675
fo_rate_prev,-0.3296,0.060,-5.508,0.000,-0.447,-0.212
go_rate_prev,-0.2449,0.041,-6.007,0.000,-0.325,-0.165

0,1,2,3
Omnibus:,68.474,Durbin-Watson:,2.085
Prob(Omnibus):,0.0,Jarque-Bera (JB):,92.079
Skew:,0.514,Prob(JB):,1.01e-20
Kurtosis:,3.895,Cond. No.,64.8


In [296]:
#improving hrrate model
mod_hr1 = smf.ols('hrrate~hrrate_prev+lo_rate_prev+go_rate_prev+po_rate_prev+fo_rate_prev',data = data_all).fit()
mod_hr1.summary()

0,1,2,3
Dep. Variable:,hrrate,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.157
Method:,Least Squares,F-statistic:,45.15
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,9.700000000000001e-43
Time:,16:01:48,Log-Likelihood:,3429.0
No. Observations:,1189,AIC:,-6846.0
Df Residuals:,1183,BIC:,-6816.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0195,0.004,4.399,0.000,0.011,0.028
hrrate_prev,0.1716,0.032,5.301,0.000,0.108,0.235
lo_rate_prev,0.2136,0.030,7.064,0.000,0.154,0.273
go_rate_prev,-0.0367,0.010,-3.544,0.000,-0.057,-0.016
po_rate_prev,0.0693,0.031,2.257,0.024,0.009,0.129
fo_rate_prev,0.0537,0.017,3.152,0.002,0.020,0.087

0,1,2,3
Omnibus:,36.952,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,41.436
Skew:,0.395,Prob(JB):,1.01e-09
Kurtosis:,3.46,Cond. No.,88.0


In [147]:
#improving hrate model
mod_h1 = smf.ols('hrate~hrate_prev+hrate_prev+lo_rate_prev+go_rate_prev+fo_rate_prev+po_rate_prev',data = data_all).fit()
mod_h1.summary()

0,1,2,3
Dep. Variable:,hrate,R-squared:,0.073
Model:,OLS,Adj. R-squared:,0.069
Method:,Least Squares,F-statistic:,18.62
Date:,"Wed, 31 Mar 2021",Prob (F-statistic):,7.59e-18
Time:,14:31:34,Log-Likelihood:,2476.9
No. Observations:,1189,AIC:,-4942.0
Df Residuals:,1183,BIC:,-4911.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2190,0.014,15.876,0.000,0.192,0.246
hrate_prev,0.1497,0.029,5.144,0.000,0.093,0.207
lo_rate_prev,0.2740,0.067,4.080,0.000,0.142,0.406
go_rate_prev,0.0483,0.023,2.107,0.035,0.003,0.093
fo_rate_prev,0.1341,0.038,3.552,0.000,0.060,0.208
po_rate_prev,-0.2988,0.069,-4.318,0.000,-0.434,-0.163

0,1,2,3
Omnibus:,14.096,Durbin-Watson:,1.98
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.843
Skew:,0.061,Prob(JB):,1.81e-05
Kurtosis:,3.653,Cond. No.,88.7
