In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('pitches.csv')

In [3]:
dfC = df.copy()

In [4]:
dfC.head().T

Unnamed: 0,0,1,2,3,4
px,0.416,-0.191,-0.518,-0.641,-1.821
pz,2.963,2.347,3.284,1.221,2.083
start_speed,92.9,92.8,94.1,91,75.4
end_speed,84.1,84.1,85.2,84,69.6
spin_rate,2305.05,2689.93,2647.97,1289.59,1374.57
spin_dir,159.235,151.402,145.125,169.751,280.671
break_angle,-25,-40.7,-43.7,-1.3,18.4
break_length,3.2,3.4,3.7,5,12
break_y,23.7,23.7,23.7,23.8,23.8
ax,7.665,12.043,14.368,2.104,-10.28


In [5]:
dfC.pitch_type.unique()

array(['FF', 'CU', 'FC', 'SI', 'CH', 'FT', 'IN', 'SL', nan, 'KC', 'EP',
       'FS', 'FO', 'PO', 'KN', 'UN', 'SC', 'FA', 'AB'], dtype=object)

In [6]:
x = dfC.loc[(dfC.code.isin(['S','X','D','E'])) & (dfC.pitch_type.isin(['FF','CU']))]
x.groupby('pitch_type').mean()[['break_length', 'spin_rate', 'spin_dir']]

Unnamed: 0_level_0,break_length,spin_rate,spin_dir
pitch_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CU,12.194557,1253.890184,122.466435
FF,4.022105,2146.662011,191.732831


In [7]:
dfC.break_length.sort_values(ascending = False).head(5)

2063015    224889.3
868120         55.9
771971         54.6
326703         42.0
84629          38.7
Name: break_length, dtype: float64

In [8]:
dfC.iloc[2063015,].ab_id

2017165333.0

In [9]:
ab = pd.read_csv('../../eda_vis/mlb_vis/data/pitch/atbats.csv')

In [10]:
ab.loc[ab.ab_id == 2017165333]

Unnamed: 0,ab_id,batter_id,event,g_id,inning,o,p_score,p_throws,pitcher_id,stand,top
534455,2017165333,405395,Groundout,201702160,4,3,1,R,434378,R,False


In [11]:
player = pd.read_csv('../../eda_vis/mlb_vis/data/pitch/player_names.csv')

In [12]:
player.loc[player.id == 405395]

Unnamed: 0,id,first_name,last_name
1351,405395,Albert,Pujols


In [13]:
player.loc[player.id == 434378]

Unnamed: 0,id,first_name,last_name
563,434378,Justin,Verlander


# START OF REGRESSION

In [14]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [15]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [16]:
atbats = ab[['ab_id', 'stand']]
dfC = pd.merge(dfC, atbats, on = 'ab_id', how = 'left')

In [17]:
dfC = dfC.drop(dfC.index[2063015]) # drop verlander insane pitch to pujols 

In [18]:
dfC = dfC.loc[(dfC.code.isin(['X', 'D', 'E', 'S'])) & (dfC.pitch_type.isin(['FT', 'FF'])) & (dfC.stand == 'R')]
dfC.loc[dfC.code.isin(['X', 'D', 'E']), 'in_play_boolean'] = int(1)
dfC.loc[dfC.code.isin(['S']), 'in_play_boolean'] = int(0)
dfC.loc[dfC.pitch_type == 'FF', 'four_seam_boolean'] = 1
dfC.loc[dfC.pitch_type == 'FT', 'four_seam_boolean'] = 0
dfC['constant'] = 1
model_df = dfC[['constant','px', 'pz', 'start_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'four_seam_boolean','in_play_boolean']] #dropped end_speed

In [19]:
formula = 'in_play_boolean ~ px + pz + start_speed + spin_rate + spin_dir + break_angle + break_length + break_y + four_seam_boolean'

In [20]:
model = smf.glm(formula = formula, data=model_df, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        in_play_boolean   No. Observations:               201076
Model:                            GLM   Df Residuals:                   201066
Model Family:                Binomial   Df Model:                            9
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.1174e+05
Date:                Sun, 06 Oct 2019   Deviance:                   2.2348e+05
Time:                        18:04:06   Pearson chi2:                 2.25e+05
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.7694      2.33

In [21]:
train = model_df.sample(frac=0.8,random_state=1) #random state is a seed value
test = model_df.drop(train.index)

In [22]:
model = smf.glm(formula = formula, data=train, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        in_play_boolean   No. Observations:               160861
Model:                            GLM   Df Residuals:                   160851
Model Family:                Binomial   Df Model:                            9
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -89447.
Date:                Sun, 06 Oct 2019   Deviance:                   1.7889e+05
Time:                        18:04:07   Pearson chi2:                 1.80e+05
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.4185      2.60

In [23]:
predictions = result.predict(test)

In [24]:
pred = np.where(predictions >= 0.5, 1, 0) # predictions
actual = test['in_play_boolean']
1 - np.mean(np.abs(pred-actual)) # np.abs(pred-actual) will be 1 if they differ

0.7414397612831033

In [25]:
x_data = model_df.drop('in_play_boolean', axis = 1)
y_data = model_df.in_play_boolean

In [26]:
logreg = LogisticRegression()

rfe = RFE(logreg, 6)
rfe = rfe.fit(x_data, y_data)
print(rfe.support_)
print(rfe.ranking_)



[False  True  True  True False False False  True  True  True]
[2 1 1 1 5 4 3 1 1 1]


In [27]:
cols = ['px','pz', 'start_speed', 'break_length', 'break_y', 'four_seam_boolean']
X = x_data[cols]
Y = y_data

In [28]:
logit_model=sm.Logit(Y,x_data)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.555711
         Iterations 6
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.080      
Dependent Variable: in_play_boolean  AIC:              223500.4364
Date:               2019-10-06 18:04 BIC:              223602.5507
No. Observations:   201076           Log-Likelihood:   -1.1174e+05
Df Model:           9                LL-Null:          -1.2146e+05
Df Residuals:       201066           LLR p-value:      0.0000     
Converged:          1.0000           Scale:            1.0000     
No. Iterations:     6.0000                                        
------------------------------------------------------------------
                   Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
------------------------------------------------------------------
constant          -0.7694   2.3363  -0.3293 0.7419 -5.3484  3.8097
px                -0.3953   0.0092 -43.1619 0.0000 -0

In [29]:
predictions = np.where(result.predict() >= 0.5, 1, 0)

In [30]:
np.mean(predictions)

0.9086614016590742

In [31]:
actual = model_df.in_play_boolean

In [32]:
correct = 1 - np.mean(np.abs(predictions - actual))
correct

0.7410879468459687

In [33]:
np.mean(actual)

0.707886570252044