In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import scipy.stats as stats
import matplotlib.cm as cm
from IPython.display import display
from mpl_toolkits.mplot3d import Axes3D
from sklearn.feature_selection import f_regression
from statsmodels.stats.anova import anova_lm
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
#load data

pitches_orig = pd.read_csv('pitches.csv')
at_bat = pd.read_csv('atbats.csv')

prep data for model
---

In [None]:
#right handed at bats
at_bat_right_hand = at_bat.loc[at_bat.stand == 'R']
#only look at pitches where batter is right handed
pitches_orig = pd.merge(pitches_orig, at_bat_right_hand, how = 'inner', right_on = 'ab_id', left_on = 'ab_id')
#only look at pithches that were swung at
pitches_orig = pitches_orig.loc[(pitches_orig.code.isin(['X', 'D', 'E', 'S'])) & (pitches_orig.pitch_type.isin(['FT', 'FF']))]
#binary coding for in play or not after swing
pitches_orig.loc[pitches_orig.code.isin(['X', 'D', 'E']), 'in_play_boolean'] = int(1)
pitches_orig.loc[pitches_orig.code.isin(['S']), 'in_play_boolean'] = int(0)

#create model df with in_play_boolean as response
model_df = pitches_orig[['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'pitch_type','in_play_boolean']]
#dummies
pitch_typed = pd.get_dummies(model_df['pitch_type'])
model_df = pd.concat([model_df, pitch_typed], axis=1)
model_df = model_df.drop(['FT'], axis =1)

In [None]:
model_list = []

for i in range(1,features.shape[1]+1):    
    features = model_df[['px', 'pz', 'start_speed', 'end_speed', 'spin_rate', 'spin_dir',
       'break_angle', 'break_length', 'break_y', 'FF']]
    X = sm.add_constant(features)    
    #get features
    model = LogisticRegression(solver='lbfgs')
    rfe = RFE(model, i)
    #get features
    fit = rfe.fit(features, model_df['in_play_boolean'])
    
    X = features[features.columns[fit.support_]]
    X = sm.add_constant(X)
    X_train, X_test, y_train, y_test = train_test_split(X, model_df['in_play_boolean'], test_size=0.3, random_state=0)
    
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    
    logit_model = sm.Logit(y_train,X_train)
    result = logit_model.fit()
    
    
    model_list.append((result.summary2(), result.aic, result.bic, logreg.score(X_test, y_test)))

In [None]:
[(x[1],x[2],x[3]) for x in model_list]

In [None]:
model_list[5]

In [None]:
#check for multicollinarity
import seaborn as sn
correlation_matrix = features[['px', 'pz', 'start_speed', 'end_speed', 'break_length', 'break_y']].corr().round(2)
# annot = True to print the values inside the square
plt.figure(figsize = (16,5))
sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
features = model_df[['px', 'pz', 'start_speed', 'break_length', 'break_y']]
X = sm.add_constant(features)

X_train, X_test, y_train, y_test = train_test_split(X, model_df['in_play_boolean'], test_size=0.3, random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
    
logit_model = sm.Logit(y_train,X_train)
result = logit_model.fit()
    
print(result.summary2()) 
print(result.aic)
print(result.bic)
print(logreg.score(X_test, y_test))