In [14]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from random import seed
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import cohen_kappa_score
from scipy.special import expit
from scipy.special import logit
from sklearn.cross_validation import train_test_split
%matplotlib inline

In [15]:
# Make Predictions Reproducible
seed(314)

In [16]:
#Turn off SettingWithCopyWarning()
pd.options.mode.chained_assignment = None

In [17]:
!pwd

/Users/Kyle1/Dropbox/GS/Research/Sports/Baseball/Relief-Fatigue/Code


In [18]:
df = pd.read_csv('../Data/pitch_swing.csv', index_col=0)
inds = np.random.choice(df.shape[0], df.shape[0], replace=False)
df = df.iloc[inds].copy()

In [19]:
list(df.pitch_type.unique())

['SI', 'FC', 'SL', 'FF', 'FT', 'CH', 'CU', 'FS']

In [20]:
df.columns

Index(['gameday_link', 'batter', 'pitcher', 'num', 'event', 'inning_side',
       'inning', 'batter_name', 'pitcher_name', 'date', 'o', 'p_throws', 'des',
       'type', 'x', 'y', 'start_speed', 'end_speed', 'sz_top', 'sz_bot',
       'pfx_x', 'pfx_z', 'px', 'pz', 'x0', 'y0', 'z0', 'vx0', 'vy0', 'vz0',
       'ax', 'ay', 'az', 'break_y', 'break_angle', 'break_length',
       'pitch_type', 'spin_dir', 'spin_rate', 'count', 'zone', 'sv_id',
       'reg_season', 'post_season', 'year', 'b', 's', 'avg_fast_speed',
       'swinging', 'whiff', 'diff_speed'],
      dtype='object')

# Get best leaf for each pitch_type

**Also return table of results**

In [21]:
#k-fold cross validation
def get_best_leaf(df,pitch_list,leaf_list,covariates,folds = 10):

    fits={}
    imps = {}

    for pitch in pitch_list:
        df_p = df[df.pitch_type==pitch]
        fits[pitch] ={}
        Y = df_p.whiff
        X = df_p[covariates]
        avg_scores = []; avg_kappas = [];
        for leaf in leaf_list:
            scores = []; kappas = []; 
            clf = RandomForestClassifier(min_samples_leaf=leaf)

            for train, test in KFold(Y.shape[0], n_folds=folds):
                #Fit model
                clf.fit(X.iloc[train], Y.iloc[train])

                #Judge the model
                scores.append(clf.score(X.iloc[test], Y.iloc[test]))
                preds = clf.predict(X.iloc[test])
                kappas.append(cohen_kappa_score(preds, Y.iloc[test]))

            #print('\nNumber of Leaves: {}'.format(leaf))
            #print('Avg Score: {0:.3f}'.format(np.mean(scores)))
            #print('Sd Score: {0:.2f}'.format(np.std(scores)))
            #print('Avg Kappa: {0:.3f}'.format(np.mean(kappas)))
            avg_scores.append(round(np.mean(scores),3))
            avg_kappas.append(round(np.mean(kappas),3))

        best_k = np.argmax(avg_kappas)
        best_s = np.argmax(avg_scores)
        print('\nPitch Type: {}'.format(pitch))
        print("Percent Contact: ", round(1-Y.mean(),3))
        print('Leaves/Score/Kappa for best:')
        print('Score: ',leaf_list[best_s],"/",avg_scores[best_s],"/",avg_kappas[best_s])
        print('Kappa: ',leaf_list[best_k],"/",avg_scores[best_k],"/",avg_kappas[best_k])

        fits[pitch]['perc_whiff'] = round(1-Y.mean(),3)
        fits[pitch]['score'] = avg_scores[best_s]
        fits[pitch]['num_pitches'] = int(len(Y))
        fits[pitch]['leaf'] = int(leaf_list[best_s])
        fits[pitch]['kappa'] = avg_kappas[best_s]

    return(fits)



In [37]:
leafs = [25,50,100,200,500];
pitch_types = list(df.pitch_type.unique())
covars = ['start_speed','end_speed','pfx_x','vx0','vy0','vz0','pfx_z','ax','ay','az','break_y',
         'break_angle','spin_dir','spin_rate']
best = get_best_leaf(df,pitch_types,leafs,covars)


Pitch Type: SI
Percent Whiffs:  0.798
Leaves/Score/Kappa for best:
Score:  25 / 0.801 / 0.054
Kappa:  25 / 0.801 / 0.054

Pitch Type: FC
Percent Whiffs:  0.695
Leaves/Score/Kappa for best:
Score:  25 / 0.719 / 0.159
Kappa:  25 / 0.719 / 0.159

Pitch Type: SL
Percent Whiffs:  0.545
Leaves/Score/Kappa for best:
Score:  50 / 0.683 / 0.35
Kappa:  50 / 0.683 / 0.35

Pitch Type: FF
Percent Whiffs:  0.759
Leaves/Score/Kappa for best:
Score:  100 / 0.76 / 0.023
Kappa:  25 / 0.759 / 0.053

Pitch Type: FT
Percent Whiffs:  0.799
Leaves/Score/Kappa for best:
Score:  25 / 0.802 / 0.049
Kappa:  25 / 0.802 / 0.049

Pitch Type: CH
Percent Whiffs:  0.536
Leaves/Score/Kappa for best:
Score:  50 / 0.631 / 0.249
Kappa:  50 / 0.631 / 0.249

Pitch Type: CU
Percent Whiffs:  0.533
Leaves/Score/Kappa for best:
Score:  25 / 0.706 / 0.405
Kappa:  25 / 0.706 / 0.405

Pitch Type: FS
Percent Whiffs:  0.553
Leaves/Score/Kappa for best:
Score:  100 / 0.66 / 0.301
Kappa:  100 / 0.66 / 0.301


# Get Predicted Means Table

In [38]:
def get_pred_means_table(df,covariates,leaf, subset = None):
    df_p_train, df_p_test = train_test_split(df, test_size=0.5, random_state=42)

    #Test and train X datasets
    Y_train = df_p_train.whiff
    Y_test = df_p_test.whiff
    X_train = df_p_train[covariates]

    X_test = df_p_test[covariates]


    X_train = pd.concat([pd.get_dummies(X_train[col]).ix[:, :-1] 
                   if X_train[col].dtype == object or hasattr(X_train[col], 'cat')
                   else X_train[col]
                   for col in X_train.columns], axis=1)

    X_test = pd.concat([pd.get_dummies(X_test[col]).ix[:, :-1] 
                   if X_test[col].dtype == object or hasattr(X_test[col], 'cat')
                   else X_test[col]
                   for col in X_test.columns], axis=1)

    #Fit the Random Forest with the best leaf
    clf = RandomForestClassifier(min_samples_leaf=leaf)
    clf.fit(X_train,Y_train)

    print(1-Y_test.mean())
    print(clf.score(X_test,Y_test))
    #Predict probabilities
    pred_probs = clf.predict_proba(X_test)[:,1] #predicted probability of strike

    #Take the logit
    pred_stuff = logit(pred_probs)

    pred_mat = df_p_test[["pitcher_name"]].copy()
    pred_mat['pred_stuff'] = pred_stuff
    
    #Get rid of people with predicted probability 0
    pred_mat['pred_stuff'] = pred_mat['pred_stuff'][pred_mat['pred_stuff']>-math.inf]
    
    #mean/sd for standardization
    overall_mean = pred_mat.pred_stuff.mean()
    overall_sd = pred_mat.pred_stuff.std()
    
    #Standardize
    pred_mat['pred_z'] = (pred_mat['pred_stuff'] - overall_mean)/overall_sd
    
    #group by pitcher name
    #average the stuff
    group_pred = pred_mat.groupby('pitcher_name')

    mapped_funs = {'pred_z': {'count','mean'}}
    pred_means = group_pred.agg(mapped_funs)

    #Only show me players with more than 100 of that pitch
    if(subset is not None):
        pred_means = pred_means[pred_means[('pred_z','count')]>subset]
    
    pred_means = pred_means.sort_values(('pred_z','mean'),ascending = False)

    return(pred_means)

# Train Model based on best leaf, attach to other dset

** Need to train model on 2012/2016 swings, but attach to 2013-2015 all pitches**

In [39]:
def add_stuff(train_dset, pred_dset, best_fits, covariates):
    
    #Train all the pitches, save the clfs
    all_pitches = train_dset.pitch_type.unique()
    if(len(all_pitches)!=len(best_fits)):
        print('Error: Num pitch types in fits dictionary not same as num pitch types in dset')
        return
    
    clf_dict = {}
    for i in range(len(all_pitches)):
        pitch = all_pitches[i]
        
        if(pitch not in best_fits.keys()):
            print(pitch+' not in fits dictionary')
            return
        
        train_df = train_dset[train_dset.pitch_type==pitch]
        X_train = train_df[covariates]

        X_train = pd.concat([pd.get_dummies(X_train[col]).ix[:, :-1] 
                       if X_train[col].dtype == object or hasattr(X_train[col], 'cat')
                       else X_train[col]
                       for col in X_train.columns], axis=1)
        
        Y_train = train_df.whiff
        
        leaf = best_fits[pitch]['leaf']
        clf =  RandomForestClassifier(min_samples_leaf=leaf)
        clf.fit(X_train,Y_train)
        
        ##Now, predict on new data
        pred_df = pred_dset[pred_dset.pitch_type==pitch].copy()
        
        X_pred = pred_df[covariates]
        X_pred = pd.concat([pd.get_dummies(X_pred[col]).ix[:, :-1] 
                       if X_pred[col].dtype == object or hasattr(X_pred[col], 'cat')
                       else X_pred[col]
                       for col in X_pred.columns], axis=1)
        
        #Predict probabilities
        pred_probs = clf.predict_proba(X_pred)[:,1] #predicted probability of strike

        #Take the logit
        pred_stuff = logit(pred_probs)
        
        overall_mean = pred_stuff.mean()
        overall_sd = pred_stuff.std()
        
        pred_z = (pred_stuff - overall_mean)/overall_sd
        
        pred_df['z_stuff'] = pred_z
        
        if(i==0):
            out_mat = pred_df.copy()
        else:
            out_mat = out_mat.append(pred_df)
        
    
    if(out_mat.shape[0]!= pred_dset.shape[0]):
        print("You got more than zero problems big fella")
        
    return(out_mat)

    

In [40]:
final_train = df[(df.year==2016) | (df.year==2012)]
final_pred = df[(df.year>=2013)&(df.year<=2015)]

In [41]:
jags_dset = add_stuff(final_train, final_pred,best,covars,)

In [42]:
jags_group = jags_dset.groupby('pitcher_name')

mapped_funs = {'z_stuff': {'count','mean'}}
z_stuff = jags_group.agg(mapped_funs)
z_stuff.sort_values(('z_stuff','mean'),ascending=False)

Unnamed: 0_level_0,z_stuff,z_stuff
Unnamed: 0_level_1,count,mean
pitcher_name,Unnamed: 1_level_2,Unnamed: 2_level_2
Zachary Britton,646,1.550312
Jeff Beliveau,61,1.066032
Craig Kimbrel,1198,0.990807
Aroldis Chapman,1267,0.948193
Kyuji Fujikawa,93,0.926513
Matt Daley,52,0.910518
Jacob McGee,1166,0.902106
Brayan Villarreal,37,0.817641
Austin Adams,144,0.791438
Michael Kohn,442,0.765918


In [64]:
final_pred.shape

(193706, 51)

time: 3.18 ms


**This is where the money is made**

pandas.core.groupby.DataFrameGroupBy