In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [285]:
import pandas as pd
import numpy as np

import game_file_preparation as gfp

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as dates
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings(action='once')

# The Focus

For the reasons stated in the batter model exploration, tradtional time series and panel data approaches are not possible. There are too many different players involved within the meaningfully unbalanced data set. Therefore, the random forest/gradient boosting machine approach is the best way to handle the data. For pitching data, the question is how to incorporate the much more present off-days in the more variant pitching schedule. Starters traditionally have 4 days between starts (sometimes 5 or more if there is a team off-day), while bullpen arms can either have no off-day, or multiple days off.

The approach will be the same as batters (lag and rolling average), but the width of these rolling average bins will be the main focus of determining the best model.

In [271]:
train_set = pd.read_csv('data/train/pitchers_condensed_train.csv')
test_set = pd.read_csv('data/test/pitchers_condensed_test.csv')

In [272]:
drop_columns = ['Unnamed: 0', 'pitch_type', 'release_speed', 'release_pos_x', 'release_pos_z', 'batter', 'events', 'zone', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'bb_type', 'balls', 
                'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning', 'fielder_2', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'effective_speed', 
                'release_spin_rate', 'release_extension', 'game_pk', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y', 'babip_value', 'at_bat_number',
                'pitch_number', 'pitch_name', 'home_score', 'away_score', 'bat_score', 'fld_score', 'spin_axis', 'batter_name', 'pitcher_name', 'game_month', 'game_day', 'bat_event', 'spin_x', 'spin_z', 
                'attribute_0', 'attribute_1', 'attribute_2', 'attribute_3', 'attribute_4', 'attribute_5', 'attribute_6', 'attribute_7', 'attribute_8']

train_set.drop(drop_columns,axis=1,inplace=True)
test_set.drop(drop_columns,axis=1,inplace=True)

In [273]:
train_set.fillna(0,inplace=True)
test_set.fillna(0,inplace=True)

In [274]:
def fill_dates(data):
    pd_out = pd.DataFrame()
    
    for pitcher in data.pitcher.unique():
        temp_df = data[data.pitcher == pitcher]
        pd_out = pd_out.append(temp_df.asfreq('D'))
        pd_out['pitcher'].fillna(pitcher, inplace=True)
        
    data = pd_out
    data.fillna(0, inplace=True)
    
    return data
    
def shift_target(data):
    data[['next_estimated_ba_using_speedangle','next_play','day_of_week','month','year']] = data.groupby('pitcher')[['estimated_ba_using_speedangle','play','day_of_week','month','year']].shift(-1)
    
    return data

def type_set(data):
    int_cols = ['k','bb','pa','pitch_count','pitcher','play']
    
    for col in data.columns:
        if ('cluster' in col) and ('attribute' not in col):
            int_cols.append(col)
    
    data.loc[:,data.columns.isin(int_cols)] = data.loc[:,data.columns.isin(int_cols)].astype('int')
    data.loc[:,~data.columns.isin(int_cols)] = data.loc[:,~data.columns.isin(int_cols)].astype('float')
    
    return data
    
def initial_clean(data):
    temp_class = gfp.BatterPrep()
    
    temp_class.play(data)
    data = temp_class.game_date_to_index(data)
    data = fill_dates(data)
    temp_class.date_info(data)
    data = type_set(data)
    data = shift_target(data)    
    temp_class.per_pa(data)
    
    return data

In [275]:
# rolling mean by date
def rolling_data(data,roll_amount,target):
    mean_name = target + '_mean_' + str(roll_amount)
    
    temp_df = data.groupby('pitcher')[[target,'play']].rolling(roll_amount).sum().reset_index(level=0,drop=False)
    temp_df.columns = ['pitcher', target, 'play']
    
    mask = (temp_df.play != 0)
    temp_df.loc[mask,mean_name] = temp_df.loc[mask,target] / temp_df.loc[mask,'play']
    
    temp_df.drop([target,'play'],axis = 1,inplace=True)
    
    return data.merge(temp_df, on=['game_date','pitcher'])
        
# rolling mean weighted by plate appearances
def rolling_weighted_data(data,roll_amount,target):
    data['weighted_pa'] = data.pa - data.bb
    data['weighted_target'] = data[target] * data.weighted_pa
    
    name = target + '_mean_weighted_' + str(roll_amount)
    
    temp_df = data.groupby('pitcher')[['weighted_target','weighted_pa']].rolling(roll_amount).sum().reset_index(level=0,drop=False)
    
    mask = (temp_df.weighted_pa != 0)
    temp_df.loc[mask,name] = temp_df.loc[mask,'weighted_target'] / temp_df.loc[mask,'weighted_pa']
    
    temp_df.drop(['weighted_target','weighted_pa'],axis = 1,inplace=True)
    data.drop(['weighted_target','weighted_pa'],axis=1,inplace=True)
    
    return data.merge(temp_df, on=['game_date','pitcher'], how='inner')
    
# introduces log of previous xBA
def lag_features(data,n):
    cols = ['bb_per_pa','k_per_pa','play','estimated_ba_using_speedangle']
    
    for col in cols:
        for i in range(n):
            name = col + '_' + str(i+1)
            
            if i > 0:
                prev_name = col + '_' + str(i)
                data[name] = data.groupby('pitcher')[prev_name].shift(1)
            else:
                data[name] = data.groupby('pitcher')[col].shift(1)
                  
    return data
    
# combines rolling and lag methods
def depth_features(data,depth,bin_size,roll_vars=['estimated_ba_using_speedangle','k_per_pa','bb_per_pa']):
    temp_df = data.copy()
    for item in roll_vars:
        for i in range(1,depth+1):
            if i % bin_size == 0:
                temp_df = rolling_data(temp_df,i,item)
                temp_df = rolling_weighted_data(temp_df,i,item)
    temp_df = lag_features(temp_df,depth)
    
    return temp_df

def depth_finish(data,depth,bin_size,roll_vars=['estimated_ba_using_speedangle','k_per_pa','bb_per_pa']):
    temp_class = gfp.BatterPrep()
    data = depth_features(data,depth,bin_size,roll_vars)
    
    return temp_class.data_clean(data)

In [276]:
main_cols = ['pitcher','next_estimated_ba_using_speedangle','pa', 'pitch_count','play','next_play']

metric_cols = ['k','bb','estimated_ba_using_speedangle','k_per_pa','bb_per_pa','day_of_week','month','year']
cluster_cols = ['cluster_attribute_0', 'cluster_attribute_0_max', 'cluster_attribute_0_min', 'cluster_attribute_1', 'cluster_attribute_1_max', 'cluster_attribute_1_min', 'cluster_attribute_2', 
                'cluster_attribute_2_max', 'cluster_attribute_2_min', 'cluster_attribute_3', 'cluster_attribute_3_max', 'cluster_attribute_3_min', 'cluster_attribute_4', 'cluster_attribute_4_max', 
                'cluster_attribute_4_min', 'cluster_attribute_5', 'cluster_attribute_5_max', 'cluster_attribute_5_min', 'cluster_attribute_6', 'cluster_attribute_6_max', 'cluster_attribute_6_min', 
                'cluster_attribute_7', 'cluster_attribute_7_max', 'cluster_attribute_7_min', 'cluster_attribute_8', 'cluster_attribute_8_max', 'cluster_attribute_8_min', 'cluster_attribute_9', 
                'cluster_attribute_9_max', 'cluster_attribute_9_min', 'cluster_attribute_10', 'cluster_attribute_10_max', 'cluster_attribute_10_min', 'cluster_attribute_11', 'cluster_attribute_11_max',
                'cluster_attribute_11_min', 'cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9', 
                'cluster_10', 'cluster_11']

metric_cols.extend(main_cols)
cluster_cols.extend(main_cols)

In [277]:
train_set = initial_clean(train_set)
test_set = initial_clean(test_set)

In [278]:
train_metrics = train_set[metric_cols].copy()
test_metrics = test_set[metric_cols].copy()

train_cluster = train_set[cluster_cols].copy()
test_cluster = test_set[cluster_cols].copy()

In [279]:
train_set.columns

Index(['pitcher', 'bb', 'k', 'estimated_ba_using_speedangle',
       'cluster_attribute_0', 'cluster_attribute_0_max',
       'cluster_attribute_0_min', 'cluster_attribute_1',
       'cluster_attribute_1_max', 'cluster_attribute_1_min',
       'cluster_attribute_2', 'cluster_attribute_2_max',
       'cluster_attribute_2_min', 'cluster_attribute_3',
       'cluster_attribute_3_max', 'cluster_attribute_3_min',
       'cluster_attribute_4', 'cluster_attribute_4_max',
       'cluster_attribute_4_min', 'cluster_attribute_5',
       'cluster_attribute_5_max', 'cluster_attribute_5_min',
       'cluster_attribute_6', 'cluster_attribute_6_max',
       'cluster_attribute_6_min', 'cluster_attribute_7',
       'cluster_attribute_7_max', 'cluster_attribute_7_min',
       'cluster_attribute_8', 'cluster_attribute_8_max',
       'cluster_attribute_8_min', 'cluster_attribute_9',
       'cluster_attribute_9_max', 'cluster_attribute_9_min',
       'cluster_attribute_10', 'cluster_attribute_10_max',
    

In [291]:
train_metrics_4 = depth_finish(train_metrics,18,4)
train_metrics_5 = depth_finish(train_metrics,18,5)
train_metrics_6 = depth_finish(train_metrics,18,6)

train_metric_list = [train_metrics_4, train_metrics_5, train_metrics_6]

In [292]:
test_metrics_4 = depth_finish(test_metrics,18,4)
test_metrics_5 = depth_finish(test_metrics,18,5)
test_metrics_6 = depth_finish(test_metrics,18,6)

test_metric_list = [test_metrics_4, test_metrics_5, test_metrics_6]

In [293]:
train_pred = []
test_pred = []

for i in range(len(train_metric_list)):
    train_temp = train_metric_list[i]
    test_temp = test_metric_list[i]
    
    model = RandomForestRegressor()
    
    x_train = train_temp.loc[:,~train_temp.columns.isin(['next_estimated_ba_using_speedangle'])]
    y_train = train_temp.loc[:,train_temp.columns.isin(['next_estimated_ba_using_speedangle'])]
    
    x_test = test_temp.loc[:,~test_temp.columns.isin(['next_estimated_ba_using_speedangle'])]
    #y_test = test_temp.loc[:,test_temp.columns.isin('next_estimated_ba_using_speedangle')]
    
    rf_temp = model.fit(x_train,y_train)
    
    train_pred.append(rf_temp.predict(x_train))
    test_pred.append(rf_temp.predict(x_test))

  
  
  


In [294]:
for i in range(len(train_metric_list)):
    train_temp = train_metric_list[i]
    test_temp = test_metric_list[i]
    
    y_train = train_temp.loc[:,train_temp.columns.isin(['next_estimated_ba_using_speedangle'])]
    y_test = test_temp.loc[:,test_temp.columns.isin(['next_estimated_ba_using_speedangle'])]
    
    print('Train - Bins: ' + str(i+4))
    print('    mse: ' + str(mean_squared_error(y_train,train_pred[i])))
    print('    mae: ' + str(mean_absolute_error(y_train,train_pred[i])))
    print('Test - Bins: ' + str(i+4))
    print('    mse: ' + str(mean_squared_error(y_test,test_pred[i])))
    print('    mae: ' + str(mean_absolute_error(y_test,test_pred[i])))

Train - Bins: 4
    mse: 0.003941648596042845
    mae: 0.04942596661674824
Test - Bins: 4
    mse: 0.02413572302357967
    mae: 0.12472334301124192
Train - Bins: 5
    mse: 0.003498266592668973
    mae: 0.04569974651003261
Test - Bins: 5
    mse: 0.022251200985367762
    mae: 0.11766945115612257
Train - Bins: 6
    mse: 0.0032550658683814944
    mae: 0.04357566720787102
Test - Bins: 6
    mse: 0.020570166381995502
    mae: 0.11167152093185564


# Results of Bin

6 length bins are the ideal bin size. This has the best results and will be the depth going forward for pitchers. This model, despite not being properly tuned, is already better performing than using rolling averages or lagged variables, which can be seen below.

In [302]:
for i in range(len(train_metric_list)):
    df = train_metric_list[i]
    print(i + 4)
    y_train = df.next_estimated_ba_using_speedangle
    for col in df.columns:
        if "estimated_ba_using_speedangle" in col and 'next' not in col:
            print('   ' + col)
            print('      mse: ' + str(mean_squared_error(y_train,df[col])))
            print('      mae: ' + str(mean_absolute_error(y_train,df[col])))

4
   estimated_ba_using_speedangle
      mse: 0.07027561738188998
      mae: 0.21044960888534353
   estimated_ba_using_speedangle_mean_4
      mse: 0.04757804890774303
      mae: 0.16945943500524357
   estimated_ba_using_speedangle_mean_weighted_4
      mse: 0.04721365069429413
      mae: 0.16912058012314687
   estimated_ba_using_speedangle_mean_8
      mse: 0.037915215526246375
      mae: 0.15204877181675208
   estimated_ba_using_speedangle_mean_weighted_8
      mse: 0.03709990720335876
      mae: 0.1513427598351456
   estimated_ba_using_speedangle_mean_12
      mse: 0.03453950701423546
      mae: 0.14552103480631232
   estimated_ba_using_speedangle_mean_weighted_12
      mse: 0.03385757038919658
      mae: 0.14491649664357348
   estimated_ba_using_speedangle_mean_16
      mse: 0.032970116792404434
      mae: 0.14229721288300434
   estimated_ba_using_speedangle_mean_weighted_16
      mse: 0.032351174928139236
      mae: 0.14183420187482512
   estimated_ba_using_speedangle_1
      mse:

# Pitch Cluster Analysis

I want to see if the progression of pitch cluster can achieve a similar effect as performance.

In [304]:
info_cols = [item for item in cluster_cols if item not in main_cols]

In [305]:
len(info_cols)

48

In [329]:
train_cluster = train_cluster.dropna()
test_cluster = test_cluster.dropna()

train_cluster = train_cluster[train_cluster.next_play != 0]
test_cluster = test_cluster[test_cluster.next_play != 0]

x_train_c = train_cluster.loc[:,train_cluster.columns.isin(info_cols)]
y_train_c = train_cluster.loc[:,train_cluster.columns.isin(['next_estimated_ba_using_speedangle'])]

x_test_c = test_cluster.loc[:,test_cluster.columns.isin(info_cols)]
y_test_c = test_cluster.loc[:,test_cluster.columns.isin(['next_estimated_ba_using_speedangle'])]

In [330]:
train_cluster

Unnamed: 0_level_0,cluster_attribute_0,cluster_attribute_0_max,cluster_attribute_0_min,cluster_attribute_1,cluster_attribute_1_max,cluster_attribute_1_min,cluster_attribute_2,cluster_attribute_2_max,cluster_attribute_2_min,cluster_attribute_3,...,cluster_8,cluster_9,cluster_10,cluster_11,pitcher,next_estimated_ba_using_speedangle,pa,pitch_count,play,next_play
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-05,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,605218,0.000000,0,0,0,1.0
2017-04-08,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,605218,0.071000,0,0,0,1.0
2017-04-09,0.066936,0.081136,0.052454,0.040344,0.053147,0.028515,0.098662,0.160943,0.013687,0.128661,...,0,0,0,0,605218,0.000000,3,12,1,1.0
2017-04-12,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,605218,0.388667,0,0,0,1.0
2017-04-17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,605218,0.233333,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,676979,0.135000,0,0,0,1.0
2020-09-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,656240,0.422600,0,0,0,1.0
2020-09-22,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,571788,0.344333,0,0,0,1.0
2020-09-24,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,0,0,0,656382,0.205000,0,0,0,1.0


In [323]:
test_model = RandomForestRegressor()

test_model.fit(x_train_c,y_train_c)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor()

In [324]:
print('Train')
print('    mse: ' + str(mean_squared_error(y_train_c,test_model.predict(x_train_c))))
print('    mae: ' + str(mean_absolute_error(y_train_c,test_model.predict(x_train_c))))
print('Test')
print('    mse: ' + str(mean_squared_error(y_test_c,test_model.predict(x_test_c))))
print('    mae: ' + str(mean_absolute_error(y_test_c,test_model.predict(x_test_c))))

Train
    mse: 0.01757699233861749
    mae: 0.09908194104094341
Test
    mse: 0.020737125173994838
    mae: 0.11109178458084429


Wow... This is the best performing model so far. Pitcher repertoire is profoundly important.

In [325]:
feature_importance = {}
feature_var = train_cluster.loc[:,~train_cluster.columns.isin(['next_estimated_ba_using_speedangle'])].columns

for i in range(len(train_cluster[feature_var].columns)):
    feature_importance[train_cluster[feature_var].columns[i]] = test_model.feature_importances_[i]

In [326]:
{k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1],reverse=True)}

{'pitcher': 0.2600147636604278,
 'cluster_attribute_10_max': 0.036253933741905824,
 'cluster_attribute_7_min': 0.03506030416992959,
 'cluster_attribute_7_max': 0.03245996139172861,
 'cluster_attribute_1_max': 0.03226758369850405,
 'cluster_attribute_10': 0.03132684734137398,
 'cluster_attribute_7': 0.0313051952400313,
 'cluster_attribute_1_min': 0.0312400211460112,
 'pitch_count': 0.02966353575693973,
 'cluster_attribute_1': 0.028793255835045647,
 'cluster_attribute_10_min': 0.02856832637740546,
 'cluster_attribute_8_max': 0.027015096185260268,
 'cluster_attribute_8': 0.022156402440575634,
 'cluster_attribute_8_min': 0.020258885690017065,
 'cluster_attribute_2_max': 0.01654866156871301,
 'cluster_attribute_6_max': 0.01640477525260891,
 'cluster_attribute_9_max': 0.016235493446627097,
 'cluster_attribute_5_min': 0.014536163480650264,
 'cluster_attribute_3_max': 0.014184996077834473,
 'cluster_attribute_2': 0.014121365586475932,
 'cluster_attribute_3_min': 0.013753665469672077,
 'cluster

In [None]:
# rolling mean by date
def rolling_data(data,roll_amount,target):
    mean_name = target + '_mean_' + str(roll_amount)
    
    temp_df = data.groupby('pitcher')[[target,'play']].rolling(roll_amount).sum().reset_index(level=0,drop=False)
    temp_df.columns = ['pitcher', target, 'play']
    
    mask = (temp_df.play != 0)
    temp_df.loc[mask,mean_name] = temp_df.loc[mask,target] / temp_df.loc[mask,'play']
    
    temp_df.drop([target,'play'],axis = 1,inplace=True)
    
    return data.merge(temp_df, on=['game_date','pitcher'])
        
# rolling mean weighted by plate appearances
def rolling_weighted_data(data,roll_amount,target):
    data['weighted_pa'] = data.pa - data.bb
    data['weighted_target'] = data[target] * data.weighted_pa
    
    name = target + '_mean_weighted_' + str(roll_amount)
    
    temp_df = data.groupby('pitcher')[['weighted_target','weighted_pa']].rolling(roll_amount).sum().reset_index(level=0,drop=False)
    
    mask = (temp_df.weighted_pa != 0)
    temp_df.loc[mask,name] = temp_df.loc[mask,'weighted_target'] / temp_df.loc[mask,'weighted_pa']
    
    temp_df.drop(['weighted_target','weighted_pa'],axis = 1,inplace=True)
    data.drop(['weighted_target','weighted_pa'],axis=1,inplace=True)
    
    return data.merge(temp_df, on=['game_date','pitcher'], how='inner')
    
# introduces log of previous xBA
def lag_features(data,n):
    cols = ['bb_per_pa','k_per_pa','play','estimated_ba_using_speedangle']
    
    for col in cols:
        for i in range(n):
            name = col + '_' + str(i+1)
            
            if i > 0:
                prev_name = col + '_' + str(i)
                data[name] = data.groupby('pitcher')[prev_name].shift(1)
            else:
                data[name] = data.groupby('pitcher')[col].shift(1)
                  
    return data
    
# combines rolling and lag methods
def depth_features(data,depth,bin_size,roll_vars=['estimated_ba_using_speedangle','k_per_pa','bb_per_pa']):
    temp_df = data.copy()
    for item in roll_vars:
        for i in range(1,depth+1):
            if i % bin_size == 0:
                temp_df = rolling_data(temp_df,i,item)
                temp_df = rolling_weighted_data(temp_df,i,item)
    temp_df = lag_features(temp_df,depth)
    
    return temp_df

def depth_finish(data,depth,bin_size,roll_vars=['estimated_ba_using_speedangle','k_per_pa','bb_per_pa']):
    temp_class = gfp.BatterPrep()
    data = depth_features(data,depth,bin_size,roll_vars)
    
    return temp_class.data_clean(data)