In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [1]:
import pandas as pd
import numpy as np

import libs.split_data as sd

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [3]:
import os

In [4]:
train_set = pd.read_csv('data/train/matchups_train.csv')
test_set = pd.read_csv('data/test/matchups_test.csv')

In [5]:
mp = sd.MatchupPrep()

train_set = mp.game_date_to_index(train_set)
test_set = mp.game_date_to_index(test_set)

In [6]:
columns_used = [col for col in train_set.columns if ('cluster' in col)]
columns_used.extend(['batter', 'pitcher', 'bb', 'events', 'pa', 'estimated_ba_using_speedangle', 'estimated_ba_using_speedangle_list'])

In [7]:
train_set = train_set[columns_used]
test_set = test_set[columns_used]

# Pitchers

In [8]:
def rolling_cluster_hard(data,depth_num,depth_min=3,depth_type='D'):
    data = data.copy()
    
    cluster_cols = [col for col in data.columns if 'cluster' in col and 'attribute' not in col and 'list' not in col]
    cluster_cols.extend(['pa','bb'])
    depth = str(depth_num) + depth_type
    
    data.index = pd.to_datetime(data.index)
    
    data = data.groupby(['pitcher',data.index])[cluster_cols].sum().reset_index(level=0)
    temp_data = data.groupby(['pitcher'], as_index=False)[cluster_cols].rolling(depth,closed='left',min_periods=depth_min).sum()
    temp_data.reset_index(level=0)
    
    col_names = []
    for col in temp_data.columns:
        if col in cluster_cols:
            col_names.append(col + '_roll')
        else:
            col_names.append(col)
    temp_data.columns = col_names
    
    temp_data = hard_per_pa(temp_data)
    
    hard_cols = [col for col in temp_data.columns if 'per_pa' in col]
    temp_data = temp_data.set_index(['pitcher',temp_data.index])
    
    return temp_data[hard_cols].dropna()
    

def hard_per_pa(data):
    cluster_cols = [col for col in data.columns if 'cluster' in col and 'attribute' not in col and 'list' not in col and 'roll' in col]
    col_names = [item + '_per_pa' for item in cluster_cols]
    
    for i in range(len(col_names)):
        data[col_names[i]] = data[cluster_cols[i]] / (data['pa_roll'] + data['bb_roll'])
        
    return data

In [268]:
temp_df = train_set.copy()
temp_df = rolling_cluster_hard(temp_df,20)

In [269]:
len(temp_df.index)

73702

In [270]:
temp_df

Unnamed: 0_level_0,Unnamed: 1_level_0,cluster_0_roll_per_pa,cluster_1_roll_per_pa,cluster_2_roll_per_pa,cluster_3_roll_per_pa,cluster_4_roll_per_pa,cluster_5_roll_per_pa,cluster_6_roll_per_pa,cluster_7_roll_per_pa,cluster_8_roll_per_pa,cluster_9_roll_per_pa,cluster_10_roll_per_pa
pitcher,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
112526.0,2017-04-21,0.0,0.000000,0.0,0.015873,0.000000,0.015873,0.587302,0.0,0.380952,0.000000,0.000000
112526.0,2017-04-28,0.0,0.000000,0.0,0.000000,0.000000,0.013514,0.689189,0.0,0.297297,0.000000,0.000000
112526.0,2017-05-03,0.0,0.000000,0.0,0.012658,0.000000,0.025316,0.696203,0.0,0.265823,0.000000,0.000000
112526.0,2017-05-09,0.0,0.000000,0.0,0.012987,0.000000,0.012987,0.597403,0.0,0.376623,0.000000,0.000000
112526.0,2017-05-15,0.0,0.000000,0.0,0.042254,0.000000,0.042254,0.605634,0.0,0.295775,0.014085,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
685493.0,2020-09-27,0.0,0.028571,0.0,0.057143,0.057143,0.142857,0.400000,0.0,0.257143,0.028571,0.028571
685503.0,2021-04-19,0.0,0.578947,0.0,0.140351,0.000000,0.000000,0.122807,0.0,0.087719,0.052632,0.017544
685503.0,2021-04-25,0.0,0.607143,0.0,0.142857,0.000000,0.035714,0.125000,0.0,0.071429,0.017857,0.000000
685503.0,2021-04-30,0.0,0.425926,0.0,0.129630,0.000000,0.055556,0.111111,0.0,0.259259,0.000000,0.018519


In [12]:
train = train_set.copy()
temp_train = rolling_cluster_hard(train,20)
temp_train = hard_per_pa(temp_train)

cols_for_use = [col for col in temp_train.columns if ('attribute' not in col) & ('list' not in col) & ('events' not in col) & ('cluster' not in col)]
cols_to_add = [col for col in temp_train.columns if ('cluster' in col) & ('roll' in col)]
cols_for_use.extend(cols_to_add)
cols_for_use.remove('batter')
cols_for_use.remove('pitcher')
cols_for_use.remove('bb')
cols_for_use.remove('pa')

temp_train = temp_train[cols_for_use]
temp_train.dropna(inplace=True)

In [13]:
test = test_set.copy()
temp_test = rolling_cluster_hard(test,20)
temp_test = hard_per_pa(temp_test)

In [14]:
rf = RandomForestRegressor()
rf.fit(temp_train.loc[:,temp_train.columns!='estimated_ba_using_speedangle'],temp_train.loc[:,temp_train.columns=='estimated_ba_using_speedangle'])

  


RandomForestRegressor()

In [15]:
cols_for_use.remove('estimated_ba_using_speedangle')

feature_importance = {}

for i in range(len(temp_train[cols_for_use].columns)):
    feature_importance[temp_train[cols_for_use].columns[i]] = rf.feature_importances_[i]

In [16]:
{k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1],reverse=True)}

{'cluster_6_roll_per_pa': 0.08741730676399706,
 'cluster_9_roll_per_pa': 0.08444258235952519,
 'cluster_1_roll_per_pa': 0.0832640915383932,
 'cluster_8_roll_per_pa': 0.07803327793232127,
 'bb_roll': 0.07331936147754725,
 'cluster_3_roll_per_pa': 0.06788665588782009,
 'pa_roll': 0.06628113272438979,
 'cluster_5_roll_per_pa': 0.056913476431363755,
 'cluster_4_roll_per_pa': 0.0520326051426244,
 'cluster_10_roll_per_pa': 0.05131559096457014,
 'cluster_0_roll_per_pa': 0.04164726893095656,
 'cluster_1_roll': 0.030856359211362146,
 'pitcher_pa': 0.030848246081221865,
 'cluster_2_roll_per_pa': 0.027525793089109674,
 'cluster_9_roll': 0.027341043977169067,
 'cluster_6_roll': 0.026891587665978177,
 'cluster_8_roll': 0.02601983025545167,
 'cluster_3_roll': 0.019101142705419134,
 'cluster_0_roll': 0.016364647071097593,
 'cluster_5_roll': 0.015408751854627376,
 'cluster_10_roll': 0.01373003242296082,
 'cluster_4_roll': 0.013648356522536698,
 'cluster_2_roll': 0.007535278456588408,
 'cluster_7_roll_

In [9]:
def rolling_cluster_soft(data,depth_num,depth_min=3,depth_type="D"):
    data = data.copy()
    
    cluster_cols = [col for col in data.columns if 'cluster_attribute' in col and 'list' not in col]
    cluster_cols.extend(['pa','bb'])
    depth = str(depth_num) + depth_type
    
    data.index = pd.to_datetime(data.index)
    
    data = data.groupby(['pitcher',data.index])[cluster_cols].sum().reset_index(level=0)
    temp_data = data.groupby(['pitcher'], as_index=False).rolling(depth,min_periods=depth_min,closed='left')[cluster_cols].mean()
    
    col_names = []
    for col in temp_data.columns:
        if col in cluster_cols:
            col_names.append(col + '_roll')
        else:
            col_names.append(col)
    temp_data.columns = col_names
    temp_data = temp_data.reset_index(level=0)
    
    soft_cols = [col for col in temp_data.columns if '_roll' in col and 'bb_' not in col and 'k_' not in col and 'pa_' not in col]
    
    temp_data = temp_data.set_index(['pitcher',temp_data.index])
    return temp_data[soft_cols].dropna()

In [272]:
temp_df = train_set.copy()
temp_df = rolling_cluster_soft(temp_df,20)

In [273]:
temp_df

Unnamed: 0_level_0,Unnamed: 1_level_0,cluster_attribute_0_roll,cluster_attribute_0_max_roll,cluster_attribute_0_min_roll,cluster_attribute_1_roll,cluster_attribute_1_max_roll,cluster_attribute_1_min_roll,cluster_attribute_2_roll,cluster_attribute_2_max_roll,cluster_attribute_2_min_roll,cluster_attribute_3_roll,...,cluster_attribute_7_min_roll,cluster_attribute_8_roll,cluster_attribute_8_max_roll,cluster_attribute_8_min_roll,cluster_attribute_9_roll,cluster_attribute_9_max_roll,cluster_attribute_9_min_roll,cluster_attribute_10_roll,cluster_attribute_10_max_roll,cluster_attribute_10_min_roll
pitcher,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
112526.0,2017-04-21,0.563126,0.619568,0.504016,0.881727,1.010284,0.747668,0.877063,0.944050,0.810211,0.788992,...,0.810576,1.043124,1.190668,0.888397,0.536800,0.627628,0.460199,0.900462,1.031403,0.763696
112526.0,2017-04-28,0.575616,0.674314,0.497459,0.879946,1.016656,0.732153,0.878230,0.958225,0.801896,0.780985,...,0.802304,1.052598,1.219798,0.885301,0.512989,0.605938,0.434465,0.899673,1.038601,0.749231
112526.0,2017-05-03,0.552654,0.648189,0.480043,0.859352,0.999101,0.707824,0.887132,0.964181,0.811011,0.794422,...,0.811422,1.038478,1.211745,0.861760,0.530799,0.643477,0.442591,0.879521,1.022110,0.724516
112526.0,2017-05-09,0.548348,0.647774,0.475213,0.925452,1.088010,0.752927,0.852422,0.944050,0.763084,0.757342,...,0.763496,1.108542,1.304549,0.910271,0.503427,0.604438,0.416868,0.946387,1.112126,0.770113
112526.0,2017-05-15,0.524121,0.583241,0.468897,0.860347,1.032354,0.683428,0.881023,0.961312,0.797720,0.804023,...,0.798103,1.028361,1.232716,0.812520,0.572874,0.726961,0.449376,0.879489,1.055905,0.697659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685493.0,2020-09-27,0.264619,0.268753,0.260486,0.504937,0.548919,0.460954,0.491827,0.508952,0.474703,0.485563,...,0.474767,0.562215,0.606384,0.518045,0.400809,0.430487,0.371131,0.512956,0.557273,0.468639
685503.0,2021-04-19,0.492247,0.554705,0.428098,1.405634,1.814465,1.001457,0.654783,0.802649,0.503394,0.626549,...,0.503632,1.285448,1.575670,0.995007,0.496413,0.701415,0.297209,1.392130,1.786539,1.002370
685503.0,2021-04-25,0.482184,0.541314,0.424423,1.423975,1.821231,1.001084,0.658620,0.822176,0.505181,0.628954,...,0.505392,1.290558,1.553250,1.006424,0.473653,0.635865,0.325763,1.408749,1.790063,1.002766
685503.0,2021-04-30,0.454540,0.501968,0.408479,1.452037,1.823910,1.054530,0.636005,0.787830,0.494297,0.603833,...,0.494532,1.373808,1.624750,1.101424,0.452273,0.612472,0.306397,1.444072,1.800870,1.062606


In [30]:
train = train_set.copy()
temp_train = rolling_cluster_soft(train,20)

cols_for_use = [col for col in temp_train.columns if ('attribute' not in col) & ('list' not in col) & ('events' not in col) & ('cluster' not in col)]
cols_to_add = [col for col in temp_train.columns if ('cluster' in col) & ('roll' in col)]
cols_for_use.extend(cols_to_add)
cols_for_use.remove('batter')
cols_for_use.remove('pitcher')
cols_for_use.remove('bb')
cols_for_use.remove('pa')

temp_train = temp_train[cols_for_use]
temp_train = temp_train.loc[temp_train.pitcher_pa>0,:]
temp_train.dropna(inplace=True)

In [31]:
rf = RandomForestRegressor()
rf.fit(temp_train.loc[:,temp_train.columns!='estimated_ba_using_speedangle'],temp_train.loc[:,temp_train.columns=='estimated_ba_using_speedangle'])

  


RandomForestRegressor()

In [32]:
cols_for_use.remove('estimated_ba_using_speedangle')

feature_importance = {}

for i in range(len(temp_train[cols_for_use].columns)):
    feature_importance[temp_train[cols_for_use].columns[i]] = rf.feature_importances_[i]

In [33]:
{k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1],reverse=True)}

{'cluster_attribute_0_min_roll': 0.05286498880064649,
 'bb_roll': 0.05253206074643624,
 'cluster_attribute_0_max_roll': 0.050804863883387716,
 'cluster_attribute_6_max_roll': 0.04967576690817727,
 'cluster_attribute_6_min_roll': 0.047690166198362305,
 'cluster_attribute_8_max_roll': 0.04002232781943985,
 'pa_roll': 0.03885395666533495,
 'cluster_attribute_0_roll': 0.038668390444023745,
 'pitcher_pa': 0.03758598506032348,
 'cluster_attribute_8_min_roll': 0.03620981379606035,
 'cluster_attribute_3_max_roll': 0.03593866151605521,
 'cluster_attribute_6_roll': 0.03549607589341965,
 'cluster_attribute_3_min_roll': 0.032399677676376026,
 'cluster_attribute_8_roll': 0.02970179355425633,
 'cluster_attribute_1_max_roll': 0.027095975230608194,
 'cluster_attribute_5_min_roll': 0.02644836611271386,
 'cluster_attribute_9_min_roll': 0.026315438160312177,
 'cluster_attribute_3_roll': 0.025809030284571308,
 'cluster_attribute_1_min_roll': 0.025383652907427126,
 'cluster_attribute_5_max_roll': 0.0252074

# Batters

In [10]:
def explode(data):
    data = data.copy()
    
    keep_cols = [col for col in data.columns if 'list' in col]
    keep_cols.extend(['batter'])
    
    data = data[keep_cols]
    keep_cols.remove('batter')
        
    for col in keep_cols:
        data[col] = data[col].apply(lambda x: x.translate({ord(i): None for i in ' []'}).split(','))
    
    data = data.set_index(['batter',data.index]).apply(pd.Series.explode).reset_index(level=0)
    data = data.replace('nan',np.nan)
    data.dropna(inplace=True)
    data[keep_cols] = data[keep_cols].astype('float')
    return data

In [11]:
def cluster_day(data):
    data = data.apply(lambda x: x.abs())
    cluster_cols = [col for col in data.columns if 'cluster' in col]
    cluster_cols.append('estimated_ba_using_speedangle_list')
    col_names = [col + '_agg' for col in cluster_cols]
    
    def weighted_mean(x):
        list_out = []
        for i in range(len(cluster_cols)):
            try:
                wm = np.average(x['estimated_ba_using_speedangle_list'],weights=x[cluster_cols[i]])
            except ZeroDivisionError:
                wm = 0
            list_out.append(wm)
        return list_out
    
    temp_df = pd.DataFrame()
    temp_df[0] = data.groupby(['batter','game_date'])[cluster_cols].apply(lambda x: weighted_mean(x))
    temp_data = data.groupby(['batter','game_date'])[cluster_cols].sum()
    #temp_df = temp_df.set_index(['batter',temp_data.index])
    temp_df = pd.DataFrame(temp_df[0].values.tolist(), index=temp_df.index, columns=col_names)
    temp_df.columns = col_names
    
    temp_df = temp_df.join(temp_data)
    
    return temp_df.reset_index(level=0)

In [12]:
def rolling_xba(data,depth_num,depth_min=10,depth_type='D'):
    cluster_cols = [col for col in data.columns if 'cluster' in col and 'agg' in col]
    weight_cols = [col for col in data.columns if 'cluster' in col and 'list' in col and 'agg' not in col]
    
    col_names = [col.replace('_list','_estimated') for col in weight_cols]
    
    depth = str(depth_num) + depth_type
    
    for i in range(len(cluster_cols)):
        data[cluster_cols[i]] = data[cluster_cols[i]] * data[weight_cols[i]]
    
    data = data.apply(lambda x: x.abs())
    
    temp_df = pd.DataFrame()
    temp_data = pd.DataFrame()
    temp_df = data.groupby(['batter'])[cluster_cols].rolling(depth,min_periods=depth_min,closed='left').sum()
    temp_data = data.groupby(['batter'])[weight_cols].rolling(depth,min_periods=depth_min,closed='left').sum()
    
    temp_df = temp_df.join(temp_data)
    
    for i in range(len(cluster_cols)):
        temp_df[col_names[i]] = temp_df[cluster_cols[i]] / temp_df[weight_cols[i]]
    
    temp_df = temp_df[col_names].reset_index(level=0)
    temp_df.replace([np.inf,-np.inf],0,inplace=True)
    temp_df = temp_df.set_index(['batter',temp_df.index])
    
    temp_df.dropna(how='all',inplace=True)
    return temp_df.fillna(0)

In [335]:
temp_df = explode(train_set)
temp_df = cluster_day(temp_df)
temp_df = rolling_xba(temp_df,30)

In [336]:
temp_df

Unnamed: 0_level_0,Unnamed: 1_level_0,cluster_attribute_0_estimated,cluster_attribute_1_estimated,cluster_attribute_2_estimated,cluster_attribute_3_estimated,cluster_attribute_4_estimated,cluster_attribute_5_estimated,cluster_attribute_6_estimated,cluster_attribute_7_estimated,cluster_attribute_8_estimated,cluster_attribute_9_estimated,...,cluster_1_estimated,cluster_2_estimated,cluster_3_estimated,cluster_4_estimated,cluster_5_estimated,cluster_6_estimated,cluster_7_estimated,cluster_8_estimated,cluster_9_estimated,cluster_10_estimated
batter,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
134181.0,2017-06-13,0.252629,0.297940,0.246329,0.238125,0.246424,0.227222,0.277076,0.246358,0.294018,0.227222,...,0.315846,0.000000e+00,0.215000,0.11300,0.110,0.325000,0.0,0.239143,0.187000,0.395667
134181.0,2017-06-14,0.257227,0.300206,0.234428,0.223824,0.234559,0.202783,0.268908,0.234469,0.294119,0.202781,...,0.343786,0.000000e+00,0.172000,0.11300,0.110,0.325000,0.0,0.212625,0.100000,0.395667
134181.0,2017-06-16,0.243932,0.278184,0.219392,0.207272,0.219533,0.189582,0.254960,0.219436,0.273656,0.189581,...,0.320867,0.000000e+00,0.143333,0.11300,0.110,0.325000,0.0,0.212625,0.100000,0.298500
134181.0,2017-06-17,0.271879,0.286026,0.226307,0.214144,0.226450,0.196566,0.261631,0.226352,0.280906,0.196566,...,0.320867,0.000000e+00,0.143333,0.11300,0.110,0.325000,0.0,0.212625,0.100000,0.298500
134181.0,2017-06-18,0.277054,0.291416,0.231699,0.219817,0.231840,0.202155,0.265984,0.231743,0.286619,0.202155,...,0.324706,0.000000e+00,0.158143,0.11300,0.110,0.325000,0.0,0.227556,0.100000,0.298500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683734.0,2021-09-24,0.152407,0.202993,0.196285,0.203233,0.196206,0.216386,0.191789,0.196261,0.200680,0.216386,...,0.228556,0.000000e+00,0.808500,0.02075,0.071,0.205286,0.0,0.107750,0.393333,0.009000
683734.0,2021-09-25,0.145627,0.186050,0.184125,0.190763,0.184050,0.205133,0.180293,0.184102,0.185835,0.205133,...,0.207000,0.000000e+00,0.808500,0.02520,0.071,0.205286,0.0,0.107750,0.393333,0.009000
683734.0,2021-09-26,0.162825,0.190641,0.193121,0.200519,0.193046,0.225141,0.188562,0.193097,0.190699,0.225144,...,0.207000,4.440892e-16,0.808500,0.02520,0.071,0.215571,0.0,0.107750,0.587000,0.009000
683734.0,2021-09-27,0.172919,0.204837,0.178158,0.184125,0.178093,0.210174,0.175338,0.178137,0.196691,0.210176,...,0.218300,4.440892e-16,0.957000,0.02520,0.071,0.184222,0.0,0.079000,0.587000,0.009000


In [336]:
temp_df = explode(train_set)
print(temp_df[temp_df.batter == 641313][[col for col in temp_df.columns if '6' in col]])

            cluster_attribute_6_list  cluster_6_list
game_date                                           
2017-04-04                  0.062309             0.0
2017-04-04                  0.057830             0.0
2017-04-04                  0.086620             0.0
2017-04-04                  0.126984             1.0
2017-04-06                  0.106611             0.0
...                              ...             ...
2021-10-01                  0.087939             0.0
2021-10-02                  0.143147             0.0
2021-10-02                  0.104761             0.0
2021-10-02                  0.066912             0.0
2021-10-02                  0.079097             0.0

[2390 rows x 2 columns]


In [357]:
temp_df = cluster_day(temp_df)
print(temp_df[temp_df.batter == 641313][[col for col in temp_df.columns if '6' in col]])

                                                                     0
batter   game_date                                                    
112526.0 2017-04-05  [0.0493724146783362, 0.07233099205574599, 0.03...
         2017-04-11  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
         2017-04-16  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
         2017-04-21  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
         2017-04-28  [0.07580967196624573, 0.056090051863785316, 0....
...                                                                ...
683734.0 2021-09-25  [0.06478460672716352, 0.06004631671102997, 0.0...
         2021-09-26  [0.4003221965037783, 0.41533159275628806, 0.25...
         2021-09-27  [0.33291026675597296, 0.22346249120791895, 0.4...
         2021-09-28  [0.169, 0.169, 0.169, 0.169, 0.169, 0.169, 0.1...
685503.0 2021-09-08  [0.05123102655388503, 0.057176504813603826, 0....

[217550 rows x 1 columns]
            cluster_attribute_6_list_agg  cluster_

In [358]:
temp_df = rolling_xba(temp_df,30)
print(temp_df[temp_df.batter == 641313][[col for col in temp_df.columns if '6' in col]])

            cluster_attribute_6_list_agg  cluster_6_list_agg  \
game_date                                                      
2017-04-04                           NaN                 NaN   
2017-04-06                           NaN                 NaN   
2017-04-07                           NaN                 NaN   
2017-04-08                           NaN                 NaN   
2017-04-09                           NaN                 NaN   
...                                  ...                 ...   
2021-09-26                      1.438459               1.817   
2021-09-28                      1.284039               1.968   
2021-09-29                      1.360568               1.968   
2021-10-01                      1.438577               1.968   
2021-10-02                      1.721087               1.968   

            cluster_attribute_6_list  cluster_6_list  \
game_date                                              
2017-04-04                       NaN             NaN   

# Combine

In [128]:
import contextlib
import os

In [129]:
def merging(df1,df2,key,train=True):
    df2.to_csv('data/temp_file.csv')
    
    if train == True:
        file_name = 'data/train/matchup_train_def.csv'
    elif train == False:
        file_name = 'data/test/matchup_test_def.csv'
        
    df_result = pd.DataFrame(columns=(df1.columns.append(df2.columns)).unique())
    df_result.to_csv(file_name,index_label=False)
    
    del(df2)
    
    def preprocess(x):
        x.game_date = pd.to_datetime(x.game_date)
        df2 = pd.merge(df1,x,left_index=True,right_on=key)
        df2.drop('Unnamed: 0', axis=1, inplace=True)
        df2.to_csv(file_name,mode="a",header=False,index=False)
        #df2 = df1.join(x,how='outer')
        #df2 = df_out.reset_index()
        #df2.to_csv(file_name,mode="a",index=False)
        
    reader = pd.read_csv('data/temp_file.csv', chunksize=1000)
    
    [preprocess(r) for r in reader]
    
    #with contextlib.suppress(FileNotFoundError):
    #    os.remove('data/temp_file.csv')

In [13]:
pitcher_hard_train = rolling_cluster_hard(train_set,20)

pitcher_hard_test = rolling_cluster_hard(test_set,20)

In [14]:
pitcher_soft_train = rolling_cluster_soft(train_set,20)

pitcher_soft_test = rolling_cluster_soft(test_set,20)

In [15]:
batter_train = explode(train_set)
batter_train = cluster_day(batter_train)
batter_train = rolling_xba(batter_train,30)

batter_test = explode(test_set)
batter_test = cluster_day(batter_test)
batter_test = rolling_xba(batter_test,30)

In [16]:
train_set = train_set.loc[:,['pitcher','batter','estimated_ba_using_speedangle']]
test_set = test_set.loc[:,['pitcher','batter','estimated_ba_using_speedangle']]

train_set.dropna(inplace=True)
test_set.dropna(inplace=True)

In [17]:
train_set.head()

Unnamed: 0_level_0,pitcher,batter,estimated_ba_using_speedangle
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-04-02,407822.0,491676.0,0.0
2017-04-02,543331.0,452254.0,0.0
2017-04-02,543331.0,543063.0,0.447
2017-04-02,543331.0,456488.0,0.48
2017-04-02,543557.0,545341.0,0.833


In [18]:
for col in train_set.columns:
    print(col + ": " + str(train_set[col].isna().sum()))
print("full length: " + str(len(train_set.index)))

pitcher: 0
batter: 0
estimated_ba_using_speedangle: 0
full length: 474379


In [19]:
train = train_set.copy()
test = test_set.copy()

In [20]:
train = train.set_index(['pitcher',train.index])
test = test.set_index(['pitcher',test.index])

In [21]:
train

Unnamed: 0_level_0,Unnamed: 1_level_0,batter,estimated_ba_using_speedangle
pitcher,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1
407822.0,2017-04-02,491676.0,0.000
543331.0,2017-04-02,452254.0,0.000
543331.0,2017-04-02,543063.0,0.447
543331.0,2017-04-02,456488.0,0.480
543557.0,2017-04-02,545341.0,0.833
...,...,...,...
593144.0,2021-10-02,543510.0,0.914
593144.0,2021-10-02,596019.0,0.651
593144.0,2021-10-02,607043.0,0.614
593144.0,2021-10-02,641645.0,0.540


In [22]:
train = train.join(pitcher_hard_train,on=['pitcher','game_date'])
test = test.join(pitcher_hard_test,on=['pitcher','game_date'])

In [23]:
train

Unnamed: 0_level_0,Unnamed: 1_level_0,batter,estimated_ba_using_speedangle,cluster_0_roll_per_pa,cluster_1_roll_per_pa,cluster_2_roll_per_pa,cluster_3_roll_per_pa,cluster_4_roll_per_pa,cluster_5_roll_per_pa,cluster_6_roll_per_pa,cluster_7_roll_per_pa,cluster_8_roll_per_pa,cluster_9_roll_per_pa,cluster_10_roll_per_pa
pitcher,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
407822.0,2017-04-02,491676.0,0.000,,,,,,,,,,,
543331.0,2017-04-02,452254.0,0.000,,,,,,,,,,,
543331.0,2017-04-02,543063.0,0.447,,,,,,,,,,,
543331.0,2017-04-02,456488.0,0.480,,,,,,,,,,,
543557.0,2017-04-02,545341.0,0.833,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593144.0,2021-10-02,543510.0,0.914,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,0.0,0.090909,0.0
593144.0,2021-10-02,596019.0,0.651,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,0.0,0.090909,0.0
593144.0,2021-10-02,607043.0,0.614,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,0.0,0.090909,0.0
593144.0,2021-10-02,641645.0,0.540,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,0.0,0.090909,0.0


In [24]:
for col in train.columns:
    print(col + ": " + str(train[col].isna().sum()))
print("full lenght: " + str(len(train.index)))

batter: 0
estimated_ba_using_speedangle: 0
cluster_0_roll_per_pa: 113806
cluster_1_roll_per_pa: 113806
cluster_2_roll_per_pa: 113806
cluster_3_roll_per_pa: 113806
cluster_4_roll_per_pa: 113806
cluster_5_roll_per_pa: 113806
cluster_6_roll_per_pa: 113806
cluster_7_roll_per_pa: 113806
cluster_8_roll_per_pa: 113806
cluster_9_roll_per_pa: 113806
cluster_10_roll_per_pa: 113806
full lenght: 474379


In [25]:
train = train.join(pitcher_soft_train,on=['pitcher','game_date'])
test = test.join(pitcher_soft_test,on=['pitcher','game_date'])

In [26]:
train

Unnamed: 0_level_0,Unnamed: 1_level_0,batter,estimated_ba_using_speedangle,cluster_0_roll_per_pa,cluster_1_roll_per_pa,cluster_2_roll_per_pa,cluster_3_roll_per_pa,cluster_4_roll_per_pa,cluster_5_roll_per_pa,cluster_6_roll_per_pa,cluster_7_roll_per_pa,...,cluster_attribute_7_min_roll,cluster_attribute_8_roll,cluster_attribute_8_max_roll,cluster_attribute_8_min_roll,cluster_attribute_9_roll,cluster_attribute_9_max_roll,cluster_attribute_9_min_roll,cluster_attribute_10_roll,cluster_attribute_10_max_roll,cluster_attribute_10_min_roll
pitcher,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
407822.0,2017-04-02,491676.0,0.000,,,,,,,,,...,,,,,,,,,,
543331.0,2017-04-02,452254.0,0.000,,,,,,,,,...,,,,,,,,,,
543331.0,2017-04-02,543063.0,0.447,,,,,,,,,...,,,,,,,,,,
543331.0,2017-04-02,456488.0,0.480,,,,,,,,,...,,,,,,,,,,
543557.0,2017-04-02,545341.0,0.833,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593144.0,2021-10-02,543510.0,0.914,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,0.217674,0.563881,0.563881,0.563881,0.195536,0.195536,0.195536,0.691542,0.691542,0.691542
593144.0,2021-10-02,596019.0,0.651,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,0.217674,0.563881,0.563881,0.563881,0.195536,0.195536,0.195536,0.691542,0.691542,0.691542
593144.0,2021-10-02,607043.0,0.614,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,0.217674,0.563881,0.563881,0.563881,0.195536,0.195536,0.195536,0.691542,0.691542,0.691542
593144.0,2021-10-02,641645.0,0.540,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,0.217674,0.563881,0.563881,0.563881,0.195536,0.195536,0.195536,0.691542,0.691542,0.691542


In [27]:
for col in train.columns:
    print(col + ": " + str(train[col].isna().sum()))
print("full lenght: " + str(len(train.index)))

batter: 0
estimated_ba_using_speedangle: 0
cluster_0_roll_per_pa: 113806
cluster_1_roll_per_pa: 113806
cluster_2_roll_per_pa: 113806
cluster_3_roll_per_pa: 113806
cluster_4_roll_per_pa: 113806
cluster_5_roll_per_pa: 113806
cluster_6_roll_per_pa: 113806
cluster_7_roll_per_pa: 113806
cluster_8_roll_per_pa: 113806
cluster_9_roll_per_pa: 113806
cluster_10_roll_per_pa: 113806
cluster_attribute_0_roll: 113806
cluster_attribute_0_max_roll: 113806
cluster_attribute_0_min_roll: 113806
cluster_attribute_1_roll: 113806
cluster_attribute_1_max_roll: 113806
cluster_attribute_1_min_roll: 113806
cluster_attribute_2_roll: 113806
cluster_attribute_2_max_roll: 113806
cluster_attribute_2_min_roll: 113806
cluster_attribute_3_roll: 113806
cluster_attribute_3_max_roll: 113806
cluster_attribute_3_min_roll: 113806
cluster_attribute_4_roll: 113806
cluster_attribute_4_max_roll: 113806
cluster_attribute_4_min_roll: 113806
cluster_attribute_5_roll: 113806
cluster_attribute_5_max_roll: 113806
cluster_attribute_5_m

In [28]:
train = train.reset_index().set_index(['batter','game_date'])
test = test.reset_index().set_index(['batter','game_date'])

In [29]:
train = train.join(batter_train,on=['batter','game_date'])
test = test.join(batter_test,on=['batter','game_date'])

In [30]:
train

Unnamed: 0_level_0,Unnamed: 1_level_0,pitcher,estimated_ba_using_speedangle,cluster_0_roll_per_pa,cluster_1_roll_per_pa,cluster_2_roll_per_pa,cluster_3_roll_per_pa,cluster_4_roll_per_pa,cluster_5_roll_per_pa,cluster_6_roll_per_pa,cluster_7_roll_per_pa,...,cluster_1_estimated,cluster_2_estimated,cluster_3_estimated,cluster_4_estimated,cluster_5_estimated,cluster_6_estimated,cluster_7_estimated,cluster_8_estimated,cluster_9_estimated,cluster_10_estimated
batter,game_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
491676.0,2017-04-02,407822.0,0.000,,,,,,,,,...,,,,,,,,,,
452254.0,2017-04-02,543331.0,0.000,,,,,,,,,...,,,,,,,,,,
543063.0,2017-04-02,543331.0,0.447,,,,,,,,,...,,,,,,,,,,
456488.0,2017-04-02,543331.0,0.480,,,,,,,,,...,,,,,,,,,,
545341.0,2017-04-02,543557.0,0.833,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543510.0,2021-10-02,593144.0,0.914,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,0.168889,1.110223e-16,0.441333,0.22400,0.202750,0.363000,0.0,0.114667,0.044500,0.00000
596019.0,2021-10-02,593144.0,0.651,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,0.288478,4.850000e-01,0.284625,0.43125,0.224455,0.164375,0.0,0.282429,0.186091,0.39550
607043.0,2021-10-02,593144.0,0.614,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,0.200130,1.730000e-01,0.420000,0.07450,0.323667,0.513500,0.0,0.145500,0.475500,0.41725
641645.0,2021-10-02,593144.0,0.540,0.0,0.772727,0.0,0.0,0.0,0.136364,0.0,0.0,...,,,,,,,,,,


In [31]:
for col in train.columns:
    print(col + ": " + str(train[col].isna().sum()))
print("full lenght: " + str(len(train.index)))

pitcher: 0
estimated_ba_using_speedangle: 0
cluster_0_roll_per_pa: 113806
cluster_1_roll_per_pa: 113806
cluster_2_roll_per_pa: 113806
cluster_3_roll_per_pa: 113806
cluster_4_roll_per_pa: 113806
cluster_5_roll_per_pa: 113806
cluster_6_roll_per_pa: 113806
cluster_7_roll_per_pa: 113806
cluster_8_roll_per_pa: 113806
cluster_9_roll_per_pa: 113806
cluster_10_roll_per_pa: 113806
cluster_attribute_0_roll: 113806
cluster_attribute_0_max_roll: 113806
cluster_attribute_0_min_roll: 113806
cluster_attribute_1_roll: 113806
cluster_attribute_1_max_roll: 113806
cluster_attribute_1_min_roll: 113806
cluster_attribute_2_roll: 113806
cluster_attribute_2_max_roll: 113806
cluster_attribute_2_min_roll: 113806
cluster_attribute_3_roll: 113806
cluster_attribute_3_max_roll: 113806
cluster_attribute_3_min_roll: 113806
cluster_attribute_4_roll: 113806
cluster_attribute_4_max_roll: 113806
cluster_attribute_4_min_roll: 113806
cluster_attribute_5_roll: 113806
cluster_attribute_5_max_roll: 113806
cluster_attribute_5_

In [32]:
train = train.set_index(['pitcher',train.index])
test = test.set_index(['pitcher',test.index])

In [33]:
train.dropna(inplace=True)
test.dropna(inplace=True)

# Model

In [34]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [35]:
def mape(y_true,y_pred):
    ape = np.abs((y_true - y_pred) / y_true)
    ape[~np.isfinite(ape)] = 1
    return np.mean(ape)

def wape(y_true,y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))

In [36]:
pitcher_soft_columns = list(pitcher_soft_train.columns)

pitcher_hard_columns = list(pitcher_hard_train.columns)

batter_hard_columns = [col for col in batter_train.columns if 'attribute' not in col]

batter_soft_columns = [col for col in batter_train.columns if 'attribute' in col]

In [37]:
target = 'estimated_ba_using_speedangle'

hard_cols = pitcher_hard_columns
for col in batter_hard_columns:
    hard_cols.append(col)
    
soft_cols = pitcher_soft_columns
for col in batter_soft_columns:
    soft_cols.append(col)

In [38]:
x_train = train.loc[:,~train.columns.isin([target])]
y_train = train.loc[:,target]

x_hard_train = x_train.loc[:,x_train.columns.isin(hard_cols)]
x_soft_train = x_train.loc[:,x_train.columns.isin(soft_cols)]

x_test = test.loc[:,~test.columns.isin([target])]
y_test = test.loc[:,target]

x_hard_test = x_test.loc[:,x_test.columns.isin(hard_cols)]
x_soft_test = x_test.loc[:,x_test.columns.isin(soft_cols)]

In [39]:
model_all = RandomForestRegressor()
model_hard = RandomForestRegressor()
model_soft = RandomForestRegressor()

#model_all.fit(x_train,y_train)
#model_hard.fit(x_hard_train,y_train)
#model_soft.fit(x_soft_train,y_train)

In [40]:
from sklearn.model_selection import RandomizedSearchCV

In [41]:
import datetime

In [42]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
           'max_features': max_features,
           'max_depth': max_depth,
           'min_samples_split': min_samples_split,
           'min_samples_leaf': min_samples_leaf,
           'bootstrap': bootstrap}

In [43]:
model_all = RandomizedSearchCV(estimator = model_all, param_distributions = param_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs=-1)
#model_hard = RandomizedSearchCV(estimator = model_hard, param_distributions = param_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs=-1)
#model_soft = RandomizedSearchCV(estimator = model_soft, param_distributions = param_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs=-1)

In [44]:
now = datetime.datetime.now()
print(now)

2022-08-09 02:34:06.556310


In [None]:
model_all.fit(x_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
now = datetime.datetime.now()
print(now)

In [None]:
pred_all_train = model_all.predict(x_train)
pred_all_test = model_all.predict(x_test)

In [None]:
print('All categories, Train')
print('   mape: ' + str(mape(y_train,pred_all_train)))
print('   wape: ' + str(wape(y_train,pred_all_train)))
print('   mse:  ' + str(mean_squared_error(y_train, pred_all_train)))
print('   mae:  ' + str(mean_absolute_error(y_train, pred_all_train)))
print('All categories, Test')
print('   mape: ' + str(mape(y_test,pred_all_test)))
print('   wape: ' + str(wape(y_test,pred_all_test)))
print('   mse:  ' + str(mean_squared_error(y_test, pred_all_test)))
print('   mae:  ' + str(mean_absolute_error(y_test, pred_all_test)))

In [None]:
feature_importance = {}

for i in range(len(x_train.columns)):
    feature_importance[x_train.columns[i]] = model_all.feature_importances_[i]

In [None]:
{k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1],reverse=True)}

In [None]:
model_hard.fit(x_hard_train,y_train)

In [None]:
now = datetime.datetime.now()
print(now)

In [None]:
model_soft.fit(x_soft_train,y_train)

In [None]:
now = datetime.datetime.now()
print(now)

In [None]:
pred_all = model_all.predict(x_train)
pred_hard = model_hard.predict(x_hard_train)
pred_soft = model_soft.predict(x_soft_train)

In [None]:
print('All categories, Train')
print('   mape: ' + str(mape(y_train,pred_all)))
print('   wape: ' + str(wape(y_train,pred_all)))
print('   mse:  ' + str(mean_squared_error(y_train, pred_all)))
print('   mae:  ' + str(mean_absolute_error(y_train, pred_all)))
print('Hard cluster categories, Train')
print('   mape: ' + str(mape(y_train,pred_hard)))
print('   wape: ' + str(wape(y_train,pred_hard)))
print('   mse:  ' + str(mean_squared_error(y_train, pred_hard)))
print('   mae:  ' + str(mean_absolute_error(y_train, pred_hard)))
print('Soft cluster categories, Train')
print('   mape: ' + str(mape(y_train,pred_soft)))
print('   wape: ' + str(wape(y_train,pred_soft)))
print('   mse:  ' + str(mean_squared_error(y_train, pred_soft)))
print('   mae:  ' + str(mean_absolute_error(y_train, pred_soft)))

In [None]:
pred_all = model_all.predict(x_test)
pred_hard = model_hard.predict(x_hard_test)
pred_soft = model_soft.predict(x_soft_test)

In [None]:
print('All categories, Test')
print('   mape: ' + str(mape(y_test,pred_all)))
print('   wape: ' + str(wape(y_test,pred_all)))
print('   mse:  ' + str(mean_squared_error(y_test, pred_all)))
print('   mae:  ' + str(mean_absolute_error(y_test, pred_all)))
print('Hard cluster categories, Test')
print('   mape: ' + str(mape(y_test,pred_hard)))
print('   wape: ' + str(wape(y_test,pred_hard)))
print('   mse:  ' + str(mean_squared_error(y_test, pred_hard)))
print('   mae:  ' + str(mean_absolute_error(y_test, pred_hard)))
print('Soft cluster categories, Test')
print('   mape: ' + str(mape(y_test,pred_soft)))
print('   wape: ' + str(wape(y_test,pred_soft)))
print('   mse:  ' + str(mean_squared_error(y_test, pred_soft)))
print('   mae:  ' + str(mean_absolute_error(y_test, pred_soft)))