In [2]:
%load_ext autoreload
%autoreload 2

In [16]:
import pandas as pd
import pybaseball
from data_loader import PitchDataLoader
import processing as prc
import stuff_model as stuff
import joblib

import pandas as pd
import numpy as np
import skfuzzy as fuzz
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity
from joblib import Parallel, delayed
from xgboost import XGBRegressor
from tune_xgboost import tune_xgboost, distill_params
from typing import List, Dict, Tuple, Optional

In [4]:
path = 'data/'
pybaseball.cache.enable()

In [5]:
pdl = PitchDataLoader(path=path, start_date='2016-03-01', end_date='2024-11-15')
#pdl.load_new_data()

In [5]:
df = pdl.df.copy()
df = prc.apply_all_filters(df, prc.default_filters()).copy()
df = prc.save_memory(df, cols_to_drop=['des']).copy()
df = prc.calculate_new_features(df).copy()
df = prc.save_memory(df).copy()
df.to_parquet(f'{path}mem_eff_pitch_data.parquet')

In [6]:
df = pd.read_parquet(f'{path}mem_eff_pitch_data.parquet')

In [23]:
stuff_features = ['speed', 'speed_diff', 'lift', 'lift_diff', 'transverse_pit', 'transverse_pit_diff', 
                  'release_pos_x_pit', 'release_pos_y', 'release_pos_z', 'vert_approach_angle_adj']
cluster_target = 'csw'

df = stuff.assign_fuzzy_clusters(df, stuff_features, cluster_target)
df = prc.save_memory(df)
df.to_parquet(f'{path}clustered_pitch_data.parquet')

100%|██████████| 30/30 [01:57<00:00,  3.91s/trial, best loss: 0.59476561477669]  
{'cluster1': 'gyro slider', 'cluster2': 'cutter', 'cluster3': 'offspeed', 'cluster4': 'high-slot fastball', 'cluster5': 'sweeper', 'cluster6': 'low-slot fastball', 'cluster7': 'curveball', 'cluster8': 'sinker'}


In [24]:
cluster_dist = stuff.create_location_distributions(df)
count_frequencies = stuff.calculate_count_frequencies(df)
platoon_cluster_dist = stuff.combine_flatten_distributions(cluster_dist, count_frequencies)
joblib.dump(platoon_cluster_dist, 'data/platoon_cluster_dist.dat')

  grouped = df[cols].groupby(group_cols)


['data/platoon_cluster_dist.dat']

In [25]:
feat = ['speed', 'vert_approach_angle_adj', 'transverse', 'transverse_pit', 'transverse_bat', 
        'lift', 'release_pos_x', 'release_pos_x_pit','release_pos_x_bat', 'release_pos_y', 
        'release_pos_z', 'plate_x', 'plate_x_pit','plate_x_bat', 'plate_x_abs', 'plate_z', 
        'plate_z_top', 'plate_z_bot', 'plate_dist', 'balls', 'strikes', 'speed_diff', 
        'lift_diff', 'transverse_pit_diff', 'vert_approach_angle', 'game_year']

model_filters = stuff.get_model_filters(df)
xgb_models = stuff.train_models(df, model_filters, feat)
joblib.dump(xgb_models, 'data/xgb_models.dat')

100%|██████████| 30/30 [04:11<00:00,  8.39s/trial, best loss: 0.42217230804973305]
100%|██████████| 30/30 [03:25<00:00,  6.85s/trial, best loss: 0.4614861654671837]
100%|██████████| 30/30 [03:12<00:00,  6.42s/trial, best loss: 0.15958146159484238]
100%|██████████| 30/30 [02:35<00:00,  5.17s/trial, best loss: 0.01758540548682458] 
100%|██████████| 30/30 [03:37<00:00,  7.26s/trial, best loss: 0.6628150280839916]
100%|██████████| 30/30 [03:06<00:00,  6.23s/trial, best loss: 0.632173671048221] 
100%|██████████| 30/30 [01:54<00:00,  3.81s/trial, best loss: 0.35046240257370875]
100%|██████████| 30/30 [01:54<00:00,  3.83s/trial, best loss: 0.4371952970598194] 
100%|██████████| 30/30 [01:54<00:00,  3.82s/trial, best loss: 0.3018922261302999]
100%|██████████| 30/30 [01:20<00:00,  2.69s/trial, best loss: 0.3571244861636534] 
100%|██████████| 30/30 [00:52<00:00,  1.76s/trial, best loss: 0.6690882630844428]
100%|██████████| 30/30 [5:39:52<00:00, 679.76s/trial, best loss: 0.6614014087799577]    
10

['data/xgb_models.dat']

In [14]:
df = pd.read_parquet(f'{path}clustered_pitch_data.parquet')

In [10]:
batch_size = 5000
n_batches = 3
rv = pd.read_csv('runvalue.csv')
dist = joblib.load('data/platoon_cluster_dist.dat')
features = ['speed', 'vert_approach_angle_adj', 'transverse', 'transverse_pit', 'transverse_bat', 
        'lift', 'release_pos_x', 'release_pos_x_pit','release_pos_x_bat', 'release_pos_y', 
        'release_pos_z', 'plate_x', 'plate_x_pit','plate_x_bat', 'plate_x_abs', 'plate_z', 
        'plate_z_top', 'plate_z_bot', 'plate_dist', 'balls', 'strikes', 'speed_diff', 
        'lift_diff', 'transverse_pit_diff', 'vert_approach_angle', 'game_year']
xgb_models = joblib.load('data/xgb_models.dat')
path = 'data/'
#sim = stuff.simulate_pitches(df, batch_size, n_batches, dist, features, rv, xgb_models, path)

In [11]:
simR = []
simL = []
for i in range(n_batches):
    simR.append(pd.read_parquet(f'data/sim_vsR_batch{i+1}.parquet'))
    simL.append(pd.read_parquet(f'data/sim_vsL_batch{i+1}.parquet'))
simR = pd.concat(simR).reset_index(drop=True)
simL = pd.concat(simL).reset_index(drop=True)
simR.to_parquet(f'{path}sim_vsR.parquet')
simL.to_parquet(f'{path}sim_vsL.parquet')

In [12]:
distill_features = ['speed', 'speed_diff', 'lift', 'lift_diff', 
                    'transverse', 'transverse_pit', 'transverse_pit_diff', 
                    'release_pos_x', 'release_pos_x_pit', 'release_pos_y', 'release_pos_z', 
                    'vert_approach_angle_adj']
events = [x for x in simR.columns if x.startswith('x_')]
vsR_models = stuff.train_distilled_models(simR, distill_features, events)
vsL_models = stuff.train_distilled_models(simL, distill_features, events)

df = stuff.make_distilled_predictions(df, distill_features, events, vsR_models, vsL_models)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.





100%|██████████| 10/10 [00:55<00:00,  5.57s/trial, best loss: 0.00014462331971498878]
100%|██████████| 10/10 [00:50<00:00,  5.07s/trial, best loss: 0.00010337266531034472]
100%|██████████| 10/10 [00:16<00:00,  1.64s/trial, best loss: 1.688997020757612e-07]
100%|██████████| 10/10 [00:42<00:00,  4.29s/trial, best loss: 7.803001704313726e-05]
100%|██████████| 10/10 [00:46<00:00,  4.69s/trial, best loss: 7.025269047459416e-05]
100%|██████████| 10/10 [00:32<00:00,  3.27s/trial, best loss: 1.1379767031648009e-06]
100%|██████████| 10/10 [00:29<00:00,  2.98s/trial, best loss: 7.745661903656032e-07]
100%|██████████| 10/10 [00:23<00:00,  2.30s/trial, best loss: 2.9295585484469407e-07]
100%|██████████| 10/10 [00:50<00:00,  5.03s/trial, best loss: 1.192626942279711e-05]
100%|██████████| 10/10 [00:44<00:00,  4.50s/trial, best loss: 2.6125621541963193e-06]
100%|██████████| 10/10 [00:43<00:00,  4.36s/trial, best loss: 1.2756726533254224e-05]
100%|██████████| 10/10 [00:39<00:00,  3.97s/trial, best los

In [17]:
df = stuff.make_distilled_predictions(df, distill_features, events, vsR_models, vsL_models)
stuff.generate_results_csv(df, distill_features)

x_callstr
x_ball
x_hbp
x_swstr
x_foul
x_pop
x_hr
x_if1b
x_gbout
x_gidp
x_air_out
x_1b
x_2b
x_3b
x_run_value


  grp_filt = df[cols].groupby(by=group_cols).filter(lambda x: len(x) >= 10)
  results = grp_filt[cols].groupby(by=group_cols).mean().dropna().reset_index()
  res_count = grp_filt[cols].groupby(by=group_cols).speed.count().reset_index().rename(columns={'speed':'pitches'})
