In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
import xgboost

import pickle

### Read in Fall 2024 Pitch Data

In [2]:
fall_data = pd.read_csv("/Users/aidanbeilke/Desktop/Purdue_Base/newman_proj/csvs/fall_pitch_type_copy.csv")

fall_data.shape

(3905, 169)

In [3]:
fall_data['Pitcher'].nunique()

43

In [4]:
player_heights = {
    'Lance Lauve': 71,
    'Joe Trennery' : 76,
    'Easton Storey' : 71,
    'Brayden Olson' : 74,
    'Cole Van Assen' : 75,
    'Nick Kolze' : 75,
    'Enas Hayden' : 75,
    'Gavin Beuter' : 76,
    'Maclane Finley' : 72,
    'Michael Vallone' : 72,
    'Austin Klug' : 76,
    'Matthew Tarr' : 74,
    'Carter Doorn' : 75,
    'Isaac Milburn' : 72,
    'Evan Schweizer' : 77,
    'Barron Sawyer' : 78,
    'Gabriel Watson' : 70,
    'Justin Guiliano' : 76,
    'Avery Cook' : 78,
    'Matthew Totten' : 75,
    'Kale Wemer' : 73,
    'Chris Gallagher' : 72,
    'Luke Reasor' : 75
}

heights_df = pd.DataFrame(list(player_heights.items()), columns=['player_name', 'height_in_inches'])

### Apply Arm Angle Model

In [5]:
catboost = "/Users/aidanbeilke/Desktop/Purdue_Base/newman_proj/models/best_catboost_model.pkl"

fall_data = fall_data.rename(columns={
    'RelSide' : 'release_pos_x',
    'RelHeight' : 'release_pos_z',
    'Extension' : 'release_extension',
    'Pitcher' : 'player_name'
})

fall_data = fall_data[fall_data['player_name'] != "O'Brien Cameron"]

def format_name(name):
    try:
        last, first = name.split(", ")
        return f"{first} {last}"
    except ValueError:
        # Return the name as is if it doesn't match the expected format
        return name

fall_data['player_name'] = fall_data['player_name'].apply(format_name)

fall_data = fall_data.merge(heights_df, 
                                  on = 'player_name', 
                                  how = 'left')

fall_data['interaction'] =  fall_data['release_pos_z'] * fall_data['release_extension'] * fall_data['height_in_inches']
feats = ['release_pos_x', 'release_pos_z', 'release_extension', 'height_in_inches', 'interaction']

fall_data = fall_data.dropna(subset = feats)

with open(catboost, 'rb') as cb_file:
    catboost_model = pickle.load(cb_file)

pred_arm_angle = catboost_model.predict(fall_data[feats])

fall_data['arm_angle'] = pred_arm_angle

### Apply iVB Model

In [6]:
ivb_model = "/Users/aidanbeilke/Desktop/Purdue_Base/newman_proj/models/ivb_model.pkl"

with open(ivb_model, 'rb') as file:
    ivb_model = pickle.load(file)

ivb_features = ['arm_angle', 'release_pos_x', 'release_pos_z', 'release_extension', 'RelSpeed', 'SpinRate']
fall_data['xiVB'] = ivb_model.predict(fall_data[ivb_features])
fall_data['iVB_oe'] = fall_data['InducedVertBreak'] - fall_data['xiVB']

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



### Apply Location Model

In [7]:
xrv_model = "/Users/aidanbeilke/Desktop/Purdue_Base/newman_proj/models/xrv_model.pkl"

fastballs = ['Fastball', 'Sinker', 'Cutter']
college_fb = fall_data[fall_data['pitch_type'].isin(fastballs)]

most_common_fb = college_fb.groupby(['PitcherId', 'BatterSide'])['pitch_type'].agg(lambda x: x.mode().iloc[0]).reset_index()
most_common_fb = most_common_fb.rename(columns={'pitch_type': 'most_common_fb'})

college_fb = college_fb.merge(most_common_fb, on=['PitcherId', 'BatterSide'], how='left')
college_fb_filtered = college_fb[college_fb['pitch_type'] == college_fb['most_common_fb']]

average_metrics = college_fb_filtered.groupby(['PitcherId', 'BatterSide', 'most_common_fb']).agg({
    'RelSpeed': 'mean',
    'release_pos_x': 'mean',
    'release_pos_z': 'mean',
    'HorzBreak': 'mean',
    'InducedVertBreak': 'mean'
}).reset_index()

average_metrics = average_metrics.rename(columns={
    'RelSpeed': 'avg_RelSpeed',
    'release_pos_x': 'avg_release_pos_x',
    'release_pos_z': 'avg_release_pos_z',
    'HorzBreak': 'avg_HorzBreak',
    'InducedVertBreak': 'avg_InducedVertBreak'
})

fall_data = fall_data.merge(average_metrics, on=['PitcherId', 'BatterSide'], how='left')

fall_data['PitcherThrows'] = np.where(
    fall_data['player_name'].isin(['Easton Storey', 'Michael Vallone', 'Isaac Milburn', 
                                   'Justin Guiliano', 'Luke Reasor']),  # Condition
    'L', 
    'R'   
)

conditions = [
    (fall_data['BatterSide'] == 'Left') & (fall_data['PitcherThrows'] == 'L'),
    (fall_data['BatterSide'] == 'Left') & (fall_data['PitcherThrows'] == 'R'),  
    (fall_data['BatterSide'] == 'Right') & (fall_data['PitcherThrows'] == 'L'),  
    (fall_data['BatterSide'] == 'Right') & (fall_data['PitcherThrows'] == 'R') 
]

values = [0, 1, 2, 3]

fall_data['platoon_state'] = np.select(conditions, values)

count_mapping = {
    (0, 0): 0,
    (0, 1): 1,
    (0, 2): 2,
    (1, 0): 3,
    (1, 1): 4,
    (1, 2): 5,
    (2, 0): 6,
    (2, 1): 7,
    (2, 2): 8,
    (3, 0): 9,
    (3, 1): 10,
    (3, 2): 11
}

fall_data['count'] = fall_data[['Balls', 'Strikes']].apply(tuple, axis=1).map(count_mapping)

loc_features = ['RelSpeed', 'release_pos_x', 'release_pos_z', 'platoon_state', 
                'count', 'HorzBreak', 'InducedVertBreak', 'release_extension',
                'SpinRate', 'PlateLocHeight', 'PlateLocSide', 'SpinAxis', 'avg_RelSpeed',
                'avg_release_pos_x', 'avg_release_pos_z', 'avg_HorzBreak', 'avg_InducedVertBreak', 
                'arm_angle', 'VertApprAngle', 'iVB_oe']

with open(xrv_model, 'rb') as file:
    xrv_model = pickle.load(file)

fall_data['xrv'] = xrv_model.predict(fall_data[loc_features])

In [76]:
grouped_df = fall_data.groupby(['player_name', 'pitch_type'])[['RelSpeed', 'release_pos_x', 'release_pos_z', 'HorzBreak', 'InducedVertBreak','iVB_oe' ,'PlateLocHeight', 'PlateLocSide', 'xrv',]].mean()

count_df = fall_data.groupby(['player_name', 'pitch_type']).size().reset_index(name='n')

grouped_df = grouped_df.reset_index().merge(count_df, on=['player_name', 'pitch_type'])

In [77]:
grouped_df = grouped_df[grouped_df['n'] >= 5].sort_values(by='xrv', ascending=True)


In [78]:
grouped_stats = grouped_df.groupby('pitch_type')['xrv'].agg(mean_xrv='mean', std_xrv='std').reset_index()

df = grouped_df.merge(grouped_stats, on='pitch_type')

df['xrv_plus'] = (((df['mean_xrv'] - df['xrv']) / df['std_xrv']) * 10 + 100).round()

In [80]:
all_pitchers = df.groupby(['player_name', 'pitch_type'])['xrv_plus'].mean().reset_index()

all_pitchers.to_csv("all_xrvs.csv")

In [29]:
df.sort_values(by = 'xrv', ascending=True)

Unnamed: 0,player_name,pitch_type,RelSpeed,release_pos_x,release_pos_z,HorzBreak,InducedVertBreak,iVB_oe,PlateLocHeight,PlateLocSide,xrv,n,mean_xrv,std_xrv,xrv_plus
73,Gabriel Watson,Curveball,80.207220,2.657180,5.312290,-17.534460,-9.018150,-4.041643,1.824780,0.623730,-0.074125,1,0.020108,0.050932,81.498318
71,Chris Gallagher,Curveball,72.767990,3.227630,3.350300,-15.425010,-5.725990,-9.047101,2.096850,0.585710,-0.027471,1,0.020108,0.050932,90.658394
8,Gabriel Watson,ChangeUp,82.469420,2.104470,5.283250,3.579155,3.277345,0.203349,0.759630,0.460165,-0.024695,2,0.027054,0.019970,74.086605
74,Isaac Milburn,Curveball,77.532388,-1.733160,5.650427,15.044271,-15.400662,-7.953001,1.726158,0.180211,-0.016769,42,0.020108,0.050932,92.759636
58,Gavin Beuter,Slider,83.587608,2.031169,5.452733,-11.234356,5.175407,3.965819,2.019484,-0.901412,-0.007312,28,0.023056,0.028581,89.374751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,Barron Sawyer,Sinker,92.933750,1.988250,5.625460,16.670360,11.699340,-3.517386,2.134240,1.893680,0.073262,1,0.038171,0.034853,110.068345
81,Enas Hayden,Cutter,82.366263,2.249568,5.994480,2.669320,8.082080,7.452289,-0.126727,-0.931422,0.080318,6,0.025718,0.029064,118.785926
79,Michael Vallone,Curveball,77.570810,-3.196150,4.321380,12.865440,1.822130,1.420259,2.582360,1.424220,0.107919,1,0.020108,0.050932,117.240724
44,Justin Guiliano,Sinker,86.434900,-1.400750,4.447020,-15.865400,7.299370,1.190633,0.626070,1.575160,0.120588,1,0.038171,0.034853,123.646809


In [10]:
fall_data.to_csv("fall_xrv.csv")