In [378]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from pathlib import Path
import statsmodels.api as sm
import glob
import os
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.patches as patches

Stuff+ model trained on the 2023 NCAA baseball season, using all available games on Trackman.

In [1]:
data = pd.read_csv(r'2023NCAATrackman.csv', low_memory=False)

In [381]:
df = data[['Pitcher', 'PitcherTeam', 'TaggedPitchType', 'PitchCall', 'TaggedHitType',
           'PlayResult', 'RelSpeed', 'VertRelAngle', 'HorzRelAngle', 'SpinRate',
           'SpinAxis', 'RelHeight', 'RelSide', 'Extension', 'InducedVertBreak', 
           'HorzBreak', 'ZoneSpeed', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo', 'PlateLocSide', 'PlateLocHeight']]

In [382]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df = df.reset_index()

Adjust some of the variables so that they treat positive and negative the same - where it make sense to.

In [383]:
df.insert(6, "DifferentialBreak", df["InducedVertBreak"] - df["HorzBreak"].abs(), True)
df.insert(10, "ABSSideRelease", df["RelSide"].abs(), True)
df.insert(11, 'ABSHorzBreak', df['HorzBreak'].abs(), True)

Here are the run values used for this model.

In [384]:
home_run = 1.374328827219,
triple = 1.05755624961515,
double = 0.766083122898271,
single = 0.467292970729251
ball = 0.0636883289483747,
hit_by_pitch = 0.0636883289483747,
blocked_ball = 0.0636883289483747,
foul = -0.0380502742575014,
foul_tip = -0.0380502742575014,
bunt_foul = -0.0380502742575014,
bunt_foul_tip = -0.0380502742575014,
called_strike = -0.065092516089806,
swinging_strike = -0.118124935770601,
swinging_strike_blocked = -0.118124935770601,
force_out = -0.1955687665555,
grounded_into_double_play = -0.1955687665555,
fielders_choice_out = -0.1955687665555,
fielders_choice = -0.1955687665555,
field_out = -0.1955687665555,
double_play = -0.1955687665555,
Sac_fly = -0.236889645519856,
field_error = -0.236889645519856,
catcher_interf = -0.789788814378052
sac_fly_double_play = -0.789788814378052
triple_play = -0.789788814378052

Next, the technique for assigning each pitch in the dataframe a run value.

In [385]:
df["in play"] = df['PlayResult'] + ' ' + df['TaggedHitType']

In [386]:
inplay = df['PitchCall'] == 'InPlay'
df.loc[inplay, 'PitchCall'] = df.loc[inplay, 'in play']

In [387]:
for index, row in df.iterrows():
    if row['PitchCall'] == 'InPlay':
        repval = row['in play']
        df.at[index, 'PitchCall'] = repval

In [388]:
repl = {'StrikeSwinging': 'swinging_strike', 'BallCalled': 'ball',
       'FoulBall': 'foul', 'Out FlyBall': 'field_out', 'Single GroundBall': 'single',
       'StrikeCalled': 'called_strike', 'Out Popup': 'field_out',
       'Out GroundBall': 'field_out', 'Double FlyBall': 'double', 
       'BallinDirt': 'ball', 'HomeRune LineDrive': 'home_run', 
       'Out LineDrive': 'field_out', 'HitByPitch': 'hit_by_pitch', 
       'Single LineDrive': 'single', 'Double LineDrive': 'double',
       'Error GroundBall': 'field_error', 'Sacrifice FlyBall': 'Sac_fly',
       'HomeRun FlyBall': 'home_run', 'Double GroundBall': 'double', 
       'FieldersChoice GroundBall': 'fielders_choice', 'Single Bunt': 'single', 
       'Sacrifice Bunt': 'fielders_choice_out', 'Error Bunt': 'field_error', 
       'Single FlyBall': 'single', 'Sacrifice LineDrive': 'sac_fly', 'Out Bunt': 'field_out',
       'Triple LineDrive': 'triple', 'Single Popup': 'single', 'Error Popup': 'field_error',
       'Triple FlyBall': 'triple', 'Sacrifice Popup': 'sac_fly'}

In [389]:
df['PitchCall'] = df['PitchCall'].replace(repl)

In [390]:
df["Run Values"] = df["PitchCall"].map({
'home_run': 1.374328827219,
'triple': 1.05755624961515,
'double': 0.766083122898271,
'single': 0.467292970729251,
'ball': 0.0636883289483747,
'hit_by_pitch': 0.0636883289483747,
'blocked_ball': 0.0636883289483747,
'foul': -0.0380502742575014,
'foul_tip': -0.0380502742575014,
'bunt_foul': -0.0380502742575014,
'bunt_foul_tip': -0.0380502742575014,
'called_strike': -0.065092516089806,
'swinging_strike': -0.118124935770601,
'swinging_strike_blocked': -0.118124935770601,
'force_out': -0.1955687665555,
'grounded_into_double_play': -0.1955687665555,
'fielders_choice_out': -0.1955687665555,
'fielders_choice': -0.1955687665555,
'field_out': -0.1955687665555,
'double_play': -0.1955687665555,
'Sac_fly': -0.236889645519856,
'field_error': -0.236889645519856,
'catcher_interf': -0.789788814378052,
'sac_fly_double_play': -0.789788814378052,
'triple_play': -0.789788814378052
                                       })

In [391]:
df = df.drop(['in play', 'PlayResult', 'PitchCall', 'TaggedHitType'], axis=1)

In [392]:
df = df.drop(['index'], axis=1)

In [394]:
df.rename(columns = {'Run Values':'RV'}, inplace=True)

In [395]:
#fix the pitchers' names:
df['Pitcher'] = df['Pitcher'].str.split(' ', expand=True)[1] + ' ' + df['Pitcher'].str.split(' ', expand=True)[0]

In [396]:
df = df.replace(',','', regex=True)

In [397]:
df = df.dropna()

To train, I've subset the table by pitch type, splitting up each "fastball" variety as well as offspeed pitches and breaking balls.

Tags on the Trackman games aren't always accurate, especially tags for the away team, but this will make do.

In [398]:
dff = df[(df['TaggedPitchType'] == 'Fastball') | 
          (df['TaggedPitchType'] == 'FourSeamFastball')]
dfc = df[df['TaggedPitchType'] == 'Cutter']
dsk = df[(df['TaggedPitchType'] == 'TwoSeamFastball') |
          (df['TaggedPitchType'] == 'Sinker')]
dsl = df[df['TaggedPitchType'] == 'Slider']
dch = df[df['TaggedPitchType'] == 'ChangeUp']
dcb = df[df['TaggedPitchType'] == 'Curveball']
dsp = df[df['TaggedPitchType'] == 'Splitter']

In [399]:
X1 = dff[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
y1 = dff['RV']

In [400]:
X2 = dfc[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
y2 = dfc['RV']

In [401]:
X3 = dsk[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
y3 = dsk['RV']

In [402]:
X4 = dsl[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
y4 = dsl['RV']

In [403]:
X5 = dch[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
y5 = dch['RV']

In [404]:
X6 = dcb[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
y6 = dcb['RV']

In [405]:
X7 = dsp[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
y7 = dsp['RV']

In [406]:
#Split data into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.25, random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25, random_state=0)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.25, random_state=0)
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.25, random_state=0)
X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size=0.25, random_state=0)
X6_train, X6_test, y6_train, y6_test = train_test_split(X6, y6, test_size=0.25, random_state=0)
X7_train, X7_test, y7_train, y7_test = train_test_split(X7, y7, test_size=0.25, random_state=0)

The method for each pitch type subset:
1. fit the model with scikit-learn's random forest and XGBoost
2. obtain the list of feature importances, to help contextualize the model's predictions
3. the predicted run value used is the average of each prediction method

In [407]:
#fastball randomforest
rfr1 = RandomForestRegressor(n_estimators=10, max_depth=10)
rfr1.fit(X1_train, y1_train)

In [408]:
#fastball xgboost
xgb_model1 = XGBRegressor()
xgb_model1.fit(X1_train, y1_train)

In [409]:
#fastball feature importances
feature_importances1r = pd.DataFrame(rfr1.feature_importances_, index = X1_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances1x = pd.DataFrame(xgb_model1.feature_importances_, index = X1_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances1 = feature_importances1x.join(feature_importances1r,  how='outer', lsuffix='_xgb', rsuffix='_rf')
feature_importances1['mean_importance_FF'] = (feature_importances1['importance_xgb'] + feature_importances1['importance_rf']) / 2
feature_importances1 = feature_importances1.sort_values(by='mean_importance_FF', ascending=False).drop(['importance_xgb', 'importance_rf'], axis=1).reset_index()

In [410]:
dff['rfPredictRV'] = rfr1.predict(X1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['rfPredictRV'] = rfr1.predict(X1)


In [411]:
dff['xgPredictRV'] = xgb_model1.predict(X1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['xgPredictRV'] = xgb_model1.predict(X1)


In [412]:
dff['xRV'] = (dff['rfPredictRV'] + dff['xgPredictRV']) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['xRV'] = (dff['rfPredictRV'] + dff['xgPredictRV']) / 2


In [413]:
#cutter randomforest
rfr2 = RandomForestRegressor(n_estimators=10, max_depth=10)
rfr2.fit(X2_train, y2_train)

In [414]:
#cutter xgboost
xgb_model2 = XGBRegressor()
xgb_model2.fit(X2_train, y2_train)

In [415]:
#cutter feature importances
feature_importances2r = pd.DataFrame(rfr2.feature_importances_, index = X2_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances2x = pd.DataFrame(xgb_model2.feature_importances_, index = X2_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances2 = feature_importances2x.join(feature_importances2r,  how='outer', lsuffix='_xgb', rsuffix='_rf')
feature_importances2['mean_importance_CT'] = (feature_importances2['importance_xgb'] + feature_importances2['importance_rf']) / 2
feature_importances2 = feature_importances2.sort_values(by='mean_importance_CT', ascending=False).drop(['importance_xgb', 'importance_rf'], axis=1).reset_index()

In [416]:
dfc['rfPredictRV'] = rfr2.predict(X2)
dfc['xgPredictRV'] = xgb_model2.predict(X2)
dfc['xRV'] = (dfc['rfPredictRV'] + dfc['xgPredictRV']) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfc['rfPredictRV'] = rfr2.predict(X2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfc['xgPredictRV'] = xgb_model2.predict(X2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfc['xRV'] = (dfc['rfPredictRV'] + dfc['xgPredictRV']) / 2


In [417]:
#sinker randomforest
rfr3 = RandomForestRegressor(n_estimators=10, max_depth=10)
rfr3.fit(X3_train, y3_train)

In [418]:
#sinker xgboost
xgb_model3 = XGBRegressor()
xgb_model3.fit(X3_train, y3_train)

In [419]:
#sinker feature importances
feature_importances3r = pd.DataFrame(rfr3.feature_importances_, index = X3_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances3x = pd.DataFrame(xgb_model3.feature_importances_, index = X3_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances3 = feature_importances3x.join(feature_importances3r,  how='outer', lsuffix='_xgb', rsuffix='_rf')
feature_importances3['mean_importance_SK'] = (feature_importances3['importance_xgb'] + feature_importances3['importance_rf']) / 2
feature_importances3 = feature_importances3.sort_values(by='mean_importance_SK', ascending=False).drop(['importance_xgb', 'importance_rf'], axis=1).reset_index()

In [420]:
dsk['rfPredictRV'] = rfr3.predict(X3)
dsk['xgPredictRV'] = xgb_model3.predict(X3)
dsk['xRV'] = (dsk['rfPredictRV'] + dsk['xgPredictRV']) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsk['rfPredictRV'] = rfr3.predict(X3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsk['xgPredictRV'] = xgb_model3.predict(X3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsk['xRV'] = (dsk['rfPredictRV'] + dsk['xgPredictRV']) / 2


In [421]:
#slider randomforest
rfr4 = RandomForestRegressor(n_estimators=10, max_depth=10)
rfr4.fit(X4_train, y4_train)

In [422]:
#slider xgboost
xgb_model4 = XGBRegressor()
xgb_model4.fit(X4_train, y4_train)

In [423]:
#slider feature importances
feature_importances4r = pd.DataFrame(rfr4.feature_importances_, index = X4_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances4x = pd.DataFrame(xgb_model4.feature_importances_, index = X4_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances4 = feature_importances4x.join(feature_importances4r,  how='outer', lsuffix='_xgb', rsuffix='_rf')
feature_importances4['mean_importance_SL'] = (feature_importances4['importance_xgb'] + feature_importances4['importance_rf']) / 2
feature_importances4 = feature_importances4.sort_values(by='mean_importance_SL', ascending=False).drop(['importance_xgb', 'importance_rf'], axis=1).reset_index()

In [424]:
dsl['rfPredictRV'] = rfr4.predict(X4)
dsl['xgPredictRV'] = xgb_model4.predict(X4)
dsl['xRV'] = (dsl['rfPredictRV'] + dsl['xgPredictRV']) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsl['rfPredictRV'] = rfr4.predict(X4)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsl['xgPredictRV'] = xgb_model4.predict(X4)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsl['xRV'] = (dsl['rfPredictRV'] + dsl['xgPredictRV']) / 2


In [425]:
#changeup randomforest
rfr5 = RandomForestRegressor(n_estimators=10, max_depth=10)
rfr5.fit(X5_train, y5_train)

In [426]:
#changeup xgboost
xgb_model5 = XGBRegressor()
xgb_model5.fit(X5_train, y5_train)

In [427]:
#changeup feature importances
feature_importances5r = pd.DataFrame(rfr5.feature_importances_, index = X5_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances5x = pd.DataFrame(xgb_model5.feature_importances_, index = X5_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances5 = feature_importances5x.join(feature_importances5r,  how='outer', lsuffix='_xgb', rsuffix='_rf')
feature_importances5['mean_importance_CH'] = (feature_importances5['importance_xgb'] + feature_importances5['importance_rf']) / 2
feature_importances5 = feature_importances5.sort_values(by='mean_importance_CH', ascending=False).drop(['importance_xgb', 'importance_rf'], axis=1).reset_index()

In [428]:
dch['rfPredictRV'] = rfr5.predict(X5)
dch['xgPredictRV'] = xgb_model5.predict(X5)
dch['xRV'] = (dch['rfPredictRV'] + dch['xgPredictRV']) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dch['rfPredictRV'] = rfr5.predict(X5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dch['xgPredictRV'] = xgb_model5.predict(X5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dch['xRV'] = (dch['rfPredictRV'] + dch['xgPredictRV']) / 2


In [429]:
#curveball randomforest
rfr6 = RandomForestRegressor(n_estimators=10, max_depth=10)
rfr6.fit(X6_train, y6_train)

In [430]:
#curveball xgboost
xgb_model6 = XGBRegressor()
xgb_model6.fit(X6_train, y6_train)

In [431]:
#curveball feature importances
feature_importances6r = pd.DataFrame(rfr6.feature_importances_, index = X6_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances6x = pd.DataFrame(xgb_model6.feature_importances_, index = X6_train.columns, columns=['importance']).sort_values('importance', ascending=False)
feature_importances6 = feature_importances6x.join(feature_importances6r,  how='outer', lsuffix='_xgb', rsuffix='_rf')
feature_importances6['mean_importance_CB'] = (feature_importances6['importance_xgb'] + feature_importances6['importance_rf']) / 2
feature_importances6 = feature_importances6.sort_values(by='mean_importance_CB', ascending=False).drop(['importance_xgb', 'importance_rf'], axis=1).reset_index()

In [433]:
mean_features0 = feature_importances1.merge(feature_importances2, on='index', how='left')
mean_features1 = mean_features0.merge(feature_importances3, on='index', how='left')
mean_features2 = mean_features1.merge(feature_importances4, on='index', how='left')
mean_features3 = mean_features2.merge(feature_importances5, on='index', how='left')
mean_features = mean_features3.merge(feature_importances6, on='index', how='left')

#mean_features.to_csv(r'2023stuffplus_feature_importances2.csv', index=False)

In [434]:
dcb['rfPredictRV'] = rfr6.predict(X6)
dcb['xgPredictRV'] = xgb_model6.predict(X6)
dcb['xRV'] = (dcb['rfPredictRV'] + dcb['xgPredictRV']) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dcb['rfPredictRV'] = rfr6.predict(X6)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dcb['xgPredictRV'] = xgb_model6.predict(X6)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dcb['xRV'] = (dcb['rfPredictRV'] + dcb['xgPredictRV']) / 2


In [435]:
#splitter randomforest
rfr7 = RandomForestRegressor(n_estimators=10, max_depth=10)
rfr7.fit(X7_train, y7_train)

In [436]:
#splitter xgboost
xgb_model7 = XGBRegressor()
xgb_model7.fit(X7_train, y7_train)

In [437]:
dsp['rfPredictRV'] = rfr7.predict(X7)
dsp['xgPredictRV'] = xgb_model7.predict(X7)
dsp['xRV'] = (dsp['rfPredictRV'] + dsp['xgPredictRV']) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsp['rfPredictRV'] = rfr7.predict(X7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsp['xgPredictRV'] = xgb_model7.predict(X7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsp['xRV'] = (dsp['rfPredictRV'] + dsp['xgPredictRV']) / 2


Now, calculate Stuff+

In [438]:
dff['xRV/100'] = dff['xRV'] * 100
dfc['xRV/100'] = dfc['xRV'] * 100
dsk['xRV/100'] = dsk['xRV'] * 100
dch['xRV/100'] = dch['xRV'] * 100
dcb['xRV/100'] = dcb['xRV'] * 100
dsl['xRV/100'] = dsl['xRV'] * 100
dsp['xRV/100'] = dsp['xRV'] * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['xRV/100'] = dff['xRV'] * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfc['xRV/100'] = dfc['xRV'] * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsk['xRV/100'] = dsk['xRV'] * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [439]:
#Create a new column called xRV/100 scaled negative which is the xRV/100 - 3.503456
dff = dff.assign(xRV100_scaled_negative=dff['xRV/100'] - 3.50345)
dfc = dfc.assign(xRV100_scaled_negative=dfc['xRV/100'] - 3.50345)
dsk = dsk.assign(xRV100_scaled_negative=dsk['xRV/100'] - 3.50345)
dch = dch.assign(xRV100_scaled_negative=dch['xRV/100'] - 3.50345)
dsl = dsl.assign(xRV100_scaled_negative=dsl['xRV/100'] - 3.50345)
dcb = dcb.assign(xRV100_scaled_negative=dcb['xRV/100'] - 3.50345)
dsp = dsp.assign(xRV100_scaled_negative=dsp['xRV/100'] - 3.50345)

In [440]:
#Create a new column which is the aboslute value of xRV/100 scaled negative
dff = dff.assign(xRV100_scaled_negative_abs=dff['xRV100_scaled_negative'].abs())
dfc = dfc.assign(xRV100_scaled_negative_abs=dfc['xRV100_scaled_negative'].abs())
dsk = dsk.assign(xRV100_scaled_negative_abs=dsk['xRV100_scaled_negative'].abs())
dch = dch.assign(xRV100_scaled_negative_abs=dch['xRV100_scaled_negative'].abs())
dsl = dsl.assign(xRV100_scaled_negative_abs=dsl['xRV100_scaled_negative'].abs())
dcb = dcb.assign(xRV100_scaled_negative_abs=dcb['xRV100_scaled_negative'].abs())
dsp = dsp.assign(xRV100_scaled_negative_abs=dsp['xRV100_scaled_negative'].abs())

In [441]:
#create a new column called Stuff+ which is ((xRV100_scaled_negative_abs)/mean of xRV100_scaled_negative_abs) * 100
dff = dff.assign(Stuff_plus=(dff['xRV100_scaled_negative_abs']/dff['xRV100_scaled_negative_abs'].mean())*100)
dfc = dfc.assign(Stuff_plus=(dfc['xRV100_scaled_negative_abs']/dfc['xRV100_scaled_negative_abs'].mean())*100)
dsk = dsk.assign(Stuff_plus=(dsk['xRV100_scaled_negative_abs']/dsk['xRV100_scaled_negative_abs'].mean())*100)
dch = dch.assign(Stuff_plus=(dch['xRV100_scaled_negative_abs']/dch['xRV100_scaled_negative_abs'].mean())*100)
dsl = dsl.assign(Stuff_plus=(dsl['xRV100_scaled_negative_abs']/dsl['xRV100_scaled_negative_abs'].mean())*100)
dcb = dcb.assign(Stuff_plus=(dcb['xRV100_scaled_negative_abs']/dcb['xRV100_scaled_negative_abs'].mean())*100)
dsp = dsp.assign(Stuff_plus=(dsp['xRV100_scaled_negative_abs']/dsp['xRV100_scaled_negative_abs'].mean())*100)

For each pitch type subset, get each pitcher's average Stuff+.

In [443]:
#fastball averages
dffavg = dff.groupby(['Pitcher', 'PitcherTeam', 'TaggedPitchType']).agg({'RelSpeed': 'mean', 'SpinRate': 'mean', 'InducedVertBreak': 'mean', 'HorzBreak': 'mean', 'VertApprAngle': 'mean', 'Stuff_plus': 'mean'}).reset_index()

In [444]:
pff = dff.groupby(['Pitcher', 'PitcherTeam']).size().to_numpy()
dffavg['PitchCount'] = pff

In [445]:
dffavg = dffavg.round(2)

In [446]:
dffavg.loc[dffavg['Pitcher'] == 'Paul Skenes']

Unnamed: 0,Pitcher,PitcherTeam,TaggedPitchType,RelSpeed,SpinRate,InducedVertBreak,HorzBreak,VertApprAngle,Stuff_plus,PitchCount
4302,Paul Skenes,LSU_TIG,Fastball,98.27,2514.83,15.79,17.05,-5.1,158.45,1705


In [447]:
#cutter average
dfcavg = dfc.groupby(['Pitcher', 'PitcherTeam', 'TaggedPitchType']).agg({'RelSpeed': 'mean', 'SpinRate': 'mean', 'InducedVertBreak': 'mean', 'HorzBreak': 'mean', 'VertApprAngle': 'mean', 'Stuff_plus': 'mean'}).reset_index()

In [448]:
pcc = dfc.groupby(['Pitcher', 'PitcherTeam']).size().to_numpy()
dfcavg['PitchCount'] = pcc

In [449]:
dfcavg = dfcavg.round(2)

In [450]:
#sinkers
dskavg = dsk.groupby(['Pitcher', 'PitcherTeam', 'TaggedPitchType']).agg({'RelSpeed': 'mean', 'SpinRate': 'mean', 'InducedVertBreak': 'mean', 'HorzBreak': 'mean', 'VertApprAngle': 'mean', 'Stuff_plus': 'mean'}).reset_index()

In [451]:
psk = dsk.groupby(['Pitcher', 'PitcherTeam']).size().to_numpy()
dskavg['PitchCount'] = psk

In [452]:
dskavg = dskavg.round(2)

In [453]:
#changeups
dchavg = dch.groupby(['Pitcher', 'PitcherTeam', 'TaggedPitchType']).agg({'RelSpeed': 'mean', 'SpinRate': 'mean', 'InducedVertBreak': 'mean', 'HorzBreak': 'mean', 'VertApprAngle': 'mean', 'Stuff_plus': 'mean'}).reset_index()

In [454]:
pch = dch.groupby(['Pitcher', 'PitcherTeam']).size().to_numpy()
dchavg['PitchCount'] = pch

In [455]:
dchavg = dchavg.round(2)

In [456]:
#sliders
dslavg = dsl.groupby(['Pitcher', 'PitcherTeam', 'TaggedPitchType']).agg({'RelSpeed': 'mean', 'SpinRate': 'mean', 'InducedVertBreak': 'mean', 'HorzBreak': 'mean', 'VertApprAngle': 'mean', 'Stuff_plus': 'mean'}).reset_index()

In [457]:
psl = dsl.groupby(['Pitcher', 'PitcherTeam']).size().to_numpy()
dslavg['PitchCount'] = psl

In [458]:
dslavg = dslavg.round(2)

In [459]:
#curveball
dcbavg = dcb.groupby(['Pitcher', 'PitcherTeam', 'TaggedPitchType']).agg({'RelSpeed': 'mean', 'SpinRate': 'mean', 'InducedVertBreak': 'mean', 'HorzBreak': 'mean', 'VertApprAngle': 'mean', 'Stuff_plus': 'mean'}).reset_index()

In [460]:
pcb = dcb.groupby(['Pitcher', 'PitcherTeam']).size().to_numpy()
dcbavg['PitchCount'] = pcb

In [461]:
dcbavg = dcbavg.round(2)

In [462]:
#splitter
dspavg = dsp.groupby(['Pitcher', 'PitcherTeam', 'TaggedPitchType']).agg({'RelSpeed': 'mean', 'SpinRate': 'mean', 'InducedVertBreak': 'mean', 'HorzBreak': 'mean', 'VertApprAngle': 'mean', 'Stuff_plus': 'mean'}).reset_index()

In [463]:
psp = dsp.groupby(['Pitcher', 'PitcherTeam']).size().to_numpy()
dspavg['PitchCount'] = psp

In [464]:
dstuffplus = pd.concat([dffavg, dfcavg, dskavg, dchavg, dcbavg, dslavg, dspavg])

The final table with Stuff+ for each pitcher by pitch type:

In [None]:
dstuffplus.head(10)

The arsenal Stuff+ value is calculated with a weighted average by pitch count:

In [469]:
def w_avg(df, values, weights):
    d = df[values]
    w = df[weights]
    return (d * w).sum() / w.sum()

In [470]:
dstuffplus_arsenal = dstuffplus.groupby(['Pitcher', 'PitcherTeam']).apply(w_avg, 'Stuff_plus', 'PitchCount').reset_index()

In [471]:
dstuffplus_arsenal.head(10)

Unnamed: 0,Pitcher,0
0,Austin Anderson,121.363922
1,Dylan McShane,93.545789
2,Grayson Grinsell,137.55707
3,Gus Rogers,69.0528
4,Ian Umlandt,85.324667
5,Jace Stoffal,105.015712
6,Jackson Jaha,91.999762
7,Jackson Pace,79.962592
8,Jacob Hughes,107.337013
9,Josh Mollerus,131.131969


Below are functions that can be used to get the Stuff+ for each pitch type, utilizing the unique trained models for each pitch type:

In [472]:
def getFBstuff(df):
    
    '''Function to get fastball stuff plus.
    DataFrame must Run Values already input.
    Gets xRV, then does calculations to get Stuff+
    returns DataFrame with Stuff+ value for each pitch.
    Only use for fastballs, function accesses trained model for fastballs.
    '''
    #first, we need to add some columns that we'll use for prediction.
    df.insert(6, "DifferentialBreak", df["InducedVertBreak"] - df["HorzBreak"].abs(), True)
    df.insert(10, "ABSSideRelease", df["RelSide"].abs(), True)
    df.insert(11, 'ABSHorzBreak', df['HorzBreak'].abs(), True)

    #next, set your predictor and response variables. 
    X = df[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
    y = df['RV']

    #now, we can use the random forest and xgboost objects from this doc to predict.
    #X1 and y1 refer to the objects trained on fastballs.
    df['rfPredictRV'] = rfr1.predict(X)
    df['xgPredictRV'] = xgb_model1.predict(X)
    df['xRV'] = (df['rfPredictRV'] + df['xgPredictRV']) / 2
    #the xRV column is the mean of the two different predictors' results.

    #calculating Stuff+:

    #add a column for xRV * 100
    df['xRV100'] = df['xRV'] * 100

    #add a column to scale the xRV100, so that we can take its abs
    df = df.assign(xRV100_scaled_negative=df['xRV100'] - 3.50345)

    df = df.assign(xRV100_scaled_negative_abs=df['xRV100_scaled_negative'].abs())
    
    #and get the stuff plus:
    df_stuffplus = df.assign(Stuff_plus=(df['xRV100_scaled_negative_abs']/df['xRV100_scaled_negative_abs'].mean())*100)

    return df_stuffplus

In [473]:
def getCTstuff(df):
    
    '''Function to get cutter stuff plus.
    DataFrame must Run Values already input.
    Gets xRV, then does calculations to get Stuff+
    returns DataFrame with Stuff+ value for each pitch.
    Only use for specific pitch type, function accesses trained model for pitch type.
    '''
    #first, we need to add some columns that we'll use for prediction.
    df.insert(6, "DifferentialBreak", df["InducedVertBreak"] - df["HorzBreak"].abs(), True)
    df.insert(10, "ABSSideRelease", df["RelSide"].abs(), True)
    df.insert(11, 'ABSHorzBreak', df['HorzBreak'].abs(), True)

    #next, set your predictor and response variables. 
    X = df[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
    y = df['RV']

    #now, we can use the random forest and xgboost objects from this doc to predict.
    #X1 and y1 refer to the objects trained on fastballs.
    df['rfPredictRV'] = rfr2.predict(X)
    df['xgPredictRV'] = xgb_model2.predict(X)
    df['xRV'] = (df['rfPredictRV'] + df['xgPredictRV']) / 2
    #the xRV column is the mean of the two different predictors' results.

    #calculating Stuff+:

    #add a column for xRV * 100
    df['xRV100'] = df['xRV'] * 100

    #add a column to scale the xRV100, so that we can take its abs
    df = df.assign(xRV100_scaled_negative=df['xRV100'] - 3.50345)

    df = df.assign(xRV100_scaled_negative_abs=df['xRV100_scaled_negative'].abs())
    
    #and get the stuff plus:
    df_stuffplus = df.assign(Stuff_plus=(df['xRV100_scaled_negative_abs']/df['xRV100_scaled_negative_abs'].mean())*100)

    return df_stuffplus

In [474]:
def getSKstuff(df):
    
    '''Function to get sinker stuff plus.
    DataFrame must Run Values already input.
    Gets xRV, then does calculations to get Stuff+
    returns DataFrame with Stuff+ value for each pitch.
    Only use for specific pitch type, function accesses trained model for pitch type.
    '''
    #first, we need to add some columns that we'll use for prediction.
    df.insert(6, "DifferentialBreak", df["InducedVertBreak"] - df["HorzBreak"].abs(), True)
    df.insert(10, "ABSSideRelease", df["RelSide"].abs(), True)
    df.insert(11, 'ABSHorzBreak', df['HorzBreak'].abs(), True)

    #next, set your predictor and response variables. 
    X = df[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
    y = df['RV']

    #now, we can use the random forest and xgboost objects from this doc to predict.
    #X1 and y1 refer to the objects trained on fastballs.
    df['rfPredictRV'] = rfr3.predict(X)
    df['xgPredictRV'] = xgb_model3.predict(X)
    df['xRV'] = (df['rfPredictRV'] + df['xgPredictRV']) / 2
    #the xRV column is the mean of the two different predictors' results.

    #calculating Stuff+:

    #add a column for xRV * 100
    df['xRV100'] = df['xRV'] * 100

    #add a column to scale the xRV100, so that we can take its abs
    df = df.assign(xRV100_scaled_negative=df['xRV100'] - 3.50345)

    df = df.assign(xRV100_scaled_negative_abs=df['xRV100_scaled_negative'].abs())
    
    #and get the stuff plus:
    df_stuffplus = df.assign(Stuff_plus=(df['xRV100_scaled_negative_abs']/df['xRV100_scaled_negative_abs'].mean())*100)

    return df_stuffplus

In [475]:
def getCHstuff(df):
    
    '''Function to get changeup stuff plus.
    DataFrame must Run Values already input.
    Gets xRV, then does calculations to get Stuff+
    returns DataFrame with Stuff+ value for each pitch.
    Only use for specific pitch type, function accesses trained model for pitch type.
    '''
    #first, we need to add some columns that we'll use for prediction.
    df.insert(6, "DifferentialBreak", df["InducedVertBreak"] - df["HorzBreak"].abs(), True)
    df.insert(10, "ABSSideRelease", df["RelSide"].abs(), True)
    df.insert(11, 'ABSHorzBreak', df['HorzBreak'].abs(), True)

    #next, set your predictor and response variables. 
    X = df[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
    y = df['RV']

    #now, we can use the random forest and xgboost objects from this doc to predict.
    #X1 and y1 refer to the objects trained on fastballs.
    df['rfPredictRV'] = rfr5.predict(X)
    df['xgPredictRV'] = xgb_model5.predict(X)
    df['xRV'] = (df['rfPredictRV'] + df['xgPredictRV']) / 2
    #the xRV column is the mean of the two different predictors' results.

    #calculating Stuff+:

    #add a column for xRV * 100
    df['xRV100'] = df['xRV'] * 100

    #add a column to scale the xRV100, so that we can take its abs
    df = df.assign(xRV100_scaled_negative=df['xRV100'] - 3.50345)

    df = df.assign(xRV100_scaled_negative_abs=df['xRV100_scaled_negative'].abs())
    
    #and get the stuff plus:
    df_stuffplus = df.assign(Stuff_plus=(df['xRV100_scaled_negative_abs']/df['xRV100_scaled_negative_abs'].mean())*100)

    return df_stuffplus

In [476]:
def getSLstuff(df):
    
    '''Function to get slider stuff plus.
    DataFrame must Run Values already input.
    Gets xRV, then does calculations to get Stuff+
    returns DataFrame with Stuff+ value for each pitch.
    Only use for specific pitch type, function accesses trained model for pitch type.
    '''
    #first, we need to add some columns that we'll use for prediction.
    df.insert(6, "DifferentialBreak", df["InducedVertBreak"] - df["HorzBreak"].abs(), True)
    df.insert(10, "ABSSideRelease", df["RelSide"].abs(), True)
    df.insert(11, 'ABSHorzBreak', df['HorzBreak'].abs(), True)

    #next, set your predictor and response variables. 
    X = df[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
    y = df['RV']

    #now, we can use the random forest and xgboost objects from this doc to predict.
    #X1 and y1 refer to the objects trained on fastballs.
    df['rfPredictRV'] = rfr4.predict(X)
    df['xgPredictRV'] = xgb_model4.predict(X)
    df['xRV'] = (df['rfPredictRV'] + df['xgPredictRV']) / 2
    #the xRV column is the mean of the two different predictors' results.

    #calculating Stuff+:

    #add a column for xRV * 100
    df['xRV100'] = df['xRV'] * 100

    #add a column to scale the xRV100, so that we can take its abs
    df = df.assign(xRV100_scaled_negative=df['xRV100'] - 3.50345)

    df = df.assign(xRV100_scaled_negative_abs=df['xRV100_scaled_negative'].abs())
    
    #and get the stuff plus:
    df_stuffplus = df.assign(Stuff_plus=(df['xRV100_scaled_negative_abs']/df['xRV100_scaled_negative_abs'].mean())*100)

    return df_stuffplus

In [477]:
def getCBstuff(df):
    
    '''Function to get curveball stuff plus.
    DataFrame must Run Values already input.
    Gets xRV, then does calculations to get Stuff+
    returns DataFrame with Stuff+ value for each pitch.
    Only use for specific pitch type, function accesses trained model for pitch type.
    '''
    #first, we need to add some columns that we'll use for prediction.
    df.insert(6, "DifferentialBreak", df["InducedVertBreak"] - df["HorzBreak"].abs(), True)
    df.insert(10, "ABSSideRelease", df["RelSide"].abs(), True)
    df.insert(11, 'ABSHorzBreak', df['HorzBreak'].abs(), True)

    #next, set your predictor and response variables. 
    X = df[['DifferentialBreak', 'RelSpeed', 'VertRelAngle', 'ABSSideRelease', 'HorzRelAngle', 'SpinRate', 'SpinAxis', 
        'RelHeight', 'Extension', 'InducedVertBreak', 'ABSHorzBreak', 'VertApprAngle', 'HorzApprAngle', 'EffectiveVelo']]
    y = df['RV']

    #now, we can use the random forest and xgboost objects from this doc to predict.
    #X1 and y1 refer to the objects trained on fastballs.
    df['rfPredictRV'] = rfr6.predict(X)
    df['xgPredictRV'] = xgb_model6.predict(X)
    df['xRV'] = (df['rfPredictRV'] + df['xgPredictRV']) / 2
    #the xRV column is the mean of the two different predictors' results.

    #calculating Stuff+:

    #add a column for xRV * 100
    df['xRV100'] = df['xRV'] * 100

    #add a column to scale the xRV100, so that we can take its abs
    df = df.assign(xRV100_scaled_negative=df['xRV100'] - 3.50345)

    df = df.assign(xRV100_scaled_negative_abs=df['xRV100_scaled_negative'].abs())
    
    #and get the stuff plus:
    df_stuffplus = df.assign(Stuff_plus=(df['xRV100_scaled_negative_abs']/df['xRV100_scaled_negative_abs'].mean())*100)

    return df_stuffplus