In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pitch_data import load_data # This is the custom function I built to load the data from cwd

In [2]:
pitches, metadata = load_data() # Function at work

Data Loaded!


In [3]:
# Replacing the column names from the sandbox notebook 
pitches_to_replace = pitches['pitch_type'].dropna().value_counts(normalize=True) # Getting pitch % for each pitch_type
pitches_to_replace = pitches_to_replace.where(pitches_to_replace <= .05).dropna().index.to_list() # creating a list of pitches that were thrown <5% in our data
pitches['pitch_type'] = pitches['pitch_type'].replace(pitches_to_replace, "other").dropna() # Replacing those pitch types with "other"

In [4]:
# Typical Features
pitches['RISP'] = pitches[['on_2b','on_3b']].any(axis = 1) # Bool for is there a runner in scoring position
pitches['previous_pitch'] = pitches.groupby(['game_pk','inning','pitcher_id'])['pitch_type'].shift(1) # what was the previous pitch
pitches['previous_outcome'] = pitches.groupby(['game_pk','inning','pitcher_id'])['type'].shift(1) # was the previous pitch a ball, strike, out, etc
pitches['handedness'] = pitches['p_throws'] + pitches['stand'] # string to OH encode for pitcher batter matchup
pitches['run_diff'] = np.where(pitches['top'] == 1, pitches['away_team_runs'] - pitches['home_team_runs'], pitches['home_team_runs'] - pitches['away_team_runs']) # run differential
pitches['leading'] = np.where(pitches['run_diff'] > 0, 1,0) # bool for team leading
pitches['trailing'] = np.where(pitches['run_diff'] < 0, 1,0) # bool for team trailing

# Pitcher Specific 
pitches['mean_career_velocity'] = pitches.groupby('pitcher_id')['start_speed'].rolling(999999, min_periods=100, closed='left').mean().reset_index().set_index('level_1').sort_index()['start_speed'] # What is the pitchers mean velo in his career? min 100 pitches.
pitches['l5_velocity'] = pitches.groupby('pitcher_id')['start_speed'].rolling(5, min_periods=5, closed='left').mean().reset_index().set_index('level_1').sort_index()['start_speed'] # What is the pitchers mean velo over his last 5 pitches?
pitches['%_diff_career'] = pitches['l5_velocity']  / pitches['mean_career_velocity'] - 1 # % difference for the pitchers last 5 velo vs career.
pitches

Unnamed: 0,uid,game_pk,year,date,team_id_b,team_id_p,inning,top,at_bat_num,pcount_at_bat,...,RISP,previous_pitch,previous_outcome,handedness,run_diff,leading,trailing,mean_career_velocity,l5_velocity,%_diff_career
0,14143226,286874,2011,2011-03-31,108,118,1,1,1,1,...,False,,,RL,0,0,0,,,
1,14143227,286874,2011,2011-03-31,108,118,1,1,1,2,...,False,,B,RL,0,0,0,,,
2,14143228,286874,2011,2011-03-31,108,118,1,1,1,3,...,False,,B,RL,0,0,0,,,
3,14143229,286874,2011,2011-03-31,108,118,1,1,1,4,...,False,,S,RL,0,0,0,,,
4,14143230,286874,2011,2011-03-31,108,118,1,1,2,1,...,False,,X,RR,0,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718956,19838192,317073,2011,2011-10-28,140,138,9,1,72,3,...,False,FF,B,RR,-4,0,1,94.525118,97.06,0.026817
718957,19838193,317073,2011,2011-10-28,140,138,9,1,72,4,...,False,FF,S,RR,-4,0,1,94.527782,97.10,0.027211
718958,19838194,317073,2011,2011-10-28,140,138,9,1,72,5,...,False,FC,B,RR,-4,0,1,94.525315,96.04,0.016024
718959,19838195,317073,2011,2011-10-28,140,138,9,1,73,1,...,False,FF,X,RL,-4,0,1,94.527502,96.16,0.017270


In [5]:
dummies = pd.get_dummies(pitches[['handedness','previous_outcome','previous_pitch']]) # Encoding categorical variables.
pitches[dummies.columns] = dummies # Setting the column names
pitches

Unnamed: 0,uid,game_pk,year,date,team_id_b,team_id_p,inning,top,at_bat_num,pcount_at_bat,...,previous_outcome_S,previous_outcome_X,previous_pitch_CH,previous_pitch_CU,previous_pitch_FC,previous_pitch_FF,previous_pitch_FT,previous_pitch_SI,previous_pitch_SL,previous_pitch_other
0,14143226,286874,2011,2011-03-31,108,118,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,14143227,286874,2011,2011-03-31,108,118,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
2,14143228,286874,2011,2011-03-31,108,118,1,1,1,3,...,0,0,0,0,0,0,0,0,0,0
3,14143229,286874,2011,2011-03-31,108,118,1,1,1,4,...,1,0,0,0,0,0,0,0,0,0
4,14143230,286874,2011,2011-03-31,108,118,1,1,2,1,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718956,19838192,317073,2011,2011-10-28,140,138,9,1,72,3,...,0,0,0,0,0,1,0,0,0,0
718957,19838193,317073,2011,2011-10-28,140,138,9,1,72,4,...,1,0,0,0,0,1,0,0,0,0
718958,19838194,317073,2011,2011-10-28,140,138,9,1,72,5,...,0,0,0,0,1,0,0,0,0,0
718959,19838195,317073,2011,2011-10-28,140,138,9,1,73,1,...,0,1,0,0,0,1,0,0,0,0


In [6]:
available_features = metadata[metadata['available_prior_to_pitch'] == 'Yes']['column_name'].to_list() # These are all of the provided columns that are available before a pitch
available_features += ['previous_pitch','previous_outcome','handedness','mean_career_velocity','l5_velocity','%_diff_career', 'RISP','run_diff','leading','trailing'] # These are the columns I've created that I want to use
available_features += dummies.columns.to_list() # These are the columns that are encoded
unwanted_features = ['uid', 'game_pk', 'year', 'date', 'top', 'at_bat_num','start_tfs','start_tfs_zulu','team_id_b', 'team_id_p', 'inning', 'pcount_at_bat', 'pcount_pitcher', 'balls',
       'strikes', 'fouls', 'outs', 'pitch_id', 'on_1b', 'on_2b', 'on_3b','batter_id','stand','b_height','pitcher_id','p_throws','away_team_runs','home_team_runs','previous_pitch', 'previous_outcome', 'handedness',] ## These are the features that are provided but are either encoded or not selected during feature testing
for feat in unwanted_features:
    available_features.remove(feat) # Removing unwated features. I think there's a better way to do this but this was pretty fast.
available_features 

['mean_career_velocity',
 'l5_velocity',
 '%_diff_career',
 'RISP',
 'run_diff',
 'leading',
 'trailing',
 'handedness_LL',
 'handedness_LR',
 'handedness_RL',
 'handedness_RR',
 'previous_outcome_B',
 'previous_outcome_S',
 'previous_outcome_X',
 'previous_pitch_CH',
 'previous_pitch_CU',
 'previous_pitch_FC',
 'previous_pitch_FF',
 'previous_pitch_FT',
 'previous_pitch_SI',
 'previous_pitch_SL',
 'previous_pitch_other']

In [7]:
response_var = 'pitch_type' # y var
df = pitches[available_features + [response_var]].dropna(subset = [response_var]) # creating DF of wanted features

# Train Test Split
train_size = .8 # setting split size to 80% 
train_idx = round(len(df)*train_size) # Train idx to stop on
train_df = df[:train_idx] # train data
test_df = df[train_idx:] # test data

# X Y 
train_X = train_df.drop(response_var, axis = 1) # training X
train_y = train_df[response_var] # training y

test_X = test_df.drop(response_var, axis = 1) # test X
test_y = test_df[response_var] # Test y

In [9]:
# Imputing missing data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

imputer = SimpleImputer()
train_X = imputer.fit_transform(train_X) # technically there's a bit of data leakage but it's confined to my training set, which is fine given the sample size and test set (which won't have leakage).
test_X = imputer.transform(test_X) # imputing test set based on train data

scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X) # scaling test set based on train data

In [10]:
## My goal is to build a regressor with an accuracy > .34 with weighted f1 > .13. 
# To be honest, this shouldn't be too hard, so I'll do a simple classification task in the best interest of time and simplicity.
# Basically, I'm trying to do better than randomly guessing

## This is my goal from the dummy_results notebook.

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [12]:
## Simple logistic regression to gauge performance
clf = LogisticRegression() 
clf.fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
from sklearn.metrics import classification_report
print(classification_report(test_y, clf.predict(test_X))) ## I like using these metrics for multiclass classification problems. There are multi-label ROC AUC metrics but this is fine.

              precision    recall  f1-score   support

          CH       0.27      0.07      0.11     14024
          CU       0.23      0.04      0.07     11452
          FC       0.34      0.24      0.28      8649
          FF       0.42      0.76      0.54     49124
          FT       0.38      0.35      0.36     16598
          SI       0.52      0.48      0.50     16005
          SL       0.36      0.15      0.22     21661
       other       0.48      0.28      0.35      5823

    accuracy                           0.41    143336
   macro avg       0.38      0.30      0.31    143336
weighted avg       0.39      0.41      0.36    143336



In [14]:
## super easy to beat my goal with a simple logit model... but I'm disappointed in the changeup and cutter recall. 
# I'm going to see if I can improve those numbers with a few more features and then hypertune a classifer.

In [15]:
# I'm going to add pitcher tendencies. Basically, what has each pitcher historically thrown? Pitchers who throw more sliders than league average are likely to continue to do so.

pitches[['total_CH','total_CU','total_FC','total_FF','total_pitch_FT','total_SI','total_SL','total_other']] = pitches.groupby('pitcher_id')[['previous_pitch_CH','previous_pitch_CU','previous_pitch_FC','previous_pitch_FF','previous_pitch_FT','previous_pitch_SI','previous_pitch_SL','previous_pitch_other']].apply('cumsum') # Total count of each pitch_type thrown
pitches['total_pitches'] = pitches.groupby('pitcher_id')['pitch_id'].rolling(999999, min_periods=100, closed='left').count().reset_index().set_index('level_1').sort_index()['pitch_id'] #total pitches thrown
pitches[['percent_CH','percent_CU','percent_FC','percent_FF','percent_FT','percent_SI','percent_SL','percent_other']] = pitches[['total_CH','total_CU','total_FC','total_FF','total_pitch_FT','total_SI','total_SL','total_other']].div(pitches['total_pitches'],axis = 0) #percent of each pitch thrown

In [16]:
response_var = 'pitch_type'
new_features = ['percent_CH','percent_CU','percent_FC','percent_FF','percent_FT','percent_SI','percent_SL','percent_other']
df = pitches[available_features + new_features + [response_var]].dropna(subset = [response_var])
df.select_dtypes(include=["number","bool_"])

# Train Test Split
train_size = .8
train_idx = round(len(df)*train_size)
train_df = df[:train_idx]
test_df = df[train_idx:]

# X Y 
train_X = train_df.drop(response_var, axis = 1)
train_y = train_df[response_var]

test_X = test_df.drop(response_var, axis = 1)
test_y = test_df[response_var]

In [17]:
# Imputing missing data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

imputer = SimpleImputer()
train_X = imputer.fit_transform(train_X)
test_X = imputer.transform(test_X) # imputing test set based on train data

scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X) # scaling test set based on train data

In [18]:
## Simple logistic regression to gauge new feauture performance

clf = LogisticRegression()
clf.fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
print(classification_report(test_y, clf.predict(test_X)))

              precision    recall  f1-score   support

          CH       0.31      0.13      0.18     14024
          CU       0.33      0.14      0.19     11452
          FC       0.48      0.33      0.39      8649
          FF       0.48      0.73      0.58     49124
          FT       0.43      0.35      0.39     16598
          SI       0.53      0.71      0.61     16005
          SL       0.40      0.25      0.31     21661
       other       0.51      0.27      0.35      5823

    accuracy                           0.46    143336
   macro avg       0.43      0.36      0.38    143336
weighted avg       0.44      0.46      0.43    143336



In [20]:
## Way better results. Going to see if a base random forest performs better and then play around with model types/hyper tuning

In [21]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(train_X, train_y)
print(classification_report(test_y, clf.predict(test_X)))

              precision    recall  f1-score   support

          CH       0.26      0.12      0.16     14024
          CU       0.28      0.12      0.17     11452
          FC       0.40      0.33      0.36      8649
          FF       0.47      0.67      0.56     49124
          FT       0.38      0.36      0.37     16598
          SI       0.51      0.72      0.60     16005
          SL       0.36      0.23      0.28     21661
       other       0.53      0.33      0.40      5823

    accuracy                           0.44    143336
   macro avg       0.40      0.36      0.36    143336
weighted avg       0.41      0.44      0.41    143336



In [22]:
## I'm actually a bit surprised this performed worse. 
# I'm going to try xgb next as thats typically the best performing model out there

In [56]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# I need to label encode pitch type 
le = LabelEncoder()
train_yle = le.fit_transform(train_y) # _yle is "Y Label Encoded"
test_yle = le.transform(test_y)

clf = xgb.XGBClassifier(learning_rate = .1)
clf.fit(train_X, train_yle)
print(classification_report(test_yle, clf.predict(test_X)))

              precision    recall  f1-score   support

           0       0.34      0.07      0.11     14024
           1       0.38      0.09      0.15     11452
           2       0.49      0.34      0.40      8649
           3       0.48      0.77      0.59     49124
           4       0.42      0.40      0.41     16598
           5       0.53      0.78      0.63     16005
           6       0.41      0.21      0.28     21661
           7       0.62      0.32      0.43      5823

    accuracy                           0.48    143336
   macro avg       0.46      0.37      0.38    143336
weighted avg       0.45      0.48      0.43    143336



In [52]:
# Best performing model yet, no suprise. I'll do some typer parameter tuning to finish.

In [53]:
## I don't want to overfit -- and since I don't have a holdout set, I'll stick to basic tuning parameters. 

from sklearn.model_selection import GridSearchCV
parameters = {

    'learning_rate': [0.1,.01],
    'max_depth': range(5,10,2)
}

clf = xgb.XGBClassifier()

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=parameters,
    scoring = 'f1_weighted',
    cv = 5,
    verbose=True
)

grid_search.fit(train_X, train_yle)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [54]:
print(classification_report(test_yle, grid_search.predict(test_X)))

              precision    recall  f1-score   support

           0       0.33      0.08      0.12     14024
           1       0.38      0.10      0.15     11452
           2       0.47      0.35      0.40      8649
           3       0.48      0.76      0.59     49124
           4       0.42      0.41      0.41     16598
           5       0.53      0.77      0.63     16005
           6       0.41      0.21      0.28     21661
           7       0.61      0.32      0.42      5823

    accuracy                           0.48    143336
   macro avg       0.45      0.38      0.38    143336
weighted avg       0.45      0.48      0.43    143336



In [55]:
grid_search.best_estimator_

## Next Steps

I learned here at the end that pitcher specific tendencies improve the model a good bit. Some things I'd like to do if I continued playing with this dataset:

1. Pitcher clustering. Understanding which pitchers have similar tendencies to see if one pitchers decision might be similar to another pitchers given a game situation.
2. More historical data. Using 2010 and further back data to see how trends might change over time and use league average numbers/decisions to bolster decision making.
3. Current batter data. There's no way a pitcher is throwing the same pitch sequence to lefty Bryce Harper and righty Aaron Judge. Similarly, pitchers throw to the same batter dozens of times per game. We should be using that data to help us predict as well (and potentially using the cluster data I was talking about earlier!)

I also think a simple classification task was great here for my goal, but to improve model performance beyond 50% accuracy I'd like to try the following:

1. Bootstrapping/Monte Carlo sims. I've found incredible success simulating outcomes in the NFL and NBA given historical trends. I'd imagine it works the same with MLB, but those models typically take longer to build, test and run.
2. Sequence encoding. Building a neural network would have taken more time than alotted, but I've messed around with LTSM (long-term short-term memory) neural networks. I think this would do even better at predicting what a pitcher might do based on historical and recent trends for different types of pitchers.
3. Pitcher pipelining. Using pitcher specific data (like % of previous throws that were fast-balls, sliders etc) boosted performance, so I'd wager that building a pipeline that is fit on specific to individual pitchers would perform even better. That said, you'd run the risk of overfitting with small samples.