In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from pathlib import Path
import statsmodels.api as sm
import glob
import os
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.patches as patches
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

This is a model that predicts the likely next pitch type in a unique count, previous pitch type, and previous outcome (ball or strike) sequence. It is useful during games to identify pitch calling tendencies held by the other team in specific pitcher hand - batter side matchups, which is often found in collegiate pitch calling. 

In [2]:
data2024 = pd.read_csv(r'2024NCAATrackman.csv', low_memory=False)

In [3]:
sb2024 = data2024.loc[data2024['PitcherTeam'] == 'ORE_DUC']

In [4]:
sb = sb2024.iloc[:, :54]

Prep the raw data for the predictor function:

In [5]:
sb['Balls'] = sb['Balls'].astype(int)
sb['Strikes'] = sb['Strikes'].astype(int)

In [6]:
sb['count'] = sb['Balls'].astype(str) + '' + sb['Strikes'].astype(str)

In [7]:
sb['count'].unique()

array(['00', '10', '11', '21', '31', '32', '01', '02', '20', '12', '22',
       '30'], dtype=object)

In [8]:
counts = ['0 0', '0 1', '1 0', '2 0', 
                '1 1', '0 2', '3 0', '3 1',
                '1 2', '2 1', '2 2', '3 2']

In [9]:
sb_rhh_rhp = sb[(sb['PitcherThrows'] == 'Right') & (sb['BatterSide'] == 'Right')]
sb_rhh_lhp = sb[(sb['PitcherThrows'] == 'Left') & (sb['BatterSide'] == 'Right')]
sb_lhh_lhp = sb[(sb['PitcherThrows'] == 'Left') & (sb['BatterSide'] == 'Left')]
sb_lhh_rhp = sb[(sb['PitcherThrows'] == 'Right') & (sb['BatterSide'] == 'Left')]

In [10]:
pitch_group_mapping = {
        'FourSeamFastBall': 'hard',
        'TwoSeamFastBall': 'hard',
        'Fastball': 'hard',
        'Sinker': 'hard',
        'Cutter': 'hard',
        'ChangeUp': 'os',
        'Changeup': 'os',
        'Splitter': 'os',
        'Slider': 'break',
        'Curveball': 'break',
        'Knuckleball': 'os',
        'OneSeamFastBall': 'hard'
}

In [11]:
sb['PitchGroup'] = sb['TaggedPitchType'].map(pitch_group_mapping)

In [12]:
sb = sb.dropna(subset=['PitchGroup'])

In [13]:
def encodegroup(row):
    hit_keywords = ['Fastball', 'Sinker', 'Cutter']
    hit2_keywords = ['Slider', 'Curveball']
    hit3_keywords = ['ChangeUp', 'Splitter']
    if any(keyword in row['TaggedPitchType'] for keyword in hit_keywords):
        return 'hard'
    elif any(keyword in row['TaggedPitchType'] for keyword in hit2_keywords):
        return 'break'
    elif any(keyword in row['TaggedPitchType'] for keyword in hit3_keywords):
        return 'os'
    else:
        return pd.NA

sb['PitchGroup'] = sb.apply(encodegroup, axis=1)

In [14]:
def count_at_bats(df):
    at_bats = []
    current_pa = None
    at_bat_count = 0
    
    for index, row in df.iterrows():
        if row['PAofInning'] != current_pa:
            current_pa = row['PAofInning']
            at_bat_count += 1
        at_bats.append(at_bat_count)
    
    return at_bats

sb['ABcount'] = count_at_bats(sb)

In [15]:
def determine_outcome(row):
    hit_keywords = ['StrikeCalled', 'StrikeSwinging', 'InPlay', 'FoulBallNotFieldable', 'FoulBallFieldable', 'FoulBall']
    hit_keywords2 = ['BallCalled', 'BallinDirt', 'BallIntentional', 'HitByPitch']  # Add all possible variations of 'hit'
    if any(keyword in row['PitchCall'] for keyword in hit_keywords):
        return 'S'
    elif any(keyword in row['PitchCall'] for keyword in hit_keywords2):
        return 'B'
    else:
        return pd.NA

sb['outcome'] = sb.apply(determine_outcome, axis=1)

In [16]:
sb = sb[sb['PitchCall'] != 'Undefined']

In [17]:
count_map = {'00': 0, '01': 1, '10': 2, '20': 3, '11': 4, '02': 5,
              '30': 6, '31': 7, '12': 8, '21': 9, '22': 10, '32': 11}
sb['count_num'] = sb['count'].map(count_map)

# encode previous pitch group and outcome
pitch_map = {'hard': 0, 'break': 1, 'os': 2}
sb['PitchGroup_encoded'] = sb['PitchGroup'].map(pitch_map)
outcome_map = {'S': 1, 'B': 0}
sb['Outcome_encoded'] = sb['outcome'].map(outcome_map)

sb = sb.dropna(subset=['PitchGroup_encoded', 'Outcome_encoded'])

In [18]:
def prep_df(df):
    pitch_group_mapping = {
    'FourSeamFastball': 'hard',
    'TwoSeamFastball': 'hard',
    'Fastball': 'hard',
    'Sinker': 'hard',
    'Cutter': 'hard',
    'ChangeUp': 'os',
    'Splitter': 'os',
    'Slider': 'break',
    'Curveball': 'break'
    }
    df['PitchGroup'] = df['TaggedPitchType'].map(pitch_group_mapping)
    #df['PitchGroup'] = df.apply(encodegroup, axis=1)
    df['ABcount'] = count_at_bats(df)
    df['outcome'] = df.apply(determine_outcome, axis=1)

    count_map = {'00': 0, '01': 1, '10': 2, '20': 3, '11': 4, '02': 5,
              '30': 6, '31': 7, '12': 8, '21': 9, '22': 10, '32': 11}
    df['count_num'] = df['count'].map(count_map)

    pitch_map = {'hard': 0, 'break': 1, 'os': 2}
    df['PitchGroup_encoded'] = df['PitchGroup'].map(pitch_map)
    outcome_map = {'S': 1, 'B': 0}
    df['Outcome_encoded'] = df['outcome'].map(outcome_map)

    df = df.dropna(subset=['PitchGroup_encoded', 'Outcome_encoded'])

    return df

In [19]:
def predict_at_bat(df):
    '''
    Function to add predictions to a Trackman DataFrame
    given it has been processed with the above code
    '''
    group_df = df.copy()
    
    group_df.loc[:, 'previous_pitchgroup'] = group_df.loc[:, 'PitchGroup_encoded'].shift(1)
    group_df.loc[:, 'previous_outcome'] = group_df.loc[:, 'Outcome_encoded'].shift(1)
    group_df.loc[:, 'prev_pitchgroup'] = group_df.loc[:, 'PitchGroup'].shift(1)
    group_df.loc[:, 'prev_outcome'] = group_df.loc[:, 'outcome'].shift(1)

    group_df = group_df.dropna(subset=['previous_pitchgroup', 'previous_outcome', 'prev_pitchgroup', 'prev_outcome'])  # drop 0-0 pitches
    group_df = group_df[group_df['count_num'] != 0]  # double down to be sure

    X = group_df[['count_num', 'previous_pitchgroup', 'previous_outcome']]
    y = group_df['PitchGroup'] 

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X, y)

    group_df.loc[:, 'Predicted_PitchGroup'] = clf.predict(X)

    prob_array = clf.predict_proba(X)

    break_prob = []
    hard_prob = []
    os_prob = []

    # create the probability columns
    for i in range(len(prob_array)):
        break_prob.append(prob_array[i][0])
        hard_prob.append(prob_array[i][1])
        os_prob.append(prob_array[i][2])

    group_df.loc[:, 'breakprob'] = break_prob
    group_df.loc[:, 'hardprob'] = hard_prob
    group_df.loc[:, 'osprob'] = os_prob

    # get the score
    score = np.round(clf.score(X, y), 3)
    print(f"accuracy: {score}")

    return group_df, clf

Use the unique matchups for better results, in practice individual pitchers would get their own version:

In [20]:
df_lp_lb = prep_df(sb_lhh_lhp)
df_lp_rb = prep_df(sb_rhh_lhp)
df_rp_lb = prep_df(sb_lhh_rhp)
df_rp_rb = prep_df(sb_rhh_rhp)

In [21]:
df_lp_lb, m_lp_lb = predict_at_bat(df_lp_lb)
df_lp_rb, m_lp_rb = predict_at_bat(df_lp_rb)
df_rp_lb, m_rp_lb = predict_at_bat(df_rp_lb)
df_rp_rb, m_rp_rb = predict_at_bat(df_rp_rb)

accuracy: 0.658
accuracy: 0.567
accuracy: 0.58
accuracy: 0.624


Grab the unique combinations of each count-pitchtype sequence:

In [22]:
df_lp_lb['num'] = df_lp_lb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).transform('size') # add a column that tracks the sample size
df_lp_lb = df_lp_lb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).first().reset_index()[['count', 'prev_pitchgroup', 'prev_outcome', 'Predicted_PitchGroup', 'breakprob', 'hardprob', 'osprob', 'num']]

In [23]:
df_lp_rb['num'] = df_lp_rb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).transform('size')
df_lp_rb = df_lp_rb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).first().reset_index()[['count', 'prev_pitchgroup', 'prev_outcome', 'Predicted_PitchGroup', 'breakprob', 'hardprob', 'osprob', 'num']]

In [24]:
df_rp_lb['num'] = df_rp_lb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).transform('size')
df_rp_lb = df_rp_lb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).first().reset_index()[['count', 'prev_pitchgroup', 'prev_outcome', 'Predicted_PitchGroup', 'breakprob', 'hardprob', 'osprob', 'num']]

In [25]:
df_rp_rb['num'] = df_rp_rb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).transform('size')
df_rp_rb = df_rp_rb.groupby(['count', 'prev_pitchgroup', 'prev_outcome']).first().reset_index()[['count', 'prev_pitchgroup', 'prev_outcome', 'Predicted_PitchGroup', 'breakprob', 'hardprob', 'osprob', 'num']]

Final product to be used during a game:

In [26]:
df_lp_lb.head(10)

Unnamed: 0,count,prev_pitchgroup,prev_outcome,Predicted_PitchGroup,breakprob,hardprob,osprob,num
0,1,break,S,break,0.541577,0.372778,0.085645,27
1,1,hard,S,break,0.555343,0.369153,0.075504,54
2,1,os,S,os,0.006129,0.489275,0.504596,4
3,2,break,S,hard,0.472413,0.527587,0.0,17
4,2,hard,S,hard,0.227591,0.682543,0.089866,21
5,2,os,S,os,0.192709,0.386127,0.421164,5
6,10,break,B,hard,0.26288,0.681217,0.055903,37
7,10,hard,B,hard,0.145234,0.720031,0.134735,44
8,10,os,B,hard,0.0,0.99,0.01,5
9,11,break,B,hard,0.488081,0.511919,0.0,20
