# Pitch type prediction

<p>Usefull links</p>
<ul>
  <li>https://www.kaggle.com/datasets/pschale/mlb-pitch-data-20152018/versions/17/data?select=games.csv</li>
  <li>https://github.com/chrisjackson4256/MLBPitchPredictor/blob/master/pitchPredict2.ipynb</li>
  <li>https://pypi.org/project/pybaseball/2.0.0/</li>
  <li>https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html</li>
</ul>

In [None]:
!pip install pybaseball
!pip install duckdb
!pip install lightgbm
!pip install matplotlib

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import requests
import duckdb
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import lightgbm as lgbm
from pybaseball import statcast, pitching_stats
import datetime as dt

### Loading data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
raw_df = statcast(start_dt='2020-05-24', end_dt='2023-01-01', verbose = 0)

100%|██████████| 589/589 [10:19<00:00,  1.05s/it]


In [None]:
target = ['pitch_type']

situation_features = ['stand', 'p_throws', 'inning', 'balls', 'strikes',
                      'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'pitch_number',
                      'fld_score', 'bat_score']

id_columns = ['game_pk', 'pitcher', 'batter']

prev_pitch_features = ['type', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'release_speed', 'release_spin_rate']

data = raw_df[target + id_columns + situation_features + prev_pitch_features]
data.to_csv("/content/drive/MyDrive/Capstone/data2020-2023.csv")

In [None]:
#Load data for ball prediction
pitch_df = pd.read_csv('/content/drive/MyDrive/Capstone/archive/pitches.csv')
atbat_df = pd.read_csv('/content/drive/MyDrive/Capstone/archive/atbats.csv')
games_df = pd.read_csv('/content/drive/MyDrive/Capstone/archive/games.csv')
players_df = pd.read_csv('/content/drive/MyDrive/Capstone/archive/player_names.csv')

In [None]:
#load data for game prediction
data = pd.read_csv('/content/drive/MyDrive/Capstone/data2020-2023.csv')

### Data Cleaning

In [None]:
data = data[pd.notnull(data['pitch_type'])]

In [None]:
fastball_pitches = ['FA', 'FF', 'FT', 'FC', 'FS', 'SI', 'SF']
def map_fastballs(x):
    if x in fastball_pitches:
        return 1
    else:
        return 0
data['pitch_type'] = data['pitch_type'].apply(map_fastballs)

In [None]:
for col in id_columns:
    data[col] = data[col].astype(int)

# convert innings, balls and strikes to ints
for col in ['inning', 'balls', 'strikes', 'outs_when_up', 'pitch_number']:
    data[col] = data[col].astype(int)

# if inning > 9, just replace with "9"
def cap_extra_innings(x):
    if x > 9:
        return 9
    else:
        return x
data['inning'] = data['inning'].apply(cap_extra_innings)

# make a new id based on game id + pitcher id that we can use for groupby's
data['game_pitcher_id'] = data['game_pk'].astype(str) + '_' + data['pitcher'].astype(str)

# convert on_1b/on_2b/on_3b to boolean
data['on_1b'] = data['on_1b'].apply(lambda x: not np.isnan(x))
data['on_2b'] = data['on_2b'].apply(lambda x: not np.isnan(x))
data['on_3b'] = data['on_3b'].apply(lambda x: not np.isnan(x))

# handedness: does the batter hit from the same side that the pitcher is pitching from
data['pitch_bat_same_side'] = data['p_throws'] == data['stand']
data.drop(['p_throws', 'stand'], axis=1, inplace=True)

# score differential
data['score_diff'] = data['fld_score'] - data['bat_score']
data.drop(['fld_score', 'bat_score'], axis=1, inplace=True)

In [None]:
data.dropna(inplace=True)
data.drop(['game_pk', 'batter', 'game_pitcher_id'], axis=1, inplace=True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 638190 entries, 2649 to 424
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   pitch_type           638190 non-null  int64  
 1   pitcher              638190 non-null  int64  
 2   inning               638190 non-null  int64  
 3   balls                638190 non-null  int64  
 4   strikes              638190 non-null  int64  
 5   on_1b                638190 non-null  bool   
 6   on_2b                638190 non-null  bool   
 7   on_3b                638190 non-null  bool   
 8   outs_when_up         638190 non-null  int64  
 9   pitch_number         638190 non-null  int64  
 10  type                 638190 non-null  object 
 11  pfx_x                638190 non-null  Float64
 12  pfx_z                638190 non-null  Float64
 13  plate_x              638190 non-null  Float64
 14  plate_z              638190 non-null  Float64
 15  release_speed        6

### Random Forest Model Ball Stats

In [None]:
features = ['pitch_num', 'end_speed', 'start_speed',
            'break_angle', 'break_length', 'break_y',
            'pfx_x', 'pfx_z', 'spin_rate', 'spin_dir',
            'b_count', 's_count', 'outs', 'on_1b',
            'on_2b', 'on_3b', 'px', 'pz', 'zone', 'sz_bot', 'sz_top']

target = 'pitch_type'

In [None]:
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7207

Classification Report:
               precision    recall  f1-score   support

           1       0.74      0.81      0.77      2184
           2       0.74      0.75      0.75      1334
           3       1.00      0.33      0.50        12
           5       0.59      0.48      0.53      1228
           6       0.82      0.90      0.86      6860
           7       0.00      0.00      0.00         4
           8       0.28      0.01      0.03       335
           9       0.55      0.55      0.55      2421
          10       0.99      0.96      0.98       106
          11       0.58      0.32      0.41       444
          12       0.67      0.51      0.58        90
          13       0.00      0.00      0.00         4
          15       0.56      0.43      0.49      2066
          16       0.71      0.81      0.76      2876
          18       1.00      1.00      1.00        36

    accuracy                           0.72     20000
   macro avg       0.61      0.52     

In [None]:
#store model
joblib.dump(model, '/content/drive/MyDrive/Capstone/random_forest_pitchtype_model.joblib')

['/content/drive/MyDrive/Capstone/random_forest_pitchtype_model.joblib']

In [None]:
#load model
loaded_model = joblib.load('/content/drive/MyDrive/Capstone/random_forest_pitchtype_model.joblib')

#### Testing

In [None]:
sample = loaded_model.predict()


In [None]:
pitch_dict = {0: 'AB', 1: 'CH', 2: 'CU', 3: 'EP', 4: 'FA', 5: 'FC', 6: 'FF', 7: 'FO', 8: 'FS', 9: 'FT', 10: 'IN', 11: 'KC', 12: 'KN', 13: 'PO', 14: 'SC', 15: 'SI', 16: 'SL', 17: 'UN', 18: nan}}

{0: 'AB', 1: 'CH', 2: 'CU', 3: 'EP', 4: 'FA', 5: 'FC', 6: 'FF', 7: 'FO', 8: 'FS', 9: 'FT', 10: 'IN', 11: 'KC', 12: 'KN', 13: 'PO', 14: 'SC', 15: 'SI', 16: 'SL', 17: 'UN', 18: nan}


## Random Forest Model Game Stats


<p>features used to predict:</p>
<ul>
<li>pitcher</li>
<li>inning</li>
<li>balls</li>
<li>strikes</li>
<li>on_1b</li>
<li>on_2b</li>
<li>on_3b</li>
<li>outs_when_up</li>
<li>pitch_number</li>
<li>score_diff</li>
<li>pitch_bat_same_side</li>
</ul>

<p>features available in MLB stats:</p>
<ul>
<li>innings</li>
<li>p_throws as pitchesThrown</li>
<li>strikes</li>
<li>balls</li>
<li>outs_when_up as outs(check for modifications)</li>
</ul>

<p>features NOT available in MLB stats:</p>
<ul>
<li>pitch_number</li>
<li>on_1b</li>
<li>on_2b</li>
<li>on_3b</li>
<li>score_diff</li>
<li>pitch_bat_same_side</li>
</ul>

In [None]:
game_features = ['pitcher', 'inning', 'balls', 'strikes',
                      'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'pitch_number', 'score_diff', 'pitch_bat_same_side']

X = data[game_features]
y = data['pitch_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5657756768858434

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.44      0.45    145552
           1       0.62      0.65      0.64    207314

    accuracy                           0.57    352866
   macro avg       0.55      0.55      0.55    352866
weighted avg       0.56      0.57      0.56    352866



In [None]:
joblib.dump(model, '/content/drive/MyDrive/Capstone/pitchType_game_model.joblib')

['/content/drive/MyDrive/Capstone/pitchType_game_model.joblib']

#### Testing

In [None]:
joblib.load('/content/drive/MyDrive/Capstone/pitchType_game_model.joblib')

In [None]:
data['pitch_type'].unique()

array([0, 1])

In [None]:
#'pitcher', 'inning', 'balls', 'strikes', 'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'pitch_number', 'score_diff', 'pitch_bat_same_side'
test_x = [[453343, 9, 1, 2, False, True, False, 2, 4, 1, False]]
types = ['Break Ball', 'Fastball']
print(types[model.predict(test_x)[0]])

1


## Gradiant Boost Model Game Stats  

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier

In [None]:
game_features = ['pitcher', 'inning', 'balls', 'strikes',
                      'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'pitch_number', 'score_diff', 'pitch_bat_same_side']

X = data[game_features]
y = data['pitch_type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.62
              precision    recall  f1-score   support

           0       0.57      0.35      0.43     72681
           1       0.64      0.81      0.72    103752

    accuracy                           0.62    176433
   macro avg       0.61      0.58      0.58    176433
weighted avg       0.61      0.62      0.60    176433



### Fine tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],           # Testing different numbers of trees
    'max_depth': [3, 5, 7],               # Testing different tree depths
    'learning_rate': [0.01, 0.1, 0.2]     # Testing different learning rates
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),  # Your model
    param_grid,  # The parameter grid
    cv=3,        # Number of cross-validation folds
    scoring='accuracy'  # Scoring metric
)

# Fit the grid search model
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validated accuracy: {grid_search.best_score_:.2f}')

Best parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
Best cross-validated accuracy: 0.62


In [None]:
best_model = grid_search.best_estimator_

In [None]:
joblib.dump(best_model, '/content/drive/MyDrive/Capstone/pitchType_game_GBM.joblib')

['/content/drive/MyDrive/Capstone/pitchType_game_GBM.joblib']

### Testing

In [None]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,pitch_type,pitcher,inning,balls,strikes,on_1b,on_2b,on_3b,outs_when_up,pitch_number,type,pfx_x,pfx_z,plate_x,plate_z,release_speed,release_spin_rate,pitch_bat_same_side,score_diff
0,191,0,519151,9,0,0,True,False,False,2,1,X,0.41,0.37,1.05,2.39,89.2,2679.0,True,3


In [None]:
#'pitcher', 'inning', 'balls', 'strikes', 'on_1b', 'on_2b', 'on_3b', 'outs_when_up', 'pitch_number', 'score_diff', 'pitch_bat_same_side'
test_x = [[519151, 9, 0, 0, True, False, False, 2, 1, 3, True]]
types = ['Break Ball', 'Fastball']
print(types[model.predict(test_x)[0]])

Break Ball


## Random Forest API model

features available in MLB stats:
<ul>
<li>innings</li>
<li>p_throws as pitchesThrown</li>
<li>strikes</li>
<li>balls</li>
<li>outs_when_up as outs(check for modifications)</li>
<ul>

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Capstone/data2020-2023.csv')

In [None]:
data = data[pd.notnull(data['pitch_type'])]
fastball_pitches = ['FA', 'FF', 'FT', 'FC', 'FS', 'SI', 'SF']
def map_fastballs(x):
    if x in fastball_pitches:
        return 1
    else:
        return 0
data['pitch_type'] = data['pitch_type'].apply(map_fastballs)

def cap_extra_innings(x):
    if x > 9:
        return 9
    else:
        return x
data['inning'] = data['inning'].apply(cap_extra_innings)
data.dropna(inplace=True)
data['score_diff'] = data['fld_score'] - data['bat_score']

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41680 entries, 499 to 1816331
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         41680 non-null  int64  
 1   pitch_type         41680 non-null  int64  
 2   game_pk            41680 non-null  int64  
 3   pitcher            41680 non-null  int64  
 4   batter             41680 non-null  int64  
 5   stand              41680 non-null  object 
 6   p_throws           41680 non-null  float64
 7   inning             41680 non-null  int64  
 8   balls              41680 non-null  int64  
 9   strikes            41680 non-null  int64  
 10  on_1b              41680 non-null  float64
 11  on_2b              41680 non-null  float64
 12  on_3b              41680 non-null  float64
 13  outs_when_up       41680 non-null  int64  
 14  pitch_number       41680 non-null  int64  
 15  fld_score          41680 non-null  int64  
 16  bat_score          4168

In [None]:
features = ['inning', 'p_throws', 'strikes', 'balls', 'outs_when_up',]
target = 'pitch_type'

In [None]:
data['p_throws'] = OneHotEncoder(sparse_output=False).fit_transform(data[['p_throws']])

In [None]:
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5886516314779271

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.15      0.22      3368
           1       0.61      0.89      0.72      4968

    accuracy                           0.59      8336
   macro avg       0.54      0.52      0.47      8336
weighted avg       0.55      0.59      0.52      8336



In [None]:
joblib.dump(model, '/content/drive/MyDrive/Capstone/pitchType_API_RandomForrest.joblib')

['/content/drive/MyDrive/Capstone/pitchType_API_RandomForrest.joblib']

### Testing
<p>TODO:</p>
<ol>
<li>Pip install libraries (loading data)</li>
<li>Import Libraries</li>
<li>Mount Drive</li>
<li>Load Model</li>
<li>Test!</li>
</ol>


In [None]:
test_model = joblib.load('/content/drive/MyDrive/Capstone/pitchType_API_RandomForrest.joblib')

In [None]:
test_cases = [
    {"inning": 2, "p_throws": 1, "strikes": 1, "balls": 2, "outs_when_up": 1},
    {"inning": 8, "p_throws": 1, "strikes": 2, "balls": 1, "outs_when_up": 1,},
    {"inning": 7, "p_throws": 0, "strikes": 2, "balls": 3, "outs_when_up": 2},
    {"inning": 1, "p_throws": 1, "strikes": 0, "balls": 1, "outs_when_up": 0},
    {"inning": 4, "p_throws": 0, "strikes": 2, "balls": 1, "outs_when_up": 2},
    {"inning": 6, "p_throws": 0, "strikes": 1, "balls": 3, "outs_when_up": 1,}
]

real_cases = [
    data.loc[(data['inning'] == 2) & (data['p_throws'] == 1) & (data['strikes'] == 1) & (data['balls'] == 2) & (data['outs_when_up'] == 1)],
    data.loc[(data['inning'] == 8) & (data['p_throws'] == 1) & (data['strikes'] == 2) & (data['balls'] == 1) & (data['outs_when_up'] == 1)],
    data.loc[(data['inning'] == 7) & (data['p_throws'] == 0) & (data['strikes'] == 2) & (data['balls'] == 3) & (data['outs_when_up'] == 2)],
    data.loc[(data['inning'] == 1) & (data['p_throws'] == 1) & (data['strikes'] == 0) & (data['balls'] == 1) & (data['outs_when_up'] == 0)],
    data.loc[(data['inning'] == 4) & (data['p_throws'] == 0) & (data['strikes'] == 2) & (data['balls'] == 1) & (data['outs_when_up'] == 2)],
    data.loc[(data['inning'] == 6) & (data['p_throws'] == 0) & (data['strikes'] == 1) & (data['balls'] == 3) & (data['outs_when_up'] == 1)],
]

labels = ['Early Inning, Few Strikes and Balls',
'Late Inning, Bases Loaded',
'Close Game, High Pressure',
'Early Game, Favorable Count',
'Middle Inning, 2 Outs, Ahead in the Count',
'Middle Inning, High Ball Count',
]

#print(real_cases[0]['pitch_type'])

predictions = []
types = ['Break Ball', 'Fastball']

for case in test_cases:
    X_testing = pd.DataFrame([case])
    pred = model.predict(X_testing)
    predictions.append(types[pred[0]])

for i in range(len(predictions)):
  fastball_sum = sum(real_cases[i]['pitch_type'] == 1)
  breakball_sum = sum(real_cases[i]['pitch_type'] == 0)
  pitch_sum = fastball_sum + breakball_sum
  if pitch_sum == 0:
    print("**Scenario doesn't exist**")
    continue
  print(f"Scenario: {labels[i]}\nPrediction: {predictions[i]}; Fastball%:{fastball_sum/pitch_sum:.3f}; Breakball%:{breakball_sum/pitch_sum:.3f}")

Scenario: Early Inning, Few Strikes and Balls
Prediction: Fastball; Fastball%:0.612; Breakball%:0.388
Scenario: Late Inning, Bases Loaded
Prediction: Fastball; Fastball%:0.529; Breakball%:0.471
Scenario: Close Game, High Pressure
Prediction: Fastball; Fastball%:0.765; Breakball%:0.235
Scenario: Early Game, Favorable Count
Prediction: Fastball; Fastball%:0.594; Breakball%:0.406
Scenario: Middle Inning, 2 Outs, Ahead in the Count
Prediction: Break Ball; Fastball%:0.476; Breakball%:0.524
Scenario: Middle Inning, High Ball Count
Prediction: Fastball; Fastball%:0.917; Breakball%:0.083


## Gradiant Boost API Model

In [None]:
from xgboost import XGBClassifier

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Capstone/data2020-2023.csv')

In [None]:
data = data[pd.notnull(data['pitch_type'])]
fastball_pitches = ['FA', 'FF', 'FT', 'FC', 'FS', 'SI', 'SF']
def map_fastballs(x):
    if x in fastball_pitches:
        return 1
    else:
        return 0
data['pitch_type'] = data['pitch_type'].apply(map_fastballs)

def cap_extra_innings(x):
    if x > 9:
        return 9
    else:
        return x
data['inning'] = data['inning'].apply(cap_extra_innings)
data.dropna(inplace=True)
data['score_diff'] = data['fld_score'] - data['bat_score']
data['p_throws'] = OneHotEncoder(sparse_output=False).fit_transform(data[['p_throws']])

In [None]:
features = ['inning', 'p_throws', 'strikes', 'balls', 'outs_when_up',]
target = 'pitch_type'

In [None]:
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.59
              precision    recall  f1-score   support

           0       0.49      0.12      0.20      3368
           1       0.61      0.91      0.73      4968

    accuracy                           0.59      8336
   macro avg       0.55      0.52      0.46      8336
weighted avg       0.56      0.59      0.51      8336



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],           # Testing different numbers of trees
    'max_depth': [3, 5, 7],               # Testing different tree depths
    'learning_rate': [0.01, 0.1, 0.2]     # Testing different learning rates
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),  # Your model
    param_grid,  # The parameter grid
    cv=3,        # Number of cross-validation folds
    scoring='accuracy'  # Scoring metric
)

# Fit the grid search model
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validated accuracy: {grid_search.best_score_:.2f}')

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best cross-validated accuracy: 0.59
