In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import requests
import duckdb
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import lightgbm as lgbm
from pybaseball import statcast, pitching_stats
import datetime as dt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [None]:
raw_data = statcast(start_dt='2023-04-01', end_dt='2023-10-01', verbose = 0)
raw_data.to_csv('data2023-2023.csv')

100%|██████████| 246/246 [01:55<00:00,  2.13it/s]


In [None]:
raw_data = pd.read_csv('data2023-2023.csv', index_col=0)

In [4]:
prediction_columns = [
    #'pitch_type',
    'batter', 
    'pitcher',
    'events',
    'zone', 
    'stand', 
    'p_throws', 
    #'hit_location', 
    #'bb_type', 
    'balls',
    'on_3b', 
    'on_2b', 
    'on_1b', 
    'outs_when_up'
]

hit_analysis_data = raw_data[prediction_columns]

#Mapping events into 4 categories: hit, strikeout, hit into an out, walk
"""print(hit_analysis_data['events'].unique())
print("="*30)"""

def event_mapping(event):
    if event in ['single', 'double', 'triple', 'home_run']:
        return 'hit'
    elif event in ['strikeout', 'strikeout_double_play']:
        return 'strikeout'
    elif event in ['field_out', 'grounded_into_double_play']:
        return 'hit_into_out'
    elif event in ['walk', 'intent_walk']:
        return 'walk'
    else:
        return 'other'

hit_analysis_data['events'] = hit_analysis_data['events'].apply(event_mapping)

"""print(hit_analysis_data['events'].unique())
print("="*30)"""



hit_analysis_data['on_1b'] = hit_analysis_data['on_1b'].fillna(0)
hit_analysis_data['on_2b'] = hit_analysis_data['on_2b'].fillna(0)
hit_analysis_data['on_3b'] = hit_analysis_data['on_3b'].fillna(0)
hit_analysis_data['zone'] = hit_analysis_data['zone'].fillna(0)
#hit_analysis_data['hit_location'] = hit_analysis_data['hit_location'].fillna(0)

hit_analysis_data['on_1b'] = hit_analysis_data['on_1b'].astype(int)
hit_analysis_data['on_2b'] = hit_analysis_data['on_2b'].astype(int)
hit_analysis_data['on_3b'] = hit_analysis_data['on_3b'].astype(int)
hit_analysis_data['zone'] = hit_analysis_data['zone'].astype(int)
#hit_analysis_data['hit_location'] = hit_analysis_data['hit_location'].astype(int)

hit_analysis_data.dropna(inplace=False)
print(hit_analysis_data['events'].unique())

label_encoder = LabelEncoder()
#hit_analysis_data['pitch_type'] = LabelEncoder().fit_transform(hit_analysis_data['pitch_type'])
hit_analysis_data['events'] = label_encoder.fit_transform(hit_analysis_data['events'])
#hit_analysis_data['bb_type'] = LabelEncoder().fit_transform(hit_analysis_data['bb_type'])
hit_analysis_data['p_throws'] = LabelEncoder().fit_transform(hit_analysis_data['p_throws'])
hit_analysis_data['stand'] = LabelEncoder().fit_transform(hit_analysis_data['stand'])

['hit_into_out' 'hit' 'other' 'strikeout' 'walk']


In [5]:
hit_analysis_data.info()
hit_analysis_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 773618 entries, 190 to 578
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   batter        773618 non-null  int64
 1   pitcher       773618 non-null  int64
 2   events        773618 non-null  int32
 3   zone          773618 non-null  int32
 4   stand         773618 non-null  int32
 5   p_throws      773618 non-null  int32
 6   balls         773618 non-null  int64
 7   on_3b         773618 non-null  int32
 8   on_2b         773618 non-null  int32
 9   on_1b         773618 non-null  int32
 10  outs_when_up  773618 non-null  int64
dtypes: int32(7), int64(4)
memory usage: 50.2 MB


batter          0
pitcher         0
events          0
zone            0
stand           0
p_throws        0
balls           0
on_3b           0
on_2b           0
on_1b           0
outs_when_up    0
dtype: int64

In [12]:
"""
Feature Importances:
batter: 0.312
pitcher: 0.322
zone: 0.103
stand: 0.011
p_throws: 0.014
balls: 0.067
on_3b: 0.022
on_2b: 0.052
on_1b: 0.069
outs_when_up: 0.029
"""
features = ['batter', 'pitcher', 'zone',  'balls', 'on_3b', 'on_2b', 'on_1b'] # 'p_throws', 'outs_when_up', 'stand', 
target = 'events'

X = hit_analysis_data[features]
y = hit_analysis_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [13]:
from sklearn.metrics import f1_score
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

macro_precision = precision_score(y_test, y_pred, average='macro')
micro_precision = precision_score(y_test, y_pred, average='micro')
print(f"Macro Precision:{macro_precision}, micro Precision: {micro_precision}")

macro_recall = recall_score(y_test, y_pred, average='macro')
micro_recall = recall_score(y_test, y_pred, average='micro')
print(f"Macro Recall:{macro_precision}, micro Recall:{micro_precision}")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

feature_importances = model.feature_importances_
print("Feature Importances:")
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance:.3f}")

f1 = f1_score(y_test, y_pred, average='micro')
print("F1 Score:", f1)

Accuracy: 0.7167795558542954
Macro Precision:0.35076970549871017, micro Precision: 0.7167795558542954
Macro Recall:0.35076970549871017, micro Recall:0.7167795558542954
Confusion Matrix:
 [[   241    757   7667     93    130]
 [   508   1405  14884    272    257]
 [  1779   4557 106465   2408    763]
 [   131    376   8166    239    433]
 [    13     60    455    112   2553]]
Feature Importances:
batter: 0.342
pitcher: 0.354
zone: 0.088
balls: 0.068
on_3b: 0.023
on_2b: 0.055
on_1b: 0.072
F1 Score: 0.7167795558542954


In [None]:
joblib.dump(model, 'hit_analysis_model.joblib', compress='lzma')

['hit_analysis_model.joblib']

In [17]:
import pickle
with open('hitAnalysismodel.pkl', 'wb') as f:
    pickle.dump(model, f, protocol=4)