# EDA

- Goal: Design a model to predict each matchup outcome for MLB games
- Data Source: https://baseballsavant.mlb.com/statcast_search
- Things to look into
    - What predicts matchout outcome before the initial pitch?
        - pitcher, batter (career batting average, batting average of batters X facing pitcher Y), 
        - left-handed vs right-handed?
        - batters' speed and position (probably correlated with probabilities at the plate)
        - weather (temperature, humidity, wind direction)
        - Ballpark characteristics (away vs home, not all stadiums are alike)
        - Specific game situation (players on-base, bottom 9th and 2 outs so walk is unlikely), 
        - modern statistics (batted-ball exit velocity?
        - Starting with the predictors here: https://www.baseballprospectus.com/news/article/59993/singlearity-using-a-neural-network-to-predict-the-outcome-of-plate-appearances/#_ftn12
    - Predictors: events
        - 7 events?
        - out, single, double, triple, homerun, walk, hit-by-pitch
    - Data validation (downloaded data vs mlb.com)
    - understands each field/column
    - eda
        - at bat vs plate appearance
        - power hitter vs contact hitter
        - 
    - preprocessing pipeline
    - simple model
    - metrics
    - fine-tune: feature selection / model selection / HP search
    - Application
        - Situational analysis: what to do for a certain situation for the coaches
        - Daily fantasy: Simulate games for all team matchup, then aggregate the average stats for each player
    
    
    
    

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

sys.path.append('/Users/allenchen/projects/baseball-analytics/src')
from modeling.pipeline import data_pipeline

pd.set_option('display.max_columns', None)

In [3]:
# config

# gives game state data
#ANGELS_PITCHER_DATA_PATH = "../data/20230418_angels_pitcher_stats.csv"
ANGELS_BATTER_DATA_PATH = "../data/20230418_angels_batter_stats.csv"
start_of_season = datetime(2023, 3, 30)

In [4]:
#angels_pitcher_data = pd.read_csv(ANGELS_PITCHER_DATA_PATH)
angels_batter_data = pd.read_csv(ANGELS_BATTER_DATA_PATH)

In [5]:
#print(angels_pitcher_data.shape)
print(angels_batter_data.shape)

(2511, 92)


In [6]:
angels_batter_data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,SL,2023-04-17,83.6,-2.11,5.48,"Neto, Zach",687263,678394,,foul,,,,,9,Zach Neto flies out to right fielder Raimel Ta...,R,R,R,BOS,LAA,S,,,2,1,2023,0.25,0.13,0.42,2.02,,571875.0,592273.0,2,3,Top,,,,,624512,,,5.359175,-121.637612,-1.80556,1.425656,23.284948,-30.747685,3.37,1.53,2.0,65.4,-46.0,83.7,2487.0,6.3,718540,678394,624512,671213,624414,646240,571771,608701,680776,606132,54.22,,,,,,,,26,4,Slider,1,5,5,1,5,1,5,1,Strategic,Standard,56.0,0.0,-0.147
1,FF,2023-04-17,95.0,-1.14,5.96,"Ward, Taylor",621493,676710,strikeout,swinging_strike,,,,,11,Taylor Ward strikes out swinging.,R,R,R,BOS,LAA,S,2.0,,0,2,2023,-0.9,1.48,-0.45,4.04,,,,2,9,Top,,,,,657136,,,3.801963,-138.289353,-2.679609,-12.345344,31.49452,-12.839856,3.5,1.74,,,,94.9,2557.0,6.3,718540,676710,657136,671213,624414,646240,571771,608701,680776,606132,54.24,,,0.0,1.0,0.0,0.0,,73,3,4-Seam Fastball,3,5,5,3,5,3,5,3,Standard,Standard,204.0,0.003,-0.055
2,FC,2023-04-17,88.5,-1.28,5.94,"Ward, Taylor",621493,676710,,foul,,,,,5,Taylor Ward strikes out swinging.,R,R,R,BOS,LAA,S,,,0,1,2023,0.12,0.49,0.19,2.74,,,,2,9,Top,,,,,657136,,,3.341144,-128.939199,-2.808038,0.675659,25.944013,-26.35921,3.5,1.74,200.0,71.4,49.0,88.8,2310.0,6.4,718540,676710,657136,671213,624414,646240,571771,608701,680776,606132,54.14,,,,,,,,73,2,Cutter,3,5,5,3,5,3,5,3,Standard,Standard,158.0,0.0,-0.023
3,SL,2023-04-17,83.9,-1.99,5.47,"Neto, Zach",687263,678394,,ball,,,,,14,Zach Neto flies out to right fielder Raimel Ta...,R,R,R,BOS,LAA,B,,,1,1,2023,0.23,-0.05,1.5,1.15,,571875.0,592273.0,2,3,Top,,,,,624512,,,7.596845,-121.921568,-3.459481,0.792663,24.037741,-32.155512,3.67,1.63,,,,83.8,2414.0,6.3,718540,678394,624512,671213,624414,646240,571771,608701,680776,606132,54.19,,,,,,,,26,3,Slider,1,5,5,1,5,1,5,1,Strategic,Standard,50.0,0.0,0.056
4,SL,2023-04-17,84.5,-1.84,5.56,"Neto, Zach",687263,678394,,ball,,,,,14,Zach Neto flies out to right fielder Raimel Ta...,R,R,R,BOS,LAA,B,,,0,1,2023,0.49,0.07,1.12,2.32,,571875.0,592273.0,2,3,Top,,,,,624512,,,5.890274,-122.98629,-1.299135,3.734734,25.583836,-31.392505,3.7,1.66,,,,84.3,2484.0,6.2,718540,678394,624512,671213,624414,646240,571771,608701,680776,606132,54.26,,,,,,,,26,2,Slider,1,5,5,1,5,1,5,1,Strategic,Standard,50.0,0.0,0.03


In [7]:
angels_batter_data["events"].value_counts()

events
field_out                    225
strikeout                    140
single                        90
walk                          65
double                        20
home_run                      19
grounded_into_double_play     16
force_out                     16
hit_by_pitch                  10
field_error                    6
sac_fly                        5
caught_stealing_2b             2
double_play                    2
catcher_interf                 1
Name: count, dtype: int64

In [8]:
angels_batter_data.columns

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

In [9]:

# Need the following features
# Outs, [done]
# inning, [done]
# net score, [done]
# 1B occupied, [done]
# 2B occupied, [done]
# 3B occupied, [done]
# pitcher pitch number, [not available]
# top/bot of inning, [done]
# days since start of season [done]
# temperature at game start time [need weather data]

# all given columns
# data dictionary: https://baseballsavant.mlb.com/csv-docs
game_state_cols_names = [
    #'pitch_type', 
    'game_date', 
    #'release_speed',
    #'release_pos_x',
    #'release_pos_z', 
    #'player_name', # field tied to the search query
    'batter',
    'pitcher',
    'events',
    'description', 
    #'spin_dir', 
    #'spin_rate_deprecated',
    #'break_angle_deprecated', 
    #'break_length_deprecated', 
    #'zone', 
    #'des',
    'game_type', # ensure regular season only
    'stand', # side of the plate batter is standing
    'p_throws', # Hand pitcher throws with
    #'home_team', 
    #'away_team', 
    #'type', # short hand of pitch result, B=ball, S=strike, X=in play
    #'hit_location', # position of first fielder to touch the ball
    #'bb_type', # Batted ball type, ground_ball, line_drive, fly_ball, popup.
    #'balls', # pre-pitch number of balls in count
    #'strikes', # pre-pitch number of strikes in count
    #'game_year',
    #'pfx_x', 
    #'pfx_z', 
    #'plate_x', 
    #'plate_z', 
    'on_3b', # Pre-pitch MLB Player Id of Runner on 3B.
    'on_2b', # Pre-pitch MLB Player Id of Runner on 2B.
    'on_1b', # Pre-pitch MLB Player Id of Runner on 1B.
    'outs_when_up', # Pre-pitch number of outs.
    'inning', # Pre-pitch inning number.
    'inning_topbot', # Pre-pitch top or bottom of inning.
    #'hc_x', 
    #'hc_y', 
    #'tfs_deprecated', 
    #'tfs_zulu_deprecated', 
    #'fielder_2', 
    #'umpire', 
    #'sv_id',
    #'vx0', # velocity
    #'vy0', 
    #'vz0', 
    #'ax', # acceleration
    #'ay', 
    #'az',
    #'sz_top', 
    #'sz_bot',
    #'hit_distance_sc', # Projected hit distance of the batted ball.
    #'launch_speed', 
    #'launch_angle', 
    #'effective_speed', # Derived speed based on the the extension of the pitcher's release.
    #'release_spin_rate', # Spin rate of pitch tracked by Statcast.
    #'release_extension', # Release extension of pitch in feet as tracked by Statcast.
    'game_pk', # unique id for the game
    #'pitcher.1',
    #'fielder_2.1', # Player Id for catcher
    #'fielder_3', # Player id for 1B
    #'fielder_4', # 2B
    #'fielder_5', # 3B
    #'fielder_6', # SS
    #'fielder_7', # LF
    #'fielder_8', # CF 
    #'fielder_9', # RF
    #'release_pos_y', 
    #'estimated_ba_using_speedangle', 
    #'estimated_woba_using_speedangle',
    #'woba_value', # weighted on base average wOBA value based on result of play.
    #'woba_denom', 
    #'babip_value', # Batting average on Balls in Play value based on result of play. BABIP=(H - HR)/(AB - K - HR + SF)
    #'iso_value', # ISO value based on result of play. ISO (isolated power) = (1x2B + 2x3B + 3xHR) / At-bats OR Slugging percentage - Batting average
    #'launch_speed_angle', # Launch speed/angle zone based on launch angle and exit velocity.
    #'at_bat_number', # Plate appearance number of the game.
    #'pitch_number', # Total pitch number of the plate appearance.
    #'pitch_name', 
    #'home_score', 
    #'away_score', 
    'bat_score', 
    'fld_score', 
    #'post_away_score',
    #'post_home_score', 
    #'post_bat_score', 
    #'post_fld_score',
    #'if_fielding_alignment', # Infield fielding alignment at the time of the pitch.
    #'of_fielding_alignment', # Outfield fielding alignment at the time of the pitch.
    #'spin_axis',
    #'delta_home_win_exp', # The change in Win Expectancy before the Plate Appearance and after the Plate Appearance
    #'delta_run_exp' # The change in Run Expectancy before the Pitch and after the Pitch
]

In [63]:
# ETL (Transform)
# get necesssary data
data = angels_batter_data[game_state_cols_names].copy()
data = data[data["events"].notnull()]
data.reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   game_date      617 non-null    object 
 1   batter         617 non-null    int64  
 2   pitcher        617 non-null    int64  
 3   events         617 non-null    object 
 4   description    617 non-null    object 
 5   game_type      617 non-null    object 
 6   stand          617 non-null    object 
 7   p_throws       617 non-null    object 
 8   on_3b          79 non-null     float64
 9   on_2b          122 non-null    float64
 10  on_1b          229 non-null    float64
 11  outs_when_up   617 non-null    int64  
 12  inning         617 non-null    int64  
 13  inning_topbot  617 non-null    object 
 14  game_pk        617 non-null    int64  
 15  bat_score      617 non-null    int64  
 16  fld_score      617 non-null    int64  
dtypes: float64(3), int64(7), object(7)
memory usage: 82.1+

In [64]:
sys.path.append('/Users/allenchen/projects/baseball-analytics/src')
from modeling.pipeline import data_pipeline
cols = [col for col in data.columns if col != "events"]
X, y = data[cols], data[["events"]]

In [65]:
data_pipeline

In [66]:
X.columns

Index(['game_date', 'batter', 'pitcher', 'description', 'game_type', 'stand',
       'p_throws', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up', 'inning',
       'inning_topbot', 'game_pk', 'bat_score', 'fld_score'],
      dtype='object')

In [67]:
Xt = data_pipeline.fit_transform(X, y)

IDENTITY
['batter', 'pitcher', 'description', 'game_type', 'stand', 'p_throws', 'outs_when_up', 'inning', 'inning_topbot', 'game_pk']
EncodeOnBaseOccupancy
ComputeNetScore
ComputeDaysSinceStart


In [68]:
len(Xt[0])

15

In [69]:
transformed_feature_col_names = data_pipeline["feature_transformers"].get_feature_names_out().tolist()

In [70]:
pd.DataFrame(Xt, columns=transformed_feature_col_names).head(20)

Unnamed: 0,identity__batter,identity__pitcher,identity__description,identity__game_type,identity__stand,identity__p_throws,identity__outs_when_up,identity__inning,identity__inning_topbot,identity__game_pk,encode_on_base_occupancy__on_1b,encode_on_base_occupancy__on_2b,encode_on_base_occupancy__on_3b,compute_net_score__net_score,compute_days_since_start__days_since_start
0,621493,676710,swinging_strike,R,R,R,2,9,Top,718540,0,0,0,2,18
1,660271,676710,hit_into_play,R,L,R,1,9,Top,718540,0,0,0,2,18
2,621433,678394,swinging_strike,R,L,R,1,3,Top,718540,1,1,0,4,18
3,687263,676710,hit_into_play,R,R,R,0,9,Top,718540,0,0,0,2,18
4,621433,676710,swinging_strike,R,L,R,2,8,Top,718540,0,0,0,2,18
5,681351,678394,swinging_strike,R,R,R,0,3,Top,718540,1,1,0,4,18
6,681351,676710,hit_into_play,R,R,R,1,8,Top,718540,0,0,0,2,18
7,592273,678394,hit_into_play,R,R,R,0,3,Top,718540,1,0,0,4,18
8,592273,676710,hit_into_play,R,R,R,0,8,Top,718540,0,0,0,2,18
9,571875,678394,hit_into_play,R,L,R,0,3,Top,718540,0,0,0,4,18


In [None]:
from sklearn import set_config

set_config(display='diagram')
display(clf_pipeline)

In [60]:
# preprocessing
data["game_date"] = pd.to_datetime(data["game_date"])
data["net_score"] = data["bat_score"] - data["fld_score"]
data["on_1b"] = np.where(data["on_1b"].notnull(), 1, 0)
data["on_2b"] = np.where(data["on_2b"].notnull(), 1, 0)
data["on_3b"] = np.where(data["on_3b"].notnull(), 1, 0)
data

Unnamed: 0,game_date,batter,pitcher,events,description,game_type,stand,p_throws,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,game_pk,bat_score,fld_score,net_score
0,2023-04-17,621493,676710,strikeout,swinging_strike,R,R,R,0,0,0,2,9,Top,718540,5,3,2
1,2023-04-17,660271,676710,field_out,hit_into_play,R,L,R,0,0,0,1,9,Top,718540,5,3,2
2,2023-04-17,621433,678394,strikeout,swinging_strike,R,L,R,0,1,1,1,3,Top,718540,5,1,4
3,2023-04-17,687263,676710,field_out,hit_into_play,R,R,R,0,0,0,0,9,Top,718540,5,3,2
4,2023-04-17,621433,676710,strikeout,swinging_strike,R,L,R,0,0,0,2,8,Top,718540,5,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,2023-03-30,650859,642758,field_out,hit_into_play,R,L,R,0,0,0,0,7,Top,718769,1,0,1
613,2023-03-30,681351,543507,field_out,hit_into_play,R,R,R,0,0,0,0,8,Top,718769,1,0,1
614,2023-03-30,545361,667427,field_out,hit_into_play,R,R,R,0,0,0,0,6,Top,718769,1,0,1
615,2023-03-30,621493,666205,strikeout,swinging_strike,R,R,L,0,0,0,0,1,Top,718769,0,0,0


In [62]:
# get features
data["days_since_start"] = data["game_date"].apply(lambda x: (x - start_of_season).days)
data

Unnamed: 0,game_date,batter,pitcher,events,description,game_type,stand,p_throws,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,game_pk,bat_score,fld_score,net_score,days_since_start
0,2023-04-17,621493,676710,strikeout,swinging_strike,R,R,R,0,0,0,2,9,Top,718540,5,3,2,18
1,2023-04-17,660271,676710,field_out,hit_into_play,R,L,R,0,0,0,1,9,Top,718540,5,3,2,18
2,2023-04-17,621433,678394,strikeout,swinging_strike,R,L,R,0,1,1,1,3,Top,718540,5,1,4,18
3,2023-04-17,687263,676710,field_out,hit_into_play,R,R,R,0,0,0,0,9,Top,718540,5,3,2,18
4,2023-04-17,621433,676710,strikeout,swinging_strike,R,L,R,0,0,0,2,8,Top,718540,5,3,2,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,2023-03-30,650859,642758,field_out,hit_into_play,R,L,R,0,0,0,0,7,Top,718769,1,0,1,0
613,2023-03-30,681351,543507,field_out,hit_into_play,R,R,R,0,0,0,0,8,Top,718769,1,0,1,0
614,2023-03-30,545361,667427,field_out,hit_into_play,R,R,R,0,0,0,0,6,Top,718769,1,0,1,0
615,2023-03-30,621493,666205,strikeout,swinging_strike,R,R,L,0,0,0,0,1,Top,718769,0,0,0,0


In [None]:
# preprocessing pipeline

# feature pipeline


In [None]:
# simple model

In [None]:
pipeline vs columtrasnformer()  

In [68]:
data.columns.tolist()

['game_date',
 'batter',
 'pitcher',
 'events',
 'description',
 'game_type',
 'stand',
 'p_throws',
 'on_3b',
 'on_2b',
 'on_1b',
 'outs_when_up',
 'inning',
 'inning_topbot',
 'game_pk',
 'bat_score',
 'fld_score']

In [None]:
start_date = "2023-"