# EDA

- Goal: Design a model to predict each matchup outcome for MLB games
- Data Source: https://baseballsavant.mlb.com/statcast_search
- Things to look into
    - What predicts matchout outcome before the initial pitch?
        - pitcher, batter (career batting average, batting average of batters X facing pitcher Y), 
        - left-handed vs right-handed?
        - batters' speed and position (probably correlated with probabilities at the plate)
        - weather (temperature, humidity, wind direction)
        - Ballpark characteristics (away vs home, not all stadiums are alike)
        - Specific game situation (players on-base, bottom 9th and 2 outs so walk is unlikely), 
        - modern statistics (batted-ball exit velocity?
        - Starting with the predictors here: https://www.baseballprospectus.com/news/article/59993/singlearity-using-a-neural-network-to-predict-the-outcome-of-plate-appearances/#_ftn12
    - Predictors: events
        - 7 events?
        - out, single, double, triple, homerun, walk, hit-by-pitch
    - Data validation (downloaded data vs mlb.com)
    - understands each field/column
    - eda
        - at bat vs plate appearance
        - power hitter vs contact hitter
        - 
    - preprocessing pipeline
    - simple model
    - metrics
    - fine-tune: feature selection / model selection / HP search
    - Application
        - Situational analysis: what to do for a certain situation for the coaches
        - Daily fantasy: Simulate games for all team matchup, then aggregate the average stats for each player
    
    
    
    

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from datetime import date, timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append('/Users/allenchen/projects/baseball-analytics/src')
from modeling.pipeline import data_pipeline

pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', 400)

In [3]:
# config

# gives game state data
#ANGELS_PITCHER_DATA_PATH = "../data/20230418_angels_pitcher_2023.csv"
ANGELS_BATTER_DATA_PATH = "../data/20230418_angels_batter_2023.csv"
ANGELS_BATTER_2022_PATH = "../data/angels_batter_2022_season.csv"
start_of_season = date(2023, 3, 30)

In [4]:
#angels_pitcher_data = pd.read_csv(ANGELS_PITCHER_DATA_PATH)
angels_batter_data = pd.read_csv(ANGELS_BATTER_DATA_PATH)
angels_batter_2022_data = pd.read_csv(ANGELS_BATTER_2022_PATH)

In [5]:
angels_batter_data_2022_2023 = pd.concat([angels_batter_data, angels_batter_2022_data])

In [6]:
#print(angels_pitcher_data.shape)
print(angels_batter_data_2022_2023.shape)

(25610, 92)


In [7]:
angels_batter_data_2022_2023.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,SL,2023-04-17,83.6,-2.11,5.48,"Neto, Zach",687263,678394,,foul,,,,,9,Zach Neto flies out to right fielder Raimel Tapia.,R,R,R,BOS,LAA,S,,,2,1,2023,0.25,0.13,0.42,2.02,,571875.0,592273.0,2,3,Top,,,,,624512,,,5.359175,-121.637612,-1.80556,1.425656,23.284948,-30.747685,3.37,1.53,2.0,65.4,-46.0,83.7,2487.0,6.3,718540,678394,624512,671213,624414,646240,571771,608701,680776,606132,54.22,,,,,,,,26,4,Slider,1,5,5,1,5,1,5,1,Strategic,Standard,56.0,0.0,-0.147
1,FF,2023-04-17,95.0,-1.14,5.96,"Ward, Taylor",621493,676710,strikeout,swinging_strike,,,,,11,Taylor Ward strikes out swinging.,R,R,R,BOS,LAA,S,2.0,,0,2,2023,-0.9,1.48,-0.45,4.04,,,,2,9,Top,,,,,657136,,,3.801963,-138.289353,-2.679609,-12.345344,31.49452,-12.839856,3.5,1.74,,,,94.9,2557.0,6.3,718540,676710,657136,671213,624414,646240,571771,608701,680776,606132,54.24,,,0.0,1.0,0.0,0.0,,73,3,4-Seam Fastball,3,5,5,3,5,3,5,3,Standard,Standard,204.0,0.003,-0.055
2,FC,2023-04-17,88.5,-1.28,5.94,"Ward, Taylor",621493,676710,,foul,,,,,5,Taylor Ward strikes out swinging.,R,R,R,BOS,LAA,S,,,0,1,2023,0.12,0.49,0.19,2.74,,,,2,9,Top,,,,,657136,,,3.341144,-128.939199,-2.808038,0.675659,25.944013,-26.35921,3.5,1.74,200.0,71.4,49.0,88.8,2310.0,6.4,718540,676710,657136,671213,624414,646240,571771,608701,680776,606132,54.14,,,,,,,,73,2,Cutter,3,5,5,3,5,3,5,3,Standard,Standard,158.0,0.0,-0.023
3,SL,2023-04-17,83.9,-1.99,5.47,"Neto, Zach",687263,678394,,ball,,,,,14,Zach Neto flies out to right fielder Raimel Tapia.,R,R,R,BOS,LAA,B,,,1,1,2023,0.23,-0.05,1.5,1.15,,571875.0,592273.0,2,3,Top,,,,,624512,,,7.596845,-121.921568,-3.459481,0.792663,24.037741,-32.155512,3.67,1.63,,,,83.8,2414.0,6.3,718540,678394,624512,671213,624414,646240,571771,608701,680776,606132,54.19,,,,,,,,26,3,Slider,1,5,5,1,5,1,5,1,Strategic,Standard,50.0,0.0,0.056
4,SL,2023-04-17,84.5,-1.84,5.56,"Neto, Zach",687263,678394,,ball,,,,,14,Zach Neto flies out to right fielder Raimel Tapia.,R,R,R,BOS,LAA,B,,,0,1,2023,0.49,0.07,1.12,2.32,,571875.0,592273.0,2,3,Top,,,,,624512,,,5.890274,-122.98629,-1.299135,3.734734,25.583836,-31.392505,3.7,1.66,,,,84.3,2484.0,6.2,718540,678394,624512,671213,624414,646240,571771,608701,680776,606132,54.26,,,,,,,,26,2,Slider,1,5,5,1,5,1,5,1,Strategic,Standard,50.0,0.0,0.03


In [8]:
angels_batter_data_2022_2023.events.value_counts()

events
field_out                    2579
strikeout                    1675
single                        915
walk                          486
double                        239
home_run                      209
force_out                     118
grounded_into_double_play     111
hit_by_pitch                   64
field_error                    36
triple                         31
sac_fly                        30
sac_bunt                       25
double_play                    16
fielders_choice                16
caught_stealing_2b             10
fielders_choice_out             7
strikeout_double_play           4
catcher_interf                  2
triple_play                     1
pickoff_1b                      1
Name: count, dtype: int64

In [75]:
data[data["launch_speed"].isnull()]

Unnamed: 0,game_date,batter,pitcher,events,description,game_type,stand,p_throws,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,launch_speed,game_pk,bat_score,fld_score
0,2023-04-17,621493,676710,so,swinging_strike,R,R,R,,,,2,9,Top,,718540,5,3
2,2023-04-17,621433,678394,so,swinging_strike,R,L,R,,571875.0,592273.0,1,3,Top,,718540,5,1
4,2023-04-17,621433,676710,so,swinging_strike,R,L,R,,,,2,8,Top,,718540,5,3
5,2023-04-17,681351,678394,so,swinging_strike,R,R,R,,571875.0,592273.0,0,3,Top,,718540,5,1
13,2023-04-17,592669,676710,so,called_strike,R,R,R,,,,0,7,Top,,718540,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6556,2022-04-07,660271,664285,so,called_strike,R,L,L,,,,2,3,Bot,,661042,0,1
6561,2022-04-07,666176,664285,so,swinging_strike,R,R,L,,,622110.0,0,2,Bot,,661042,0,0
6564,2022-04-07,669016,664208,hbp,hit_by_pitch,R,L,R,,,,2,8,Bot,,661042,0,3
6570,2022-04-07,545361,664285,w,ball,R,R,L,,,,1,1,Bot,,661042,0,0


## Planned Features

### Game State
- Outs, [done]
- inning, [done]
- net score, [done]
- 1B occupied, [done]
- 2B occupied, [done]
- 3B occupied, [done]
- pitcher pitch number, [not available]
- top/bot of inning, [done]
- days since start of season [done]
- temperature at game start time [need weather data]

### Batter 365 day moving average
- Rates per PA for: 1B, 2B, 3B, HR, BB, SO, DP, FO, HBP, SF, SH [done]
- Avg WOBA [done]
    - numerator events needed: w, hbp, s, d, t, hr
    - denominator: # of PA (technically PA - IBB, but IBB mixed with BB currently)
- Max exit velocity, avg exit velocity [done]
    - # exit velocity events: fo, s, d, t, hr, dp, e, t, sf, sh, fc, tp
- # of plate appearances [done]
- Platoon wOBA, # of platoon plate appearances [done]
- Relative park factor hits



In [257]:
# all given columns
# data dictionary: https://baseballsavant.mlb.com/csv-docs
game_state_cols_names = [
    #'pitch_type', 
    'game_date', 
    #'release_speed',
    #'release_pos_x',
    #'release_pos_z', 
    #'player_name', # field tied to the search query
    'batter',
    'pitcher',
    'events',
    'description', 
    #'spin_dir', 
    #'spin_rate_deprecated',
    #'break_angle_deprecated', 
    #'break_length_deprecated', 
    #'zone', 
    #'des',
    'game_type', # ensure regular season only
    'stand', # side of the plate batter is standing
    'p_throws', # Hand pitcher throws with
    'home_team', 
    'away_team', 
    #'type', # short hand of pitch result, B=ball, S=strike, X=in play
    #'hit_location', # position of first fielder to touch the ball
    #'bb_type', # Batted ball type, ground_ball, line_drive, fly_ball, popup.
    #'balls', # pre-pitch number of balls in count
    #'strikes', # pre-pitch number of strikes in count
    'game_year',
    #'pfx_x', 
    #'pfx_z', 
    #'plate_x', 
    #'plate_z', 
    'on_3b', # Pre-pitch MLB Player Id of Runner on 3B.
    'on_2b', # Pre-pitch MLB Player Id of Runner on 2B.
    'on_1b', # Pre-pitch MLB Player Id of Runner on 1B.
    'outs_when_up', # Pre-pitch number of outs.
    'inning', # Pre-pitch inning number.
    'inning_topbot', # Pre-pitch top or bottom of inning.
    #'hc_x', 
    #'hc_y', 
    #'tfs_deprecated', 
    #'tfs_zulu_deprecated', 
    #'fielder_2', 
    #'umpire', 
    #'sv_id',
    #'vx0', # velocity
    #'vy0', 
    #'vz0', 
    #'ax', # acceleration
    #'ay', 
    #'az',
    #'sz_top', 
    #'sz_bot',
    #'hit_distance_sc', # Projected hit distance of the batted ball.
    'launch_speed', # Exit velocity of the batted ball as tracked by Statcast.
    #'launch_angle', 
    #'effective_speed', # Derived speed based on the the extension of the pitcher's release.
    #'release_spin_rate', # Spin rate of pitch tracked by Statcast.
    #'release_extension', # Release extension of pitch in feet as tracked by Statcast.
    'game_pk', # unique id for the game
    #'pitcher.1',
    #'fielder_2.1', # Player Id for catcher
    #'fielder_3', # Player id for 1B
    #'fielder_4', # 2B
    #'fielder_5', # 3B
    #'fielder_6', # SS
    #'fielder_7', # LF
    #'fielder_8', # CF 
    #'fielder_9', # RF
    #'release_pos_y', 
    #'estimated_ba_using_speedangle', 
    #'estimated_woba_using_speedangle',
    #'woba_value', # weighted on base average wOBA value based on result of play.
    #'woba_denom', 
    #'babip_value', # Batting average on Balls in Play value based on result of play. BABIP=(H - HR)/(AB - K - HR + SF)
    #'iso_value', # ISO value based on result of play. ISO (isolated power) = (1x2B + 2x3B + 3xHR) / At-bats OR Slugging percentage - Batting average
    #'launch_speed_angle', # Launch speed/angle zone based on launch angle and exit velocity.
    #'at_bat_number', # Plate appearance number of the game.
    #'pitch_number', # Total pitch number of the plate appearance.
    #'pitch_name', 
    #'home_score', 
    #'away_score', 
    'bat_score', 
    'fld_score', 
    #'post_away_score',
    #'post_home_score', 
    #'post_bat_score', 
    #'post_fld_score',
    #'if_fielding_alignment', # Infield fielding alignment at the time of the pitch.
    #'of_fielding_alignment', # Outfield fielding alignment at the time of the pitch.
    #'spin_axis',
    #'delta_home_win_exp', # The change in Win Expectancy before the Plate Appearance and after the Plate Appearance
    #'delta_run_exp' # The change in Run Expectancy before the Pitch and after the Pitch
]

In [258]:
# ETL (Transform)
# get necesssary data
data = angels_batter_data_2022_2023[game_state_cols_names].copy()
data = data[data["events"].notnull()]
data.reset_index(drop=True, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6575 entries, 0 to 6574
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   game_date      6575 non-null   object 
 1   batter         6575 non-null   int64  
 2   pitcher        6575 non-null   int64  
 3   events         6575 non-null   object 
 4   description    6575 non-null   object 
 5   game_type      6575 non-null   object 
 6   stand          6575 non-null   object 
 7   p_throws       6575 non-null   object 
 8   home_team      6575 non-null   object 
 9   away_team      6575 non-null   object 
 10  game_year      6575 non-null   int64  
 11  on_3b          595 non-null    float64
 12  on_2b          1139 non-null   float64
 13  on_1b          1972 non-null   float64
 14  outs_when_up   6575 non-null   int64  
 15  inning         6575 non-null   int64  
 16  inning_topbot  6575 non-null   object 
 17  launch_speed   4323 non-null   float64
 18  game_pk 

In [259]:
data.shape

(6575, 21)

In [260]:
data.columns

Index(['game_date', 'batter', 'pitcher', 'events', 'description', 'game_type',
       'stand', 'p_throws', 'home_team', 'away_team', 'game_year', 'on_3b',
       'on_2b', 'on_1b', 'outs_when_up', 'inning', 'inning_topbot',
       'launch_speed', 'game_pk', 'bat_score', 'fld_score'],
      dtype='object')

In [261]:
# clean target column

from collections import defaultdict
events_mapping_dict = {
    "field_out": "fo", 
    "strikeout": "so", 
    "single": "s", 
    "double": "d", 
    "triple": "t", 
    "walk": "w", 
    "home_run": "hr", 
    "force_out": "fo", 
    "grounded_into_double_play": "dp", 
    "double_play": "dp", 
    "hit_by_pitch": "hbp", 
    "field_error": "e", 
    "sac_fly": "sf", 
    "sac_bunt": "sh", 
    "fielders_choice": "fc", 
    "caught_stealing_2b": "delete", 
    "fielders_choice_out": "fo", 
    "strikeout_double_play": "so", 
    "catcher_interf": "e", 
    "triple_play": "tp", 
    "pickoff_1b": "delete", 
}
events_mapping = defaultdict(lambda: "delete", events_mapping_dict)
data["events"] = data["events"].map(events_mapping)
data.drop(data[data["events"] == "delete"].index, inplace=True)

In [262]:
data.events.value_counts()

events
fo     2704
so     1679
s       915
w       486
d       239
hr      209
dp      127
hbp      64
e        38
t        31
sf       30
sh       25
fc       16
tp        1
Name: count, dtype: int64

In [263]:
data.p_throws.value_counts()

p_throws
R    4599
L    1965
Name: count, dtype: int64

### Prepare Moving average

In [86]:
player_subset = data.loc[(data["batter"] == 660271), ["game_date", "events"]]
player_subset.head()

Unnamed: 0,game_date,events
1,2023-04-17,fo
16,2023-04-17,s
20,2023-04-17,e
34,2023-04-17,s
37,2023-04-17,fo


In [88]:
from datetime import datetime
datetime(2022, 4, 7) + timedelta(365)

datetime.datetime(2023, 4, 7, 0, 0)

In [390]:

# start is the first day of the training dataset, end is the last day of the training dataset
t_index = pd.DatetimeIndex(pd.date_range(start=datetime(2022, 4, 7), end=datetime(2023, 4, 17), freq="1d"))

player_subset["game_date"] = pd.to_datetime(player_subset["game_date"])
player_subset["at_bat_count"] = 1.0
at_bat_ts = (
    player_subset[["game_date", "at_bat_count"]]
    .groupby(["game_date"])
    .sum()
    .sort_index(ascending=True)
    .resample("1D")
    .mean()
    .reindex(t_index) # most players dont have stats all the way back to the start date of the training dataset
    .fillna(0)
)
display(at_bat_ts.tail())

player_event_subset = player_subset[player_subset["events"] == 's']
player_event_subset.drop(["at_bat_count"], axis=1, inplace=True)
player_event_subset["stats_count"] = 1
stats_ts = (
    player_event_subset[["game_date", "stats_count"]]
    .groupby(["game_date"])
    .sum()
    .sort_index(ascending=True)
    .resample("1D")
    .mean()
    .reindex(t_index)
    .fillna(0)
)
display(stats_ts.tail())

combined_ts = stats_ts.merge(at_bat_ts, left_index=True, right_index=True)

def compute(window):
    num = combined_ts.loc[window.index, "stats_count"].sum()
    den = combined_ts.loc[window.index, "at_bat_count"].sum()
    # rookie debut safety consideration (in case of extremely low or high stat)
    if den < 10:
        return 0.1
    return num / (den + 1)

windows = combined_ts.rolling(365)
stat_ma = windows.apply(compute, raw=False)["stats_count"].shift()

# output will always be say ts starting from 2023-04-07, because data starts at 2022-04-07 (see above)
# this will be set in stone
computed_ma_start_date = stat_ma.index[365]
assert(date(computed_ma_start_date.year, computed_ma_start_date.month, computed_ma_start_date.day) == date(2023, 4, 7))
stat_ma_lst = stat_ma.tolist()[365:]
stat_ma_lst#.tail(30)


Unnamed: 0,at_bat_count
2023-04-13,0.0
2023-04-14,5.0
2023-04-15,5.0
2023-04-16,4.0
2023-04-17,5.0


Unnamed: 0,stats_count
2023-04-13,0.0
2023-04-14,1.0
2023-04-15,2.0
2023-04-16,0.0
2023-04-17,2.0


[0.13884785819793205,
 0.14032496307237813,
 0.1390532544378698,
 0.140117994100295,
 0.140117994100295,
 0.140117994100295,
 0.1394658753709199,
 0.1394658753709199,
 0.14094955489614244,
 0.14391691394658754,
 0.1426448736998514]

In [385]:
date(2023, 4, 7) - timedelta(365)

datetime.date(2022, 4, 7)

In [329]:
stat_ma_ind = (datetime(2023, 4, 17) - datetime(2023, 4, 8)).days
print(stat_ma_ind)
stat_ma_lst[stat_ma_ind]

9

In [373]:
ex_df = pd.DataFrame({"A": [[1, 10], [2, 20], [3, 30]]})
ex_df.loc[ex_df.index == 2, "A"].values[0]#[1]

[3, 30]

In [191]:
# prototype for head-to-head transformer

data_subset = data[["batter", "pitcher", "events"]].copy()
matchup_count = data_subset.groupby(["batter", "pitcher"])["events"].count().reset_index().rename(columns={"events": "PA"})
matchup_event_count = (
    data_subset
    .groupby(["batter", "pitcher", "events"])["events"]
    .count()
    .to_frame()
    .rename(columns={"events": "count"})
    .reset_index()
)
matchup_event_count = matchup_event_count[matchup_event_count["events"].isin(["s", "d", "hr", "w", "so"])]

matchup_df = matchup_event_count.merge(matchup_count, on=["batter", "pitcher"])
matchup_df["rate"] = matchup_df["count"] / matchup_df["PA"]
matchup_df.drop(["count", "PA"], axis=1, inplace=True)
matchup_df = (
    pd.pivot_table(matchup_df, values="rate", index=["batter", "pitcher"], columns=["events"])
    .reset_index()
    .rename(columns={"s": "1B", "d": "2B", "hr": "HR", "w": "BB", "so": "SO"})
)


# add pa
matchup_df = matchup_df.merge(matchup_count, on=["batter", "pitcher"])

# add wOBA
WOBA_FACTORS = {
    "w": 0.702,
    "hbp": 0.733,
    "s": 0.892,
    "d": 1.261,
    "t": 1.593,
    "hr": 2.039,
}
woba_events = ["w", "hbp", "s", "d", "t", "hr"]
data_subset = data_subset[data_subset["events"].isin(woba_events)].copy()
data_subset["weight"] = data_subset["events"].map(WOBA_FACTORS)
matchup_woba_df = data_subset.groupby(["batter", "pitcher"])["weight"].sum().reset_index().rename(columns={"weight": "count"})
matchup_woba_df = matchup_woba_df.merge(matchup_count, on=["batter", "pitcher"])
matchup_woba_df["wOBA"] = matchup_woba_df["count"] / matchup_woba_df["PA"]
matchup_woba_df.drop(["count", "PA"], axis=1, inplace=True)

matchup_df = matchup_df.merge(matchup_woba_df, on=["batter", "pitcher"])

def make_bp_index(row):
    return str(int(row["batter"])) + str(int(row["pitcher"]))

matchup_df["bp_index"] = matchup_df.apply(make_bp_index, axis=1)
matchup_df.fillna(0.0, inplace=True)
matchup_df.drop(["batter", "pitcher"], axis=1, inplace=True)
matchup_df.set_index("bp_index", inplace=True)
matchup_stat_mapping = defaultdict(lambda: "no such matchup", **matchup_df.to_dict(orient="index"))
matchup_stat_mapping["000"]#["435559456501"]["2B"]

'no such matchup'

### Preprocessing Pipeline Planning

In [22]:
# preprocessing
data["game_date"] = pd.to_datetime(data["game_date"])
data

Unnamed: 0,game_date,batter,pitcher,events,description,game_type,stand,p_throws,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,launch_speed,game_pk,bat_score,fld_score
0,2023-04-17,621493,676710,so,swinging_strike,R,R,R,,,,2,9,Top,,718540,5,3
1,2023-04-17,660271,676710,fo,hit_into_play,R,L,R,,,,1,9,Top,97.1,718540,5,3
2,2023-04-17,621433,678394,so,swinging_strike,R,L,R,,571875.0,592273.0,1,3,Top,,718540,5,1
3,2023-04-17,687263,676710,fo,hit_into_play,R,R,R,,,,0,9,Top,91.5,718540,5,3
4,2023-04-17,621433,676710,so,swinging_strike,R,L,R,,,,2,8,Top,,718540,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6570,2022-04-07,545361,664285,w,ball,R,R,L,,,,1,1,Bot,,661042,0,0
6571,2022-04-07,543685,519151,fo,hit_into_play,R,R,R,,,545361.0,0,9,Bot,96.8,661042,1,3
6572,2022-04-07,666176,664208,so,swinging_strike,R,R,R,,,642180.0,2,7,Bot,,661042,0,1
6573,2022-04-07,545361,519151,s,hit_into_play,R,R,R,,,,0,9,Bot,70.9,661042,1,3


In [None]:
# get features
data["net_score"] = data["bat_score"] - data["fld_score"]
data["on_1b"] = np.where(data["on_1b"].notnull(), 1, 0)
data["on_2b"] = np.where(data["on_2b"].notnull(), 1, 0)
data["on_3b"] = np.where(data["on_3b"].notnull(), 1, 0)
data["days_since_start"] = data["game_date"].apply(lambda x: (x - start_of_season).days)
data

In [84]:
(date.today() - date(2023, 4, 21)).days

1

In [89]:
arr = [1, 2, 3, 7, 9]
window_size = 3
numbers_series = pd.Series(arr)
windows = numbers_series.rolling(window_size)
windows.mean().tolist()[window_size-1:]

[2.0, 4.0, 6.333333333333333]

In [240]:
data.events.value_counts()

events
fo     2704
so     1679
s       915
w       486
d       239
hr      209
dp      127
hbp      64
e        38
t        31
sf       30
sh       25
fc       16
tp        1
Name: count, dtype: int64

In [82]:
date(2022, 4, 7) + timedelta(30)

datetime.date(2022, 5, 7)

### Prod model pipeline

In [252]:
# Testing code for the prod pipeline

sys.path.append('/Users/allenchen/projects/baseball-analytics/src')
from modeling.pipeline import data_pipeline
cols = [col for col in data.columns if col != "events"]
X, y = data[cols], data[["events"]]

In [253]:
%%time

Xt = data_pipeline.fit_transform(X, y)
transformed_feature_col_names = data_pipeline["feature_transformers"].get_feature_names_out().tolist()
features = pd.DataFrame(Xt, columns=transformed_feature_col_names)

CPU times: user 11.8 s, sys: 107 ms, total: 12 s
Wall time: 12.1 s


In [254]:
features_and_events = pd.concat([features, y], axis=1)
features_and_events.head(5)

Unnamed: 0,identity__batter,identity__pitcher,identity__game_date,identity__stand,identity__p_throws,identity__outs_when_up,identity__inning,identity__inning_topbot,identity__home_team,encode_on_base_occupancy__on_1b,encode_on_base_occupancy__on_2b,encode_on_base_occupancy__on_3b,compute_net_score__net_score,compute_days_since_start__days_since_start,batter_365_days_ma__PA,batter_365_days_ma__1B,batter_365_days_ma__2B,batter_365_days_ma__3B,batter_365_days_ma__HR,batter_365_days_ma__BB,batter_365_days_ma__SO,batter_365_days_ma__DP,batter_365_days_ma__FO,batter_365_days_ma__HBP,batter_365_days_ma__SF,batter_365_days_ma__SH,batter_365_days_ma__wOBA,batter_365_days_ma__mEV,batter_365_days_ma__aEV,batter_365_days_ma__pwOBA,batter_365_days_ma__pPA,head_to_head__1B,head_to_head__2B,head_to_head__HR,head_to_head__BB,head_to_head__SO,head_to_head__PA,head_to_head__wOBA,ball_park__1B,ball_park__2B,ball_park__3B,ball_park__HR,ball_park__BB,events
0,621493,676710,2023-04-17,R,R,2,9,Top,BOS,0,0,0,2,18,626.0,0.164537,0.036741,0.003195,0.038339,0.103834,0.210863,0.01278,0.402556,0.007987,0.007987,0.0,0.355105,112.6,89.856028,0.366904,449.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107,122,138,107,99,so
1,660271,676710,2023-04-17,L,R,1,9,Top,BOS,0,0,0,2,18,672.0,0.142857,0.044643,0.008929,0.050595,0.095238,0.244048,0.013393,0.38244,0.00744,0.004464,0.0,0.373421,118.0,92.558447,0.381971,446.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107,122,138,107,99,fo
2,621433,678394,2023-04-17,L,R,1,3,Top,BOS,1,1,0,4,18,4.0,0.0,0.0,0.0,0.0,0.75,0.25,0.0,0.0,0.0,0.0,0.0,0.5265,0.0,0.0,0.351,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107,122,138,107,99,so
3,687263,676710,2023-04-17,R,R,0,9,Top,BOS,0,0,0,2,18,8.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.5,0.0,0.0,0.0,0.0,105.4,85.633332,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107,122,138,107,99,fo
4,621433,676710,2023-04-17,L,R,2,8,Top,BOS,0,0,0,2,18,4.0,0.0,0.0,0.0,0.0,0.75,0.25,0.0,0.0,0.0,0.0,0.0,0.5265,0.0,0.0,0.351,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107,122,138,107,99,so


In [255]:
features_and_events[
    (features_and_events["identity__batter"] == 660271) &
    #(features_and_events["identity__game_date"] == "2023-04-09")
    (features_and_events["head_to_head__1B"] > 0)
]

Unnamed: 0,identity__batter,identity__pitcher,identity__game_date,identity__stand,identity__p_throws,identity__outs_when_up,identity__inning,identity__inning_topbot,identity__home_team,encode_on_base_occupancy__on_1b,encode_on_base_occupancy__on_2b,encode_on_base_occupancy__on_3b,compute_net_score__net_score,compute_days_since_start__days_since_start,batter_365_days_ma__PA,batter_365_days_ma__1B,batter_365_days_ma__2B,batter_365_days_ma__3B,batter_365_days_ma__HR,batter_365_days_ma__BB,batter_365_days_ma__SO,batter_365_days_ma__DP,batter_365_days_ma__FO,batter_365_days_ma__HBP,batter_365_days_ma__SF,batter_365_days_ma__SH,batter_365_days_ma__wOBA,batter_365_days_ma__mEV,batter_365_days_ma__aEV,batter_365_days_ma__pwOBA,batter_365_days_ma__pPA,head_to_head__1B,head_to_head__2B,head_to_head__HR,head_to_head__BB,head_to_head__SO,head_to_head__PA,head_to_head__wOBA,ball_park__1B,ball_park__2B,ball_park__3B,ball_park__HR,ball_park__BB,events
16,660271,678394,2023-04-17,L,R,0,2,Top,BOS,1,0,0,3,18,672.0,0.142857,0.044643,0.008929,0.050595,0.095238,0.244048,0.013393,0.38244,0.00744,0.004464,0.0,0.373421,118.0,92.558447,0.381971,446.0,1.0,0.0,0.0,0.0,0.0,2.0,0.892,107,122,138,107,99,s
34,660271,678394,2023-04-17,L,R,1,1,Top,BOS,0,0,0,0,18,672.0,0.142857,0.044643,0.008929,0.050595,0.095238,0.244048,0.013393,0.38244,0.00744,0.004464,0.0,0.373421,118.0,92.558447,0.381971,446.0,1.0,0.0,0.0,0.0,0.0,2.0,0.892,107,122,138,107,99,s
70,660271,601713,2023-04-15,L,R,0,5,Top,BOS,0,1,0,-2,16,673.0,0.141159,0.044577,0.008915,0.054978,0.095097,0.248143,0.013373,0.375929,0.007429,0.004458,0.0,0.38063,118.0,92.7,0.391749,443.0,0.166667,0.0,0.166667,0.0,0.333333,6.0,0.4885,107,122,138,107,99,s
79,660271,601713,2023-04-15,L,R,2,2,Top,BOS,1,1,0,2,16,673.0,0.141159,0.044577,0.008915,0.054978,0.095097,0.248143,0.013373,0.375929,0.007429,0.004458,0.0,0.38063,118.0,92.7,0.391749,443.0,0.166667,0.0,0.166667,0.0,0.333333,6.0,0.4885,107,122,138,107,99,fo
90,660271,542947,2023-04-15,L,L,2,6,Top,BOS,0,1,0,0,16,673.0,0.141159,0.044577,0.008915,0.054978,0.095097,0.248143,0.013373,0.375929,0.007429,0.004458,0.0,0.38063,118.0,92.7,0.359213,230.0,0.5,0.0,0.0,0.0,0.0,2.0,0.446,107,122,138,107,99,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6516,660271,650556,2022-04-08,L,R,1,7,Bot,LAA,0,0,0,-10,-356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333,3.0,0.297333,100,93,104,123,105,fo
6525,660271,543606,2022-04-08,L,R,0,1,Bot,LAA,0,0,0,-1,-356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.333333,0.0,3.0,0.531333,100,93,104,123,105,s
6536,660271,664285,2022-04-07,L,L,2,6,Bot,LAA,0,0,0,-1,-357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.153846,0.153846,13.0,0.245231,100,93,104,123,105,s
6545,660271,664285,2022-04-07,L,L,2,3,Bot,LAA,0,0,0,-1,-357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.153846,0.153846,13.0,0.245231,100,93,104,123,105,dp


In [256]:
data[
    (data["batter"] == 660271)&
    #(data["game_date"] < "2023-04-09")&
    #(data["game_date"] >= "2022-04-09")&
    #(data["p_throws"] == "L")
    (data["pitcher"] == 650556)
]

Unnamed: 0,game_date,batter,pitcher,events,description,game_type,stand,p_throws,home_team,away_team,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,launch_speed,game_pk,bat_score,fld_score
3324,2022-07-13,660271,650556,s,hit_into_play,R,L,R,LAA,HOU,,,623205.0,1,6,Bot,103.6,663401,3,1
6138,2022-04-19,660271,650556,so,called_strike,R,L,R,HOU,LAA,,,623205.0,0,6,Top,,662927,7,1
6527,2022-04-08,660271,650556,fo,hit_into_play,R,L,R,LAA,HOU,,,,1,7,Bot,78.1,661041,2,12


In [242]:
# prepare park factors features
park_factors_df = pd.read_csv("../src/modeling/intermediate/park_factors.csv")
park_factors_df

Unnamed: 0,year_range,venue_name,name_display_club,n_pa,index_1b,index_2b,index_3b,index_hr,index_bb,team_abbre
0,2021-2023,Angel Stadium,Angels,14644,100,93,104,123,105,LAA
1,2021-2023,Oriole Park at Camden Yards,Orioles,14115,108,94,77,109,97,BAL
2,2021-2023,Fenway Park,Red Sox,15588,107,122,138,107,99,BOS
3,2021-2023,Guaranteed Rate Field,White Sox,14626,97,93,79,126,106,CWS
4,2021-2023,Progressive Field,Guardians,14028,96,100,70,103,104,CLE
5,2021-2023,Kauffman Stadium,Royals,14440,110,112,134,77,94,KC
6,2021-2023,Oakland Coliseum,Athletics,15090,99,100,96,77,98,OAK
7,2021-2023,Tropicana Field,Rays,14876,92,99,76,95,99,TB
8,2021-2023,Rogers Centre,Blue Jays,12434,97,111,59,104,90,TOR
9,2021-2023,Chase Field,D-backs,14360,107,115,125,81,96,ARI


### After generating features, remember to remove rows that are before the mv_start_date aka null values for mv

In [None]:
"SO": "so",
    "DP": "dp",
    "FO": "fo",
    "SF": "sf",
    "SH": "sh",