In [1]:
# Import dependencies

import pandas as pd
import numpy as np
import itertools
import os

In [2]:
# Read in CSV files from FanGraphs (exported directly from site daily)

# Files contain pitching and hitting stats from two different time periods:
    # - Beginning of 2023 season - present ("year")
    # - Beginning of 2024 season - present ("season")
    
# Filtered stats include:
    # - Pitching stats vs. left- and right-handed batters the first time through the order
    # - Batting stats vs. left- and right-handed pitchers
    
pitching_vs_left_year_df = pd.read_csv('pitching_vs_left_year.csv')
pitching_vs_right_year_df = pd.read_csv('pitching_vs_right_year.csv')
pitching_vs_left_season_df = pd.read_csv('pitching_vs_left_season.csv')
pitching_vs_right_season_df = pd.read_csv('pitching_vs_right_season.csv')
batting_vs_left_year_df = pd.read_csv('batting_vs_left_year.csv')
batting_vs_right_year_df = pd.read_csv('batting_vs_right_year.csv')
batting_vs_left_season_df = pd.read_csv('batting_vs_left_season.csv')
batting_vs_right_season_df = pd.read_csv('batting_vs_right_season.csv')

# MLB season CSV contains league-wide averages from 2024 season to fill in any missing data 

mlb_season_df = pd.read_csv('MLB_season.csv')

In [3]:
# Create new columns:
    # - K Rate column for both pitchers and batters
    # - Singles, doubles, triples and home run rate columns for both pitchers and batters
    # - Walk rate column for both pitchers and batters

pitching_vs_left_year_df['K_Rate'] = pitching_vs_left_year_df['SO'] / pitching_vs_left_year_df['TBF']
pitching_vs_right_year_df['K_Rate'] = pitching_vs_right_year_df['SO'] / pitching_vs_right_year_df['TBF']
pitching_vs_left_season_df['K_Rate'] = pitching_vs_left_season_df['SO'] / pitching_vs_left_season_df['TBF']
pitching_vs_right_season_df['K_Rate'] = pitching_vs_right_season_df['SO'] / pitching_vs_right_season_df['TBF']

pitching_vs_left_year_df['Opp_1B'] = (pitching_vs_left_year_df['H'] - (pitching_vs_left_year_df['2B'] + pitching_vs_left_year_df['3B'] + pitching_vs_left_year_df['HR'])) / pitching_vs_left_year_df['TBF']
pitching_vs_right_year_df['Opp_1B'] = (pitching_vs_right_year_df['H'] - (pitching_vs_right_year_df['2B'] + pitching_vs_right_year_df['3B'] + pitching_vs_right_year_df['HR'])) / pitching_vs_right_year_df['TBF']
pitching_vs_left_season_df['Opp_1B'] = (pitching_vs_left_season_df['H'] - (pitching_vs_left_season_df['2B'] + pitching_vs_left_season_df['3B'] + pitching_vs_left_season_df['HR'])) / pitching_vs_left_season_df['TBF']
pitching_vs_right_season_df['Opp_1B'] = (pitching_vs_right_season_df['H'] - (pitching_vs_right_season_df['2B'] + pitching_vs_right_season_df['3B'] + pitching_vs_right_season_df['HR'])) / pitching_vs_right_season_df['TBF']

batting_vs_left_year_df['K_Rate'] = batting_vs_left_year_df['SO'] / batting_vs_left_year_df['PA']
batting_vs_right_year_df['K_Rate'] = batting_vs_right_year_df['SO'] / batting_vs_right_year_df['PA']
batting_vs_right_season_df['K_Rate'] = batting_vs_right_season_df['SO'] / batting_vs_right_season_df['PA']
batting_vs_left_season_df['K_Rate'] = batting_vs_left_season_df['SO'] / batting_vs_left_season_df['PA']

batting_vs_left_year_df['1B_Rate'] = batting_vs_left_year_df['1B'] / batting_vs_left_year_df['PA']
batting_vs_right_year_df['1B_Rate'] = batting_vs_right_year_df['1B'] / batting_vs_right_year_df['PA']
batting_vs_right_season_df['1B_Rate'] = batting_vs_right_season_df['1B'] / batting_vs_right_season_df['PA']
batting_vs_left_season_df['1B_Rate'] = batting_vs_left_season_df['1B'] / batting_vs_left_season_df['PA']

pitching_vs_left_year_df['BB_Rate'] = pitching_vs_left_year_df['BB'] / pitching_vs_left_year_df['TBF']
pitching_vs_right_year_df['BB_Rate'] = pitching_vs_right_year_df['BB'] / pitching_vs_right_year_df['TBF']
pitching_vs_left_season_df['BB_Rate'] = pitching_vs_left_season_df['BB'] / pitching_vs_left_season_df['TBF']
pitching_vs_right_season_df['BB_Rate'] = pitching_vs_right_season_df['BB'] / pitching_vs_right_season_df['TBF']

batting_vs_left_year_df['BB_Rate'] = batting_vs_left_year_df['BB'] / batting_vs_left_year_df['PA']
batting_vs_right_year_df['BB_Rate'] = batting_vs_right_year_df['BB'] / batting_vs_right_year_df['PA']
batting_vs_right_season_df['BB_Rate'] = batting_vs_right_season_df['BB'] / batting_vs_right_season_df['PA']
batting_vs_left_season_df['BB_Rate'] = batting_vs_left_season_df['BB'] / batting_vs_left_season_df['PA']

pitching_vs_left_year_df['2B_Rate'] = pitching_vs_left_year_df['2B'] / pitching_vs_left_year_df['TBF']
pitching_vs_right_year_df['2B_Rate'] = pitching_vs_right_year_df['2B'] / pitching_vs_right_year_df['TBF']
pitching_vs_left_season_df['2B_Rate'] = pitching_vs_left_season_df['2B'] / pitching_vs_left_season_df['TBF']
pitching_vs_right_season_df['2B_Rate'] = pitching_vs_right_season_df['2B'] / pitching_vs_right_season_df['TBF']

batting_vs_left_year_df['2B_Rate'] = batting_vs_left_year_df['2B'] / batting_vs_left_year_df['PA']
batting_vs_right_year_df['2B_Rate'] = batting_vs_right_year_df['2B'] / batting_vs_right_year_df['PA']
batting_vs_right_season_df['2B_Rate'] = batting_vs_right_season_df['2B'] / batting_vs_right_season_df['PA']
batting_vs_left_season_df['2B_Rate'] = batting_vs_left_season_df['2B'] / batting_vs_left_season_df['PA']

pitching_vs_left_year_df['3B_Rate'] = pitching_vs_left_year_df['3B'] / pitching_vs_left_year_df['TBF']
pitching_vs_right_year_df['3B_Rate'] = pitching_vs_right_year_df['3B'] / pitching_vs_right_year_df['TBF']
pitching_vs_left_season_df['3B_Rate'] = pitching_vs_left_season_df['3B'] / pitching_vs_left_season_df['TBF']
pitching_vs_right_season_df['3B_Rate'] = pitching_vs_right_season_df['3B'] / pitching_vs_right_season_df['TBF']

batting_vs_left_year_df['3B_Rate'] = batting_vs_left_year_df['3B'] / batting_vs_left_year_df['PA']
batting_vs_right_year_df['3B_Rate'] = batting_vs_right_year_df['3B'] / batting_vs_right_year_df['PA']
batting_vs_right_season_df['3B_Rate'] = batting_vs_right_season_df['3B'] / batting_vs_right_season_df['PA']
batting_vs_left_season_df['3B_Rate'] = batting_vs_left_season_df['3B'] / batting_vs_left_season_df['PA']

pitching_vs_left_year_df['HR_Rate'] = pitching_vs_left_year_df['HR'] / pitching_vs_left_year_df['TBF']
pitching_vs_right_year_df['HR_Rate'] = pitching_vs_right_year_df['HR'] / pitching_vs_right_year_df['TBF']
pitching_vs_left_season_df['HR_Rate'] = pitching_vs_left_season_df['HR'] / pitching_vs_left_season_df['TBF']
pitching_vs_right_season_df['HR_Rate'] = pitching_vs_right_season_df['HR'] / pitching_vs_right_season_df['TBF']

batting_vs_left_year_df['HR_Rate'] = batting_vs_left_year_df['HR'] / batting_vs_left_year_df['PA']
batting_vs_right_year_df['HR_Rate'] = batting_vs_right_year_df['HR'] / batting_vs_right_year_df['PA']
batting_vs_right_season_df['HR_Rate'] = batting_vs_right_season_df['HR'] / batting_vs_right_season_df['PA']
batting_vs_left_season_df['HR_Rate'] = batting_vs_left_season_df['HR'] / batting_vs_left_season_df['PA']

mlb_season_df['K_Rate'] = mlb_season_df['SO'] / mlb_season_df['TBF']
mlb_season_df['BB_Rate'] = mlb_season_df['BB'] / mlb_season_df['TBF']
mlb_season_df['1B_Rate'] = (mlb_season_df['H'] - (mlb_season_df['2B'] + mlb_season_df['3B'] + mlb_season_df['HR'])) / mlb_season_df['TBF']
mlb_season_df['2B_Rate'] = mlb_season_df['2B'] / mlb_season_df['TBF']
mlb_season_df['3B_Rate'] = mlb_season_df['3B'] / mlb_season_df['TBF']
mlb_season_df['HR_Rate'] = mlb_season_df['HR'] / mlb_season_df['TBF']

# Display example pitching DataFrame (pitching stats the first time through the order vs. left-handed batters during the 2024 season)

pitching_vs_left_season_df.head()

Unnamed: 0,Season,Name,Tm,G,TBF,ERA,H,2B,3B,R,...,OBP,SLG,wOBA,playerId,K_Rate,Opp_1B,BB_Rate,2B_Rate,3B_Rate,HR_Rate
0,Total,Adam Ottavino,NYM,36,65,6.585366,15,4,1,11,...,0.369231,0.545455,0.388469,1247,0.292308,0.107692,0.092308,0.061538,0.015385,0.046154
1,Total,Matt Moore,LAA,35,63,6.6,12,1,0,12,...,0.301587,0.5,0.343422,1890,0.190476,0.095238,0.111111,0.015873,0.0,0.079365
2,Total,Clayton Kershaw,LAD,1,2,0.0,1,0,0,0,...,0.5,0.5,0.441468,2036,0.5,0.5,0.0,0.0,0.0,0.0
3,Total,Lance Lynn,STL,20,244,5.56875,56,11,1,41,...,0.352459,0.454976,0.350527,2520,0.184426,0.143443,0.114754,0.045082,0.004098,0.036885
4,Total,Kenley Jansen,BOS,31,70,3.306122,18,4,0,6,...,0.342857,0.396825,0.313414,3096,0.271429,0.185714,0.085714,0.057143,0.0,0.014286


In [4]:
# Display example batting DataFrame (batting stats vs. left-handed pitchers since the beginning of the 2023 season)

batting_vs_left_year_df.head()

Unnamed: 0,Season,Name,Tm,G,PA,AB,H,1B,2B,3B,...,SB,CS,AVG,playerId,K_Rate,1B_Rate,BB_Rate,2B_Rate,3B_Rate,HR_Rate
0,Total,Miguel Cabrera,DET,47,96,83,22,17,5,0,...,0,0,0.26506,1744,0.239583,0.177083,0.114583,0.052083,0.0,0.0
1,Total,David Peralta,2 Tms,37,45,42,13,11,2,0,...,0,0,0.309524,2136,0.244444,0.244444,0.022222,0.044444,0.0,0.0
2,Total,Adam Wainwright,STL,1,1,1,0,0,0,0,...,0,0,0.0,2233,0.0,0.0,0.0,0.0,0.0,0.0
3,Total,Carlos Santana,3 Tms,128,267,238,68,42,14,0,...,2,0,0.285714,2396,0.149813,0.157303,0.097378,0.052434,0.0,0.044944
4,Total,Nelson Cruz,SDP,37,86,80,20,14,3,0,...,0,0,0.25,2434,0.325581,0.162791,0.046512,0.034884,0.0,0.034884


In [5]:
# Merge pitching DataFrames together and format columns 

pitchers_combined_df = pd.merge(pitching_vs_left_year_df, pitching_vs_right_year_df, on='Name', how='outer')
pitchers_combined_df = pitchers_combined_df.rename(columns={'HR_Rate_x': 'Year_HR_Rate_LHH', 'HR_Rate_y': 'Year_HR_Rate_RHH','3B_Rate_x': 'Year_3B_Rate_LHH', '3B_Rate_y': 'Year_3B_Rate_RHH','2B_Rate_x': 'Year_2B_Rate_LHH', '2B_Rate_y': 'Year_2B_Rate_RHH','BB_Rate_x': 'Year_BB_Rate_LHH', 'BB_Rate_y': 'Year_BB_Rate_RHH', 'Opp_1B_x': 'Year_Opp_1B_LHH', 'Opp_1B_y': 'Year_Opp_1B_RHH', 'Opp_OBP_x': 'Year_Opp_OBP_LHH', 'Opp_OBP_y': 'Year_Opp_OBP_RHH', 'K_Rate_x' : 'Year_K%_LHH', 'K_Rate_y' : 'Year_K%_RHH'})
pitchers_combined_df = pd.merge(pitchers_combined_df, pitching_vs_left_season_df, on='Name', how='outer')
pitchers_combined_df = pitchers_combined_df.rename(columns={'HR_Rate': 'Season_HR_Rate_LHH','3B_Rate': 'Season_3B_Rate_LHH','2B_Rate': 'Season_2B_Rate_LHH','BB_Rate': 'Season_BB_Rate_LHH', 'Opp_1B': 'Season_Opp_1B_LHH', 'Opp_OBP': 'Season_Opp_OBP_LHH', 'K_Rate' : 'Season_K%_LHH'})
pitchers_combined_df = pitchers_combined_df.drop(columns=['playerId_x','Season_x', 'Season_y', 'Tm_x', 'Tm_y', 'G_x', 'G_y', 'TBF_x', 'TBF_y', 'ERA_x', 'ERA_y', 'H_x', 'H_y', '2B_x', '2B_y', '3B_x', '3B_y', 'R_x', 'R_y', 'ER_x', 'ER_y', 'HR_x', 'HR_y', 'BB_x', 'BB_y', 'IBB_x', 'IBB_y', 'HBP_x', 'HBP_y', 'SO_x', 'SO_y', 'AVG_x', 'AVG_y', 'OBP_x', 'OBP_y', 'SLG_x', 'SLG_y', 'wOBA_x', 'wOBA_y'])
pitchers_combined_df = pd.merge(pitchers_combined_df, pitching_vs_right_season_df, on='Name', how='outer')
pitchers_combined_df = pitchers_combined_df.rename(columns={'HR_Rate': 'Season_HR_Rate_RHH','3B_Rate': 'Season_3B_Rate_RHH','2B_Rate': 'Season_2B_Rate_RHH', 'Opp_1B': 'Season_Opp_1B_RHH', 'BB_Rate': 'Season_BB_Rate_RHH', 'Opp_1B': 'Season_Opp_1B_RHH', 'Opp_OBP': 'Season_Opp_OBP_RHH', 'K_Rate' : 'Season_K%_RHH'})
pitchers_combined_df = pitchers_combined_df.drop(columns=['playerId_y','playerId_x','Season_x', 'Season_y', 'Tm_x', 'Tm_y', 'G_x', 'G_y', 'TBF_x', 'TBF_y', 'ERA_x', 'ERA_y', 'H_x', 'H_y', '2B_x', '2B_y', '3B_x', '3B_y', 'R_x', 'R_y', 'ER_x', 'ER_y', 'HR_x', 'HR_y', 'BB_x', 'BB_y', 'IBB_x', 'IBB_y', 'HBP_x', 'HBP_y', 'SO_x', 'SO_y', 'AVG_x', 'AVG_y', 'OBP_x', 'OBP_y', 'SLG_x', 'SLG_y', 'wOBA_x', 'wOBA_y'])

In [6]:
# Merge batting DataFrames together and format columns

batters_combined_df = pd.merge(batting_vs_left_year_df, batting_vs_right_year_df, on='Name', how='outer')
batters_combined_df = batters_combined_df.rename(columns={'HR_Rate_x': 'Year_HR_Rate_LHP','HR_Rate_y': 'Year_HR_Rate_RHP','3B_Rate_x': 'Year_3B_Rate_LHP','3B_Rate_y': 'Year_3B_Rate_RHP','2B_Rate_x': 'Year_2B_Rate_LHP','2B_Rate_y': 'Year_2B_Rate_RHP', 'BB_Rate_x': 'Year_BB_Rate_LHP', 'BB_Rate_y': 'Year_BB_Rate_RHP', '1B_Rate_x': 'Year_1B_Rate_LHP', '1B_Rate_y': 'Year_1B_Rate_RHP', 'OBP_x': 'Year_OBP_LHP', 'OBP_y': 'Year_OBP_RHP', 'K_Rate_x' : 'Year_K%_LHP', 'K_Rate_y' : 'Year_K%_RHP'})
batters_combined_df = pd.merge(batters_combined_df, batting_vs_left_season_df, on='Name', how='outer')
batters_combined_df = batters_combined_df.rename(columns={'HR_Rate': 'Season_HR_Rate_LHP','3B_Rate': 'Season_3B_Rate_LHP','2B_Rate': 'Season_2B_Rate_LHP','BB_Rate': 'Season_BB_Rate_LHP', '1B_Rate': 'Season_1B_Rate_LHP', 'OBP': 'Season_OBP_LHP', 'K_Rate' : 'Season_K%_LHP'})
batters_combined_df = batters_combined_df.drop(columns=['playerId_x','Season_x', 'Season_y', 'Tm_x', 'Tm_y', 'G_x', 'G_y', 'PA_x', 'PA_y', 'AB_x', 'AB_y', 'H_x', 'H_y', '1B_x', '1B_y', '2B_x', '2B_y', '3B_x', '3B_y', 'R_x', 'R_y', 'RBI_x', 'RBI_y', 'HR_x', 'HR_y', 'BB_x', 'BB_y', 'IBB_x', 'IBB_y', 'HBP_x', 'HBP_y', 'SO_x', 'SO_y', 'AVG_x', 'AVG_y', 'SF_x', 'SF_y', 'SH_x', 'SH_y', 'GDP_x', 'GDP_y', 'SB_x', 'SB_y', 'CS_x', 'CS_y'])
batters_combined_df = pd.merge(batters_combined_df, batting_vs_right_season_df, on='Name', how='outer')
batters_combined_df = batters_combined_df.rename(columns={'HR_Rate': 'Season_HR_Rate_RHP','3B_Rate': 'Season_3B_Rate_RHP','2B_Rate': 'Season_2B_Rate_RHP','BB_Rate': 'Season_BB_Rate_RHP', '1B_Rate': 'Season_1B_Rate_RHP', 'OBP': 'Season_OBP_RHP', 'K_Rate' : 'Season_K%_RHP'})
batters_combined_df = batters_combined_df.drop(columns=['playerId_y', 'playerId_x','Season_x', 'Season_y', 'Tm_x', 'Tm_y', 'G_x', 'G_y', 'PA_x', 'PA_y', 'AB_x', 'AB_y', 'H_x', 'H_y', '1B_x', '1B_y', '2B_x', '2B_y', '3B_x', '3B_y', 'R_x', 'R_y', 'RBI_x', 'RBI_y', 'HR_x', 'HR_y', 'BB_x', 'BB_y', 'IBB_x', 'IBB_y', 'HBP_x', 'HBP_y', 'SO_x', 'SO_y', 'AVG_x', 'AVG_y', 'SF_x', 'SF_y', 'SH_x', 'SH_y', 'GDP_x', 'GDP_y', 'SB_x', 'SB_y', 'CS_x', 'CS_y'])

In [7]:
# Read in additional CSV files from FanGraphs to retrieve the handedness information for each pitcher/batter

RHH_df = pd.read_csv('RHH_data.csv')
LHH_df = pd.read_csv('LHH_data.csv')
switch_df = pd.read_csv('switch_data.csv')
RHP_df = pd.read_csv('RHP_data.csv')
LHP_df = pd.read_csv('LHP_data.csv')

In [8]:
# Initialize Handedness columns in pitching and batting DataFrames

pitchers_combined_df['Handedness'] = np.nan
batters_combined_df['Handedness'] = np.nan

# Define a helper function to update handedness

def update_handedness(RHH_df, batters_combined_df, handedness):
    for name in RHH_df['Name']:
        batters_combined_df.loc[batters_combined_df['Name'] == name, 'Handedness'] = handedness
def update_handedness(LHH_df, batters_combined_df, handedness):
    for name in LHH_df['Name']:
        batters_combined_df.loc[batters_combined_df['Name'] == name, 'Handedness'] = handedness
def update_handedness(switch_df, batters_combined_df, handedness):
    for name in switch_df['Name']:
        batters_combined_df.loc[batters_combined_df['Name'] == name, 'Handedness'] = handedness
def update_handedness(RHP_df, pitchers_combined_df, handedness):
    for name in RHP_df['Name']:
        pitchers_combined_df.loc[pitchers_combined_df['Name'] == name, 'Handedness'] = handedness
def update_handedness(LHP_df, pitchers_combined_df, handedness):
    for name in LHP_df['Name']:
        pitchers_combined_df.loc[pitchers_combined_df['Name'] == name, 'Handedness'] = handedness

# Update handedness for pitchers

update_handedness(RHP_df, pitchers_combined_df, 'R')
update_handedness(LHP_df, pitchers_combined_df, 'L')

# Update handedness for batters

update_handedness(RHH_df, batters_combined_df, 'R')
update_handedness(LHH_df, batters_combined_df, 'L')
update_handedness(switch_df, batters_combined_df, 'S')

# Display pitching DataFrame

pitchers_combined_df.head()

Unnamed: 0,Name,Year_K%_LHH,Year_Opp_1B_LHH,Year_BB_Rate_LHH,Year_2B_Rate_LHH,Year_3B_Rate_LHH,Year_HR_Rate_LHH,Year_K%_RHH,Year_Opp_1B_RHH,Year_BB_Rate_RHH,...,Season_2B_Rate_LHH,Season_3B_Rate_LHH,Season_HR_Rate_LHH,Season_K%_RHH,Season_Opp_1B_RHH,Season_BB_Rate_RHH,Season_2B_Rate_RHH,Season_3B_Rate_RHH,Season_HR_Rate_RHH,Handedness
0,Tommy Hunter,0.236842,0.078947,0.078947,0.0,0.026316,0.131579,0.195652,0.195652,0.021739,...,,,,,,,,,,R
1,Matt Bush,0.24,0.08,0.16,0.04,0.0,0.12,0.173913,0.043478,0.086957,...,,,,,,,,,,R
2,Adam Ottavino,0.267857,0.10119,0.125,0.047619,0.005952,0.047619,0.260536,0.141762,0.08046,...,0.061538,0.015385,0.046154,0.309278,0.134021,0.072165,0.020619,0.0,0.010309,R
3,Matt Moore,0.192,0.12,0.096,0.016,0.008,0.072,0.265683,0.125461,0.088561,...,0.015873,0.0,0.079365,0.208696,0.130435,0.121739,0.034783,0.0,0.043478,L
4,Zack Greinke,0.136752,0.222222,0.059829,0.042735,0.0,0.025641,0.229167,0.159722,0.0625,...,,,,,,,,,,R


In [9]:
# Display batting DataFrame

batters_combined_df.head()

Unnamed: 0,Name,Year_K%_LHP,Year_1B_Rate_LHP,Year_BB_Rate_LHP,Year_2B_Rate_LHP,Year_3B_Rate_LHP,Year_HR_Rate_LHP,Year_K%_RHP,Year_1B_Rate_RHP,Year_BB_Rate_RHP,...,Season_2B_Rate_LHP,Season_3B_Rate_LHP,Season_HR_Rate_LHP,Season_K%_RHP,Season_1B_Rate_RHP,Season_BB_Rate_RHP,Season_2B_Rate_RHP,Season_3B_Rate_RHP,Season_HR_Rate_RHP,Handedness
0,Miguel Cabrera,0.239583,0.177083,0.114583,0.052083,0.0,0.0,0.188192,0.166052,0.073801,...,,,,0.188889,0.166667,0.074074,0.051852,0.0,0.014815,R
1,David Peralta,0.244444,0.244444,0.022222,0.044444,0.0,0.0,0.173038,0.156942,0.054326,...,0.1,0.0,0.0,0.178138,0.153846,0.052632,0.054656,0.002024,0.018219,L
2,Adam Wainwright,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,1.0,0.0,0.0,0.0,0.0,0.0,
3,Carlos Santana,0.149813,0.157303,0.097378,0.052434,0.0,0.044944,0.17456,0.112314,0.104195,...,0.059406,0.0,0.059406,0.173913,0.112772,0.103261,0.052989,0.001359,0.033967,S
4,Nelson Cruz,0.325581,0.162791,0.046512,0.034884,0.0,0.034884,0.269841,0.15873,0.031746,...,,,,0.274194,0.16129,0.032258,0.032258,0.016129,0.032258,R


In [10]:
# Pull average values for each statistic from 2024 MLB season DataFrame
    
average_k_rate = mlb_season_df['K_Rate'].iloc[0]
average_BB_rate = mlb_season_df['BB_Rate'].iloc[0]
average_1B_rate = mlb_season_df['1B_Rate'].iloc[0]
average_2B_rate = mlb_season_df['2B_Rate'].iloc[0]
average_3B_rate = mlb_season_df['3B_Rate'].iloc[0]
average_HR_rate = mlb_season_df['HR_Rate'].iloc[0]

In [11]:
# Game ID's for Streamlit application

game_ids = ['TOR/BAL - Top 1', 'TOR/BAL - Bot 1', 'NYY/PHI - Top 1', 
           'NYY/PHI - Bot 1', 'MIA/TB - Top 1', 'MIA/TB - Bot 1', 'SEA/BOS - Top 1', 'SEA/BOS - Bot 1', 
           'CHC/CIN - Top 1', 'CHC/CIN - Bot 1', 'MIN/NYM - Top 1', 'MIN/NYM - Bot 1', 'TEX/STL - Top 1', 
           'TEX/STL - Bot 1', 'KC/CWS - Top 1', 'KC/CWS - Bot 1', 'PIT/HOU - Top 1', 'PIT/HOU - Bot 1', 
           'ATL/MIL - Top 1', 'ATL/MIL - Bot 1', 'COL/LAA - Top 1', 'COL/LAA - Bot 1', 'WAS/ARI - Top 1', 
           'WAS/ARI - Bot 1', 'LAD/SD - Top 1', 'LAD/SD - Bot 1', 'OAK/SF - Top 1', 'OAK/SF - Bot 1']

In [12]:
# Input game ID, team batting, team pitching, the ballpark in which the game is being played, starting pitcher and opposing team's lineup
# * Note * For the ballpark, input the name of the home team

game_id = input("Enter the game ID: ")
team_batting = input("Enter the name of the team batting: ")
team_pitching = input("Enter the name of the team pitching: ")
ballpark = input("Enter the name of the ballpark: ")
pitcher_name = input("Enter the name of the starting pitcher: ")
batter_1 = input("Enter the name of the first batter: ")
batter_2 = input("Enter the name of the second batter: ")
batter_3 = input("Enter the name of the third batter: ")
batter_4 = input("Enter the name of the fourth batter: ")
batter_5 = input("Enter the name of the fifth batter: ")
batter_6 = input("Enter the name of the sixth batter: ")
batter_7 = input("Enter the name of the seventh batter: ")
batter_8 = input("Enter the name of the eighth batter: ")
batter_9 = input("Enter the name of the ninth batter: ")

Enter the game ID: KC/CWS - Bot 1
Enter the name of the team batting: White Sox
Enter the name of the team pitching: Royals
Enter the name of the ballpark: White Sox
Enter the name of the starting pitcher: Brady Singer
Enter the name of the first batter: Nicky Lopez
Enter the name of the second batter: Miguel Vargas
Enter the name of the third batter: Andrew Vaughn
Enter the name of the fourth batter: Gavin Sheets
Enter the name of the fifth batter: Lenyn Sosa
Enter the name of the sixth batter: Andrew Benintendi
Enter the name of the seventh batter: Nick Senzel
Enter the name of the eighth batter: Dominic Fletcher
Enter the name of the ninth batter: Chuckie Robinson


In [13]:
# Input implied probabilities for expected value calculation

implied_no_strikeouts = input("Enter the implied probability of zero strikeouts: ")
implied_no_hits = input("Enter the implied probability of zero hits: ")
implied_under_four_batters_to_plate = input("Enter the implied probability of less than four batters: ")

Enter the implied probability of zero strikeouts: .3846
Enter the implied probability of zero hits: .4762
Enter the implied probability of less than four batters: .4


In [14]:
# Store inputted batter names as strings

batter_names = [str(batter_1), str(batter_2), str(batter_3), str(batter_4), str(batter_5), str(batter_6), str(batter_7), str(batter_8), str(batter_9)]

In [15]:
# Pull relevant names for combined pitching/batting DataFrame

if pitcher_name in pitchers_combined_df['Name'].values:
    pitcher_data = pitchers_combined_df[pitchers_combined_df['Name'] == pitcher_name]
    
else:
    average_pitcher_values = {'Name': pitcher_name, 'Year_K%_LHH': average_k_rate, 'Year_Opp_1B_LHH': average_1B_rate, 
                     'Year_BB_Rate_LHH': average_BB_rate,'Year_2B_Rate_LHH': average_2B_rate, 'Year_3B_Rate_LHH': 
                      average_3B_rate,'Year_HR_Rate_LHH': average_HR_rate, 'Year_K%_RHH': average_k_rate, 'Year_Opp_1B_RHH': average_1B_rate, 
                     'Year_BB_Rate_RHH': average_BB_rate,'Year_2B_Rate_RHH': average_2B_rate, 'Year_3B_Rate_RHH': 
                      average_3B_rate,'Year_HR_Rate_RHH': average_HR_rate,'Season_K%_LHH': average_k_rate, 'Season_Opp_1B_LHH': average_1B_rate, 
                     'Season_BB_Rate_LHH': average_BB_rate,'Season_2B_Rate_LHH': average_2B_rate, 'Season_3B_Rate_LHH': 
                      average_3B_rate,'Season_HR_Rate_LHH': average_HR_rate, 'Season_K%_RHH': average_k_rate, 'Season_Opp_1B_RHH': average_1B_rate, 
                     'Season_BB_Rate_RHH': average_BB_rate,'Season_2B_Rate_RHH': average_2B_rate, 'Season_3B_Rate_RHH': 
                      average_3B_rate,'Season_HR_Rate_RHH': average_HR_rate}
    
    pitcher_data = pd.DataFrame([average_pitcher_values])
    
batters_data = pd.DataFrame()

# Loop through each batter name

for batter_name in batter_names:
    
    # Check if the batter_name exists in the DataFrame
    
    if batter_name in batters_combined_df['Name'].values:
        batter_data = batters_combined_df[batters_combined_df['Name'] == batter_name]
        
    else:
        
        # Define your default list of values
        
        average_batter_values = {
            'Name': batter_name,'Year_K%_LHP': average_k_rate, 'Year_1B_Rate_LHP': average_1B_rate, 
                     'Year_BB_Rate_LHP': average_BB_rate,'Year_2B_Rate_LHP': average_2B_rate, 'Year_3B_Rate_LHP': 
                      average_3B_rate,'Year_HR_Rate_LHP': average_HR_rate, 'Year_K%_RHP': average_k_rate, 'Year_1B_Rate_RHP': average_1B_rate, 
                     'Year_BB_Rate_RHP': average_BB_rate,'Year_2B_Rate_RHP': average_2B_rate, 'Year_3B_Rate_RHP': 
                      average_3B_rate,'Year_HR_Rate_RHP': average_HR_rate,'Season_K%_LHP': average_k_rate, 'Season_1B_Rate_LHP': average_1B_rate, 
                     'Season_BB_Rate_LHP': average_BB_rate,'Season_2B_Rate_LHP': average_2B_rate, 'Season_3B_Rate_LHP': 
                      average_3B_rate,'Season_HR_Rate_LHP': average_HR_rate, 'Season_K%_RHP': average_k_rate, 'Season_1B_Rate_RHP': average_1B_rate, 
                     'Season_BB_Rate_RHP': average_BB_rate,'Season_2B_Rate_RHP': average_2B_rate, 'Season_3B_Rate_RHP': 
                      average_3B_rate,'Season_HR_Rate_RHP': average_HR_rate}
        
        # Create a DataFrame from the default values
        
        batter_data = pd.DataFrame([average_batter_values])
    
    # Append the batter_data to batters_data
    batters_data = pd.concat([batters_data, batter_data], ignore_index=True)

# Order batter data by lineup

batters_data['batting_order'] = pd.Categorical(batters_data['Name'], categories=batter_names, ordered=True)
batters_data = batters_data.sort_values('batting_order').drop(columns='batting_order')

# Concatenate data vertically to stack rows and create combined DataFrame of starting pitcher and opposing team's lineup

model_data = pd.concat([pitcher_data, batters_data], axis=0, ignore_index=True)
model_data

Unnamed: 0,Name,Year_K%_LHH,Year_Opp_1B_LHH,Year_BB_Rate_LHH,Year_2B_Rate_LHH,Year_3B_Rate_LHH,Year_HR_Rate_LHH,Year_K%_RHH,Year_Opp_1B_RHH,Year_BB_Rate_RHH,...,Season_BB_Rate_LHP,Season_2B_Rate_LHP,Season_3B_Rate_LHP,Season_HR_Rate_LHP,Season_K%_RHP,Season_1B_Rate_RHP,Season_BB_Rate_RHP,Season_2B_Rate_RHP,Season_3B_Rate_RHP,Season_HR_Rate_RHP
0,Brady Singer,0.188462,0.111538,0.134615,0.065385,0.007692,0.034615,0.226316,0.173684,0.063158,...,,,,,,,,,,
1,Nicky Lopez,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.152439,0.158537,0.097561,0.034553,0.010163,0.002033
2,Miguel Vargas,,,,,,,,,,...,0.075,0.05,0.0,0.05,0.234818,0.097166,0.097166,0.040486,0.012146,0.020243
3,Andrew Vaughn,,,,,,,,,,...,0.025641,0.076923,0.0,0.038462,0.235075,0.144279,0.064677,0.044776,0.002488,0.033582
4,Gavin Sheets,,,,,,,,,,...,0.076923,0.025641,0.025641,0.025641,0.183413,0.125997,0.095694,0.044657,0.0,0.025518
5,Lenyn Sosa,,,,,,,,,,...,0.058824,0.039216,0.0,0.039216,0.25,0.134615,0.023077,0.038462,0.0,0.023077
6,Andrew Benintendi,,,,,,,,,,...,0.039216,0.039216,0.0,0.019608,0.146667,0.165333,0.082667,0.046667,0.001333,0.012
7,Nick Senzel,,,,,,,,,,...,0.117647,0.044118,0.0,0.044118,0.252577,0.110825,0.095361,0.030928,0.0,0.020619
8,Dominic Fletcher,,,,,,,,,,...,0.2,0.0,0.0,0.0,0.22973,0.182432,0.067568,0.054054,0.0,0.013514
9,Chuckie Robinson,,,,,,,,,,...,,,,,0.090909,0.181818,0.0,0.0,0.0,0.0


In [16]:
# Convert all columns to numeric, forcing errors to NaN
model_data = model_data.apply(pd.to_numeric, errors='coerce')
mlb_season_df = mlb_season_df.apply(pd.to_numeric, errors='coerce')

In [17]:
# Initialize inning simulation model, running 10,000 simulations of the relevant half-inning
# Establish variables for betting lines to test against, such as the probability of zero strikeouts, zero hits, or a home run occurring in the half-inning, or over three batters coming to the plate

def simulate_inning(model_data, n_simulations=10000):
    
    no_strikeout_counts = 0
    strikeout_counts = 0
    no_hit_counts = 0
    one_hit_counts = 0
    two_hit_counts = 0
    three_hit_counts = 0
    four_plus_hit_counts = 0
    home_run_counts = 0
    no_home_run_counts = 0
    one_strikeout_counts = 0
    two_strikeout_counts = 0
    three_strikeout_counts = 0
    over_three_batters_to_plate_counts = 0
    
    # Begin to loop through each simulation, starting each half-inning with zeros across all stats and no runners on base
    
    for _ in range(n_simulations):
        
        outs = 0
        strikeouts = 0
        hits = 0
        home_runs = 0
        batters_to_plate = 0
        
        runner_on_first = False
        runner_on_second = False
        runner_on_third = False
        
        # Pull pitcher data from first row of DataFrame and start to pull batter data from second row
        
        pitcher_row_index = 0
        row_index = 1       
         
        # Define a function to pull suffixes for the relevant handedness of each player
        
        def get_batter_suffix(batter_handedness, pitcher_handedness):
            if batter_handedness == 'L':
                return 'LHP' if pitcher_handedness == 'L' else 'RHP'
            elif batter_handedness == 'R':
                return 'LHP' if pitcher_handedness == 'L' else 'RHP'
            else:
                return 'LHP' if pitcher_handedness == 'L' else 'RHP'

        def get_pitcher_suffix(pitcher_handedness, batter_handedness):
            if batter_handedness == 'L':
                return 'LHH'
            elif batter_handedness == 'R':
                return 'RHH'
            else:
                return 'LHH' if pitcher_handedness == 'R' else 'RHH'

        # Extract all stats using the correct suffixes and return values as floats
        
        def extract_and_convert_batter_stats(model_data, row_index, batter_suffix):
            stats = {
            'Year_K_rate': model_data.at[row_index, f'Year_K%_{batter_suffix}'],
            'Season_K_rate': model_data.at[row_index, f'Season_K%_{batter_suffix}'],
            'Year_BB_rate': model_data.at[row_index, f'Year_BB_Rate_{batter_suffix}'],
            'Season_BB_rate': model_data.at[row_index, f'Season_BB_Rate_{batter_suffix}'],
            'Year_1B_rate': model_data.at[row_index, f'Year_1B_Rate_{batter_suffix}'],
            'Season_1B_rate': model_data.at[row_index, f'Season_1B_Rate_{batter_suffix}'],
            'Year_2B_rate': model_data.at[row_index, f'Year_2B_Rate_{batter_suffix}'],
            'Season_2B_rate': model_data.at[row_index, f'Season_2B_Rate_{batter_suffix}'],
            'Year_3B_rate': model_data.at[row_index, f'Year_3B_Rate_{batter_suffix}'],
            'Season_3B_rate': model_data.at[row_index, f'Season_3B_Rate_{batter_suffix}'],
            'Year_HR_rate': model_data.at[row_index, f'Year_HR_Rate_{batter_suffix}'],
            'Season_HR_rate': model_data.at[row_index, f'Season_HR_Rate_{batter_suffix}'],
            'Handedness': model_data.at[row_index, 'Handedness']
            }
            return {k: float(v) for k, v in stats.items()}

        def extract_and_convert_pitcher_stats(model_data, pitcher_row_index, pitcher_suffix):
            stats = {
            'Year_K_rate': model_data.at[pitcher_row_index, f'Year_K%_{pitcher_suffix}'],
            'Season_K_rate': model_data.at[pitcher_row_index, f'Season_K%_{pitcher_suffix}'],
            'Year_BB_rate': model_data.at[pitcher_row_index, f'Year_BB_Rate_{pitcher_suffix}'],
            'Season_BB_rate': model_data.at[pitcher_row_index, f'Season_BB_Rate_{pitcher_suffix}'],
            'Year_1B_rate': model_data.at[pitcher_row_index, f'Year_Opp_1B_{pitcher_suffix}'],
            'Season_1B_rate': model_data.at[pitcher_row_index, f'Season_Opp_1B_{pitcher_suffix}'],
            'Year_2B_rate': model_data.at[pitcher_row_index, f'Year_2B_Rate_{pitcher_suffix}'],
            'Season_2B_rate': model_data.at[pitcher_row_index, f'Season_2B_Rate_{pitcher_suffix}'],
            'Year_3B_rate': model_data.at[pitcher_row_index, f'Year_3B_Rate_{pitcher_suffix}'],
            'Season_3B_rate': model_data.at[pitcher_row_index, f'Season_3B_Rate_{pitcher_suffix}'],
            'Year_HR_rate': model_data.at[pitcher_row_index, f'Year_HR_Rate_{pitcher_suffix}'],
            'Season_HR_rate': model_data.at[pitcher_row_index, f'Season_HR_Rate_{pitcher_suffix}'],
            'Handedness': model_data.at[pitcher_row_index, 'Handedness']
            }
            return {k: float(v) for k, v in stats.items()}

        # Pull pitcher handedness from first row
        
        pitcher_handedness = model_data.iloc[0]['Handedness']

        # Append relevant pitcher and batter stats to lists
        
        pitcher_stats = []
        batter_handedness = model_data.iloc[row_index]['Handedness']
        pitcher_suffix = get_pitcher_suffix(pitcher_handedness, batter_handedness)
        pitcher_stats.append(extract_and_convert_pitcher_stats(model_data, pitcher_row_index, pitcher_suffix))

        batter_stats = []
        batter_suffix = get_batter_suffix(batter_handedness, pitcher_handedness)
        batter_stats.append(extract_and_convert_batter_stats(model_data, row_index, batter_suffix))

        # Create dictionary of batting rate adjustments for left and right-handed batters based on the ballpark
        # These rates are pulled from Swish Analytics' park factors data (https://swishanalytics.com/mlb/mlb-park-factors), which tracks from 2014-onwards
        
        team_rates = {
        'Angels': {
            '1B_LH_rate': 0.95, '1B_RH_rate': 0.96, 
            '2B_LH_rate': 0.91, '2B_RH_rate': 1.02,
            '3B_LH_rate': 0.55, '3B_RH_rate': 0.95,
            'HR_LH_rate': 1.29, 'HR_RH_rate': 1.02
        }, 
        'Cardinals': {
            '1B_LH_rate': 1.01, '1B_RH_rate': 1.05, 
            '2B_LH_rate': 0.89, '2B_RH_rate': 0.89,
            '3B_LH_rate': 0.75, '3B_RH_rate': 1.10,
            'HR_LH_rate': 0.92, 'HR_RH_rate': 0.84
        },
        'Diamondbacks': {
            '1B_LH_rate': 1.05, '1B_RH_rate': 0.99, 
            '2B_LH_rate': 1.01, '2B_RH_rate': 0.95,
            '3B_LH_rate': 2.39, '3B_RH_rate': 1.52,
            'HR_LH_rate': 0.97, 'HR_RH_rate': 0.87
        },
        'Mets': {
            '1B_LH_rate': 1.01, '1B_RH_rate': 0.86, 
            '2B_LH_rate': 0.74, '2B_RH_rate': 0.88,
            '3B_LH_rate': 0.62, '3B_RH_rate': 0.70,
            'HR_LH_rate': 0.98, 'HR_RH_rate': 1.07
        },
        'Phillies': {
            '1B_LH_rate': 0.97, '1B_RH_rate': 0.98, 
            '2B_LH_rate': 0.98, '2B_RH_rate': 0.88,
            '3B_LH_rate': 1.10, '3B_RH_rate': 0.99,
            'HR_LH_rate': 1.17, 'HR_RH_rate': 1.22
        },
        'Tigers': {
            '1B_LH_rate': 0.98, '1B_RH_rate': 1.06, 
            '2B_LH_rate': 0.83, '2B_RH_rate': 1.09,
            '3B_LH_rate': 1.69, '3B_RH_rate': 1.85,
            'HR_LH_rate': 0.88, 'HR_RH_rate': 0.97
        },
        'Rockies': {
            '1B_LH_rate': 1.15, '1B_RH_rate': 1.19, 
            '2B_LH_rate': 1.12, '2B_RH_rate': 1.43,
            '3B_LH_rate': 1.91, '3B_RH_rate': 2.17,
            'HR_LH_rate': 1.22, 'HR_RH_rate': 1.21
        },
        'Dodgers': {
            '1B_LH_rate': 0.96, '1B_RH_rate': 0.99, 
            '2B_LH_rate': 1.06, '2B_RH_rate': 0.92,
            '3B_LH_rate': 0.24, '3B_RH_rate': 0.50,
            'HR_LH_rate': 1.04, 'HR_RH_rate': 1.21
        },
        'Red Sox': {
            '1B_LH_rate': 0.97, '1B_RH_rate': 0.99, 
            '2B_LH_rate': 1.59, '2B_RH_rate': 1.25,
            '3B_LH_rate': 1.19, '3B_RH_rate': 1.21,
            'HR_LH_rate': 0.82, 'HR_RH_rate': 0.97
        },
        'Rangers': {
            '1B_LH_rate': 1.04, '1B_RH_rate': 1.00, 
            '2B_LH_rate': 1.01, '2B_RH_rate': 0.96,
            '3B_LH_rate': 1.01, '3B_RH_rate': 0.98,
            'HR_LH_rate': 0.95, 'HR_RH_rate': 0.96
        },
        'Reds': {
            '1B_LH_rate': 0.99, '1B_RH_rate': 0.93, 
            '2B_LH_rate': 0.92, '2B_RH_rate': 1.08,
            '3B_LH_rate': 0.79, '3B_RH_rate': 0.63,
            'HR_LH_rate': 1.35, 'HR_RH_rate': 1.30
        },
        'White Sox': {
            '1B_LH_rate': 0.95, '1B_RH_rate': 1.03, 
            '2B_LH_rate': 0.72, '2B_RH_rate': 0.91,
            '3B_LH_rate': 0.84, '3B_RH_rate': 0.31,
            'HR_LH_rate': 1.15, 'HR_RH_rate': 1.12
        },
        'Royals': {
            '1B_LH_rate': 1.15, '1B_RH_rate': 1.03, 
            '2B_LH_rate': 1.22, '2B_RH_rate': 1.07,
            '3B_LH_rate': 1.17, '3B_RH_rate': 1.28,
            'HR_LH_rate': 0.76, 'HR_RH_rate': 0.84
        },
        'Marlins': {
            '1B_LH_rate': 0.91, '1B_RH_rate': 1.09, 
            '2B_LH_rate': 0.90, '2B_RH_rate': 1.04,
            '3B_LH_rate': 1.25, '3B_RH_rate': 0.99,
            'HR_LH_rate': 0.77, 'HR_RH_rate': 0.72
        },
        'Brewers': {
            '1B_LH_rate': 0.96, '1B_RH_rate': 0.96, 
            '2B_LH_rate': 0.91, '2B_RH_rate': 0.92,
            '3B_LH_rate': 0.82, '3B_RH_rate': 0.92,
            'HR_LH_rate': 1.08, 'HR_RH_rate': 1.14
        },
        'Astros': {
            '1B_LH_rate': 0.98, '1B_RH_rate': 1.01, 
            '2B_LH_rate': 0.91, '2B_RH_rate': 0.87,
            '3B_LH_rate': 1.27, '3B_RH_rate': 0.61,
            'HR_LH_rate': 1.05, 'HR_RH_rate': 1.10
        },
        'Nationals': {
            '1B_LH_rate': 1.01, '1B_RH_rate': 1.00, 
            '2B_LH_rate': 1.30, '2B_RH_rate': 1.04,
            '3B_LH_rate': 0.85, '3B_RH_rate': 0.83,
            'HR_LH_rate': 1.14, 'HR_RH_rate': 1.09
        },
        'Athletics': {
            '1B_LH_rate': 0.94, '1B_RH_rate': 0.96, 
            '2B_LH_rate': 0.98, '2B_RH_rate': 1.13,
            '3B_LH_rate': 0.78, '3B_RH_rate': 0.75,
            'HR_LH_rate': 0.74, 'HR_RH_rate': 0.80
        },
        'Giants': {
            '1B_LH_rate': 0.97, '1B_RH_rate': 1.05, 
            '2B_LH_rate': 1.05, '2B_RH_rate': 0.94,
            '3B_LH_rate': 1.66, '3B_RH_rate': 1.19,
            'HR_LH_rate': 0.73, 'HR_RH_rate': 0.79
        },
        'Orioles': {
            '1B_LH_rate': 0.99, '1B_RH_rate': 1.00, 
            '2B_LH_rate': 1.01, '2B_RH_rate': 0.87,
            '3B_LH_rate': 0.90, '3B_RH_rate': 0.65,
            'HR_LH_rate': 1.11, 'HR_RH_rate': 1.20
        },
        'Padres': {
            '1B_LH_rate': 0.95, '1B_RH_rate': 0.93, 
            '2B_LH_rate': 1.07, '2B_RH_rate': 0.96,
            '3B_LH_rate': 0.76, '3B_RH_rate': 0.71,
            'HR_LH_rate': 0.92, 'HR_RH_rate': 0.98
        },
        'Pirates': {
            '1B_LH_rate': 0.97, '1B_RH_rate': 0.95, 
            '2B_LH_rate': 1.27, '2B_RH_rate': 1.10,
            '3B_LH_rate': 0.75, '3B_RH_rate': 0.83,
            'HR_LH_rate': 0.93, 'HR_RH_rate': 0.79
        },
        'Guardians': {
            '1B_LH_rate': 0.99, '1B_RH_rate': 1.00, 
            '2B_LH_rate': 1.13, '2B_RH_rate': 1.02,
            '3B_LH_rate': 0.85, '3B_RH_rate': 0.88,
            'HR_LH_rate': 1.08, 'HR_RH_rate': 0.98
        },
        'Blue Jays': {
            '1B_LH_rate': 0.95, '1B_RH_rate': 0.92, 
            '2B_LH_rate': 0.99, '2B_RH_rate': 1.02,
            '3B_LH_rate': 0.86, '3B_RH_rate': 1.03,
            'HR_LH_rate': 1.21, 'HR_RH_rate': 1.12
        },
        'Mariners': {
            '1B_LH_rate': 1.01, '1B_RH_rate': 0.95, 
            '2B_LH_rate': 0.86, '2B_RH_rate': 0.83,
            '3B_LH_rate': 0.50, '3B_RH_rate': 0.75,
            'HR_LH_rate': 0.89, 'HR_RH_rate': 1.04
        },
        'Twins': {
            '1B_LH_rate': 1.03, '1B_RH_rate': 0.94, 
            '2B_LH_rate': 1.03, '2B_RH_rate': 1.22,
            '3B_LH_rate': 1.40, '3B_RH_rate': 0.73,
            'HR_LH_rate': 0.89, 'HR_RH_rate': 0.86
        },
        'Rays': {
            '1B_LH_rate': 0.97, '1B_RH_rate': 0.96, 
            '2B_LH_rate': 0.85, '2B_RH_rate': 1.01,
            '3B_LH_rate': 1.32, '3B_RH_rate': 1.22,
            'HR_LH_rate': 0.94, 'HR_RH_rate': 0.86
        },
        'Braves': {
            '1B_LH_rate': 0.99, '1B_RH_rate': 1.09, 
            '2B_LH_rate': 1.04, '2B_RH_rate': 1.03,
            '3B_LH_rate': 0.69, '3B_RH_rate': 0.91,
            'HR_LH_rate': 0.90, 'HR_RH_rate': 0.93
        },
        'Cubs': {
            '1B_LH_rate': 1.03, '1B_RH_rate': 0.99, 
            '2B_LH_rate': 0.98, '2B_RH_rate': 1.01,
            '3B_LH_rate': 1.18, '3B_RH_rate': 1.56,
            'HR_LH_rate': 0.83, 'HR_RH_rate': 0.98
        },
        'Yankees': {
            '1B_LH_rate': 1.06, '1B_RH_rate': 1.05, 
            '2B_LH_rate': 0.89, '2B_RH_rate': 0.85,
            '3B_LH_rate': 0.53, '3B_RH_rate': 1.36,
            'HR_LH_rate': 1.09, 'HR_RH_rate': 1.02
        }}

        # Use itertools to iterate through both batter and pitcher stat lists
        # Iterate through each at-bat to determine the handedness of the batter and relevant rates for both the pitcher and batter
        # Adjust for the fact that teams play half of their games in their home ballpark (i.e. the home team's rates receive less of the ballpark adjustment because it is already baked in)
        
        for batter_stat, pitcher_stat in itertools.product(batter_stats, pitcher_stats):
  
            if ballpark == team_batting:
                if batter_stat['Handedness'] == 'R' or (batter_stat['Handedness'] == 'S' and pitcher_stats['Handedness'] == 'L'):
                    batter_stat['Year_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_RH_rate'] + 0.5)
                    batter_stat['Season_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_RH_rate'] + 0.5)
                    batter_stat['Year_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_RH_rate'] + 0.5)
                    batter_stat['Season_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_RH_rate'] + 0.5)
                    batter_stat['Year_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_RH_rate'] + 0.5)
                    batter_stat['Season_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_RH_rate'] + 0.5)
                    batter_stat['Year_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_RH_rate'] + 0.5)
                    batter_stat['Season_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_RH_rate'] + 0.5)

                    pitcher_stat['Year_1B_rate'] *= (1 + (team_rates[team_pitching]['1B_RH_rate'] - team_rates[ballpark]['1B_RH_rate']) * 0.5)
                    pitcher_stat['Season_1B_rate'] *= (1 + (team_rates[team_pitching]['1B_RH_rate'] - team_rates[ballpark]['1B_RH_rate']) * 0.5)
                    pitcher_stat['Year_2B_rate'] *= (1 + (team_rates[team_pitching]['2B_RH_rate'] - team_rates[ballpark]['2B_RH_rate']) * 0.5)
                    pitcher_stat['Season_2B_rate'] *= (1 + (team_rates[team_pitching]['2B_RH_rate'] - team_rates[ballpark]['2B_RH_rate']) * 0.5)
                    pitcher_stat['Year_3B_rate'] *= (1 + (team_rates[team_pitching]['3B_RH_rate'] - team_rates[ballpark]['3B_RH_rate']) * 0.5)
                    pitcher_stat['Season_3B_rate'] *= (1 + (team_rates[team_pitching]['3B_RH_rate'] - team_rates[ballpark]['3B_RH_rate']) * 0.5)
                    pitcher_stat['Year_HR_rate'] *= (1 + (team_rates[team_pitching]['HR_RH_rate'] - team_rates[ballpark]['HR_RH_rate']) * 0.5)
                    pitcher_stat['Season_HR_rate'] *= (1 + (team_rates[team_pitching]['HR_RH_rate'] - team_rates[ballpark]['HR_RH_rate']) * 0.5)

                else:
                    batter_stat['Year_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_LH_rate'] + 0.5)
                    batter_stat['Season_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_LH_rate'] + 0.5)
                    batter_stat['Year_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_LH_rate'] + 0.5)
                    batter_stat['Season_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_LH_rate'] + 0.5)
                    batter_stat['Year_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_LH_rate'] + 0.5)
                    batter_stat['Season_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_LH_rate'] + 0.5)
                    batter_stat['Year_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_LH_rate'] + 0.5)
                    batter_stat['Season_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_LH_rate'] + 0.5)

                    pitcher_stat['Year_1B_rate'] *= (1 + (team_rates[team_pitching]['1B_LH_rate'] - team_rates[ballpark]['1B_LH_rate']) * 0.5)
                    pitcher_stat['Season_1B_rate'] *= (1 + (team_rates[team_pitching]['1B_LH_rate'] - team_rates[ballpark]['1B_LH_rate']) * 0.5)
                    pitcher_stat['Year_2B_rate'] *= (1 + (team_rates[team_pitching]['2B_LH_rate'] - team_rates[ballpark]['2B_LH_rate']) * 0.5)
                    pitcher_stat['Season_2B_rate'] *= (1 + (team_rates[team_pitching]['2B_LH_rate'] - team_rates[ballpark]['2B_LH_rate']) * 0.5)
                    pitcher_stat['Year_3B_rate'] *= (1 + (team_rates[team_pitching]['3B_LH_rate'] - team_rates[ballpark]['3B_LH_rate']) * 0.5)
                    pitcher_stat['Season_3B_rate'] *= (1 + (team_rates[team_pitching]['3B_LH_rate'] - team_rates[ballpark]['3B_LH_rate']) * 0.5)
                    pitcher_stat['Year_HR_rate'] *= (1 + (team_rates[team_pitching]['HR_LH_rate'] - team_rates[ballpark]['HR_LH_rate']) * 0.5)
                    pitcher_stat['Season_HR_rate'] *= (1 + (team_rates[team_pitching]['HR_LH_rate'] - team_rates[ballpark]['HR_LH_rate']) * 0.5)

            else:
                if batter_stat['Handedness'] == 'R' or (batter_stat['Handedness'] == 'S' and pitcher_stats['Handedness'] == 'L'):
                    batter_stat['Year_1B_rate'] *= (1 + (team_rates[team_batting]['1B_RH_rate'] - team_rates[ballpark]['1B_RH_rate']) * 0.5)
                    batter_stat['Season_1B_rate'] *= (1 + (team_rates[team_batting]['1B_RH_rate'] - team_rates[ballpark]['1B_RH_rate']) * 0.5)
                    batter_stat['Year_2B_rate'] *= (1 + (team_rates[team_batting]['2B_RH_rate'] - team_rates[ballpark]['2B_RH_rate']) * 0.5)
                    batter_stat['Season_2B_rate'] *= (1 + (team_rates[team_batting]['2B_RH_rate'] - team_rates[ballpark]['2B_RH_rate']) * 0.5)
                    batter_stat['Year_3B_rate'] *= (1 + (team_rates[team_batting]['3B_RH_rate'] - team_rates[ballpark]['3B_RH_rate']) * 0.5)
                    batter_stat['Season_3B_rate'] *= (1 + (team_rates[team_batting]['3B_RH_rate'] - team_rates[ballpark]['3B_RH_rate']) * 0.5)
                    batter_stat['Year_HR_rate'] *= (1 + (team_rates[team_batting]['HR_RH_rate'] - team_rates[ballpark]['HR_RH_rate']) * 0.5)
                    batter_stat['Season_HR_rate'] *= (1 + (team_rates[team_batting]['HR_RH_rate'] - team_rates[ballpark]['HR_RH_rate']) * 0.5)

                    pitcher_stat['Year_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_RH_rate'] + 0.5)
                    pitcher_stat['Season_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_RH_rate'] + 0.5)
                    pitcher_stat['Year_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_RH_rate'] + 0.5)
                    pitcher_stat['Season_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_RH_rate'] + 0.5)
                    pitcher_stat['Year_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_RH_rate'] + 0.5)
                    pitcher_stat['Season_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_RH_rate'] + 0.5)
                    pitcher_stat['Year_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_RH_rate'] + 0.5)
                    pitcher_stat['Season_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_RH_rate'] + 0.5)

                else:
                    batter_stat['Year_1B_rate'] *= (1 + (team_rates[team_batting]['1B_LH_rate'] - team_rates[ballpark]['1B_LH_rate']) * 0.5)
                    batter_stat['Season_1B_rate'] *= (1 + (team_rates[team_batting]['1B_LH_rate'] - team_rates[ballpark]['1B_LH_rate']) * 0.5)
                    batter_stat['Year_2B_rate'] *= (1 + (team_rates[team_batting]['2B_LH_rate'] - team_rates[ballpark]['2B_LH_rate']) * 0.5)
                    batter_stat['Season_2B_rate'] *= (1 + (team_rates[team_batting]['2B_LH_rate'] - team_rates[ballpark]['2B_LH_rate']) * 0.5)
                    batter_stat['Year_3B_rate'] *= (1 + (team_rates[team_batting]['3B_LH_rate'] - team_rates[ballpark]['3B_LH_rate']) * 0.5)
                    batter_stat['Season_3B_rate'] *= (1 + (team_rates[team_batting]['3B_LH_rate'] - team_rates[ballpark]['3B_LH_rate']) * 0.5)
                    batter_stat['Year_HR_rate'] *= (1 + (team_rates[team_batting]['HR_LH_rate'] - team_rates[ballpark]['HR_LH_rate']) * 0.5)
                    batter_stat['Season_HR_rate'] *= (1 + (team_rates[team_batting]['HR_LH_rate'] - team_rates[ballpark]['HR_LH_rate']) * 0.5)

                    pitcher_stat['Year_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_LH_rate'] + 0.5)
                    pitcher_stat['Season_1B_rate'] *= (0.5 * team_rates[ballpark]['1B_LH_rate'] + 0.5)
                    pitcher_stat['Year_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_LH_rate'] + 0.5)
                    pitcher_stat['Season_2B_rate'] *= (0.5 * team_rates[ballpark]['2B_LH_rate'] + 0.5)
                    pitcher_stat['Year_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_LH_rate'] + 0.5)
                    pitcher_stat['Season_3B_rate'] *= (0.5 * team_rates[ballpark]['3B_LH_rate'] + 0.5)
                    pitcher_stat['Year_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_LH_rate'] + 0.5)
                    pitcher_stat['Season_HR_rate'] *= (0.5 * team_rates[ballpark]['HR_LH_rate'] + 0.5)

            # Define a function to weigh the current season's stats at 60% and the total stats since the beginning of 2023 at 40%
            # This places slightly more emphasis on how players have performed recently
            
            def get_weighted_stat(year_stat, season_stat, year_weight=0.4, season_weight=0.6):
                return (year_stat * year_weight) + (season_stat * season_weight)

            # Define a function to weigh batter and pitcher stats equally, filling in any NaN values with league averages
            
            def get_combined_stat(batter_stat, pitcher_stat, average_stat, batter_weight=0.5, pitcher_weight=0.5):
                if np.isnan(batter_stat):
                    batter_stat = average_stat  # Or handle appropriately
                if np.isnan(pitcher_stat):
                    pitcher_stat = average_stat
                return (batter_stat * batter_weight) + (pitcher_stat * pitcher_weight)

            # Use the `get_weighted_stat` function to calculate combined stats for batters
            
            combined_batter_k_rate = get_weighted_stat(batter_stat['Year_K_rate'], batter_stat['Season_K_rate'])
            combined_batter_BB_rate = get_weighted_stat(batter_stat['Year_BB_rate'], batter_stat['Season_BB_rate'])
            combined_batter_1B_rate = get_weighted_stat(batter_stat['Year_1B_rate'], batter_stat['Season_1B_rate'])
            combined_batter_2B_rate = get_weighted_stat(batter_stat['Year_2B_rate'], batter_stat['Season_2B_rate'])
            combined_batter_3B_rate = get_weighted_stat(batter_stat['Year_3B_rate'], batter_stat['Season_3B_rate'])
            combined_batter_HR_rate = get_weighted_stat(batter_stat['Year_HR_rate'], batter_stat['Season_HR_rate'])

            # Use the `get_weighted_stat` function to calculate combined stats for pitchers
            
            combined_pitcher_k_rate = get_weighted_stat(pitcher_stat['Year_K_rate'], pitcher_stat['Season_K_rate'])
            combined_pitcher_BB_rate = get_weighted_stat(pitcher_stat['Year_BB_rate'], pitcher_stat['Season_BB_rate'])
            combined_pitcher_1B_rate = get_weighted_stat(pitcher_stat['Year_1B_rate'], pitcher_stat['Season_1B_rate'])
            combined_pitcher_2B_rate = get_weighted_stat(pitcher_stat['Year_2B_rate'], pitcher_stat['Season_2B_rate'])
            combined_pitcher_3B_rate = get_weighted_stat(pitcher_stat['Year_3B_rate'], pitcher_stat['Season_3B_rate'])
            combined_pitcher_HR_rate = get_weighted_stat(pitcher_stat['Year_HR_rate'], pitcher_stat['Season_HR_rate'])

            # Combine batter and pitcher stats with equal weights
            
            combined_k_rate = get_combined_stat(combined_batter_k_rate, combined_pitcher_k_rate, average_k_rate)
            combined_BB_rate = get_combined_stat(combined_batter_BB_rate, combined_pitcher_BB_rate, average_BB_rate)
            combined_1B_rate = get_combined_stat(combined_batter_1B_rate, combined_pitcher_1B_rate, average_1B_rate)
            combined_2B_rate = get_combined_stat(combined_batter_2B_rate, combined_pitcher_2B_rate, average_2B_rate)
            combined_3B_rate = get_combined_stat(combined_batter_3B_rate, combined_pitcher_3B_rate, average_3B_rate)
            combined_HR_rate = get_combined_stat(combined_batter_HR_rate, combined_pitcher_HR_rate, average_HR_rate)
            
            # Create a combined rate for the amount of balls put in play (i.e. everything besides strikeouts and walks)
            
            combined_in_play_rate = 1 - (combined_k_rate + combined_BB_rate)
            
            # Calculate total rate

            total_rate = combined_k_rate + combined_BB_rate + combined_in_play_rate
            
            # Handle possible division by zero
            
            if total_rate == 0:
                combined_k_rate = average_k_rate
                combined_BB_rate = average_BB_rate
                combined_1B_rate = average_1B_rate
                combined_2B_rate = average_2B_rate
                combined_3B_rate = average_3B_rate
                combined_HR_rate = average_HR_rate
                
            # Normalize all rates to equal the total rate
            
            else:
                combined_k_rate /= total_rate
                combined_BB_rate /= total_rate
                combined_1B_rate /= total_rate
                combined_2B_rate /= total_rate
                combined_3B_rate /= total_rate
                combined_HR_rate /= total_rate

        # As long as three outs have not been made, continue the loop
        
        while outs < 3:
            
            # Additional batter comes to the plate
            
            batters_to_plate += 1
            
            # Generate a random number to determine the outcome of the at-bat
            
            outcome = np.random.rand()
            
            # Determine if a strikeout occurs

            if outcome < combined_k_rate:
                strikeouts += 1
                outs += 1

            # Determine if a walk occurs and adjust baserunners accordingly
            
            elif outcome < combined_k_rate + combined_BB_rate:
                if runner_on_first and runner_on_second and runner_on_third:
                    runner_on_first = True
                    runner_on_second = True
                    runner_on_third = True
                elif runner_on_first and runner_on_second:
                    runner_on_first = True
                    runner_on_second = True
                    runner_on_third = True
                elif runner_on_first:
                    runner_on_first = True
                    runner_on_second = True
                    runner_on_third = False
                else:
                    runner_on_first = True
                    runner_on_second = False
                    runner_on_third = False

            else:
                
                # Calculate the range of probabilities for hits using the combined in-play rate
                
                in_play_outcome = (outcome - combined_k_rate - combined_BB_rate) / combined_in_play_rate

                # Determine if a single occurs and adjust hits and baserunners accordingly
                
                if in_play_outcome < combined_1B_rate / combined_in_play_rate:
                    hits += 1
                    if runner_on_first and runner_on_second and runner_on_third:
                        runner_on_first = True
                        runner_on_second = True
                        runner_on_third = True
                    elif runner_on_first and runner_on_second:
                        runner_on_first = True
                        runner_on_second = True
                        runner_on_third = True
                    elif runner_on_first:
                        runner_on_first = True
                        runner_on_second = True
                        runner_on_third = False
                    else:
                        runner_on_first = True
                        runner_on_second = False
                        runner_on_third = False
                        
                # Determine if a double occurs and adjust hits and baserunners accordingly
                
                elif in_play_outcome < (combined_1B_rate + combined_2B_rate) / combined_in_play_rate:
                    hits += 1
                    if runner_on_first and runner_on_second and runner_on_third:
                        runner_on_first = False
                        runner_on_second = True
                        runner_on_third = True
                    elif runner_on_first and runner_on_second:
                        runner_on_first = False
                        runner_on_second = True
                        runner_on_third = True
                    elif runner_on_first:
                        runner_on_first = False
                        runner_on_second = True
                        runner_on_third = True
                    else:
                        runner_on_first = False
                        runner_on_second = True
                        runner_on_third = False
                        
                # Determine if a triple occurs and adjust hits and baserunners accordingly
                
                elif in_play_outcome < (combined_1B_rate + combined_2B_rate + combined_3B_rate) / combined_in_play_rate:
                    hits += 1
                    runner_on_first = False
                    runner_on_second = False
                    runner_on_third = True
                    
                # Determine if a home run occurs and adjust hits, home runs and baserunners accordingly
                
                elif in_play_outcome < (combined_1B_rate + combined_2B_rate + combined_3B_rate + combined_HR_rate) / combined_in_play_rate:
                    hits += 1
                    home_runs +=1
                    runner_on_first = False
                    runner_on_second = False
                    runner_on_third = False
                    
                # Otherwise, check for the possibility of a double play using an average rate of 6.4% on balls in play with a runner on first and less than 2 outs
                
                else:
                    if runner_on_first and outs < 2:
                        if np.random.rand() < 0.064:
                            outs += 2
                            runner_on_first = False
                            runner_on_second = False
                            runner_on_third = False
                            
                        # If not a double play, outcome must be one out 
                            
                        else:
                            outs += 1
                            if runner_on_first and runner_on_second and runner_on_third:
                                runner_on_first = True
                                runner_on_second = True
                                runner_on_third = True
                            elif runner_on_first and runner_on_second:
                                runner_on_first = True
                                runner_on_second = True
                                runner_on_third = False
                            elif runner_on_first:
                                runner_on_first = True
                                runner_on_second = False
                                runner_on_third = False
                            else:
                                runner_on_first = False
                                runner_on_second = False
                                runner_on_third = False
                    else:
                        outs += 1
                        
            # Add one to batter row index before looping back through, starting over after nine batters have appeared
            
            row_index = (row_index + 1) % 9             
            
        # Update stats based on outcome of half-inning
        
        if batters_to_plate > 3:
            over_three_batters_to_plate_counts += 1
        if strikeouts == 0:
            no_strikeout_counts += 1
        elif strikeouts == 1:
            one_strikeout_counts += 1
        elif strikeouts == 2:
            two_strikeout_counts += 1
        elif strikeouts == 3:
            three_strikeout_counts += 1
        else:
            strikeout_counts += 1
        if home_runs >= 1:
            home_run_counts += 1
        else:
            no_home_run_counts += 1
        if hits == 0:
            no_hit_counts += 1
        elif hits == 1:
            one_hit_counts += 1
        elif hits == 2:
            two_hit_counts += 1
        elif hits == 3:
            three_hit_counts += 1
        else:
            four_plus_hit_counts +=1

        # Return rate of occurrences in 10,000 simulations
        
        probability_no_strikeouts = no_strikeout_counts / n_simulations
        probability_no_hits = no_hit_counts / n_simulations
        probability_one_hit = one_hit_counts / n_simulations
        probability_two_hits = two_hit_counts / n_simulations
        probability_three_hits = three_hit_counts / n_simulations
        probability_home_run = home_run_counts / n_simulations
        probability_one_strikeout = one_strikeout_counts / n_simulations
        probability_two_strikeouts = two_strikeout_counts / n_simulations
        probability_three_strikeouts = three_strikeout_counts / n_simulations
        probability_under_four_batters_to_plate = 1 - (over_three_batters_to_plate_counts / n_simulations)

    return probability_no_strikeouts, probability_one_strikeout, probability_two_strikeouts, probability_three_strikeouts, probability_no_hits, probability_one_hit, probability_two_hits, probability_three_hits, probability_home_run, probability_under_four_batters_to_plate

# Run the simulation and display estimated probability of outcomes occurring in decimal form

probability_no_strikeouts, probability_one_strikeout, probability_two_strikeouts, probability_three_strikeouts, probability_no_hits, probability_one_hit, probability_two_hits, probability_three_hits, probability_home_run, probability_under_four_batters_to_plate = simulate_inning(model_data)

In [18]:
# Calculate expected value based on betting odds

ev_no_strikeouts = float(probability_no_strikeouts - float(implied_no_strikeouts))
ev_no_hits = float(probability_no_hits - float(implied_no_hits))
ev_under_four_batters_to_plate = float(probability_under_four_batters_to_plate - float(implied_under_four_batters_to_plate))

In [21]:
# Save model output into list

model_output = {'game_id' : game_id, 'probability_no_strikeouts': probability_no_strikeouts, 
                'implied_probability_no_strikeouts': implied_no_strikeouts, 'ev_no_strikeouts': ev_no_strikeouts, 
                'probability_no_hits': probability_no_hits, 'implied_probability_no_hits': implied_no_hits,
                'ev_no_hits': ev_no_hits, 'probability_under_four_batters_to_plate': probability_under_four_batters_to_plate,
                'implied_probability_under_four_batters_to_plate': implied_under_four_batters_to_plate, 'ev_under_four_batters_to_plate': ev_under_four_batters_to_plate}

# Convert new data to DataFrame

new_row_df = pd.DataFrame([model_output])
    
# Define the CSV path

csv_path = 'model_outputs.csv'

# Read existing CSV file or create an empty DataFrame if the file doesn't exist

if os.path.exists(csv_path):
    existing_df = pd.read_csv(csv_path)
else:
    existing_df = pd.DataFrame(columns=new_row_df.columns)

# Append new row to the existing DataFrame

updated_df = pd.concat([existing_df, new_row_df], ignore_index=True)

# Ensure no duplicates

updated_df.drop_duplicates(subset=['game_id'], keep='last', inplace=True)

# Save updated DataFrame to CSV

updated_df.to_csv(csv_path, index=False)

# Additional step to ensure file is properly saved and closed

with open(csv_path, 'r') as f:
    content = f.read()

# Force Google Drive to sync the updated file

os.utime(csv_path, None)

In [20]:
print(f"Estimated Probability of Zero Strikeouts: {probability_no_strikeouts:.4f}")
print(f"Estimated Probability of One Strikeout: {probability_one_strikeout:.4f}")
print(f"Estimated Probability of Two Strikeouts: {probability_two_strikeouts:.4f}")
print(f"Estimated Probability of Three Strikeouts: {probability_three_strikeouts:.4f}")
print(f"Estimated Probability of Zero Hits: {probability_no_hits:.4f}")
print(f"Estimated Probability of One Hit: {probability_one_hit:.4f}")
print(f"Estimated Probability of Two Hits: {probability_two_hits:.4f}")
print(f"Estimated Probability of Three Hits: {probability_three_hits:.4f}")
print(f"Estimated Probability of a Home Run: {probability_home_run:.4f}")
print(f"Estimated Probability of less than Four Batters: {probability_under_four_batters_to_plate:.4f}")
print("-------------------------------------------------")
print(f"Expected Value of Zero Strikeouts: {ev_no_strikeouts:.4f}")
print(f"Expected Value of Zero Hits: {ev_no_hits:.4f}")
print(f"Expected Value of less than Four Batters: {ev_under_four_batters_to_plate:.4f}")

Estimated Probability of Zero Strikeouts: 0.3849
Estimated Probability of One Strikeout: 0.4313
Estimated Probability of Two Strikeouts: 0.1635
Estimated Probability of Three Strikeouts: 0.0203
Estimated Probability of Zero Hits: 0.4615
Estimated Probability of One Hit: 0.3101
Estimated Probability of Two Hits: 0.1443
Estimated Probability of Three Hits: 0.0559
Estimated Probability of a Home Run: 0.0238
Estimated Probability of less than Four Batters: 0.3784
-------------------------------------------------
Expected Value of Zero Strikeouts: 0.0003
Expected Value of Zero Hits: -0.0147
Expected Value of less than Four Batters: -0.0216
