In [4]:
import pandas as pd
import numpy as np
import itertools
pd.set_option('display.max_columns', None)

In [5]:
final_probs = pd.read_csv('final_safe_probabilities.csv')
final_probs.head()

Unnamed: 0,game_str,play_id,timestamp,player_position,position,play_type,home_team,away_team,top_bottom_inning,pitcher,catcher,first_base,second_base,third_base,shortstop,left_field,center_field,right_field,batter,first_baserunner,second_baserunner,third_baserunner,field_x_7,field_y_7,field_x_8,field_y_8,field_x_9,field_y_9,field_x_10,field_y_10,field_x_11,field_y_11,field_x_12,field_y_12,ball_position_x,ball_position_y,ball_position_z,runner_id,from_base,to_base,attempted,was_safe,runner_x,runner_y,runner_dist_to_base,fielder_x,fielder_y,fielder_dist_to_base,safe_probability
0,y1_d001_CGA_QEA,108,4098455,8,center field,ball acquired,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1824,CGA-1929,CGA-1592,CGA-2010,-110.8791,230.5518,94.4808,338.481,65.4084,291.4254,35.7615,110.5899,-44.5356,100.8936,-62.0019,65.0769,95.3448,339.444,0.0,10,Home,2B,True,True,35.7615,110.5899,39.346871,94.4808,338.481,231.626499,0.995977
1,y1_d001_CGA_QEA,114,4231155,9,right field,ball acquired,QEA,CGA,top,QEA-0404,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1353,CGA-1824,CGA-1929,CGA-1592,-121.0008,236.3949,28.4658,327.723,139.5966,299.0814,61.1802,61.2882,,,,,138.1734,295.9152,5.19453,10,Home,2B,True,True,61.1802,61.2882,89.783392,139.5966,299.0814,221.583436,0.928451
2,y1_d001_CGA_QEA,216,8333280,9,right field,ball acquired,QEA,CGA,top,QEA-0152,QEA-0071,QEA-0263,QEA-0365,QEA-0180,QEA-0027,QEA-0249,QEA-0235,QEA-0252,CGA-1929,CGA-1592,CGA-1547,,-119.2062,233.3106,-29.9289,274.1229,188.1207,245.4093,56.9868,46.3326,,,-50.5641,38.5989,188.0088,246.3627,0.0,10,Home,2B,True,True,56.9868,46.3326,98.76601,188.1207,245.4093,222.283963,0.884478
3,y1_d002_CGA_QEA,92,3087421,7,left field,ball acquired,QEA,CGA,bottom,CGA-2074,CGA-1127,CGA-1592,CGA-1824,CGA-1198,CGA-2010,CGA-1353,CGA-1458,CGA-1547,QEA-0252,QEA-0235,QEA-0120,,-200.5506,236.1,-61.6011,276.9882,113.8962,207.5649,57.1884,47.2554,-6.3957,122.2593,-57.114,67.5888,-201.4455,237.1629,0.0,10,Home,2B,True,True,57.1884,47.2554,98.131108,-200.5506,236.1,228.305395,0.887637
4,y1_d003_CGA_QEA,92,3994106,9,right field,ball acquired,QEA,CGA,bottom,CGA-1302,CGA-1503,CGA-1592,CGA-1458,CGA-1166,CGA-2010,CGA-1824,CGA-1547,CGA-1198,QEA-0249,QEA-0365,,,-136.9167,241.8789,51.5217,335.9463,69.6903,327.1002,59.493,72.1209,,,,,67.7784,325.818,0.011808,10,Home,2B,True,True,59.493,72.1209,80.939068,69.6903,327.1002,211.888716,0.956541


In [6]:
# RE24 run expectancy for 1 out 
re_1_out = {
    '000': 0.27,
    '100': 0.49,
    '010': 0.63,
    '001': 0.98,
    '110': 0.88,
    '101': 1.23,
    '011': 1.39,
    '111': 1.64
}

base_index = {'1B': 0, '2B': 1, '3B': 2}

def compute_ev_from_paths(attempts_dict, re_table=re_1_out):

    total_ev = 0
    advancements = list(attempts_dict.keys())
    
    # loop through all 2^n combinations of success/failures of the base runners
    for outcome in itertools.product([0, 1], repeat=len(advancements)):
        outcome_prob = 1
        base_state = ['0', '0', '0']
        runs_scored = 0
        
        for i, adv in enumerate(advancements):
            p_success = attempts_dict[adv]
            success = outcome[i]
            outcome_prob *= p_success if success else (1 - p_success)
            
            from_base, to_base = adv.split("->")
            
            if success:
                if to_base == "Home":
                    runs_scored += 1
                else:
                    base_state[base_index[to_base]] = '1'

        base_str = ''.join(base_state)
        RE = re_table.get(base_str, 0)

        total_ev += outcome_prob * (RE + runs_scored)
    
    return total_ev


example_attempts = {
    "1B->3B": 0.65,
    "2B->Home": 0.85
}

ev = compute_ev_from_paths(example_attempts)
ev

1.5815

In [7]:
def simulate_all_actions(group, re_table=re_1_out):

    # make a dictionary like {"1B->3B": 0.7}
    attempts_dict = {
        f"{row['from_base']}->{row['to_base']}": row['safe_probability']
        for _, row in group.iterrows()
    }

    home_attempts = {k: v for k, v in attempts_dict.items() if k.endswith("->Home")}
    non_home_attempts = {k: v for k, v in attempts_dict.items() if not k.endswith("->Home")}

    evs = {}

    ## letting the ball through - assume all non-home runners are safe, home runner probabilistic
    let_attempts = {}
    for k in non_home_attempts:
        let_attempts[k] = 1.0 
    for k, v in home_attempts.items():
        let_attempts[k] = v 
    evs['let'] = compute_ev_from_paths(let_attempts, re_table)

    ## cut and hold - assume everyone safe at desired base (no throws), basically error prevention
    cut_hold_attempts = {k: 1.0 for k in attempts_dict}
    evs['cut_hold'] = compute_ev_from_paths(cut_hold_attempts, re_table)

    # cut and relay - assume home runner is safe, others probabilistic
    cut_relay_attempts = {}
    for k in attempts_dict:
        if k.endswith("->Home"):
            cut_relay_attempts[k] = 1.0 
        else:
            cut_relay_attempts[k] = attempts_dict[k] 
    evs['cut_relay'] = compute_ev_from_paths(cut_relay_attempts, re_table)

    return evs


In [8]:
grouped = final_probs.groupby(['game_str', 'play_id'])

results = []

for (game_str, play_id), group in grouped:
    evs = simulate_all_actions(group)
    evs['game_str'] = game_str
    evs['play_id'] = play_id

    ev_values = [evs['let'], evs['cut_hold'], evs['cut_relay']]
    if max(ev_values) - min(ev_values) < 0.01: # default to cut and hold if its super close, might need to adjust this threshold
        evs['optimal_action'] = 'cut_hold'
    else:
        evs['optimal_action'] = min(
            {'let': evs['let'], 'cut_hold': evs['cut_hold'], 'cut_relay': evs['cut_relay']},
            key=lambda x: evs[x]
        )
    results.append(evs)

ev_df = pd.DataFrame(results)

ev_df

Unnamed: 0,let,cut_hold,cut_relay,game_str,play_id,optimal_action
0,3.255006,3.39,3.388295,y1_d001_CGA_QEA,108,let
1,0.630000,0.63,0.604242,y1_d001_CGA_QEA,114,cut_relay
2,1.629831,1.63,1.588412,y1_d001_CGA_QEA,216,cut_relay
3,2.380460,2.39,2.271366,y1_d002_CGA_QEA,92,cut_relay
4,0.980000,0.98,0.853384,y1_d002_CGA_QEA,99,cut_relay
...,...,...,...,...,...,...
894,0.980000,0.98,0.960704,y2_d097_YJD_RZQ,19,cut_relay
895,2.297304,2.39,2.385384,y2_d097_YJD_RZQ,53,let
896,0.980000,0.98,0.978191,y2_d097_YJD_RZQ,135,cut_hold
897,2.218158,2.39,2.384866,y2_d097_YJD_RZQ,158,let


In [9]:
ev_df['optimal_action'].value_counts() # I feel like this is fair?

optimal_action
cut_relay    380
cut_hold     292
let          227
Name: count, dtype: int64

In [10]:
cutoff_decisions = final_probs.merge(ev_df, on=['game_str', 'play_id'], how='left')  
cutoff_decisions = cutoff_decisions[['game_str', 'play_id', 'timestamp', 'position', 'from_base', 'to_base', 'was_safe', 'safe_probability', 'optimal_action']]
cutoff_decisions

Unnamed: 0,game_str,play_id,timestamp,position,from_base,to_base,was_safe,safe_probability,optimal_action
0,y1_d001_CGA_QEA,108,4098455,center field,Home,2B,True,0.995977,let
1,y1_d001_CGA_QEA,114,4231155,right field,Home,2B,True,0.928451,cut_relay
2,y1_d001_CGA_QEA,216,8333280,right field,Home,2B,True,0.884478,cut_relay
3,y1_d002_CGA_QEA,92,3087421,left field,Home,2B,True,0.887637,cut_relay
4,y1_d003_CGA_QEA,92,3994106,right field,Home,2B,True,0.956541,cut_relay
...,...,...,...,...,...,...,...,...,...
1650,y2_d095_QZE_RZQ,97,2945498,left field,2B,Home,True,0.848645,cut_relay
1651,y2_d095_QZE_RZQ,143,4421598,right field,2B,Home,True,0.933757,cut_relay
1652,y2_d095_QZE_RZQ,241,7666948,left field,2B,Home,True,0.999997,cut_hold
1653,y2_d095_QZE_RZQ,261,8351098,right field,2B,Home,True,0.999993,cut_hold
