<a href="https://colab.research.google.com/github/brian-feldman-3/baseball_lineups/blob/main/OptimalBaseballLineup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The purpose of this tool is to evaluate the optimized lineup for a baseball team based on each players' current statistics.

In [2]:
import numpy as np
from numpy import random
import pandas as pd
pd.options.mode.chained_assignment = None


In [3]:
#Import data from Baseball Reference website for a single team (for testing)

url = 'https://www.baseball-reference.com/teams/BOS/2025-batting.shtml'

# Read the HTML tables from the URL
current_red_sox = pd.read_html(url)
current_red_sox_df = current_red_sox[0]
current_red_sox_df = current_red_sox_df[['Player', 'PA', 'H', '2B', '3B', 'HR', 'BB']]

#Delete rows with player named "Player" and "Team Totals"
current_red_sox_df.drop(current_red_sox_df.loc[current_red_sox_df['Player'] == 'Player'].index, inplace = True)
current_red_sox_df.drop(current_red_sox_df.loc[current_red_sox_df['Player'] == 'Team Totals'].index, inplace = True)

#Make all plate possiblities integers and create a column for singles
current_red_sox_df[['PA', 'H', '2B', '3B', 'HR', 'BB']] = current_red_sox_df[['PA', 'H', '2B', '3B', 'HR', 'BB']].astype(int)
current_red_sox_df['1B'] = current_red_sox_df['H']-current_red_sox_df['2B']-current_red_sox_df['3B']-current_red_sox_df['HR']

#Remove players below a certain plate appearance threshold
plate_appearance_threshold = 1
current_red_sox_df.drop(current_red_sox_df.loc[current_red_sox_df['PA'] < plate_appearance_threshold].index, inplace = True)

#Now, we replace each plate possiblity with its percent chance of happening, but first get rid of weird sac flies and stuff so we add to 100
current_red_sox_df['OUT'] = current_red_sox_df['PA'] - (current_red_sox_df['1B'] + current_red_sox_df['2B'] + current_red_sox_df['3B'] + current_red_sox_df['HR'] + current_red_sox_df['BB'])
for hit_type in ['1B', '2B', '3B', 'HR', 'BB', 'OUT']:
  current_red_sox_df[hit_type] = current_red_sox_df[hit_type]/current_red_sox_df['PA']

#We now round these to 3 decimals, multipyly by 1000 and create length 1000 list for each hitter of outcomes
for hit_type in ['1B', '2B', '3B', 'HR', 'BB', 'OUT']:
  current_red_sox_df[hit_type] = current_red_sox_df[hit_type].round(3)
  current_red_sox_df[hit_type] = current_red_sox_df[hit_type]*1000

#Create player dictionary with a list of possible plate appearance outcomes as the key
player_dict = dict()
for player in current_red_sox_df['Player']:
  player_dict[player] = (['HR']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, 'HR'])
                         + ['3B']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, '3B'])
                         + ['2B']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, '2B'])
                         + ['1B']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, '1B'])
                         + ['OUT']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, 'OUT']))

#We want to shuffle all plater appearance outcomes (although we expect true randomness in random.rand, we do this to be sure)
for value in player_dict.values():
  value = np.random.shuffle(value)

#We need to create a way to ignore traded players because they are included in the team table even after they are traded
#traded_players

  player_dict[player] = (['HR']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, 'HR'])
  + ['3B']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, '3B'])
  + ['2B']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, '2B'])
  + ['1B']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, '1B'])
  + ['OUT']*int(current_red_sox_df.loc[current_red_sox_df['Player'] == player, 'OUT']))


In [4]:
def game_simulation(lineup):
  #Initialize 0 outs in the first inning
  runs_scored = 0
  batter_number = 0

  #Create a list of binary base values depending on if there is a runner on-base or not
  bases = [0, 0, 0]

  #We iterate through each inning, each plate appearance (assuming no extra innings)
  for inning_count in range(1,10):
    out_count = 0
    while out_count < 3:
      batter_up = lineup[batter_number]
      batter_outcome = np.random.choice(player_dict[batter_up])
      #print(f"{batter_up} up to bat. Outcome: {batter_outcome}")
      if batter_outcome == 'HR':
        runs_scored = runs_scored + sum(bases) + 1
        bases = [0, 0, 0]
      elif batter_outcome == '3B':
        runs_scored = runs_scored + sum(bases)
        bases = [0, 0, 1]
      elif batter_outcome == '2B':
        #According to a quick search, 45% of runners score from first, 70% score from second, and we assume all score from third
        #If there is only a runner on first, we use the 45% chance to see where they end up
        #If there is a runner on first and second, we see what the runner on second does first, then respond accordingly
        if bases[0] == 1 & bases[1] == 1:
          #Runner on first and second (and maybe third, does not matter for our calculating)
          #One runner must score via push, the second runner is coming from 1st and has a 45% chance of scoring
          runner_on_first_outcome = np.random.choice(['score', 'does not score'], p=[0.45, 0.55])
          if runner_on_first_outcome == 'score':
            runs_scored = runs_scored + sum(bases)
            bases = [0, 1, 0]
          else:
            runs_scored = runs_scored + sum(bases) - 1
            bases = [0, 1, 1]
        elif bases[0] == 1 & bases[1] == 0:
          #Runner on first only
          runner_on_first_outcome = np.random.choice(['score', 'does not score'], p=[0.45, 0.55])
          if runner_on_first_outcome == 'score':
            runs_scored = runs_scored + sum(bases)
            bases = [0, 1, 0]
          else:
            runs_scored = runs_scored + bases[2]
            bases = [0, 1, 1]
        else:
          #Nobody on first, only second and ~maybe~ third
          runner_on_second_outcome = np.random.choice(['score', 'does not score'], p=[0.7, 0.3])
          if runner_on_second_outcome == 'score':
            runs_scored = runs_scored + bases[2] + bases[1]
            bases = [0, 1, 0]
          else:
            runs_scored = runs_scored + bases[2]
            bases = [0, 1, 1]
      elif batter_outcome == '1B':
        #Runner ~never~ scores from first on a single. Runner scores from second 40% of the time, runner scores from 3rd 100% of the time
        if bases[2] == 1:
          runs_scored = runs_scored + 1
        if bases[1] == 1:
          runner_on_second_outcome = np.random.choice(['score', 'does not score'], p=[0.4, 0.6])
          if runner_on_second_outcome == 'score':
            runs_scored = runs_scored + 1
            if bases[0] == 1:
              bases = [1, 0, 1]
            else:
              bases = [1, 0, 0]
          else:
            if bases[0] == 1:
              bases = [1, 1, 1]
            else:
              bases = [1, 0, 1]
      elif batter_outcome == 'BB':
        if bases == [1, 0, 0]:
          bases = [1, 1, 0]
        elif bases == [1, 1, 0]:
          bases = [1, 1, 1]
        elif bases == [1, 1, 1]:
          bases = [1, 1, 1]
          runs_scored = runs_scored + 1
        elif bases == [1, 0, 1]:
          bases = [1, 1, 1]
      else:
        out_count += 1
      #print(f"{inning_count} inning.\n{out_count} outs\n{bases[0]} on first\n{bases[1]} on second\n{bases[2]} on third\n{runs_scored} runs have scored.")

      if batter_number == 8:
        batter_number = 0
      else:
        batter_number += 1
  return runs_scored


In [18]:
def game_simulation_ninth_batter_intentional_out(lineup):
  #Initialize 0 outs in the first inning
  runs_scored = 0
  batter_number = 0

  #Create a list of binary base values depending on if there is a runner on-base or not
  bases = [0, 0, 0]

  #We iterate through each inning, each plate appearance (assuming no extra innings)
  for inning_count in range(1,10):
    out_count = 0
    while out_count < 3:
      batter_up = lineup[batter_number]
      batter_outcome = np.random.choice(player_dict[batter_up])
      ########################################
      ##Below is intentional out from last batter in the lineup##
      if batter_number == 8 & out_count == 2 & inning_count != 9:
        if batter_outcome == 'BB':
          batter_outcome = 'BB'
        else:
          batter_outcome = 'OUT'
      ########################################
      #print(f"{batter_up} up to bat. Outcome: {batter_outcome}")
      if batter_outcome == 'HR':
        runs_scored = runs_scored + sum(bases) + 1
        bases = [0, 0, 0]
      elif batter_outcome == '3B':
        runs_scored = runs_scored + sum(bases)
        bases = [0, 0, 1]
      elif batter_outcome == '2B':
        #According to a quick search, 45% of runners score from first, 70% score from second, and we assume all score from third
        #If there is only a runner on first, we use the 45% chance to see where they end up
        #If there is a runner on first and second, we see what the runner on second does first, then respond accordingly
        if bases[0] == 1 & bases[1] == 1:
          #Runner on first and second (and maybe third, does not matter for our calculating)
          #One runner must score via push, the second runner is coming from 1st and has a 45% chance of scoring
          runner_on_first_outcome = np.random.choice(['score', 'does not score'], p=[0.45, 0.55])
          if runner_on_first_outcome == 'score':
            runs_scored = runs_scored + sum(bases)
            bases = [0, 1, 0]
          else:
            runs_scored = runs_scored + sum(bases) - 1
            bases = [0, 1, 1]
        elif bases[0] == 1 & bases[1] == 0:
          #Runner on first only
          runner_on_first_outcome = np.random.choice(['score', 'does not score'], p=[0.45, 0.55])
          if runner_on_first_outcome == 'score':
            runs_scored = runs_scored + sum(bases)
            bases = [0, 1, 0]
          else:
            runs_scored = runs_scored + bases[2]
            bases = [0, 1, 1]
        else:
          #Nobody on first, only second and ~maybe~ third
          runner_on_second_outcome = np.random.choice(['score', 'does not score'], p=[0.7, 0.3])
          if runner_on_second_outcome == 'score':
            runs_scored = runs_scored + bases[2] + bases[1]
            bases = [0, 1, 0]
          else:
            runs_scored = runs_scored + bases[2]
            bases = [0, 1, 1]
      elif batter_outcome == '1B':
        #Runner ~never~ scores from first on a single. Runner scores from second 40% of the time, runner scores from 3rd 100% of the time
        if bases[2] == 1:
          runs_scored = runs_scored + 1
        if bases[1] == 1:
          runner_on_second_outcome = np.random.choice(['score', 'does not score'], p=[0.4, 0.6])
          if runner_on_second_outcome == 'score':
            runs_scored = runs_scored + 1
            if bases[0] == 1:
              bases = [1, 0, 1]
            else:
              bases = [1, 0, 0]
          else:
            if bases[0] == 1:
              bases = [1, 1, 1]
            else:
              bases = [1, 0, 1]
      elif batter_outcome == 'BB':
        if bases == [1, 0, 0]:
          bases = [1, 1, 0]
        elif bases == [1, 1, 0]:
          bases = [1, 1, 1]
        elif bases == [1, 1, 1]:
          bases = [1, 1, 1]
          runs_scored = runs_scored + 1
        elif bases == [1, 0, 1]:
          bases = [1, 1, 1]
      else:
        out_count += 1
      #print(f"{inning_count} inning.\n{out_count} outs\n{bases[0]} on first\n{bases[1]} on second\n{bases[2]} on third\n{runs_scored} runs have scored.")

      if batter_number == 8:
        batter_number = 0
      else:
        batter_number += 1
  return runs_scored

In [19]:
current_BOS_lineup = ['Roman Anthony*', 'Alex Bregman', 'Jarren Duran*', 'Trevor Story', 'Nathaniel Lowe*', 'Masataka Yoshida*', 'Ceddanne Rafaela', 'David Hamilton*', 'Connor Wong']

regular_results = []
for game_count in range(1,10000):
    runs_scored = game_simulation(current_BOS_lineup)
    regular_results.append(runs_scored)

intentional_out_results = []
for game_count_intentional_out in range(1,10000):
    runs_scored = game_simulation_ninth_batter_intentional_out(current_BOS_lineup)
    intentional_out_results.append(runs_scored)

print(f"Regular lineup averages {sum(regular_results)/len(regular_results)} and intentional out averages {sum(intentional_out_results)/len(intentional_out_results)}")

Regular lineup averages 5.22062206220622 and intentional out averages 4.546254625462546


In [10]:
# def randomize_lineup(team_names):
#   final_lineup = np.random.choice(list(team_names), 9, replace=False)
#   return final_lineup

In [9]:
# team_names = ['Carlos NarvÃ¡ez', 'Abraham Toro#', 'Kristian Campbell', 'Trevor Story', 'Alex Bregman', 'Jarren Duran*', 'Ceddanne Rafaela', 'Wilyer Abreu*', 'Rafael Devers*']

In [8]:
##Use this cell to randomize the lineup, then run 100 iterations
# lineup_list = []
# lineup_run_avg = []

# for lineup_count in range(1, 6):
#   print(lineup_count)
#   lineup = randomize_lineup(player_dict.keys())
#   lineup_list.append(lineup)
#   all_results = []
#   for game_count in range(1,1000):
#     runs_scored = game_simulation(lineup)
#     all_results.append(runs_scored)
#   lineup_run_avg.append(sum(all_results)/len(all_results))

# print(lineup_list)
# print(lineup_run_avg)
