In [1]:
import pandas as pd
import numpy as np

# Load CSV and Start Transforming the DF

In [2]:
csv_data = pd.read_csv('API Output-Batters.csv')

# Calculate number of singles
csv_data["singles"] = (
    csv_data["hits"] - csv_data["doubles"] - csv_data["triples"] - csv_data["homeRuns"]
)

# Replace missing or zero hits to avoid division errors
csv_data["hits"] = csv_data["hits"].replace(0, np.nan)

# Calculate each as a percentage of total hits
csv_data["singles_pct"] = (csv_data["singles"] / csv_data["hits"]) * 100
csv_data["doubles_pct"] = (csv_data["doubles"] / csv_data["hits"]) * 100
csv_data["triples_pct"] = (csv_data["triples"] / csv_data["hits"]) * 100
csv_data["homeRuns_pct"] = (csv_data["homeRuns"] / csv_data["hits"]) * 100

# Fill NaN percentages with 0 if player has 0 hits
csv_data= csv_data.fillna(0)

# Round to one decimal place for readability
csv_data[["singles_pct", "doubles_pct", "triples_pct", "homeRuns_pct"]] = (
    csv_data[["singles_pct", "doubles_pct", "triples_pct", "homeRuns_pct"]].round(1))

useful_columns = ['PlayerName', 'TeamName', 'avg', "singles_pct", "doubles_pct", "triples_pct", "homeRuns_pct"]

df_cleaned = csv_data[useful_columns].copy()

df_cleaned['TeamName'].unique()

array(['Athletics', 'Pittsburgh Pirates', 'San Diego Padres',
       'Seattle Mariners', 'San Francisco Giants', 'St. Louis Cardinals',
       'Tampa Bay Rays', 'Texas Rangers', 'Toronto Blue Jays',
       'Minnesota Twins', 'Philadelphia Phillies', 'Atlanta Braves',
       'Chicago White Sox', 'Miami Marlins', 'New York Yankees',
       'Milwaukee Brewers', 'Los Angeles Angels', 'Arizona Diamondbacks',
       'Baltimore Orioles', 'Boston Red Sox', 'Chicago Cubs',
       'Cincinnati Reds', 'Cleveland Guardians', 'Colorado Rockies',
       'Detroit Tigers', 'Houston Astros', 'Kansas City Royals',
       'Los Angeles Dodgers', 'Washington Nationals', 'New York Mets'],
      dtype=object)

# Split into team df

In [3]:
team_names = ['Athletics', 'Pittsburgh Pirates', 'San Diego Padres',
       'Seattle Mariners', 'San Francisco Giants', 'St. Louis Cardinals',
       'Tampa Bay Rays', 'Texas Rangers', 'Toronto Blue Jays',
       'Minnesota Twins', 'Philadelphia Phillies', 'Atlanta Braves',
       'Chicago White Sox', 'Miami Marlins', 'New York Yankees',
       'Milwaukee Brewers', 'Los Angeles Angels', 'Arizona Diamondbacks',
       'Baltimore Orioles', 'Boston Red Sox', 'Chicago Cubs',
       'Cincinnati Reds', 'Cleveland Guardians', 'Colorado Rockies',
       'Detroit Tigers', 'Houston Astros', 'Kansas City Royals',
       'Los Angeles Dodgers', 'Washington Nationals', 'New York Mets']

team_dfs = {team: data for team, data in df_cleaned.groupby("TeamName")}
team_dfs["Milwaukee Brewers"]

Unnamed: 0,PlayerName,TeamName,avg,singles_pct,doubles_pct,triples_pct,homeRuns_pct
244,Andrew Vaughn,Milwaukee Brewers,0.254,65.0,21.4,0.0,13.6
245,Andruw Monasterio,Milwaukee Brewers,0.27,61.8,26.5,0.0,11.8
246,Anthony Seigler,Milwaukee Brewers,0.194,91.7,8.3,0.0,0.0
247,Blake Perkins,Milwaukee Brewers,0.226,68.6,17.1,5.7,8.6
248,Brandon Lockridge,Milwaukee Brewers,0.231,74.2,22.6,3.2,0.0
249,Brice Turang,Milwaukee Brewers,0.288,71.4,16.7,1.2,10.7
250,Caleb Durbin,Milwaukee Brewers,0.256,68.4,21.9,0.0,9.6
251,Christian Yelich,Milwaukee Brewers,0.264,66.9,13.9,0.0,19.2
252,Isaac Collins,Milwaukee Brewers,0.263,65.3,22.4,3.1,9.2
253,Jackson Chourio,Milwaukee Brewers,0.27,59.5,23.6,2.7,14.2


In [4]:
# Apply the function to the desired column

json_df = df_cleaned[['PlayerName', "TeamName"]]
json_df.to_json("team_cleaned.json", force_ascii=False, orient="records")



# Run Basic Sim

In [5]:
team2_df = team_dfs["Athletics"]
team2_lineup_names = ["Colby Thomas", "Nick Kurtz", "Brent Rooker", "Carlos Cortes", "Tyler Soderstrom", "Max Schuemann", "Darrell Hernaiz", "Brett Harris", "JJ Bleday", "Max Muncy", "Willie Maclver"]

team2_lineup_df = team2_df[team2_df["PlayerName"].isin(team2_lineup_names)].copy()

team2_lineup_df["PlayerName"] = pd.Categorical(
    team2_lineup_df["PlayerName"],
    categories=team2_lineup_names,
    ordered=True
)

team2_lineup_df = team2_lineup_df.sort_values("PlayerName").reset_index(drop=True)

team2_lineup_df


Unnamed: 0,PlayerName,TeamName,avg,singles_pct,doubles_pct,triples_pct,homeRuns_pct
0,Colby Thomas,Athletics,0.225,59.3,18.5,0.0,22.2
1,Nick Kurtz,Athletics,0.29,47.5,21.3,1.6,29.5
2,Brent Rooker,Athletics,0.262,55.5,24.4,1.8,18.3
3,Carlos Cortes,Athletics,0.309,55.2,27.6,3.4,13.8
4,Tyler Soderstrom,Athletics,0.276,61.3,21.9,0.6,16.1
5,Max Schuemann,Athletics,0.197,77.8,11.1,5.6,5.6
6,Brett Harris,Athletics,0.274,75.0,25.0,0.0,0.0
7,JJ Bleday,Athletics,0.212,52.3,26.2,0.0,21.5
8,Max Muncy,Athletics,0.214,63.6,15.9,0.0,20.5


In [6]:
team1_df = team_dfs["Kansas City Royals"]
team1_lineup_names = ["Maikel Garcia", "Bobby Witt Jr.", "Vinnie Pasquantino", 
                   "Salvador Perez", "Mike Yastrzemski", "Carter Jensen", 
                   "Adam Frazier", "Michael Massey", "John Rave"]


team1_lineup_df = team1_df[team1_df["PlayerName"].isin(team1_lineup_names)].copy()

team1_lineup_df["PlayerName"] = pd.Categorical(
    team1_lineup_df["PlayerName"],
    categories=team1_lineup_names,
    ordered=True
)

team1_lineup_df = team1_lineup_df.sort_values("PlayerName").reset_index(drop=True)

team1_lineup_df

Unnamed: 0,PlayerName,TeamName,avg,singles_pct,doubles_pct,triples_pct,homeRuns_pct
0,Maikel Garcia,Kansas City Royals,0.286,64.7,22.9,2.9,9.4
1,Bobby Witt Jr.,Kansas City Royals,0.295,58.7,25.5,3.3,12.5
2,Vinnie Pasquantino,Kansas City Royals,0.264,59.8,20.1,0.6,19.5
3,Salvador Perez,Kansas City Royals,0.236,53.9,24.8,0.0,21.3
4,Carter Jensen,Kansas City Royals,0.3,50.0,33.3,0.0,16.7
5,Michael Massey,Kansas City Royals,0.244,81.2,14.1,0.0,4.7
6,John Rave,Kansas City Royals,0.196,70.0,16.7,0.0,13.3


In [7]:
# Track results per player
player_stats = {
    'Player': [],
    'Singles': [],
    'Doubles': [],
    'Triples': [],
    'HomeRuns': [],
    'PlateAppearances': []
}

# Initialize stats for each team
for team in [team1_lineup_df, team2_lineup_df]:
    for _, row in team.iterrows():
        player_stats['Player'].append(row['PlayerName'])  # Change column if needed
        player_stats['Singles'].append(0)
        player_stats['Doubles'].append(0)
        player_stats['Triples'].append(0)
        player_stats['HomeRuns'].append(0)
        player_stats['PlateAppearances'].append(0)


In [8]:
#functions

#simulate one half inning
# def play_half_inning(team, team_index, starting_runners=None):
#     outs = 0
#     if starting_runners is None:
#         runners = [0, 0, 0]
#     else:
#         runners = starting_runners.copy()
#     runs = 0

#     while outs < 3:
#         batter = team.iloc[team_index % len(team)]
#         rand = np.random.random()

#         if rand < batter['avg']:
#             # Determine hit type
#             hit_types = ['singles_pct', 'doubles_pct', 'triples_pct', 'homeRuns_pct']
#             probs = [batter[h] for h in hit_types]  # already probabilities
            
#             # Normalize just in case they don't sum exactly to 1
#             probs = np.array(probs) / np.sum(probs)

#             hit = np.random.choice(hit_types, p=probs)

#             # Update bases + runs
#             if hit == 'single_pct':
#                 runs += runners[2]
#                 runners = [1] + runners[:2]
#             elif hit == 'double_pct':
#                 runs += runners[2] + runners[1]
#                 runners = [0, 1, runners[0]]
#             elif hit == 'tripples_pct':
#                 runs += sum(runners)
#                 runners = [0, 0, 1]
#             else:  # homeRuns_pct
#                 runs += 1 + sum(runners)
#                 runners = [0, 0, 0]
#         else:
#             outs += 1


#         team_index += 1

#     return runs, team_index

def play_half_inning(team, team_index, starting_runners=None):
    outs = 0
    if starting_runners is None:
        runners = [0, 0, 0]
    else:
        runners = starting_runners.copy()
    runs = 0

    while outs < 3:
        batter = team.iloc[team_index % len(team)]
        player_name = batter['PlayerName']  # Change if your column is different

        # Increment plate appearance
        idx = player_stats['Player'].index(player_name)
        player_stats['PlateAppearances'][idx] += 1

        rand = np.random.random()

        if rand < batter['avg']:
            # Determine hit type
            hit_types = ['singles_pct', 'doubles_pct', 'triples_pct', 'homeRuns_pct']
            probs = np.array([batter[h] for h in hit_types])
            probs = probs / probs.sum()  # Normalize

            hit = np.random.choice(hit_types, p=probs)

            # Record hit type
            if hit == 'singles_pct':
                player_stats['Singles'][idx] += 1
                runs += runners[2]
                runners = [1] + runners[:2]
            elif hit == 'doubles_pct':
                player_stats['Doubles'][idx] += 1
                runs += runners[2] + runners[1]
                runners = [0, 1, runners[0]]
            elif hit == 'triples_pct':
                player_stats['Triples'][idx] += 1
                runs += sum(runners)
                runners = [0, 0, 1]
            else:  # homeRuns_pct
                player_stats['HomeRuns'][idx] += 1
                runs += 1 + sum(runners)
                runners = [0, 0, 0]
        else:
            outs += 1

        team_index += 1

    return runs, team_index



#simulate one full game
def simulate_game(team1, team2):
    team1_score = 0
    team2_score = 0
    team_1_index = 0
    team_2_index = 0

    inning = 1
    while True:
        # Top of inning (Team 1)
        runs, team_1_index = play_half_inning(team1, team_1_index)
        team1_score += runs

        # Bottom of inning (Team 2)
        runs, team_2_index = play_half_inning(team2, team_2_index)
        team2_score += runs

        # Check for end of regulation
        if inning >= 9:
            # If tied, go to extra innings
            if team1_score == team2_score:
                while team1_score == team2_score:
                    # Extra inning with runner on 2nd
                    runs, team_1_index = play_half_inning(team1, team_1_index, starting_runners=[0, 1, 0])
                    team1_score += runs

                    runs, team_2_index = play_half_inning(team2, team_2_index, starting_runners=[0, 1, 0])
                    team2_score += runs
            break
        inning += 1

    return team1_score, team2_score


#run everything mutliple tmes
def run_simulations(n_runs=1000):
    results = []
    for i in range(n_runs):
        team1_score, team2_score = simulate_game(team1_lineup_df, team2_lineup_df)
        results.append((team1_score, team2_score))
    return pd.DataFrame(results, columns=["Team 1 Score", "Team 2 Score"])


In [9]:
#run the simulation and get results
sim_results = run_simulations(1000)

#mean, median, and mode results
sim_results["Diff"] = sim_results["Team 1 Score"] - sim_results["Team 2 Score"]
print(f"Simulated {len(sim_results)} games")

print("\nMedian Scores:")
print(sim_results.median())

print("\nWin percentages:")
print(f"Team 1 wins: {(sim_results['Diff'] > 0).mean()*100:.1f}%")
print(f"Team 2 wins: {(sim_results['Diff'] < 0).mean()*100:.1f}%")

player_stats_df = pd.DataFrame(player_stats)

# Add percentages
player_stats_df['Single %'] = player_stats_df['Singles'] / player_stats_df['PlateAppearances']
player_stats_df['Double %'] = player_stats_df['Doubles'] / player_stats_df['PlateAppearances']
player_stats_df['Triple %'] = player_stats_df['Triples'] / player_stats_df['PlateAppearances']
player_stats_df['Home Run %'] = player_stats_df['HomeRuns'] / player_stats_df['PlateAppearances']

player_stats_df.fillna(0, inplace=True)  # Replace any NaN from divide-by-zero
player_stats_df


Simulated 1000 games

Median Scores:
Team 1 Score    3.0
Team 2 Score    3.0
Diff            1.0
dtype: float64

Win percentages:
Team 1 wins: 52.2%
Team 2 wins: 47.8%


Unnamed: 0,Player,Singles,Doubles,Triples,HomeRuns,PlateAppearances,Single %,Double %,Triple %,Home Run %
0,Maikel Garcia,1081,342,44,168,5770,0.187348,0.059272,0.007626,0.029116
1,Bobby Witt Jr.,983,400,61,218,5639,0.174322,0.070935,0.010818,0.038659
2,Vinnie Pasquantino,904,300,10,290,5516,0.163887,0.054387,0.001813,0.052574
3,Salvador Perez,686,315,0,288,5381,0.127486,0.058539,0.0,0.053522
4,Carter Jensen,785,529,0,267,5242,0.149752,0.100916,0.0,0.050935
5,Michael Massey,1030,175,0,60,5096,0.202119,0.034341,0.0,0.011774
6,John Rave,690,149,0,131,4934,0.139846,0.030199,0.0,0.02655
7,Colby Thomas,644,169,0,224,4525,0.14232,0.037348,0.0,0.049503
8,Nick Kurtz,626,283,19,372,4417,0.141725,0.064071,0.004302,0.08422
9,Brent Rooker,640,272,20,187,4318,0.148217,0.062992,0.004632,0.043307
