### Import libraries

In [57]:
import os
from sqlalchemy import create_engine
import pybaseball as pyb
import pybaseball.cache # Ensure caching is imported
import pandas as pd
from dotenv import load_dotenv
import time
from datetime import date, timedelta
from sqlalchemy.engine import Engine
from sqlalchemy import text
import datetime
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException, TimeoutException
from rapidfuzz import process
import re
import numpy as np
from io import StringIO
import pylahman

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

### Load environment and connect to the DB

In [None]:

# Load environment variables from .env file
load_dotenv()

# Build the PostgreSQL connection string
DB_URL = f"postgresql://{os.environ['DB_USER']}:{os.environ['DB_PASS']}@{os.environ['DB_HOST']}:5432/{os.environ['DB_NAME']}"

# Create the engine object for connecting
engine = create_engine(DB_URL)

print("Database connection established.")

### Create dim_players

In [None]:
def update_players(engine: Engine):   
    try:
        players_lahman = pylahman.People()
        player_chadwick = pyb.chadwick_register()

        # Join lahman and chadwick on key identifiers and bring all the columns from lahman
        # Ignore if key_bbref is empty in chadwick
        players_chadwick_clean = player_chadwick[player_chadwick['key_retro'].notna()]
        players_lahman_clean   = players_lahman[players_lahman['retroID'].notna()]

        players_df = pd.merge(
            players_chadwick_clean,
            players_lahman_clean,
            left_on=['key_retro'],
            right_on=['retroID'],
            how='left',
        )

        # Remove unnecesary columns and drop them from the dataframe
        cols_to_remove = ['retroID', 'bbrefID', 'mlb_played_first', 'mlb_played_last']
        players_df = players_df.drop(columns= cols_to_remove)

        # Rename the fields
        rename_map = {
            # IDs
            "key_mlbam":     "key_mlbam",
            "key_retro":     "key_retro",
            "key_bbref":     "key_bbref",
            "key_fangraphs": "key_fangraphs",
            "ID":            "id_lahman",
            "playerID":      "player_id_lahman",

            # Names
            "name_last":     "last_name_chadwick",
            "name_first":    "first_name_chadwick",
            "nameLast":      "last_name_lahman",
            "nameFirst":     "first_name_lahman",
            "nameGiven":     "first_and_second_name_lahman",

            # Debut/Final game
            "debut":         "debut",
            "finalGame":     "final_game",

            # Info
            "weight":        "weight",
            "height":        "height",
            "bats":          "bats",
            "throws":        "throws",

            # Birth/Death
            "birthYear":     "birth_year",
            "birthMonth":    "birth_month",
            "birthDay":      "birth_day",
            "birthCity":     "birth_city",
            "birthCountry":  "birth_country",
            "birthState":    "birth_state",
            "deathYear":     "death_year",
            "deathMonth":    "death_month",
            "deathDay":      "death_day",
            "deathCountry":  "death_country",
            "deathState":    "death_state",
            "deathCity":     "death_city",
        }

        # Apply the rename
        players_df = players_df.rename(columns= rename_map)

        # Order the new columns
        ordered_cols = [
            "key_mlbam",
            "key_retro",
            "key_bbref",
            "key_fangraphs",
            "id_lahman",
            "player_id_lahman",
            "last_name_chadwick",
            "first_name_chadwick",
            "last_name_lahman",
            "first_name_lahman",
            "first_and_second_name_lahman",
            "debut",
            "final_game",
            "weight",
            "height",
            "bats",
            "throws",
            "birth_year",
            "birth_month",
            "birth_day",
            "birth_city",
            "birth_country",
            "birth_state",
            "death_year",
            "death_month",
            "death_day",
            "death_country",
            "death_state",
            "death_city"
        ]

        # Apply the order
        players_df = players_df[ordered_cols]

        # This selects only columns with numbers and fills their nulls with -1
        numeric_cols = players_df.select_dtypes(include=['number']).columns
        players_df[numeric_cols] = players_df[numeric_cols].fillna(-1)

        # Replace nulls in the text columns
        text_cols = [
            "key_retro",
            "key_bbref",
            "player_id_lahman",
            "last_name_chadwick",
            "first_name_chadwick",
            "last_name_lahman",
            "first_name_lahman",
            "first_and_second_name_lahman",
            "bats",
            "throws",
            "birth_city",
            "birth_country",
            "birth_state",
            "death_country",
            "death_state",
            "death_city"
        ]

        # Convert to a standard object type first and then fill the nulls with N/A
        for col in text_cols:
            players_df[col] = players_df[col].astype(object).fillna('N/A')
            

        # List the date columns
        date_cols = [
            "debut",
            "final_game"
        ]
        # Fill null dates with January 1st, 1700
        for col in date_cols:
            players_df[col] = players_df[col].fillna(pd.Timestamp('1700-01-01'))

        # Check for nulls in my table - there shouldn't be any
        if (players_df.isnull().sum() == 0).all():
            print("‚úÖ No nulls found.")
        else:
            print("‚ö†Ô∏è WARNING - There are nulls in some columns in the dataframe.")

        # # --- STEP 5: LOADING ---
        print(f"Loading {len(players_df)} new rows into 'players'...")
        
        players_df.to_sql(
            'players', 
            engine, 
            if_exists='replace',
            index=False, 
            chunksize=5000
        )
        
        print(f"‚úÖ Players successfully added {len(players_df)} new rows of players data.")

    except Exception as e:
        print(f"‚ùå ETL Failed during extraction or loading: {e}")
        

# Execute the players function
update_players(engine)

### Create dim_franchises

In [None]:
def update_team_franchises(engine: Engine):
    try:
        # Import the franchises
        #? Note: As of 2025-12-18 there is only data up to the 2024 season
        team_franchises = pylahman.TeamsFranchises()
        
        # Data cleaning
        # Identify all text columns
        text_cols = team_franchises.select_dtypes(include=['object', 'string']).columns

        # Convert to object first, then fill (since the columns are literal strings)
        for col in text_cols:
            # Converting to object allows 'N/A' to be treated as a normal string
            team_franchises[col] = team_franchises[col].astype(object).fillna('N/A')
            
            # Just in case some were literal 'nan' strings:
            team_franchises[col] = team_franchises[col].replace(['nan', 'None', '<NA>'], 'N/A')

        # Final verification
        null_count = team_franchises[text_cols].isnull().sum().sum()
        if null_count == 0:
            print("‚úÖ All string columns are clean. No nulls found!")
        else:
            print(f"‚ö†Ô∏è Warning: {null_count} nulls still remain in text columns.")
            
        
        # Loading
        print(f"Loading {len(team_franchises)} new rows into 'team_franchises'...")
        
        team_franchises.to_sql(
            'team_franchises', 
            engine, 
            if_exists='replace',
            index=False, 
            chunksize=5000
        )
        
        print(f"‚úÖ Team franchises successfully added {len(team_franchises)} new rows of data.")
    
    except Exception as e:
        print(f"‚ùå ETL Failed during extraction or loading: {e}")

        
# Apply the function
update_team_franchises(engine)

### Teams info ***NOT IN USE***

In [None]:
# def update_team_info(engine: Engine):
#     try:
#         team_info = pylahman.Teams()

#         # Identify all text columns
#         text_cols = team_info.select_dtypes(include=['object', 'string']).columns

#         # Convert to object first, then fill with N/A
#         for col in text_cols:
#             # Converting to object allows 'N/A' to be treated as a normal string
#             team_info[col] = team_info[col].astype(object).fillna('N/A')
            
#             # Just in case some were literal 'nan' strings:
#             team_info[col] = team_info[col].replace(['nan', 'None', '<NA>'], 'N/A')

#         # This selects only columns with numbers and fills their nulls with -1
#         numeric_cols = team_info.select_dtypes(include=['number']).columns
#         team_info[numeric_cols] = team_info[numeric_cols].fillna(-1)

#         # Final verification
#         null_count_text    = team_info[text_cols].isnull().sum().sum()
#         null_count_numeric = team_info[numeric_cols].isnull().sum().sum()
#         total_nulls        = null_count_text + null_count_numeric

#         if total_nulls == 0:
#             print("‚úÖ All columns are clean. No nulls found!")
#         else:
#             print(f"‚ö†Ô∏è Warning: {total_nulls} nulls still remain some columns.")

#         # Loading
#         print(f"Loading {len(team_info)} new rows into 'team_info'...")
        
#         team_info.to_sql(
#             'team_info', 
#             engine, 
#             if_exists='replace',
#             index=False, 
#             chunksize=5000
#         )
        
#         print(f"‚úÖ Team information successfully added {len(team_info)} new rows of data.")
    
#     except Exception as e:
#         print(f"‚ùå ETL Failed during extraction or loading: {e}")


# # Apply the function
# update_team_info(engine)

### Create fact_team_tables

In [None]:
def create_fact_team_tables(engine: Engine):    
    def load_fact_team_tables(engine: Engine, df, category):
        try:
            table_name = 'fact_team_' + category
            print(f"üíæ Creating {table_name}...")
            
            # Loading
            print(f"   üîÉ Loading {len(df)} rows...")
            
            df.to_sql(
                table_name, 
                engine, 
                if_exists='replace',
                index=False, 
                chunksize=5000
            )
            
            print(f"   ‚úÖ Successfully added {len(df)} new rows of data.")
        
        except Exception as e:
            print(f"   ‚ùå ETL Failed during extraction or loading: {e}")

    # Declare the years
    current_year  = date.today().year
    ten_years_ago = current_year - 10

    # Import the team data for the last 10 years
    fact_team_batting  = pyb.team_batting(ten_years_ago, current_year,  ind= 1, qual= 0)
    fact_team_pitching = pyb.team_pitching(ten_years_ago, current_year,  ind= 1, qual= 0)
    fact_team_fielding = pyb.team_fielding(ten_years_ago, current_year,  ind= 1, qual= 0)

    # Apply the function
    load_fact_team_tables(engine, fact_team_batting,  'batting')
    load_fact_team_tables(engine, fact_team_pitching, 'pitching')
    load_fact_team_tables(engine, fact_team_fielding, 'fielding')



Loading 330 new rows into 'team_info'...
‚úÖ Team information successfully added 330 new rows of data.
Loading 330 new rows into 'team_info'...
‚úÖ Team information successfully added 330 new rows of data.
Loading 330 new rows into 'team_info'...
‚úÖ Team information successfully added 330 new rows of data.


### Create fact_player

In [None]:
def create_fact_player_tables(engine: Engine):    
    def load_fact_player_tables(engine: Engine, df, category):
        try:
            table_name = 'fact_player_' + category
            print(f"üíæ Creating {table_name}...")
            
            # Loading
            print(f"   üîÉ Loading {len(df)} rows...")
            
            df.to_sql(
                table_name, 
                engine, 
                if_exists='replace',
                index=False, 
                chunksize=5000
            )
            
            print(f"   ‚úÖ Successfully added {len(df)} new rows of data.")
        
        except Exception as e:
            print(f"   ‚ùå ETL Failed during extraction or loading: {e}")

    # Declare the years
    current_year   = date.today().year
    five_years_ago = current_year - 5

    # Import the team data for the last 10 years
    # print("\n" + "="*40)
    # print(f"{'‚¨áÔ∏è  Importing player stats':^40}")
    # print(f"{'Please wait...':^40}")
    # print("="*40 + "\n")
    print("‚¨áÔ∏è  Importing player stats... please wait")
    
    fact_player_batting  = pyb.batting_stats(five_years_ago, current_year,  ind= 1, qual= 0)
    fact_player_pitching = pyb.pitching_stats(five_years_ago, current_year,  ind= 1, qual= 0)
    fact_player_fielding = pyb.fielding_stats(five_years_ago, current_year,  ind= 1, qual= 0)
    
    # Speed tables are by year - they do not include range
    # Setup year range
    #current_year = datetime.now().year
    years = range(current_year - 9, current_year + 1) # Last 10 years including current

    all_dfs = []

    for year in years:
        #print(f"Fetching sprint speed for {year}...")
        try:
            # Fetch data
            df = pyb.statcast_sprint_speed(year, 50)
            
            # Adding the years
            df['Season'] = year
            
            all_dfs.append(df)
        except Exception as e:
            print(f"Could not fetch data for {year}: {e}")

    # Combine everything into one fact table
    fact_player_running = pd.concat(all_dfs, ignore_index=True)

    # Apply the function
    load_fact_player_tables(engine, fact_player_batting,  'batting')
    load_fact_player_tables(engine, fact_player_pitching, 'pitching')
    load_fact_player_tables(engine, fact_player_fielding, 'fielding')
    load_fact_player_tables(engine, fact_player_running, 'running')
    
create_fact_player_tables(engine)





      ‚¨áÔ∏è  IMPORTING BASEBALL STATS      
             Please wait...             

üíæ Creating fact_player_batting...
   üîÉ Loading 8673 rows...
   ‚úÖ Successfully added 8673 new rows of data.
üíæ Creating fact_player_pitching...
   üîÉ Loading 5106 rows...
   ‚úÖ Successfully added 5106 new rows of data.
üíæ Creating fact_player_fielding...
   üîÉ Loading 13553 rows...
   ‚úÖ Successfully added 13553 new rows of data.
üíæ Creating fact_player_running...
   üîÉ Loading 3830 rows...
   ‚úÖ Successfully added 3830 new rows of data.


In [None]:
# Historical player stats - It has data from 1871 but it doesn't have last year (2025)
player_batting_historical     = pylahman.Batting()
player_pitching_historical    = pylahman.Pitching()
player_fielding_historical    = pylahman.Fielding()
player_appearances_historical = pylahman.Appearances()

### Get scores last n days *NOT IN USE*

In [None]:
# def get_game_results_last_n_days(n_days=90):
#     """
#     Pulls raw pitch-by-pitch data for all games played in the last 'n_days' 
#     and then extracts the final score for each game.
#     """
#     today = date.today()
    
#     # 1. Calculate the start and end dates for the 90-day range
#     end_date = today - timedelta(days=1)  # Search up to yesterday
#     start_date = today - timedelta(days=n_days)
    
#     start_date_str = start_date.strftime('%Y-%m-%d')
#     end_date_str = end_date.strftime('%Y-%m-%d')
    
#     print(f"Searching for all games played from {start_date_str} to {end_date_str}...")

#     try:
#         # 2. Pull all pitch-by-pitch data in that range
#         all_data_in_range = pyb.statcast(start_dt=start_date_str, end_dt=end_date_str)
        
#     except Exception as e:
#         print(f"Error retrieving Statcast data: {e}")
#         return pd.DataFrame()

#     if all_data_in_range.empty:
#         print(f"No games found between {start_date_str} and {end_date_str}.")
#         return pd.DataFrame()

#     print(f"Successfully pulled {len(all_data_in_range)} pitch events.")

#     # 3. Sort the data chronologically by game_pk, inning, etc.
#     data_sorted = all_data_in_range.sort_values(
#         by=['game_pk', 'inning', 'inning_topbot', 'at_bat_number', 'pitch_number'],
#         ascending=True
#     )

#     # 4. Group by game_pk and take the last row (which contains the final score)
#     final_events = data_sorted.groupby('game_pk').tail(1).reset_index(drop=True)
    
#     # 5. Extract and rename the relevant columns for the final scoreboard
#     scoreboard = final_events[[
#         'game_date', 
#         'home_team', 
#         'away_team', 
#         'home_score', 
#         'away_score'
#     ]].copy()
    
#     scoreboard.rename(columns={
#         'home_score': 'Home_Final_Score',
#         'away_score': 'Away_Final_Score',
#         'game_date': 'Date'
#     }, inplace=True)
    
#     # 6. Determine the Winner
#     scoreboard['Winner'] = scoreboard.apply(
#         lambda row: row['home_team'] if row['Home_Final_Score'] > row['Away_Final_Score'] else row['away_team'],
#         axis=1
#     )
#     scoreboard['Result'] = (
#         scoreboard['Winner'] + ' wins ' + 
#         scoreboard['Home_Final_Score'].astype(str) + '-' + 
#         scoreboard['Away_Final_Score'].astype(str)
#     )
    
#     return scoreboard[['Date', 'away_team', 'home_team', 'Away_Final_Score', 'Home_Final_Score', 'Winner', 'Result']]

# # --- EXECUTION ---
# results_yesterday_df    = get_game_results_last_n_days(n_days= 1)
# results_last_7_days_df  = get_game_results_last_n_days(n_days= 7)
# results_last_15_days_df = get_game_results_last_n_days(n_days= 15)
# results_last_30_days_df = get_game_results_last_n_days(n_days= 30)
# results_last_60_days_df = get_game_results_last_n_days(n_days= 60)
# results_last_90_days_df = get_game_results_last_n_days(n_days= 90)


# # if not results_last_90_days_df.empty:
# #     print(f"\n--- Game Results from the Last 90 Days ({len(results_last_90_days_df)} Games Found) ---")
# #     print(results_last_90_days_df.tail(10)) # Print the last 10 games found
# # else:
# #     print("\nNo games were found in the last 90 days.")

Searching for all games played from 2025-12-17 to 2025-12-17...
This is a large query, it may take a moment to complete
Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-17 and 2025-12-17.
Searching for all games played from 2025-12-11 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-11 and 2025-12-17.
Searching for all games played from 2025-12-03 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-03 and 2025-12-17.
Searching for all games played from 2025-11-18 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-11-18 and 2025-12-17.
Searching for all games played from 2025-10-19 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:03<00:00,  7.52it/s]

Successfully pulled 2970 pitch events.
Searching for all games played from 2025-09-19 to 2025-12-17...
This is a large query, it may take a moment to complete



  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Skipping offseason dates


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 58/58 [00:04<00:00, 14.14it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Successfully pulled 53951 pitch events.


### Create fact_statcast_pitches

In [None]:
def create_fact_statcast_events_pitch_by_pitch(engine: Engine, n_days= 90):
    """
    Pulls raw pitch-by-pitch data for all games played in the last 'n_days' 
    and then extracts the final score for each game.
    """
    def load_fact_statcast_events(engine: Engine, df):
        try:
            table_name = 'fact_statcast_pitches'
            print(f"üíæ Creating {table_name}...")
            
            # Loading
            print(f"   üîÉ Loading {len(df)} rows...")
            
            df.to_sql(
                table_name, 
                engine, 
                if_exists='replace',
                index=False, 
                chunksize=5000
            )
            
            print(f"   ‚úÖ Successfully added {len(df)} new rows of data.")
        
        except Exception as e:
            print(f"   ‚ùå ETL Failed during extraction or loading: {e}")
    
    # Get today's date
    today = date.today()

    # Calculate the start and end dates for the n-day range
    end_date = today - timedelta(days= 1)  # Search up to yesterday
    start_date = today - timedelta(days= n_days)

    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')

    print(f"Searching for all games played from {start_date_str} to {end_date_str}...")

    try:
        # Pull all pitch-by-pitch data in that range
        fact_statcast_pitches_last_n_days = pyb.statcast(start_dt= start_date_str, end_dt= end_date_str)
        
    except Exception as e:
        print(f"Error retrieving Statcast data: {e}")
        return pd.DataFrame()

    if fact_statcast_pitches_last_n_days.empty:
        print(f"No games found between {start_date_str} and {end_date_str}.")
        return pd.DataFrame()

    print(f"Successfully pulled {len(fact_statcast_pitches_last_n_days)} pitch events for the last {n_days} days.")

    # def filter_days(df, days):
    #     cutoff = today - timedelta(days=days)
    #     # Convert 'Date' column to datetime objects if they aren't already
    #     df['game_date'] = pd.to_datetime(df['game_date']).dt.date
    #     return df[df['game_date'] >= cutoff]

    # # Sub-df from the main one
    # results_1_day_df   = filter_days(fact_statcast_pitches_last_90_days, 1)
    # results_7_days_df  = filter_days(fact_statcast_pitches_last_90_days, 7)
    # results_30_days_df = filter_days(fact_statcast_pitches_last_90_days, 30)

    # Apply the function
    load_fact_statcast_events(engine, fact_statcast_pitches_last_n_days)

Searching for all games played from 2025-09-19 to 2025-12-17...
This is a large query, it may take a moment to complete
Skipping offseason dates


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 58/58 [00:04<00:00, 13.99it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Successfully pulled 53951 pitch events.


In [None]:
# Find the max date in the fact_statcast_pitches
def get_latest_date_from_db():
    query = text("SELECT MAX(game_date) FROM fact_statcast_pitches")
    
    with engine.connect() as conn:
        result = conn.execute(query).scalar()
        
    return result

# Execute and calculate fetch window
last_date = get_latest_date_from_db()

if last_date:
    # I want to start fetching from the day AFTER the last recorded date
    fetch_start = (last_date + timedelta(days=1)).strftime('%Y-%m-%d')
    # Fetch up to yesterday
    fetch_end = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    
    if fetch_start <= fetch_end:
        print(f"üîÑ Last data was {last_date}. Fetching from {fetch_start} to {fetch_end}...")
        new_data = pyb.statcast(start_dt=fetch_start, end_dt=fetch_end)
        new_data.to_sql('fact_statcast_pitches', engine, if_exists='append', index=False)
    else:
        print("‚úÖ Database is already up to date.")
else:
    print("Empty table. You need to run an initial seed fetch.")


# TEST

### Splits by team

In [None]:
def teams_split(split_type, clean_mode):
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    # Define year
    year = datetime.now().year
    
    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)  

    if split_type == 'LHP' or split_type == 'RHP': # for LHP and RHP pitchers
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=total%7CLast%20{split_type}%20days%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'RH' or split_type == 'LH': # for RH and LH Starters
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%20Starter%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=hmvis%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'first_batter_game':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=leado%7C1st%20Batter%20G%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_power_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Power%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_weak_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Finesse%7CML%7C{year}%7Cbat%7CAB%7C")
    # For each team:
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
            driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_less_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3C%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_greater_or_equal_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3E%3D%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    
    
    # Name of the table
    datatable_id = 'split1'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//table[@id='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} ({split_type}) table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)
    
    # Close the WebDriver
    driver.quit()    
    
    if clean_mode == 1:
        # Remove 'Roe' exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the last column
        df = df.iloc[:, :-1]

        # Rename last 3 columns
        new_column_names = ["BAbip", "tOPS+", "sOPS+"]  # New names for last 3 columns
        df.columns.values[-3:] = new_column_names  # Assign new names

        # Remove the first column
        df = df.iloc[:, 1:]
    else:
        # Remove 'Roe' and GS exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)
        df[0] = df[0].str.replace('GS', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Remove rows where column 'A' contains 'Rk', but keep the first row
        df = df[~((df.index > 0) & (df[0].str.contains('Rk', na=False)))]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the first column
        df = df.iloc[:, 1:]

        # Remove the last 2 columns
        df = df.iloc[:, :-2]

        # New column names
        new_column_names = ['Team', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB',
                            'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDP', 'HBP', 'SH',
                            'SF', 'IBB', 'ROE', 'BAbip', 'tOPS+', 'sOPS+']

        # Rename all columns
        df.columns = new_column_names

    return df

# Call the function to get the teams split data
team_vs_lhp             = teams_split(split_type= 'LHP',  clean_mode= 0) # GS empty
team_vs_rhp             = teams_split(split_type= 'RHP',  clean_mode= 0) # GS empty
team_vs_lh_starters     = teams_split(split_type= 'LH',   clean_mode= 1)
team_vs_rh_starters     = teams_split(split_type= 'RH',   clean_mode= 1)
team_last_seven_days    = teams_split(split_type= '7',    clean_mode= 1)
team_last_fourteen_days = teams_split(split_type= '14',   clean_mode= 1)
team_last_28_days       = teams_split(split_type= '28',   clean_mode= 1)
team_home_games         = teams_split(split_type= 'Home', clean_mode= 1)
team_away_games         = teams_split(split_type= 'Away', clean_mode= 1)
team_first_batter_game  = teams_split(split_type= 'first_batter_game', clean_mode= 0) # GS empty
team_vs_power_pitcher   = teams_split(split_type= 'vs_power_pitcher',  clean_mode= 0) # GS empty
team_vs_weak_pitcher    = teams_split(split_type= 'vs_weak_pitcher',   clean_mode= 0) # GS empty
team_vs_power_team      = teams_split(split_type= 'vs_greater_or_equal_than_500_WP', clean_mode= 1)
team_vs_weak_team       = teams_split(split_type= 'vs_less_than_500_WP',             clean_mode= 1)

# # Direct matchups
team_laa = teams_split(split_type= 'ANA', clean_mode= 1)
team_ari = teams_split(split_type= 'ARI', clean_mode= 1)
team_atl = teams_split(split_type= 'ATL', clean_mode= 1)
team_bal = teams_split(split_type= 'BAL', clean_mode= 1)
team_bos = teams_split(split_type= 'BOS', clean_mode= 1)
team_chc = teams_split(split_type= 'CHC', clean_mode= 1)
team_chw = teams_split(split_type= 'CHW', clean_mode= 1)
team_cin = teams_split(split_type= 'CIN', clean_mode= 1)
team_cle = teams_split(split_type= 'CLE', clean_mode= 1)
team_col = teams_split(split_type= 'COL', clean_mode= 1)
team_det = teams_split(split_type= 'DET', clean_mode= 1)
team_hou = teams_split(split_type= 'HOU', clean_mode= 1)
team_kcr = teams_split(split_type= 'KCR', clean_mode= 1)
team_lad = teams_split(split_type= 'LAD', clean_mode= 1)
team_mia = teams_split(split_type= 'FLA', clean_mode= 1) 
team_mil = teams_split(split_type= 'MIL', clean_mode= 1)
team_min = teams_split(split_type= 'MIN', clean_mode= 1)
team_nym = teams_split(split_type= 'NYM', clean_mode= 1)
team_nyy = teams_split(split_type= 'NYY', clean_mode= 1)
team_oak = teams_split(split_type= 'OAK', clean_mode= 1)
team_phi = teams_split(split_type= 'PHI', clean_mode= 1)
team_pit = teams_split(split_type= 'PIT', clean_mode= 1)
team_sdp = teams_split(split_type= 'SDP', clean_mode= 1)
team_sea = teams_split(split_type= 'SEA', clean_mode= 1)
team_sfg = teams_split(split_type= 'SFG', clean_mode= 1)
team_stl = teams_split(split_type= 'STL', clean_mode= 1)
team_tbr = teams_split(split_type= 'TBD', clean_mode= 1)
team_tex = teams_split(split_type= 'TEX', clean_mode= 1)
team_tor = teams_split(split_type= 'TOR', clean_mode= 1)
team_wsn = teams_split(split_type= 'WSN', clean_mode= 1)

# Dictionary of dataframes for the teams
dic_team = {
    'LAA': team_laa,
    'AZ':  team_ari,
    'ATL': team_atl,
    'BAL': team_bal,
    'BOS': team_bos,
    'CHC': team_chc,
    'CHW': team_chw,
    'CIN': team_cin,
    'CLE': team_cle,
    'COL': team_col,
    'DET': team_det,
    'HOU': team_hou,
    'KC':  team_kcr,
    'LAD': team_lad,
    'MIA': team_mia,
    'MIL': team_mil,
    'MIN': team_min,
    'NYM': team_nym,
    'NYY': team_nyy,
    'ATH': team_oak,
    'PHI': team_phi,
    'PIT': team_pit,
    'SD':  team_sdp,
    'SEA': team_sea,
    'SF':  team_sfg,
    'STL': team_stl,
    'TB':  team_tbr,
    'TEX': team_tex,
    'TOR': team_tor,
    'WSH': team_wsn   
    }

# Add an ID column with the dictionary key as the identifier
for key, df in dic_team.items():
    df['ID'] = key  # Assign the dictionary key as the ID

# Concatenate all dataFrames in the dictionary
direct_matches = pd.concat(dic_team.values(), ignore_index=True)  # Resets index

dic_splits = {
    'team_vs_lhp'        :team_vs_lhp,        
    'team_vs_rhp'        :team_vs_rhp,
    'team_vs_lh_starters':team_vs_lh_starters,
    'team_vs_rh_starters':team_vs_rh_starters,
    'team_last_seven_days':team_last_seven_days,
    'team_last_fourteen_days':team_last_fourteen_days,
    'team_last_28_days':team_last_28_days,
    'team_home_games':team_home_games,
    'team_away_games':team_away_games,
    'team_first_batter_game':team_first_batter_game,
    'team_vs_power_pitcher':team_vs_power_pitcher,
    'team_vs_weak_pitcher':team_vs_weak_pitcher,
    'team_vs_power_team':team_vs_power_team,
    'team_vs_weak_team':team_vs_weak_team      
}



### Create batting_splits

In [47]:
YEAR = 2025
DATATABLE_ID = 'team_split1' 
MAX_RETRIES = 3 

# 2. Define the lists for iteration
team_abbreviations = ['BAL']
split_parameters = [
    {'type': 'LHP',            'desc': 'vs_LHP'},
    {'type': 'RHP',            'desc': 'vs_RHP'},
    {'type': '7',              'desc': 'last_7_days'},
    {'type': '14',             'desc': 'last_14_days'},
    {'type': '28',             'desc': 'last_28_days'},
    {'type': 'Home',           'desc': 'home_games'},
    {'type': 'Away',           'desc': 'away_games'},
    {'type': 'RH',             'desc': 'vs_RH_Starters'},
    {'type': 'LH',             'desc': 'vs_LH_Starters'},
    {'type': '1st',            'desc': '1st_Half'},
    {'type': '2nd',            'desc': '2nd_Half'},
    {'type': 'April%2FMarch',  'desc': 'April_March'},
    {'type': 'June',           'desc': 'June_Splits'},
    {'type': 'July',           'desc': 'July_Splits'},
    {'type': 'August',         'desc': 'August_Splits'},
    {'type': 'Sept%2FOct',     'desc': 'Sept_Oct_Splits'},
    {'type': 'C',              'desc':'C_Position'},
    {'type': '1B',             'desc': '1B_Position'},
    {'type': '2B',             'desc': '2B_Position'},
    {'type': '3B',             'desc': '3B_Position'},
    {'type': 'SS',             'desc': 'SS_Position'},
    {'type': 'LF',             'desc': 'LF_Position'},
    {'type': 'CF',             'desc': 'CF_Position'},
    {'type': 'RF',             'desc': 'RF_Position'},
    {'type': 'DH',             'desc': 'DH_Position'},
    {'type': 'PH',             'desc': 'PH_Position'},
    {'type': '1st%20Batter',   'desc': 'First_Batter_Game'},
    {'type': 'Leadoff%20Inn.', 'desc': 'First_Batter_Inning'},
    {'type': 'Batting%201st',  'desc': 'Batting_1st'},
    {'type': 'Batting%202nd',  'desc': 'Batting_2nd'},
    {'type': 'Batting%203rd',  'desc': 'Batting_3rd'},
    {'type': 'Batting%204th',  'desc': 'Batting_4th'},
    {'type': 'Batting%205th',  'desc': 'Batting_5th'},
    {'type': 'Batting%206th',  'desc': 'Batting_6th'},
    {'type': 'Batting%207th',  'desc': 'Batting_7th'},
    {'type': 'Batting%208th',  'desc': 'Batting_8th'},
    {'type': 'Batting%209th',  'desc': 'Batting_9th'},
    {'type': '1-3',            'desc': 'in_the_lineup_1-3rd'},
    {'type': '4-6',            'desc': 'in_the_lineup_4-6th'},
    {'type': '7-9',            'desc': 'in_the_lineup_7-9th'},
    {'type': 'SP',             'desc': 'vs_SP'},
    {'type': 'RP',             'desc': 'vs_RP'},
    {'type': 'Power',          'desc': 'vs_Power_Pitchers'},
    {'type': 'Finesse',        'desc': 'vs_Finesse_Pitchers'},
    {'type': 'ANA',            'desc': 'batting_vs_ANA'},
    {'type': 'ARI',            'desc': 'batting_vs_ARI'},
    {'type': 'ATL',            'desc': 'batting_vs_ATL'},
    {'type': 'BAL',            'desc': 'batting_vs_BAL'},
    {'type': 'BOS',            'desc': 'batting_vs_BOS'},
    {'type': 'CHC',            'desc': 'batting_vs_CHC'},
    {'type': 'CHW',            'desc': 'batting_vs_CHW'},
    {'type': 'CIN',            'desc': 'batting_vs_CIN'},
    {'type': 'CLE',            'desc': 'batting_vs_CLE'},
    {'type': 'COL',            'desc': 'batting_vs_COL'},
    {'type': 'DET',            'desc': 'batting_vs_DET'},
    {'type': 'HOU',            'desc': 'batting_vs_HOU'},
    {'type': 'KCR',            'desc': 'batting_vs_KCR'},
    {'type': 'LAD',            'desc': 'batting_vs_LAD'},
    {'type': 'FLA',            'desc': 'batting_vs_FLA'},
    {'type': 'MIL',            'desc': 'batting_vs_MIL'},
    {'type': 'MIN',            'desc': 'batting_vs_MIN'},
    {'type': 'NYM',            'desc': 'batting_vs_NYM'},
    {'type': 'NYY',            'desc': 'batting_vs_NYY'},
    {'type': 'OAK',            'desc': 'batting_vs_OAK'},
    {'type': 'PHI',            'desc': 'batting_vs_PHI'},
    {'type': 'PIT',            'desc': 'batting_vs_PIT'},
    {'type': 'SDP',            'desc': 'batting_vs_SDP'},
    {'type': 'SEA',            'desc': 'batting_vs_SEA'},
    {'type': 'SFG',            'desc': 'batting_vs_SFG'},
    {'type': 'STL',            'desc': 'batting_vs_STL'},
    {'type': 'TBD',            'desc': 'batting_vs_TBD'},
    {'type': 'TEX',            'desc': 'batting_vs_TEX'},
    {'type': 'TOR',            'desc': 'batting_vs_TOR'},
    {'type': 'WSN',            'desc': 'batting_vs_WSN'},
    {'type': 'Day',            'desc': 'batting_Day_Games'},
    {'type': 'Night',          'desc': 'batting_Night_Games'},
    {'type': 'Grass',          'desc': 'batting_Grass_Field_Games'},
    {'type': 'Artif.%20Turf',  'desc': 'batting_Artificial_Turf_Games'}
]

# Helper function to initialize driver
def initialize_driver():
    """Initializes and returns a new Selenium WebDriver instance."""
    options = Options()
    #options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage") 
    options.add_argument("--headless=new")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36") # Use a recent, common User-Agent
    # NOTE: Keep the path correct my Brave installation
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" 
    
    # Attempt to start the driver with a timeout
    try:
        driver = webdriver.Chrome(options=options) 
        driver.set_page_load_timeout(60)
        return driver
    except Exception as e:
        print(f"FATAL: Could not initialize Chrome driver. Check Brave path and driver version. Error: {e}")
        return None


#  batter_split function
def batter_split(driver, split_type, team_abv, year, datatable_id, description):
    
    # --- URL CONSTRUCTION --- 
    if split_type == 'LHP' or split_type == 'RHP': # for LHP and RHP pitchers
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=total%7CLast%20{split_type}%20days%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'RH' or split_type == 'LH': # for RH and LH Starters
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%20Starter%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=hmvis%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '1st' or split_type == '2nd': # for 1st and 2nd half of the season
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=half%7C{split_type}%20Half%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'April%2FMarch' or split_type == 'May' or split_type == 'June' \
        or split_type == 'July' or split_type == 'August' or split_type == 'Sept%2FOct': # for each month
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=month%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'C' or split_type == '1B' or split_type == '2B' or split_type == '3B' \
        or split_type == 'SS' or split_type == 'LF' or split_type == 'CF' or split_type == 'RF' \
        or split_type == 'DH' or split_type == 'PH': # for each position
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=defp%7Cas%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '1st%20Batter': # first batter of the game
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Leadoff%20Inn.': # first batter of the inning
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Batting%201st' or split_type == 'Batting%202nd' or split_type == 'Batting%203rd' \
        or split_type == 'Batting%204th' or split_type == 'Batting%205th' or split_type == 'Batting%206th' \
        or split_type == 'Batting%207th' or split_type == 'Batting%208th' or split_type == 'Batting%209th': # for each spot in the lineup
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=lineu%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '1-3' or split_type == '4-6' or split_type == '7-9': # for each third of the lineup
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=innng%7CInnings%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'SP' or split_type == 'RP': # vs SP or RP
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=times%7Cvs.%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Power' or split_type == 'avg.P%2FF' or split_type == 'Finesse':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=power%7Cvs.%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C" 
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=oppon%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Day' or split_type == 'Night' or split_type == 'Grass' or split_type == 'Artif.%20Turf':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=stad%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"

    else:
        print(f"Error: Split type '{split_type}' not supported yet.")
        return pd.DataFrame()

    try:
        driver.get(url)
    except TimeoutException:
        print(f"[{team_abv} - {description}]: Page load timed out (60s). Skipping or retrying...")
        return None # Let the main loop handle the retry/skip
        
    datatable_xpath = f"//table[@id='{datatable_id}']"
    
    # --- SCRAPING LOGIC --- 
    try:
        # Wait up to 30 seconds for the table
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        table_element = driver.find_element(By.XPATH, datatable_xpath)
        print(f"[{team_abv} - {description}]: Table loaded successfully.")
    except Exception:
        # This catches both TimeoutException and NoSuchElementException
        print(f"[{team_abv} - {description}]: Table element not found after 30s. Check site content.")
        return None 

    # Extract the full HTML, wrap in StringIO, read with pandas
    table_html = table_element.get_attribute('outerHTML')
    html_string = StringIO(table_html)
    
    try:
        tables = pd.read_html(html_string, flavor='lxml') 
    except Exception as e:
        print(f"[{team_abv} - {description}]: Error parsing HTML with pandas: {e}")
        return None

    if not tables:
        print(f"[{team_abv} - {description}]: No tables found.")
        return None

    # Create an explicit copy
    df = tables[0].copy() 
    
    # --- CLEANING LOGIC --- 
    df.columns = df.columns.str.strip()
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]

    if 'Rk' in df.columns:
        df = df[df['Rk'] != 'Rk']
        
    df = df.iloc[:-1] # Remove last row (Totals)
    
    df['description'] = description
    df['team'] = team_abv
    df['year'] = YEAR
    
    return df 

# Master loop with driver reuse and retry logic

batting_splits = pd.DataFrame()
driver = initialize_driver()

if driver is None:
    exit() # Stop if the driver failed to initialize

print("Starting Scrape Job with Driver Reuse and Retry Logic...")
print("-" * 30)

try:
    # Outer loop for teams
    for team_abv in team_abbreviations:
        # Inner loop for splits
        for split in split_parameters:
            
            # Retry loop for failed connection/table load
            for attempt in range(MAX_RETRIES):
                try:
                    # Check if the driver is still alive (by checking its current URL)
                    driver.current_url 
                    
                    new_df = batter_split(
                        driver=driver,
                        split_type=split['type'], 
                        team_abv=team_abv, 
                        year=YEAR, 
                        datatable_id=DATATABLE_ID, 
                        description=split['desc']
                    )
                    
                    if new_df is not None and not new_df.empty:
                        batting_splits = pd.concat([batting_splits, new_df], ignore_index=True)
                        print(f"SUCCESS: Appended {len(new_df)} rows. Master DF size: {len(batting_splits)}")
                        break # Break the retry loop on success
                    
                    # If new_df is None (due to TimeoutException/Table not found), retry
                    print(f"RETRYING: Attempt {attempt + 1}/{MAX_RETRIES} for {team_abv} - {split['desc']}...")
                    time.sleep(2) # Short wait before retry

                except WebDriverException as e:
                    # CRITICAL: Driver died (Connection refused/lost)
                    print(f"\n[FATAL ERROR] Driver connection lost for {team_abv} - {split['desc']}. Restarting driver...")
                    
                    # Clean up the old session
                    try:
                        driver.quit()
                    except Exception:
                        pass # Ignore errors on quitting a dead driver
                    
                    # Restart the driver
                    driver = initialize_driver()
                    if driver is None:
                        # If restart fails, stop the whole script
                        raise SystemExit("Driver restart failed. Terminating.")
                        
                    time.sleep(5) # Longer wait after a fatal crash
                    print("Driver successfully restarted. Retrying scrape.")
                    
                except Exception as e:
                    print(f"[{team_abv} - {split['desc']}]: Unhandled error: {e}")
                    break # Break retry loop on unexpected failure

            # Check if retry failed all attempts and the split was not appended
            else: 
                print(f"Skipping {team_abv} - {split['desc']} after {MAX_RETRIES} failed attempts.")
                
finally:
    # 3. CLEANUP: Quit the driver ONCE after all loops are finished
    print("-" * 30)
    print("All tasks finished. Quitting driver.")
    if 'driver' in locals() and driver:
        driver.quit() 
    
print("Scraping Complete.")
print(f"Final DataFrame Shape: {batting_splits.shape}")

Starting Scrape Job with Driver Reuse and Retry Logic...
------------------------------
[BAL - vs_LHP]: Table loaded successfully.
SUCCESS: Appended 30 rows. Master DF size: 30
[BAL - vs_RHP]: Table loaded successfully.
SUCCESS: Appended 34 rows. Master DF size: 64
[BAL - last_7_days]: Table loaded successfully.
SUCCESS: Appended 15 rows. Master DF size: 79
[BAL - last_14_days]: Table loaded successfully.
SUCCESS: Appended 16 rows. Master DF size: 95
[BAL - last_28_days]: Table loaded successfully.
SUCCESS: Appended 18 rows. Master DF size: 113
[BAL - home_games]: Table loaded successfully.
SUCCESS: Appended 38 rows. Master DF size: 151
[BAL - away_games]: Table loaded successfully.
SUCCESS: Appended 37 rows. Master DF size: 188
[BAL - vs_RH_Starters]: Table loaded successfully.
SUCCESS: Appended 38 rows. Master DF size: 226
[BAL - vs_LH_Starters]: Table loaded successfully.
SUCCESS: Appended 38 rows. Master DF size: 264
[BAL - 1st_Half]: Table loaded successfully.
SUCCESS: Appended 34

### Pitching splits

In [None]:
# --- CONFIGURATION ---
YEAR = 2025
DATATABLE_ID = 'team_split1' 
MAX_RETRIES = 3 

# 2. Define the lists for iteration
team_abbreviations = ['BAL']
split_parameters = [
    {'type': 'LHB',                      'desc': 'vs_LHB'},
    {'type': 'RHB',                      'desc': 'vs_RHB'},
    {'type': '7',                        'desc': 'last_7_days'},
    {'type': '14',                       'desc': 'last_14_days'},
    {'type': '28',                       'desc': 'last_28_days'},
    {'type': 'Home',                     'desc': 'home_games'},
    {'type': 'Away',                     'desc': 'away_games'},
    {'type': '1st',                      'desc': '1st_half'},
    {'type': '2nd',                      'desc': '2nd_half'},
    {'type': 'April%2FMarch',            'desc': 'april_march'},
    {'type': 'June',                     'desc': 'june_splits'},
    {'type': 'July',                     'desc': 'july_splits'},
    {'type': 'August',                   'desc': 'august_splits'},
    {'type': 'Sept%2FOct',               'desc': 'sept_oct_Splits'},
    {'type': '1st%20Batter',             'desc': 'first_batter_game'},
    {'type': 'Leadoff%20Inn.',           'desc': 'first_batter_inning'},
    {'type': 'Batting%201st',            'desc': 'pitching_vs_1st'},
    {'type': 'Batting%202nd',            'desc': 'pitching_vs_2nd'},
    {'type': 'Batting%203rd',            'desc': 'pitching_vs_3rd'},
    {'type': 'Batting%204th',            'desc': 'pitching_vs_4th'},
    {'type': 'Batting%205th',            'desc': 'pitching_vs_5th'},
    {'type': 'Batting%206th',            'desc': 'pitching_vs_6th'},
    {'type': 'Batting%207th',            'desc': 'pitching_vs_7th'},
    {'type': 'Batting%208th',            'desc': 'pitching_vs_8th'},
    {'type': 'Batting%209th',            'desc': 'pitching_vs_9th'},
    {'type': 'Starter',                  'desc': 'as_starter'},
    {'type': 'Reliever',                 'desc': 'as_reliever'},
    {'type': '0-2%20Runs',               'desc': 'run_support_0_2'},
    {'type': '3-5%20Runs',               'desc': 'run_support_3_5'},
    {'type': '6%2B%20Runs',              'desc': 'run_support_6_plus'},
    {'type': 'Swung%20at%201st%20Pitch', 'desc': 'outcome_of_at_bat_when_swung_at_first_pitch'},
    {'type': 'Took%201st%20Pitch',       'desc': 'outcome_of_at_bat_when_took_first_pitch'},
    {'type': '0',                        'desc': '0_outs_in_the_inning'},
    {'type': '1',                        'desc': '1_outs_in_the_inning'},
    {'type': '2',                        'desc': '2_outs_in_the_inning'},
    {'type': 'innng%7C1st',             'desc': 'pitching_in_1st_inning'},
    {'type': 'innng%7C2nd',             'desc': 'pitching_in_2nd_inning'},
    {'type': 'innng%7C3rd',             'desc': 'pitching_in_3rd_inning'},
    {'type': 'innng%7C4th',             'desc': 'pitching_in_4th_inning'},
    {'type': 'innng%7C5th',             'desc': 'pitching_in_5th_inning'},
    {'type': 'innng%7C6th',             'desc': 'pitching_in_6th_inning'},
    {'type': 'innng%7C7th',             'desc': 'pitching_in_7th_inning'},
    {'type': 'innng%7C8th',             'desc': 'pitching_in_8th_inning'},
    {'type': 'innng%7C9th',             'desc': 'pitching_in_9th_inning'},
    {'type': 'ANA',                      'desc': 'pitching_vs_ANA'},
    {'type': 'ARI',                      'desc': 'pitching_vs_ARI'},
    {'type': 'ATL',                      'desc': 'pitching_vs_ATL'},
    {'type': 'BAL',                      'desc': 'pitching_vs_BAL'},
    {'type': 'BOS',                      'desc': 'pitching_vs_BOS'},
    {'type': 'CHC',                      'desc': 'pitching_vs_CHC'},
    {'type': 'CHW',                      'desc': 'pitching_vs_CHW'},
    {'type': 'CIN',                      'desc': 'pitching_vs_CIN'},
    {'type': 'CLE',                      'desc': 'pitching_vs_CLE'},
    {'type': 'COL',                      'desc': 'pitching_vs_COL'},
    {'type': 'DET',                      'desc': 'pitching_vs_DET'},
    {'type': 'HOU',                      'desc': 'pitching_vs_HOU'},
    {'type': 'KCR',                      'desc': 'pitching_vs_KCR'},
    {'type': 'LAD',                      'desc': 'pitching_vs_LAD'},
    {'type': 'FLA',                      'desc': 'pitching_vs_FLA'},
    {'type': 'MIL',                      'desc': 'pitching_vs_MIL'},
    {'type': 'MIN',                      'desc': 'pitching_vs_MIN'},
    {'type': 'NYM',                      'desc': 'pitching_vs_NYM'},
    {'type': 'NYY',                      'desc': 'pitching_vs_NYY'},
    {'type': 'OAK',                      'desc': 'pitching_vs_OAK'},
    {'type': 'PHI',                      'desc': 'pitching_vs_PHI'},
    {'type': 'PIT',                      'desc': 'pitching_vs_PIT'},
    {'type': 'SDP',                      'desc': 'pitching_vs_SDP'},
    {'type': 'SEA',                      'desc': 'pitching_vs_SEA'},
    {'type': 'SFG',                      'desc': 'pitching_vs_SFG'},
    {'type': 'STL',                      'desc': 'pitching_vs_STL'},
    {'type': 'TBD',                      'desc': 'pitching_vs_TBD'},
    {'type': 'TEX',                      'desc': 'pitching_vs_TEX'},
    {'type': 'TOR',                      'desc': 'pitching_vs_TOR'},
    {'type': 'WSN',                      'desc': 'pitching_vs_WSN'},
    {'type': 'Day',                      'desc': 'pitching_Day_Games'},
    {'type': 'Night',                    'desc': 'pitching_Night_Games'},
    {'type': 'Grass',                    'desc': 'pitching_Grass_Field_Games'},
    {'type': 'Artif.%20Turf',            'desc': 'pitching_Artificial_Turf_Games'}
]

# Helper function to initialize driver
def initialize_driver():
    """Initializes and returns a new Selenium WebDriver instance."""
    options = Options()
    #options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage") 
    options.add_argument("--headless=new")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36") # Use a recent, common User-Agent
    # NOTE: Keep the path correct my Brave installation
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" 
    
    # Attempt to start the driver with a timeout
    try:
        driver = webdriver.Chrome(options=options) 
        driver.set_page_load_timeout(60)
        return driver
    except Exception as e:
        print(f"FATAL: Could not initialize Chrome driver. Check Brave path and driver version. Error: {e}")
        return None


#  pitcher_split function
def pitcher_split(driver, split_type, team_abv, year, datatable_id, description):

    # --- URL CONSTRUCTION --- 
    if split_type == 'LHB' or split_type == 'RHB': # matchups vs LHB and RHB hitters
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=total%7CLast%20{split_type}%20days%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=hmvis%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '1st' or split_type == '2nd': # for 1st and 2nd half of the season
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=half%7C{split_type}%20Half%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'April%2FMarch' or split_type == 'May' or split_type == 'June' \
        or split_type == 'July' or split_type == 'August' or split_type == 'Sept%2FOct': # for each month
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=month%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '1st%20Batter': # first batter of the game
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Leadoff%20Inn.': # first batter of the inning
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Batting%201st' or split_type == 'Batting%202nd' or split_type == 'Batting%203rd' \
        or split_type == 'Batting%204th' or split_type == 'Batting%205th' or split_type == 'Batting%206th' \
        or split_type == 'Batting%207th' or split_type == 'Batting%208th' or split_type == 'Batting%209th': 
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=lineu%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Starter' or split_type == 'Reliever':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=sprel%7Cas%20{split_type}%7CT{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '0-2%20Runs' or split_type == '3-5%20Runs' or split_type == '6%2B%20Runs':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=rs%7C{split_type}%20Scored%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Swung%20at%201st%20Pitch' or split_type == 'Took%201st%20Pitch':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=tkswg%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '0' or split_type == '1' or split_type == '2':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=outs%7C{split_type}%20outs%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'innng%7C1st' or split_type == 'innng%7C2nd' or split_type == 'innng%7C3rd' \
        or split_type == 'innng%7C4th' or split_type == 'innng%7C5th' or split_type == 'innng%7C6th' \
        or split_type == 'innng%7C7th' or split_type == 'innng%7C8th' or split_type == 'innng%7C9th':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params={split_type}%20inning%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=oppon%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Day' or split_type == 'Night' or split_type == 'Grass' or split_type == 'Artif.%20Turf':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=stad%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    else:
        print(f"Error: Split type '{split_type}' not supported yet.")
        return pd.DataFrame()

    try:
        driver.get(url)
    except TimeoutException:
        print(f"[{team_abv} - {description}]: Page load timed out (60s). Skipping or retrying...")
        return None # Let the main loop handle the retry/skip
        
    datatable_xpath = f"//table[@id='{datatable_id}']"
    
    # --- SCRAPING LOGIC --- 
    try:
        # Wait up to 30 seconds for the table
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        table_element = driver.find_element(By.XPATH, datatable_xpath)
        print(f"[{team_abv} - {description}]: Table loaded successfully.")
    except Exception:
        # This catches both TimeoutException and NoSuchElementException
        print(f"[{team_abv} - {description}]: Table element not found after 30s. Check site content.")
        return None 

    # Extract the full HTML, wrap in StringIO, read with pandas
    table_html = table_element.get_attribute('outerHTML')
    html_string = StringIO(table_html)
    
    try:
        tables = pd.read_html(html_string, flavor='lxml') 
    except Exception as e:
        print(f"[{team_abv} - {description}]: Error parsing HTML with pandas: {e}")
        return None

    if not tables:
        print(f"[{team_abv} - {description}]: No tables found.")
        return None

    # Create an explicit copy
    df = tables[0].copy() 
    
    # --- CLEANING LOGIC --- 
    df.columns = df.columns.str.strip()
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]

    if 'Rk' in df.columns:
        df = df[df['Rk'] != 'Rk']
        
    df = df.iloc[:-1] # Remove last row (Totals)
    
    df['description'] = description
    df['team'] = team_abv
    df['year'] = YEAR
    
    return df 

# Master loop with driver reuse and retry logic

pitching_splits = pd.DataFrame()
driver = initialize_driver()

if driver is None:
    exit() # Stop if the driver failed to initialize

print("Starting Scrape Job with Driver Reuse and Retry Logic...")
print("-" * 30)

try:
    # Outer loop for teams
    for team_abv in team_abbreviations:
        # Inner loop for splits
        for split in split_parameters:
            
            # Retry loop for failed connection/table load
            for attempt in range(MAX_RETRIES):
                try:
                    # Check if the driver is still alive (by checking its current URL)
                    driver.current_url 
                    
                    new_df = pitcher_split(
                        driver=driver,
                        split_type=split['type'], 
                        team_abv=team_abv, 
                        year=YEAR, 
                        datatable_id=DATATABLE_ID, 
                        description=split['desc']
                    )
                    
                    if new_df is not None and not new_df.empty:
                        pitching_splits = pd.concat([pitching_splits, new_df], ignore_index=True)
                        print(f"SUCCESS: Appended {len(new_df)} rows. Master DF size: {len(pitching_splits)}")
                        break # Break the retry loop on success
                    
                    # If new_df is None (due to TimeoutException/Table not found), retry
                    print(f"RETRYING: Attempt {attempt + 1}/{MAX_RETRIES} for {team_abv} - {split['desc']}...")
                    time.sleep(2) # Short wait before retry

                except WebDriverException as e:
                    # CRITICAL: Driver died (Connection refused/lost)
                    print(f"\n[FATAL ERROR] Driver connection lost for {team_abv} - {split['desc']}. Restarting driver...")
                    
                    # Clean up the old session
                    try:
                        driver.quit()
                    except Exception:
                        pass # Ignore errors on quitting a dead driver
                    
                    # Restart the driver
                    driver = initialize_driver()
                    if driver is None:
                        # If restart fails, stop the whole script
                        raise SystemExit("Driver restart failed. Terminating.")
                        
                    time.sleep(5) # Longer wait after a fatal crash
                    print("Driver successfully restarted. Retrying scrape.")
                    
                except Exception as e:
                    print(f"[{team_abv} - {split['desc']}]: Unhandled error: {e}")
                    break # Break retry loop on unexpected failure

            # Check if retry failed all attempts and the split was not appended
            else: 
                print(f"Skipping {team_abv} - {split['desc']} after {MAX_RETRIES} failed attempts.")
                
finally:
    # 3. CLEANUP: Quit the driver ONCE after all loops are finished
    print("-" * 30)
    print("All tasks finished. Quitting driver.")
    if 'driver' in locals() and driver:
        driver.quit() 
    
print("Scraping Complete.")
print(f"Final DataFrame Shape: {pitching_splits.shape}")

### Pitching splits - Game Level

In [None]:
# --- CONFIGURATION ---
YEAR = 2025
DATATABLE_ID = 'team_split1' 
MAX_RETRIES = 3 

# 2. Define the lists for iteration
team_abbreviations = ['BAL']
split_parameters = [
    {'type': '7',                        'desc': 'last_7_days'},
    {'type': '14',                       'desc': 'last_14_days'},
    {'type': '28',                       'desc': 'last_28_days'},
    {'type': 'Home',                     'desc': 'home_games'},
    {'type': 'Away',                     'desc': 'away_games'},
    {'type': '1st',                      'desc': '1st_half'},
    {'type': '2nd',                      'desc': '2nd_half'},
    {'type': 'April%2FMarch',            'desc': 'april_march'},
    {'type': 'June',                     'desc': 'june_splits'},
    {'type': 'July',                     'desc': 'july_splits'},
    {'type': 'August',                   'desc': 'august_splits'},
    {'type': 'Sept%2FOct',               'desc': 'sept_oct_Splits'},
    {'type': 'Starter',                  'desc': 'as_starter'},
    {'type': 'Reliever',                 'desc': 'as_reliever'},
    {'type': '0-2%20Runs',               'desc': 'run_support_0_2'},
    {'type': '3-5%20Runs',               'desc': 'run_support_3_5'},
    {'type': '6%2B%20Runs',              'desc': 'run_support_6_plus'},
    {'type': 'ANA',                      'desc': 'pitching_vs_ANA'},
    {'type': 'ARI',                      'desc': 'pitching_vs_ARI'},
    {'type': 'ATL',                      'desc': 'pitching_vs_ATL'},
    {'type': 'BAL',                      'desc': 'pitching_vs_BAL'},
    {'type': 'BOS',                      'desc': 'pitching_vs_BOS'},
    {'type': 'CHC',                      'desc': 'pitching_vs_CHC'},
    {'type': 'CHW',                      'desc': 'pitching_vs_CHW'},
    {'type': 'CIN',                      'desc': 'pitching_vs_CIN'},
    {'type': 'CLE',                      'desc': 'pitching_vs_CLE'},
    {'type': 'COL',                      'desc': 'pitching_vs_COL'},
    {'type': 'DET',                      'desc': 'pitching_vs_DET'},
    {'type': 'HOU',                      'desc': 'pitching_vs_HOU'},
    {'type': 'KCR',                      'desc': 'pitching_vs_KCR'},
    {'type': 'LAD',                      'desc': 'pitching_vs_LAD'},
    {'type': 'FLA',                      'desc': 'pitching_vs_FLA'},
    {'type': 'MIL',                      'desc': 'pitching_vs_MIL'},
    {'type': 'MIN',                      'desc': 'pitching_vs_MIN'},
    {'type': 'NYM',                      'desc': 'pitching_vs_NYM'},
    {'type': 'NYY',                      'desc': 'pitching_vs_NYY'},
    {'type': 'OAK',                      'desc': 'pitching_vs_OAK'},
    {'type': 'PHI',                      'desc': 'pitching_vs_PHI'},
    {'type': 'PIT',                      'desc': 'pitching_vs_PIT'},
    {'type': 'SDP',                      'desc': 'pitching_vs_SDP'},
    {'type': 'SEA',                      'desc': 'pitching_vs_SEA'},
    {'type': 'SFG',                      'desc': 'pitching_vs_SFG'},
    {'type': 'STL',                      'desc': 'pitching_vs_STL'},
    {'type': 'TBD',                      'desc': 'pitching_vs_TBD'},
    {'type': 'TEX',                      'desc': 'pitching_vs_TEX'},
    {'type': 'TOR',                      'desc': 'pitching_vs_TOR'},
    {'type': 'WSN',                      'desc': 'pitching_vs_WSN'},
    {'type': 'Day',                      'desc': 'pitching_Day_Games'},
    {'type': 'Night',                    'desc': 'pitching_Night_Games'},
    {'type': 'Grass',                    'desc': 'pitching_Grass_Field_Games'},
    {'type': 'Artif.%20Turf',            'desc': 'pitching_Artificial_Turf_Games'}
]

# Helper function to initialize driver
def initialize_driver():
    """Initializes and returns a new Selenium WebDriver instance."""
    options = Options()
    #options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage") 
    options.add_argument("--headless=new")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36") # Use a recent, common User-Agent
    # NOTE: Keep the path correct my Brave installation
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" 
    
    # Attempt to start the driver with a timeout
    try:
        driver = webdriver.Chrome(options=options) 
        driver.set_page_load_timeout(60)
        return driver
    except Exception as e:
        print(f"FATAL: Could not initialize Chrome driver. Check Brave path and driver version. Error: {e}")
        return None


#  pitcher_split function
def pitcher_split_game_level(driver, split_type, team_abv, year, datatable_id, description):

    # --- URL CONSTRUCTION --- 
    if split_type == 'LHB' or split_type == 'RHB': # matchups vs LHB and RHB hitters
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=total%7CLast%20{split_type}%20days%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=hmvis%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '1st' or split_type == '2nd': # for 1st and 2nd half of the season
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=half%7C{split_type}%20Half%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'April%2FMarch' or split_type == 'May' or split_type == 'June' \
        or split_type == 'July' or split_type == 'August' or split_type == 'Sept%2FOct': # for each month
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=month%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '1st%20Batter': # first batter of the game
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Leadoff%20Inn.': # first batter of the inning
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Batting%201st' or split_type == 'Batting%202nd' or split_type == 'Batting%203rd' \
        or split_type == 'Batting%204th' or split_type == 'Batting%205th' or split_type == 'Batting%206th' \
        or split_type == 'Batting%207th' or split_type == 'Batting%208th' or split_type == 'Batting%209th': 
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=lineu%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Starter' or split_type == 'Reliever':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=sprel%7Cas%20{split_type}%7CT{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '0-2%20Runs' or split_type == '3-5%20Runs' or split_type == '6%2B%20Runs':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=rs%7C{split_type}%20Scored%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Swung%20at%201st%20Pitch' or split_type == 'Took%201st%20Pitch':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=tkswg%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '0' or split_type == '1' or split_type == '2':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=outs%7C{split_type}%20outs%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'innng%7C1st' or split_type == 'innng%7C2nd' or split_type == 'innng%7C3rd' \
        or split_type == 'innng%7C4th' or split_type == 'innng%7C5th' or split_type == 'innng%7C6th' \
        or split_type == 'innng%7C7th' or split_type == 'innng%7C8th' or split_type == 'innng%7C9th':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params={split_type}%20inning%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=oppon%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Day' or split_type == 'Night' or split_type == 'Grass' or split_type == 'Artif.%20Turf':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=stad%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    else:
        print(f"Error: Split type '{split_type}' not supported yet.")
        return pd.DataFrame()

    try:
        driver.get(url)
    except TimeoutException:
        print(f"[{team_abv} - {description}]: Page load timed out (60s). Skipping or retrying...")
        return None # Let the main loop handle the retry/skip
        
    datatable_xpath = f"//table[@id='{datatable_id}']"
    
    # --- SCRAPING LOGIC --- 
    try:
        # Wait up to 30 seconds for the table
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        table_element = driver.find_element(By.XPATH, datatable_xpath)
        print(f"[{team_abv} - {description}]: Table loaded successfully.")
    except Exception:
        # This catches both TimeoutException and NoSuchElementException
        print(f"[{team_abv} - {description}]: Table element not found after 30s. Check site content.")
        return None 

    # Extract the full HTML, wrap in StringIO, read with pandas
    table_html = table_element.get_attribute('outerHTML')
    html_string = StringIO(table_html)
    
    try:
        tables = pd.read_html(html_string, flavor='lxml') 
    except Exception as e:
        print(f"[{team_abv} - {description}]: Error parsing HTML with pandas: {e}")
        return None

    if not tables:
        print(f"[{team_abv} - {description}]: No tables found.")
        return None

    # Create an explicit copy
    df = tables[0].copy() 
    
    # --- CLEANING LOGIC --- 
    df.columns = df.columns.str.strip()
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]

    if 'Rk' in df.columns:
        df = df[df['Rk'] != 'Rk']
        
    df = df.iloc[:-1] # Remove last row (Totals)
    
    df['description'] = description
    df['team'] = team_abv
    df['year'] = YEAR
    
    return df 

# Master loop with driver reuse and retry logic

pitching_splits_game_level = pd.DataFrame()
driver = initialize_driver()

if driver is None:
    exit() # Stop if the driver failed to initialize

print("Starting Scrape Job with Driver Reuse and Retry Logic...")
print("-" * 30)

try:
    # Outer loop for teams
    for team_abv in team_abbreviations:
        # Inner loop for splits
        for split in split_parameters:
            
            # Retry loop for failed connection/table load
            for attempt in range(MAX_RETRIES):
                try:
                    # Check if the driver is still alive (by checking its current URL)
                    driver.current_url 
                    
                    new_df = pitcher_split_game_level(
                        driver=driver,
                        split_type=split['type'], 
                        team_abv=team_abv, 
                        year=YEAR, 
                        datatable_id=DATATABLE_ID, 
                        description=split['desc']
                    )
                    
                    if new_df is not None and not new_df.empty:
                        pitching_splits_game_level = pd.concat([pitching_splits_game_level, new_df], ignore_index=True)
                        print(f"SUCCESS: Appended {len(new_df)} rows. Master DF size: {len(pitching_splits_game_level)}")
                        break # Break the retry loop on success
                    
                    # If new_df is None (due to TimeoutException/Table not found), retry
                    print(f"RETRYING: Attempt {attempt + 1}/{MAX_RETRIES} for {team_abv} - {split['desc']}...")
                    time.sleep(2) # Short wait before retry

                except WebDriverException as e:
                    # CRITICAL: Driver died (Connection refused/lost)
                    print(f"\n[FATAL ERROR] Driver connection lost for {team_abv} - {split['desc']}. Restarting driver...")
                    
                    # Clean up the old session
                    try:
                        driver.quit()
                    except Exception:
                        pass # Ignore errors on quitting a dead driver
                    
                    # Restart the driver
                    driver = initialize_driver()
                    if driver is None:
                        # If restart fails, stop the whole script
                        raise SystemExit("Driver restart failed. Terminating.")
                        
                    time.sleep(5) # Longer wait after a fatal crash
                    print("Driver successfully restarted. Retrying scrape.")
                    
                except Exception as e:
                    print(f"[{team_abv} - {split['desc']}]: Unhandled error: {e}")
                    break # Break retry loop on unexpected failure

            # Check if retry failed all attempts and the split was not appended
            else: 
                print(f"Skipping {team_abv} - {split['desc']} after {MAX_RETRIES} failed attempts.")
                
finally:
    # 3. CLEANUP: Quit the driver ONCE after all loops are finished
    print("-" * 30)
    print("All tasks finished. Quitting driver.")
    if 'driver' in locals() and driver:
        driver.quit() 
    
print("Scraping Complete.")
print(f"Final DataFrame Shape: {pitching_splits_game_level.shape}")

### Update the table statcast_pitches in PostgreSQL
#### This table shows the events pitch-by-pitch

In [None]:
pybaseball.cache.enable() # Enable caching for reliability

def update_statcast_data(engine: Engine):
    """
    Pulls Statcast data starting from the day AFTER the last record in the database
    to ensure only new events are downloaded and appended.
    """
    
    today = date.today()
    
    # --- STEP 1: FIND LAST DATE IN DB ---
    try:
        # Query the database to find the latest game_date currently stored
        with engine.connect() as connection:
            result = connection.execute(
                text("SELECT MAX(game_date) FROM statcast_pitches;")
            ).scalar()
        
        # If the table is empty, start from 400 days ago (initial load range)
        if result is None:
            print("Database is empty. Starting full initial load (400 days)...")
            last_date = today - timedelta(days=400)
        else:
            # Start the new pull from the day AFTER the last record
            last_date = result.date()
            print(f"Latest game_date found in DB: {last_date.strftime('%Y-%m-%d')}")
            
    except Exception as e:
        print(f"‚ùå ERROR querying database for last date: {e}. Defaulting to last 5 days.")
        last_date = today - timedelta(days=5)

    
    # --- STEP 2: DEFINE NEW EXTRACTION RANGE ---
    start_date = last_date + timedelta(days=1)
    end_date = today - timedelta(days=1) # Pull up to yesterday, as today's games aren't finished

    start_dt_str = start_date.strftime('%Y-%m-%d')
    end_dt_str = end_date.strftime('%Y-%m-%d')

    if start_date >= end_date:
        print(f"Data is up to date as of {end_dt_str}. No new extraction needed.")
        return

    print(f"Starting DAILY Statcast ETL: Pulling data from {start_dt_str} to {end_dt_str}")
    
    # --- STEP 3: EXTRACTION ---
    try:
        df = pyb.statcast(start_dt=start_dt_str, end_dt=end_dt_str)
        
        if df is None or df.empty:
            print("No new Statcast data retrieved for this date range. Exiting.")
            return

        #  --- STEP 4: TRANSFORMATION ---        
        # Handle data types before loading (optional, but good practice)
        df['game_date'] = pd.to_datetime(df['game_date'])
        
        # # --- STEP 5: LOADING ---
        print(f"Loading {len(df)} new rows into 'statcast_pitches'...")

        df.to_sql(
            'statcast_pitches', 
            engine, 
            if_exists='replace', # CRITICAL: Append new data to the existing table
            index=False, 
            chunksize=5000
        )
        
        print(f"‚úÖ Successfully appended {len(df)} new rows of Statcast data.")

    except Exception as e:
        print(f"‚ùå Statcast ETL Failed during extraction or loading: {e}")
        

# Execute the daily update
update_statcast_data(engine)

In [None]:


def extract_statcast_data(start_date, end_date):
    """Pulls granular, pitch-by-pitch data for a specified date range."""
    print(f"-> Pulling Statcast data from {start_date} to {end_date}...")
    
    # pybaseball statcast function is designed to handle this extraction
    raw_statcast_df = pyb.statcast(start_dt=start_date, end_dt=end_date)
    
    if raw_statcast_df is None or raw_statcast_df.empty:
        print("Warning: No Statcast data returned for this date range.")
        return pd.DataFrame()
        
    return raw_statcast_df


# Example
test_start_date = '2025-10-28'
test_end_date = '2025-10-30' 

daily_data = extract_statcast_data(test_start_date, test_end_date)
print(f"Successfully extracted {len(daily_data)} individual pitches/events.")

In [None]:
# game_pk: Integer. Game id provided by MLB Advanced Media.
# get statcast data for game_pk 
game_log = pyb.statcast_single_game(813024)

In [None]:
import statsapi

In [None]:
# game_boxscore = statsapi.boxscore_data(gamePk, timecode=None)

for team in statsapi.lookup_team('dodgers'):
    print(team)

In [None]:
a = statsapi.meta('gameTypes')

In [None]:
b = statsapi.get('team', {'teamId':143})

In [None]:
last_game = statsapi.last_game(119)

In [None]:
players_lahman = pylahman.People()
player_chadwick = pyb.chadwick_register()

# Join lahman and chadwick on key identifiers and bring all the columns from lahman
# Ignore if key_bbref is empty in chadwick
players_chadwick_clean = player_chadwick[player_chadwick['key_retro'].notna()]
players_lahman_clean   = players_lahman[players_lahman['retroID'].notna()]

players_df = pd.merge(
    players_chadwick_clean,
    players_lahman_clean,
    left_on=['key_retro'],
    right_on=['retroID'],
    how='left',
)

# Remove unnecesary columns and drop them from the dataframe
cols_to_remove = ['retroID', 'bbrefID', 'mlb_played_first', 'mlb_played_last']
players_df = players_df.drop(columns= cols_to_remove)

# Rename the fields
rename_map = {
    # IDs
    "key_mlbam":     "key_mlbam",
    "key_retro":     "key_retro",
    "key_bbref":     "key_bbref",
    "key_fangraphs": "key_fangraphs",
    "ID":            "id_lahman",
    "playerID":      "player_id_lahman",

    # Names
    "name_last":     "last_name_chadwick",
    "name_first":    "first_name_chadwick",
    "nameLast":      "last_name_lahman",
    "nameFirst":     "first_name_lahman",
    "nameGiven":     "first_and_second_name_lahman",

    # Debut/Final game
    "debut":         "debut",
    "finalGame":     "final_game",

    # Info
    "weight":        "weight",
    "height":        "height",
    "bats":          "bats",
    "throws":        "throws",

    # Birth/Death
    "birthYear":     "birth_year",
    "birthMonth":    "birth_month",
    "birthDay":      "birth_day",
    "birthCity":     "birth_city",
    "birthCountry":  "birth_country",
    "birthState":    "birth_state",
    "deathYear":     "death_year",
    "deathMonth":    "death_month",
    "deathDay":      "death_day",
    "deathCountry":  "death_country",
    "deathState":    "death_state",
    "deathCity":     "death_city",
}

# Apply the rename
players_df = players_df.rename(columns= rename_map)

# Order the new columns
ordered_cols = [
    "key_mlbam",
    "key_retro",
    "key_bbref",
    "key_fangraphs",
    "id_lahman",
    "player_id_lahman",
    "last_name_chadwick",
    "first_name_chadwick",
    "last_name_lahman",
    "first_name_lahman",
    "first_and_second_name_lahman",
    "debut",
    "final_game",
    "weight",
    "height",
    "bats",
    "throws",
    "birth_year",
    "birth_month",
    "birth_day",
    "birth_city",
    "birth_country",
    "birth_state",
    "death_year",
    "death_month",
    "death_day",
    "death_country",
    "death_state",
    "death_city"
]

# Apply the order
players_df = players_df[ordered_cols]

# This selects only columns with numbers and fills their nulls with -1
numeric_cols = players_df.select_dtypes(include=['number']).columns
players_df[numeric_cols] = players_df[numeric_cols].fillna(-1)

# Replace nulls in the text columns
text_cols = [
    "key_retro",
    "key_bbref",
    "player_id_lahman",
    "last_name_chadwick",
    "first_name_chadwick",
    "last_name_lahman",
    "first_name_lahman",
    "first_and_second_name_lahman",
    "bats",
    "throws",
    "birth_city",
    "birth_country",
    "birth_state",
    "death_country",
    "death_state",
    "death_city"
]

# Convert to a standard object type first and then fill the nulls with N/A
for col in text_cols:
    players_df[col] = players_df[col].astype(object).fillna('N/A')
    

# List the date columns
date_cols = [
    "debut",
    "final_game"
]
# Fill null dates with January 1st, 1700
for col in date_cols:
    players_df[col] = players_df[col].fillna(pd.Timestamp('1700-01-01'))

# Check for nulls in my table - there shouldn't be any
if (players_df.isnull().sum() == 0).all():
    print("‚úÖ No nulls found.")
else:
    print("‚ö†Ô∏è WARNING - There are nulls in some columns in the dataframe.")

In [None]:
# 1. Identify all text columns
text_cols = team_franchises.select_dtypes(include=['object', 'string']).columns

# 2. Convert to object FIRST, then fill
for col in text_cols:
    # Converting to object allows 'N/A' to be treated as a normal string
    team_franchises[col] = team_franchises[col].astype(object).fillna('N/A')
    
    # Just in case some were literal 'nan' strings:
    team_franchises[col] = team_franchises[col].replace(['nan', 'None', '<NA>'], 'N/A')

# 3. Final Verification with Emojis
null_count = team_franchises[text_cols].isnull().sum().sum()
if null_count == 0:
    print("‚úÖ All string columns are clean. No nulls found!")
else:
    print(f"‚ö†Ô∏è Warning: {null_count} nulls still remain in text columns.")

In [None]:
team_info = pylahman.Teams()

# Identify all text columns
text_cols = team_info.select_dtypes(include=['object', 'string']).columns

# Convert to object first, then fill with N/A
for col in text_cols:
    # Converting to object allows 'N/A' to be treated as a normal string
    team_info[col] = team_info[col].astype(object).fillna('N/A')
    
    # Just in case some were literal 'nan' strings:
    team_info[col] = team_info[col].replace(['nan', 'None', '<NA>'], 'N/A')

# This selects only columns with numbers and fills their nulls with -1
numeric_cols = team_info.select_dtypes(include=['number']).columns
team_info[numeric_cols] = team_info[numeric_cols].fillna(-1)

# Final verification
null_count_text    = team_info[text_cols].isnull().sum().sum()
null_count_numeric = team_info[numeric_cols].isnull().sum().sum()
total_nulls        = null_count_text + null_count_numeric

if total_nulls == 0:
    print("‚úÖ All columns are clean. No nulls found!")
else:
    print(f"‚ö†Ô∏è Warning: {total_nulls} nulls still remain some columns.")

### Create model

In [None]:
#def update_dim_pitcher_archetypes(engine: Engine):
"""
Groups pitchers into 8 archetypes and updates the database.
Now includes an 'updated_at' column to track the last run date.
"""

# 1. Pull unique pitcher stats
query = """
SELECT 
    pitcher,
    AVG(release_speed) as avg_velo, 
    AVG(release_spin_rate) as avg_spin, -- The spin rate of a pitch measured in revolutions per minute (rpm) at the moment of release
    AVG(pfx_x) as avg_horiz_mvmt, -- Horizontal movement in feet from the catcher's perspective
    AVG(pfx_z) as avg_vert_mvmt -- Vertical movement from the catcher's perpsective.
FROM fact_statcast_pitches
WHERE release_speed IS NOT NULL 
    AND release_spin_rate IS NOT NULL
    AND pfx_x IS NOT NULL 
    AND pfx_z IS NOT NULL
GROUP BY pitcher
HAVING COUNT(*) > 100 
"""
pitcher_stats = pd.read_sql(query, engine)

# 2. Scale the data
scaler = StandardScaler()
features = ['avg_velo', 'avg_spin', 'avg_horiz_mvmt', 'avg_vert_mvmt']
scaled_data = scaler.fit_transform(pitcher_stats[features])

# 3. Create 8 Archetypes
kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
pitcher_stats['archetype_id'] = kmeans.fit_predict(scaled_data)

# 4. Map IDs and Add Timestamp
archetype_map = {
    0: "Power Flamethrower",
    1: "Sinker / Tail Specialist",
    2: "Breaking Ball Specialist",
    3: "Standard Control Righty",
    4: "Position Player / Eephus",
    5: "Deceptive Angle Specialist",
    6: "Low-Spin / Heavy Sinker",
    7: "Power Slider / Sweeper"
}
pitcher_stats['archetype_name'] = pitcher_stats['archetype_id'].map(archetype_map)

# Add the current timestamp to every row
pitcher_stats['updated_at'] = datetime.now()

# 5. Database Update (Truncate and Append)
with engine.connect() as conn:
    try:
        conn.execute(text("TRUNCATE TABLE dim_pitcher_archetypes;"))
        conn.commit()
        print("Refreshing existing dim_pitcher_archetypes table...")
    except Exception:
        print("Table 'dim_pitcher_archetypes' not found. Creating it for the first time...")
        conn.rollback()

# Upload data including the new column
pitcher_stats[['pitcher', 'archetype_id', 'archetype_name', 'updated_at']].to_sql(
    'dim_pitcher_archetypes', 
    engine, 
    if_exists='append', 
    index=False
)

# 6. Ensure the Primary Key is set
pk_check = """
SELECT count(*) 
FROM information_schema.table_constraints 
WHERE table_name='dim_pitcher_archetypes' AND constraint_type='PRIMARY KEY';
"""
with engine.connect() as conn:
    has_pk = conn.execute(text(pk_check)).scalar()
    if has_pk == 0:
        conn.execute(text("ALTER TABLE dim_pitcher_archetypes ADD PRIMARY KEY (pitcher);"))
        conn.commit()
        print("‚úÖ Primary Key (pitcher) established.")

print(f"‚úÖ Successfully categorized {len(pitcher_stats)} pitchers.")

‚úÖ dim_pitcher_archetypes updated! Primary Key was preserved.
