In [143]:
import os
from sqlalchemy import create_engine
import pybaseball as pyb
import pybaseball.cache # Ensure caching is imported
import pandas as pd
from dotenv import load_dotenv
import datetime
import time
from datetime import date, timedelta
from sqlalchemy.engine import Engine
from sqlalchemy import text


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from rapidfuzz import process
import re
import os
import numpy as np
from datetime import datetime
import re


In [2]:

# Load environment variables from .env file
load_dotenv()

# Build the PostgreSQL connection string
DB_URL = f"postgresql://{os.environ['DB_USER']}:{os.environ['DB_PASS']}@{os.environ['DB_HOST']}:5432/{os.environ['DB_NAME']}"

# Create the engine object for connecting
engine = create_engine(DB_URL)

print("Database connection established.")

Database connection established.


In [None]:
# This is a sample function; the final ETL will be more complex.
def extract_batting_data(year=2025):
    # fgs is FanGraphs Season Stats
    df = pyb.batting_stats(year)
    # Rename columns to match your SQL schema (e.g., 'BB' for Walks, 'K' for Strikeouts)
    df = df.rename(columns={'ID': 'player_id', 'Tm': 'team_id'})
    return df

current_year_batting = extract_batting_data(2025)

In [None]:
def extract_historical_batting(start_year=2024, end_year=2025):
    """Pulls batting stats for a range of years from FanGraphs and combines them."""
    
    all_data_frames = []
    
    # Create the list of years to pull
    years = range(start_year, end_year + 1)
    
    for year in years:
        print(f"-> Pulling FanGraphs data for {year}...")
        try:
            # 1. Extraction: Pull data for a single year
            df = pyb.batting_stats(year)
            
            # CRITICAL: Add a 'Season' column for historical tracking
            df['Season'] = year 
            
            # 2. Transformation (Initial Rename/Select)
            # You must select and rename columns here to match your PostgreSQL schema
            
            # Example: Select and rename columns (Adjust this based on your exact schema!)
            df = df.rename(columns={'Name': 'player_name', 'PlayerId': 'player_id', 
                                    'G': 'games_played', 'HR': 'home_runs', 'SO': 'strikeouts'})
            
            # 3. Append: Add the processed year's data to the list
            all_data_frames.append(df)
            
        except Exception as e:
            print(f"ERROR: Failed to pull data for {year}: {e}")
            
    # 4. Concatenate: Merge all DataFrames into one large DataFrame
    if all_data_frames:
        historical_df = pd.concat(all_data_frames, ignore_index=True)
        print(f"✅ Successfully combined data from {len(years)} seasons into {len(historical_df)} total rows.")
        return historical_df
    
    return pd.DataFrame() # Return empty if no data was pulled

if __name__ == '__main__':
    full_batting_df = extract_historical_batting()
    
    # ... (Then proceed to your advanced metrics calculation and load_data function) ...

In [None]:


def extract_statcast_data(start_date, end_date):
    """Pulls granular, pitch-by-pitch data for a specified date range."""
    print(f"-> Pulling Statcast data from {start_date} to {end_date}...")
    
    # pybaseball statcast function is designed to handle this extraction
    raw_statcast_df = pyb.statcast(start_dt=start_date, end_dt=end_date)
    
    if raw_statcast_df is None or raw_statcast_df.empty:
        print("Warning: No Statcast data returned for this date range.")
        return pd.DataFrame()
        
    return raw_statcast_df


# Example
test_start_date = '2025-10-28'
test_end_date = '2025-10-30' 

daily_data = extract_statcast_data(test_start_date, test_end_date)
print(f"Successfully extracted {len(daily_data)} individual pitches/events.")

In [None]:
# Enable pybaseball caching to speed up repeated queries
pybaseball.cache.enable()

# Set the connection engine using your environment variables (as previously defined)
# engine = create_engine(DB_URL) 

def update_statcast_data(engine: Engine, days_to_keep: int = 400):
    """
    Pulls recent Statcast data and keeps a rolling window of data 
    in the PostgreSQL statcast_pitches table.
    """
    today = date.today()
    
    # We pull data from the end of last season (or roughly 400 days ago) 
    # up to yesterday to ensure we have a full window for rolling metrics.
    start_date = today - timedelta(days=days_to_keep) 
    end_date = today - timedelta(days=1)
    
    start_dt_str = start_date.strftime('%Y-%m-%d')
    end_dt_str = end_date.strftime('%Y-%m-%d')
    
    print(f"Starting Statcast ETL: Pulling data from {start_dt_str} to {end_dt_str}")
    
    try:
        # Use the general statcast function for league-wide pitch data
        # 'statcast' is an alias for the league-wide Statcast search
        df = pyb.statcast(start_dt=start_dt_str, end_dt=end_dt_str)

        if df.empty:
            print("No new Statcast data retrieved. Exiting.")
            return

        # 1. Cleaning/Selection (CRITICAL)
        # Select ONLY the columns you need to prevent errors during loading.
        # Statcast column names are long; we'll rename them to match the SQL table.
        df_clean = df.rename(columns={
            'game_date': 'game_date', 
            'game_pk': 'game_pk', 
            'inning': 'inning', 
            'batter': 'batter_id', 
            'pitcher': 'pitcher_id', 
            'stand': 'stand', 
            'p_throws': 'p_throws',
            'events': 'events',
            'description': 'description',
            'launch_speed': 'launch_speed', 
            'launch_angle': 'launch_angle',
            'bb_type': 'bb_type',
            'pitch_type': 'pitch_type',
            'release_speed': 'release_speed',
            'spin_rate': 'spin_rate'
        })

        # Only keep the columns that exist in our SQL schema
        columns_to_keep = [col for col in df_clean.columns if col in ['game_date', 'game_pk', 'inning', 'batter_id', 'pitcher_id', 'stand', 'p_throws', 'events', 'description', 'launch_speed', 'launch_angle', 'bb_type', 'pitch_type', 'release_speed', 'spin_rate']]
        final_df = df_clean[columns_to_keep]

        # # 2. Loading: Use 'replace' for the first run, then switch to 'append' 
        # # for a daily ETL to avoid re-downloading old data.
        # # Since we are pulling a large historical range, we should replace the table.
        final_df.to_sql('statcast_pitches', engine, if_exists='append', index=False, chunksize=5000)
        
        # print(f"✅ Successfully loaded {len(final_df)} rows of Statcast data into 'statcast_pitches'.")
        return final_df
    except Exception as e:
        print(f"❌ Statcast ETL Failed: {e}")

# Example Run (assuming 'engine' is defined with your credentials)
statcast_data_df = update_statcast_data(engine)

### This will update the table statcast_pitches in PostgreSQL

In [9]:
pybaseball.cache.enable() # Enable caching for reliability

def update_statcast_data(engine: Engine):
    """
    Pulls Statcast data starting from the day AFTER the last record in the database
    to ensure only new events are downloaded and appended.
    """
    
    today = date.today()
    
    # --- STEP 1: FIND LAST DATE IN DB ---
    try:
        # Query the database to find the latest game_date currently stored
        with engine.connect() as connection:
            result = connection.execute(
                text("SELECT MAX(game_date) FROM statcast_pitches;")
            ).scalar()
        
        # If the table is empty, start from 400 days ago (initial load range)
        if result is None:
            print("Database is empty. Starting full initial load (400 days)...")
            last_date = today - timedelta(days=400)
        else:
            # Start the new pull from the day AFTER the last record
            last_date = result.date()
            print(f"Latest game_date found in DB: {last_date.strftime('%Y-%m-%d')}")
            
    except Exception as e:
        print(f"❌ ERROR querying database for last date: {e}. Defaulting to last 5 days.")
        last_date = today - timedelta(days=5)

    
    # --- STEP 2: DEFINE NEW EXTRACTION RANGE ---
    start_date = last_date + timedelta(days=1)
    end_date = today - timedelta(days=1) # Pull up to yesterday, as today's games aren't finished

    start_dt_str = start_date.strftime('%Y-%m-%d')
    end_dt_str = end_date.strftime('%Y-%m-%d')

    if start_date >= end_date:
        print(f"Data is up to date as of {end_dt_str}. No new extraction needed.")
        return

    print(f"Starting DAILY Statcast ETL: Pulling data from {start_dt_str} to {end_dt_str}")
    
    # --- STEP 3: EXTRACTION ---
    try:
        df = pyb.statcast(start_dt=start_dt_str, end_dt=end_dt_str)
        
        if df is None or df.empty:
            print("No new Statcast data retrieved for this date range. Exiting.")
            return

        #  --- STEP 4: TRANSFORMATION ---        
        # Handle data types before loading (optional, but good practice)
        df['game_date'] = pd.to_datetime(df['game_date'])
        
        # # --- STEP 5: LOADING ---
        print(f"Loading {len(df)} new rows into 'statcast_pitches'...")

        df.to_sql(
            'statcast_pitches', 
            engine, 
            if_exists='replace', # CRITICAL: Append new data to the existing table
            index=False, 
            chunksize=5000
        )
        
        print(f"✅ Successfully appended {len(df)} new rows of Statcast data.")

    except Exception as e:
        print(f"❌ Statcast ETL Failed during extraction or loading: {e}")
        

# Execute the daily update
update_statcast_data(engine)

Database is empty. Starting full initial load (400 days)...
Starting DAILY Statcast ETL: Pulling data from 2024-11-08 to 2025-12-11
This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 254/254 [00:14<00:00, 17.16it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Loading 770795 new rows into 'statcast_pitches'...
✅ Successfully appended 770795 new rows of Statcast data.


### Update players table

In [7]:
def update_players(engine: Engine):   
    try:
        df = pyb.chadwick_register()
        
        # # --- STEP 5: LOADING ---
        print(f"Loading {len(df)} new rows into 'players'...")
        
        df.to_sql(
            'players', 
            engine, 
            if_exists='replace',
            index=False, 
            chunksize=5000
        )
        
        print(f"✅ Players successfully added {len(df)} new rows of players data.")

    except Exception as e:
        print(f"❌ Statcast ETL Failed during extraction or loading: {e}")
        

# Execute the players function
update_players(engine)

Loading 25901 new rows into 'players'...
✅ Players successfully added 25901 new rows of players data.


### Get team stats

In [None]:
team_batting  = pyb.team_batting(2025)
team_pitching = pyb.team_pitching(2025)
team_fielding = pyb.team_fielding(2025)

### Get players stats

In [None]:
batting_stats  = pyb.batting_stats(2025,  qual=0)
pitching_stats = pyb.pitching_stats(2025, qual=0)
fielding_stats = pyb.fielding_stats(2025, qual=0)
running_stats  = pyb.statcast_sprint_speed(2025, 50) #players with at least 50 opportunities


### Get scores last n days

In [58]:
import pybaseball as pyb
import pandas as pd


def get_game_results_last_n_days(n_days=90):
    """
    Pulls raw pitch-by-pitch data for all games played in the last 'n_days' 
    and then extracts the final score for each game.
    """
    today = datetime.date.today()
    
    # 1. Calculate the start and end dates for the 90-day range
    end_date = today - datetime.timedelta(days=1)  # Search up to yesterday
    start_date = today - datetime.timedelta(days=n_days)
    
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')
    
    print(f"Searching for all games played from {start_date_str} to {end_date_str}...")

    try:
        # 2. Pull all pitch-by-pitch data in that range
        all_data_in_range = pyb.statcast(start_dt=start_date_str, end_dt=end_date_str)
        
    except Exception as e:
        print(f"Error retrieving Statcast data: {e}")
        return pd.DataFrame()

    if all_data_in_range.empty:
        print(f"No games found between {start_date_str} and {end_date_str}.")
        return pd.DataFrame()

    print(f"Successfully pulled {len(all_data_in_range)} pitch events.")

    # 3. Sort the data chronologically by game_pk, inning, etc.
    data_sorted = all_data_in_range.sort_values(
        by=['game_pk', 'inning', 'inning_topbot', 'at_bat_number', 'pitch_number'],
        ascending=True
    )

    # 4. Group by game_pk and take the last row (which contains the final score)
    final_events = data_sorted.groupby('game_pk').tail(1).reset_index(drop=True)
    
    # 5. Extract and rename the relevant columns for the final scoreboard
    scoreboard = final_events[[
        'game_date', 
        'home_team', 
        'away_team', 
        'home_score', 
        'away_score'
    ]].copy()
    
    scoreboard.rename(columns={
        'home_score': 'Home_Final_Score',
        'away_score': 'Away_Final_Score',
        'game_date': 'Date'
    }, inplace=True)
    
    # 6. Determine the Winner
    scoreboard['Winner'] = scoreboard.apply(
        lambda row: row['home_team'] if row['Home_Final_Score'] > row['Away_Final_Score'] else row['away_team'],
        axis=1
    )
    scoreboard['Result'] = (
        scoreboard['Winner'] + ' wins ' + 
        scoreboard['Home_Final_Score'].astype(str) + '-' + 
        scoreboard['Away_Final_Score'].astype(str)
    )
    
    return scoreboard[['Date', 'away_team', 'home_team', 'Away_Final_Score', 'Home_Final_Score', 'Winner', 'Result']]

# --- EXECUTION ---
results_yesterday_df    = get_game_results_last_n_days(n_days= 1)
results_last_7_days_df  = get_game_results_last_n_days(n_days= 7)
results_last_15_days_df = get_game_results_last_n_days(n_days= 15)
results_last_30_days_df = get_game_results_last_n_days(n_days= 30)
results_last_60_days_df = get_game_results_last_n_days(n_days= 60)
results_last_90_days_df = get_game_results_last_n_days(n_days= 90)


# if not results_last_90_days_df.empty:
#     print(f"\n--- Game Results from the Last 90 Days ({len(results_last_90_days_df)} Games Found) ---")
#     print(results_last_90_days_df.tail(10)) # Print the last 10 games found
# else:
#     print("\nNo games were found in the last 90 days.")

Searching for all games played from 2025-12-11 to 2025-12-11...
This is a large query, it may take a moment to complete
Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-11 and 2025-12-11.
Searching for all games played from 2025-12-05 to 2025-12-11...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-05 and 2025-12-11.
Searching for all games played from 2025-11-27 to 2025-12-11...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-11-27 and 2025-12-11.
Searching for all games played from 2025-11-12 to 2025-12-11...
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|██████████| 4/4 [00:00<00:00,  5.06it/s]

No games found between 2025-11-12 and 2025-12-11.
Searching for all games played from 2025-10-13 to 2025-12-11...
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|██████████| 34/34 [00:02<00:00, 13.37it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Successfully pulled 5290 pitch events.
Searching for all games played from 2025-09-13 to 2025-12-11...
This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 64/64 [00:04<00:00, 15.87it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Successfully pulled 77978 pitch events.


# TEST

In [5]:
# Test
df = pyb.statcast(start_dt='2025-10-01', end_dt='2025-10-30')


This is a large query, it may take a moment to complete


100%|██████████| 30/30 [00:02<00:00, 10.73it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


In [None]:
def teams_split(split_type, clean_mode):
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    # Define year
    year = datetime.now().year
    
    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)  
    

    

    if split_type == 'LHP' or split_type == 'RHP': # for LHP and RHP pitchers
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=total%7CLast%20{split_type}%20days%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'RH' or split_type == 'LH': # for RH and LH Starters
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%20Starter%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=hmvis%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'first_batter_game':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=leado%7C1st%20Batter%20G%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_power_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Power%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_weak_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Finesse%7CML%7C{year}%7Cbat%7CAB%7C")
    # For each team:
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
            driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_less_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3C%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_greater_or_equal_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3E%3D%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    
    
    # Name of the table
    datatable_id = 'split1'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//table[@id='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} ({split_type}) table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)
    
    # Close the WebDriver
    driver.quit()    
    
    if clean_mode == 1:
        # Remove 'Roe' exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the last column
        df = df.iloc[:, :-1]

        # Rename last 3 columns
        new_column_names = ["BAbip", "tOPS+", "sOPS+"]  # New names for last 3 columns
        df.columns.values[-3:] = new_column_names  # Assign new names

        # Remove the first column
        df = df.iloc[:, 1:]
    else:
        # Remove 'Roe' and GS exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)
        df[0] = df[0].str.replace('GS', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Remove rows where column 'A' contains 'Rk', but keep the first row
        df = df[~((df.index > 0) & (df[0].str.contains('Rk', na=False)))]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the first column
        df = df.iloc[:, 1:]

        # Remove the last 2 columns
        df = df.iloc[:, :-2]

        # New column names
        new_column_names = ['Team', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB',
                            'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDP', 'HBP', 'SH',
                            'SF', 'IBB', 'ROE', 'BAbip', 'tOPS+', 'sOPS+']

        # Rename all columns
        df.columns = new_column_names

    return df

# Call the function to get the teams split data
team_vs_lhp             = teams_split(split_type= 'LHP',  clean_mode= 0) # GS empty
team_vs_rhp             = teams_split(split_type= 'RHP',  clean_mode= 0) # GS empty
team_vs_lh_starters     = teams_split(split_type= 'LH',   clean_mode= 1)
team_vs_rh_starters     = teams_split(split_type= 'RH',   clean_mode= 1)
team_last_seven_days    = teams_split(split_type= '7',    clean_mode= 1)
team_last_fourteen_days = teams_split(split_type= '14',   clean_mode= 1)
team_last_28_days       = teams_split(split_type= '28',   clean_mode= 1)
team_home_games         = teams_split(split_type= 'Home', clean_mode= 1)
team_away_games         = teams_split(split_type= 'Away', clean_mode= 1)
team_first_batter_game  = teams_split(split_type= 'first_batter_game', clean_mode= 0) # GS empty
team_vs_power_pitcher   = teams_split(split_type= 'vs_power_pitcher',  clean_mode= 0) # GS empty
team_vs_weak_pitcher    = teams_split(split_type= 'vs_weak_pitcher',   clean_mode= 0) # GS empty
team_vs_power_team      = teams_split(split_type= 'vs_greater_or_equal_than_500_WP', clean_mode= 1)
team_vs_weak_team       = teams_split(split_type= 'vs_less_than_500_WP',             clean_mode= 1)

# # Direct matchups
team_laa = teams_split(split_type= 'ANA', clean_mode= 1)
team_ari = teams_split(split_type= 'ARI', clean_mode= 1)
team_atl = teams_split(split_type= 'ATL', clean_mode= 1)
team_bal = teams_split(split_type= 'BAL', clean_mode= 1)
team_bos = teams_split(split_type= 'BOS', clean_mode= 1)
team_chc = teams_split(split_type= 'CHC', clean_mode= 1)
team_chw = teams_split(split_type= 'CHW', clean_mode= 1)
team_cin = teams_split(split_type= 'CIN', clean_mode= 1)
team_cle = teams_split(split_type= 'CLE', clean_mode= 1)
team_col = teams_split(split_type= 'COL', clean_mode= 1)
team_det = teams_split(split_type= 'DET', clean_mode= 1)
team_hou = teams_split(split_type= 'HOU', clean_mode= 1)
team_kcr = teams_split(split_type= 'KCR', clean_mode= 1)
team_lad = teams_split(split_type= 'LAD', clean_mode= 1)
team_mia = teams_split(split_type= 'FLA', clean_mode= 1) 
team_mil = teams_split(split_type= 'MIL', clean_mode= 1)
team_min = teams_split(split_type= 'MIN', clean_mode= 1)
team_nym = teams_split(split_type= 'NYM', clean_mode= 1)
team_nyy = teams_split(split_type= 'NYY', clean_mode= 1)
team_oak = teams_split(split_type= 'OAK', clean_mode= 1)
team_phi = teams_split(split_type= 'PHI', clean_mode= 1)
team_pit = teams_split(split_type= 'PIT', clean_mode= 1)
team_sdp = teams_split(split_type= 'SDP', clean_mode= 1)
team_sea = teams_split(split_type= 'SEA', clean_mode= 1)
team_sfg = teams_split(split_type= 'SFG', clean_mode= 1)
team_stl = teams_split(split_type= 'STL', clean_mode= 1)
team_tbr = teams_split(split_type= 'TBD', clean_mode= 1)
team_tex = teams_split(split_type= 'TEX', clean_mode= 1)
team_tor = teams_split(split_type= 'TOR', clean_mode= 1)
team_wsn = teams_split(split_type= 'WSN', clean_mode= 1)

# Dictionary of dataframes for the teams
dic_team = {
    'LAA': team_laa,
    'AZ': team_ari,
    'ATL': team_atl,
    'BAL': team_bal,
    'BOS': team_bos,
    'CHC': team_chc,
    'CHW': team_chw,
    'CIN': team_cin,
    'CLE': team_cle,
    'COL': team_col,
    'DET': team_det,
    'HOU': team_hou,
    'KC': team_kcr,
    'LAD': team_lad,
    'MIA': team_mia,
    'MIL': team_mil,
    'MIN': team_min,
    'NYM': team_nym,
    'NYY': team_nyy,
    'ATH': team_oak,
    'PHI': team_phi,
    'PIT': team_pit,
    'SD': team_sdp,
    'SEA': team_sea,
    'SF': team_sfg,
    'STL': team_stl,
    'TB': team_tbr,
    'TEX': team_tex,
    'TOR': team_tor,
    'WSH': team_wsn   
    }

# Add an ID column with the dictionary key as the identifier
for key, df in dic_team.items():
    df['ID'] = key  # Assign the dictionary key as the ID

# Concatenate all dataFrames in the dictionary
direct_matches = pd.concat(dic_team.values(), ignore_index=True)  # Resets index

dic_splits = {
    'team_vs_lhp'        :team_vs_lhp,        
    'team_vs_rhp'        :team_vs_rhp,
    'team_vs_lh_starters':team_vs_lh_starters,
    'team_vs_rh_starters':team_vs_rh_starters,
    'team_last_seven_days':team_last_seven_days,
    'team_last_fourteen_days':team_last_fourteen_days,
    'team_last_28_days':team_last_28_days,
    'team_home_games':team_home_games,
    'team_away_games':team_away_games,
    'team_first_batter_game':team_first_batter_game,
    'team_vs_power_pitcher':team_vs_power_pitcher,
    'team_vs_weak_pitcher':team_vs_weak_pitcher,
    'team_vs_power_team':team_vs_power_team,
    'team_vs_weak_team':team_vs_weak_team      
}



In [None]:
def players_split(split_type, team_abv, clean_mode):
    # --- 1. SETUP (Only here because the logic is complex) ---
    options = Options()
    options.add_argument("--headless")
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
    driver = webdriver.Chrome(options=options)
    year = datetime.now().year
    datatable_id = 'team_split1'
    
    # --- 2. URL CONSTRUCTION ---
    if split_type == 'LHP' or split_type == 'RHP': # for LHP and RHP pitchers
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days (uses all columns)
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=total%7CLast%20{split_type}%20days%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    elif split_type == 'RH' or split_type == 'LH': # for RH and LH Starters (uses all columns)
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%20Starter%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    elif split_type == 'Home' or split_type == 'Away': # for home and away games (uses all columns)
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=hmvis%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    elif split_type == '1st' or split_type == '2nd': # for 1st and 2nd half of the season (uses all columns)
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=half%7C{split_type}%20Half%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    elif split_type == 'April%2FMarch' or split_type == 'May' or split_type == 'June' \
        or split_type == 'July' or split_type == 'August' or split_type == 'Sept%2FOct': # for each month (uses all columns)
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=month%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    elif split_type == 'C' or split_type == '1B' or split_type == '2B' or split_type == '3B' \
        or split_type == 'SS' or split_type == 'LF' or split_type == 'CF' or split_type == 'RF' \
        or split_type == 'DH' or split_type == 'PH': # for each position
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=defp%7Cas%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    elif split_type == '1st%20Batter':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    
    
    
    
    
    
    
    
    elif split_type == 'first_batter_game':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=leado%7C1st%20Batter%20G%7CML%7C{year}%7Cbat%7CAB%7C")
    
    elif split_type == 'vs_power_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Power%7CML%7C{year}%7Cbat%7CAB%7C")
    
    elif split_type == 'vs_weak_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Finesse%7CML%7C{year}%7Cbat%7CAB%7C")
    
    # For each team:
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
            driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    
    elif split_type == 'vs_less_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3C%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    
    elif split_type == 'vs_greater_or_equal_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3E%3D%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    
    else:
        # Handle other split types or raise an error
        print(f"Error: Split type '{split_type}' not supported yet.")
        driver.quit()
        return pd.DataFrame() # Return empty DataFrame on failure

    driver.get(url)
    
    # --- 3. WAIT AND EXTRACT ---
    datatable_xpath = f"//table[@id='{datatable_id}']"
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"URL: {url}")
        print(f"Table {datatable_id} ({split_type} vs {team_abv}) loaded successfully.")

        table_element = driver.find_element(By.XPATH, datatable_xpath)
        text_content = table_element.text
        
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()
        return pd.DataFrame()
    finally:
        driver.quit() # Close the driver in a finally block to ensure it closes
        
    # --- 4. DATA PROCESSING (Keep this logic) ---
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]
    df = pd.DataFrame(table_data)
    
    # ... (Your cleanup logic for clean_mode 0 or 1 remains here) ...
    # This is where the complex splitting, renaming, and removal happens.
    if clean_mode == 1:
        #Remove 'GS' from the first row since it is empty
        df.loc[0, 0] = df.loc[0, 0].replace('GS', '')
        
        # Define the correct headers
        all_headers = [
        'Name', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI',
        'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDBP',
        'HBP', 'SH', 'SF', 'IBB', 'ROE', 'BABIP', 'tOPS+', 'sOPS+'
        ]
        
    elif clean_mode == 2:        
        # Define the correct headers
        all_headers = [
        'Name', 'G', 'GS', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI',
        'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDBP',
        'HBP', 'SH', 'SF', 'IBB', 'ROE', 'BABIP', 'tOPS+', 'sOPS+'
        ]

    elif clean_mode == 3:
        #Remove some empty values from the first row
        df.loc[0, 0] = df.loc[0, 0].replace('GS', '')
        df.loc[0, 0] = df.loc[0, 0].replace('R', '')
        df.loc[0, 0] = df.loc[0, 0].replace('SB', '')
        df.loc[0, 0] = df.loc[0, 0].replace('CS', '')
        
        # Define the correct headers
        all_headers = [
        'Name', 'G', 'PA', 'AB', 'H', '2B', '3B', 'HR', 'RBI',
        'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDBP',
        'HBP', 'SH', 'SF', 'IBB', 'ROE', 'BABIP', 'tOPS+', 'sOPS+'
        ]
        
    # Remove last row
    df = df.iloc[:-1]

    # Remove rows where column 'A' contains 'Rk', but keep the first row
    #df = df[~((df.index > 0) & (df[0].str.contains('Rk', na=False)))]

    # 2. Use Regex to split the column into exactly two parts: Name/Rk and everything else (the stats)
    # Regex Pattern Breakdown:
    # (^.*?)      -> Capturing Group 1: Match and capture everything from the start (^) non-greedily (.*?)
    # (\s\d+)     -> The split transition: Look for a space (\s) followed immediately by a digit (\d+).
    #               This finds the first number after the name.
    #               Crucially, we DON'T include this space-plus-digit in Group 1.
    # (.*$)       -> Capturing Group 2: Match and capture everything from that point until the end ($).
    regex_pattern = r"^(.*?)\s(\d+.*$)"

    # Extract the two parts into new columns:
    df[['Rk_Name_Combined', 'Stats_Combined']] = df[0].str.extract(regex_pattern, expand=True)
    
    # Remove the first row
    df = df.iloc[1:]
    
    # NEW LOGIC: VERIFY AND DROP NON-CONFORMING STATS ROWS ---

    # 1. Calculate the number of spaces (separators) in each row
    space_counts = df['Stats_Combined'].str.count(' ')

    # 2. Find the maximum number of spaces, which defines the expected structure
    max_spaces = space_counts.max()

    print(f"Maximum spaces (expected separators): {max_spaces}")

    # 1. Calculate the number of spaces (separators) in each row
    space_counts = df['Stats_Combined'].str.count(' ')

    # 2. Find the maximum number of spaces, which defines the expected structure
    max_spaces = space_counts.max()

    print(f"Maximum spaces (expected separators): {max_spaces}")

    # 3. Identify the indices of the rows that do NOT have the maximum number of spaces
    indices_to_drop = space_counts[space_counts < max_spaces].index

    # 4. Check if any rows need to be dropped
    if indices_to_drop.empty:
        print("All rows have a consistent number of spaces. Proceeding with split.")
    else:
        num_dropped = len(indices_to_drop)
        
        # 5. Drop the rows that do not match the expected structure
        df = df.drop(index=indices_to_drop)
        
        # Reset the index after dropping rows
        df = df.reset_index(drop=True)
        
        print(f"Removed {num_dropped} row(s) because they did not have the expected number of spaces ({max_spaces}).")    
    
    # 3. Split the 'Stats_Combined' column on all spaces
    # The Name is now safe, and the stats are simple space-separated values.
    stats_split = df['Stats_Combined'].str.split(expand=True)

    # 4. Combine the Name/Rank column with the split statistics
    # We use iloc to get the columns we want.
    final_df = pd.concat([
        df['Rk_Name_Combined'], # The Name/Rank column
        stats_split             # All the statistical columns
    ], axis=1)

    # --- Final Cleanup (Optional, but recommended) ---
    # Separate the Rk from the Name in the first column for a cleaner table.
    final_df[['Rk', 'Name']] = final_df['Rk_Name_Combined'].str.split(' ', n=1, expand=True)
    final_df = final_df.drop(columns=['Rk_Name_Combined', 'Rk'])

    # Move the last column to be the first column (Name)
    final_df = final_df[['Name'] + [col for col in final_df.columns if col != 'Name']]

    # 5. Apply the headers
    final_df.columns = all_headers
    
    # Placeholder for the cleanup logic to make the function runnable
    if final_df.empty: return pd.DataFrame() 
    # NOTE: You would re-insert your full cleanup logic here.

    return final_df # Return the resulting DataFrame

# --- EXECUTION ---
#team_abbreviations = ['NYY', 'BOS', 'TBR', 'TOR', 'BAL'] # Use a small list for testing
team_abbreviations = ['NYY'] # Use a small list for testing
clean_mode_one   = []
clean_mode_two   = []
clean_mode_three = []

def create_and_append_table(df_source, split_type, team_abv, df_clean_mode):
    """Creates and appends a table with split type and team abbreviation.

    Args:
        df_source (_type_): _description_
        split_type (_type_): _description_
        team_abv (_type_): _description_
        df_clean_mode (_type_): _description_
    """
    if not df_source.empty:
        df_source['Split_Type'] = split_type
        df_source['Team'] = team_abv
        df_clean_mode.append(df_source)

for team_abv in team_abbreviations:
    # PLATOON SPLITS
    df_lhp = players_split(split_type='LHP', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_lhp, 'vs_LHP', team_abv, clean_mode_one)

    df_rhp = players_split(split_type='RHP', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_rhp, 'vs_RHP', team_abv, clean_mode_one)
    
    df_lh = players_split(split_type='LH', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_lh, 'vs_LH_starters', team_abv, clean_mode_two)
    
    df_rh = players_split(split_type='RH', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_rh, 'vs_RH_starters', team_abv, clean_mode_two)
    
    # # LAST N DAYS
    df_last_seven_days = players_split(split_type='7', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_last_seven_days, 'last_seven_days', team_abv, clean_mode_two)
    
    df_last_fourteen = players_split(split_type='14', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_last_fourteen, 'last_fourteen_days', team_abv, clean_mode_two)

    df_last_twenty_eight = players_split(split_type='28', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_last_twenty_eight, 'last_twenty_eight_days', team_abv, clean_mode_two)
    
    # # HOME OR AWAY
    df_home = players_split(split_type='Home', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_home, 'home', team_abv, clean_mode_two)
    
    df_away = players_split(split_type='Away', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_away, 'away', team_abv, clean_mode_two)

    # # FIRST AND SECOND HALF
    df_first_half = players_split(split_type='1st', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_first_half, 'first_half', team_abv, clean_mode_two)

    df_second_half = players_split(split_type='2nd', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_second_half, 'second_half', team_abv, clean_mode_two)

    # MONTH
    df_march_april = players_split(split_type='April%2FMarch', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_march_april, 'march_april', team_abv, clean_mode_two)

    df_may = players_split(split_type='May', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_may, 'may', team_abv, clean_mode_two)

    df_june = players_split(split_type='June', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_june, 'june', team_abv, clean_mode_two)

    df_july = players_split(split_type='July', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_july, 'july', team_abv, clean_mode_two)

    df_august = players_split(split_type='August', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_august, 'august', team_abv, clean_mode_two)
        
    df_sept_oct = players_split(split_type='Sept%2FOct', team_abv=team_abv, clean_mode= 2)
    create_and_append_table(df_sept_oct, 'sept_oct', team_abv, clean_mode_two)

    # DEFENSIVE POSITIONS
    df_catcher = players_split(split_type='C', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_catcher, 'catcher', team_abv, clean_mode_one)
        
    df_first_base = players_split(split_type='1B', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_first_base, 'first_base', team_abv, clean_mode_one)

    df_second_base = players_split(split_type='2B', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_second_base, 'second_base', team_abv, clean_mode_one)

    df_third_base = players_split(split_type='3B', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_third_base, 'third_base', team_abv, clean_mode_one)

    df_shortstop = players_split(split_type='SS', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_shortstop, 'shortstop', team_abv, clean_mode_one)

    df_left_field = players_split(split_type='LF', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_left_field, 'left_field', team_abv, clean_mode_one)

    df_center_field = players_split(split_type='CF', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_center_field, 'center_field', team_abv, clean_mode_one)
        
    df_right_field = players_split(split_type='RF', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_right_field, 'right_field', team_abv, clean_mode_one)
        
    df_designated_hitter = players_split(split_type='DH', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_designated_hitter, 'designated_hitter', team_abv, clean_mode_one)
        
    df_pinch_hitter = players_split(split_type='PH', team_abv=team_abv, clean_mode= 1)
    create_and_append_table(df_pinch_hitter, 'pinch_hitter', team_abv, clean_mode_one)
    
    # LEADING OFF INNING
    df_first_batter_of_the_game = players_split(split_type='1st%20Batter', team_abv=team_abv, clean_mode= 3)
    create_and_append_table(df_first_batter_of_the_game, 'first_batter_of_game', team_abv, clean_mode_three)

final_clean_mode_one   = pd.concat(clean_mode_one,   ignore_index=True)
final_clean_mode_two   = pd.concat(clean_mode_two,   ignore_index=True)
final_clean_mode_three = pd.concat(clean_mode_three, ignore_index=True)

URL: https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C1st%20Batter%20G%7CNYY%7C2025%7Cbat%7CAB%7C
Table team_split1 (1st%20Batter vs NYY) loaded successfully.


In [None]:

# --- 1. SETUP (Only here because the logic is complex) ---
options = Options()
options.add_argument("--headless")
options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver = webdriver.Chrome(options=options)
year = datetime.now().year
datatable_id = 'team_split1'
clean_mode = 3

split_type = '1st%20Batter'
team_abv   = 'NYY'
# --- 2. URL CONSTRUCTION ---
if split_type == '1st%20Batter':
    url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"

driver.get(url)

# --- 3. WAIT AND EXTRACT ---
datatable_xpath = f"//table[@id='{datatable_id}']"
try:
    WebDriverWait(driver, 60).until(
        EC.presence_of_element_located((By.XPATH, datatable_xpath))
    )
    print(f"URL: {url}")
    print(f"Table {datatable_id} ({split_type} vs {team_abv}) loaded successfully.")

    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text
    
finally:
    driver.quit() # Close the driver in a finally block to ensure it closes
    
# --- 4. DATA PROCESSING (Keep this logic) ---
rows = text_content.split("\n")
table_data = [row.split("\t") for row in rows]
df = pd.DataFrame(table_data)

# ... (Your cleanup logic for clean_mode 0 or 1 remains here) ...
# This is where the complex splitting, renaming, and removal happens.
if clean_mode == 3:
    #Remove some empty values from the first row
    df.loc[0, 0] = df.loc[0, 0].replace('GS', '')
    df.loc[0, 0] = df.loc[0, 0].replace('R', '')
    df.loc[0, 0] = df.loc[0, 0].replace('SB', '')
    df.loc[0, 0] = df.loc[0, 0].replace('CS', '')
    
    # Define the correct headers
    all_headers = [
    'Name', 'G', 'PA', 'AB', 'H', '2B', '3B', 'HR', 'RBI',
    'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDBP',
    'HBP', 'SH', 'SF', 'IBB', 'ROE', 'BABIP', 'tOPS+', 'sOPS+'
    ]
    
# Remove last row
df = df.iloc[:-1]

# Remove rows where column 'A' contains 'Rk', but keep the first row
#df = df[~((df.index > 0) & (df[0].str.contains('Rk', na=False)))]

# 2. Use Regex to split the column into exactly two parts: Name/Rk and everything else (the stats)
# Regex Pattern Breakdown:
# (^.*?)      -> Capturing Group 1: Match and capture everything from the start (^) non-greedily (.*?)
# (\s\d+)     -> The split transition: Look for a space (\s) followed immediately by a digit (\d+).
#               This finds the first number after the name.
#               Crucially, we DON'T include this space-plus-digit in Group 1.
# (.*$)       -> Capturing Group 2: Match and capture everything from that point until the end ($).
regex_pattern = r"^(.*?)\s(\d+.*$)"

# Extract the two parts into new columns:
df[['Rk_Name_Combined', 'Stats_Combined']] = df[0].str.extract(regex_pattern, expand=True)


URL: https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C1st%20Batter%20G%7CNYY%7C2025%7Cbat%7CAB%7C
Table team_split1 (1st%20Batter vs NYY) loaded successfully.


In [176]:

# Remove the first row
df = df.iloc[1:]


In [None]:

# NEW LOGIC: VERIFY AND DROP NON-CONFORMING STATS ROWS ---

# 1. Calculate the number of spaces (separators) in each row
space_counts = df['Stats_Combined'].str.count(' ')

# 2. Find the maximum number of spaces, which defines the expected structure
max_spaces = space_counts.max()

print(f"Maximum spaces (expected separators): {max_spaces}")

# 1. Calculate the number of spaces (separators) in each row
space_counts = df['Stats_Combined'].str.count(' ')

# 2. Find the maximum number of spaces, which defines the expected structure
max_spaces = space_counts.max()

print(f"Maximum spaces (expected separators): {max_spaces}")


# 3. Identify the indices of the rows that do NOT have the maximum number of spaces
indices_to_drop = space_counts[space_counts < max_spaces].index

# 4. Check if any rows need to be dropped
if indices_to_drop.empty:
    print("All rows have a consistent number of spaces. Proceeding with split.")
else:
    num_dropped = len(indices_to_drop)
    
    # 5. Drop the rows that do not match the expected structure
    df = df.drop(index=indices_to_drop)
    
    # Reset the index after dropping rows
    df = df.reset_index(drop=True)
    
    print(f"Removed {num_dropped} row(s) because they did not have the expected number of spaces ({max_spaces}).")
    
# --- END OF NEW LOGIC ---

# Now the 'Stats_Combined' column in 'df' only contains rows that are guaranteed
# to have the correct number of data points for the final split.

# Continue with the rest of your splitting process:
# stats_split = df['Stats_Combined'].str.split(expand=True)
# ... and the rest of your cleanup/concat logic

Maximum spaces (expected separators): 23


In [177]:

# 3. Split the 'Stats_Combined' column on all spaces
# The Name is now safe, and the stats are simple space-separated values.
stats_split = df['Stats_Combined'].str.split(expand=True)


In [None]:

# 4. Combine the Name/Rank column with the split statistics
# We use iloc to get the columns we want.
final_df = pd.concat([
    df['Rk_Name_Combined'], # The Name/Rank column
    stats_split             # All the statistical columns
], axis=1)

# Remove the first row
final_df = final_df.iloc[1:]

# --- Final Cleanup (Optional, but recommended) ---
# Separate the Rk from the Name in the first column for a cleaner table.
final_df[['Rk', 'Name']] = final_df['Rk_Name_Combined'].str.split(' ', n=1, expand=True)
final_df = final_df.drop(columns=['Rk_Name_Combined', 'Rk'])

# Move the last column to be the first column (Name)
final_df = final_df[['Name'] + [col for col in final_df.columns if col != 'Name']]

# 5. Apply the headers
final_df.columns = all_headers


