### Import libraries

In [2]:
import os
from sqlalchemy import create_engine
import pybaseball as pyb
import pybaseball.cache # Ensure caching is imported
import pandas as pd
from dotenv import load_dotenv
import time
from datetime import date, timedelta
from sqlalchemy.engine import Engine
from sqlalchemy import text
import datetime
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException, TimeoutException
from rapidfuzz import process
import re
import numpy as np
from io import StringIO
import pylahman
import statsapi

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

### Load environment and connect to the DB

In [3]:

# Load environment variables from .env file
load_dotenv()

# Build the PostgreSQL connection string
DB_URL = f"postgresql://{os.environ['DB_USER']}:{os.environ['DB_PASS']}@{os.environ['DB_HOST']}:5432/{os.environ['DB_NAME']}"

# Create the engine object for connecting
engine = create_engine(DB_URL)

print("Database connection established.")

Database connection established.


### Create dim_players

In [None]:
def update_players(engine: Engine):   
    try:
        players_lahman = pylahman.People()
        player_chadwick = pyb.chadwick_register()

        # Join lahman and chadwick on key identifiers and bring all the columns from lahman
        # Ignore if key_bbref is empty in chadwick
        players_chadwick_clean = player_chadwick[player_chadwick['key_retro'].notna()]
        players_lahman_clean   = players_lahman[players_lahman['retroID'].notna()]

        players_df = pd.merge(
            players_chadwick_clean,
            players_lahman_clean,
            left_on=['key_retro'],
            right_on=['retroID'],
            how='left',
        )

        # Remove unnecesary columns and drop them from the dataframe
        cols_to_remove = ['retroID', 'bbrefID', 'mlb_played_first', 'mlb_played_last']
        players_df = players_df.drop(columns= cols_to_remove)

        # Rename the fields
        rename_map = {
            # IDs
            "key_mlbam":     "key_mlbam",
            "key_retro":     "key_retro",
            "key_bbref":     "key_bbref",
            "key_fangraphs": "key_fangraphs",
            "ID":            "id_lahman",
            "playerID":      "player_id_lahman",

            # Names
            "name_last":     "last_name_chadwick",
            "name_first":    "first_name_chadwick",
            "nameLast":      "last_name_lahman",
            "nameFirst":     "first_name_lahman",
            "nameGiven":     "first_and_second_name_lahman",

            # Debut/Final game
            "debut":         "debut",
            "finalGame":     "final_game",

            # Info
            "weight":        "weight",
            "height":        "height",
            "bats":          "bats",
            "throws":        "throws",

            # Birth/Death
            "birthYear":     "birth_year",
            "birthMonth":    "birth_month",
            "birthDay":      "birth_day",
            "birthCity":     "birth_city",
            "birthCountry":  "birth_country",
            "birthState":    "birth_state",
            "deathYear":     "death_year",
            "deathMonth":    "death_month",
            "deathDay":      "death_day",
            "deathCountry":  "death_country",
            "deathState":    "death_state",
            "deathCity":     "death_city",
        }

        # Apply the rename
        players_df = players_df.rename(columns= rename_map)

        # Order the new columns
        ordered_cols = [
            "key_mlbam",
            "key_retro",
            "key_bbref",
            "key_fangraphs",
            "id_lahman",
            "player_id_lahman",
            "last_name_chadwick",
            "first_name_chadwick",
            "last_name_lahman",
            "first_name_lahman",
            "first_and_second_name_lahman",
            "debut",
            "final_game",
            "weight",
            "height",
            "bats",
            "throws",
            "birth_year",
            "birth_month",
            "birth_day",
            "birth_city",
            "birth_country",
            "birth_state",
            "death_year",
            "death_month",
            "death_day",
            "death_country",
            "death_state",
            "death_city"
        ]

        # Apply the order
        players_df = players_df[ordered_cols]

        # This selects only columns with numbers and fills their nulls with -1
        numeric_cols = players_df.select_dtypes(include=['number']).columns
        players_df[numeric_cols] = players_df[numeric_cols].fillna(-1)

        # Replace nulls in the text columns
        text_cols = [
            "key_retro",
            "key_bbref",
            "player_id_lahman",
            "last_name_chadwick",
            "first_name_chadwick",
            "last_name_lahman",
            "first_name_lahman",
            "first_and_second_name_lahman",
            "bats",
            "throws",
            "birth_city",
            "birth_country",
            "birth_state",
            "death_country",
            "death_state",
            "death_city"
        ]

        # Convert to a standard object type first and then fill the nulls with N/A
        for col in text_cols:
            players_df[col] = players_df[col].astype(object).fillna('N/A')
            

        # List the date columns
        date_cols = [
            "debut",
            "final_game"
        ]
        # Fill null dates with January 1st, 1700
        for col in date_cols:
            players_df[col] = players_df[col].fillna(pd.Timestamp('1700-01-01'))

        # Check for nulls in my table - there shouldn't be any
        if (players_df.isnull().sum() == 0).all():
            print("‚úÖ No nulls found.")
        else:
            print("‚ö†Ô∏è WARNING - There are nulls in some columns in the dataframe.")

        # # --- STEP 5: LOADING ---
        print(f"Loading {len(players_df)} new rows into 'players'...")
        
        players_df.to_sql(
            'players', 
            engine, 
            if_exists='replace',
            index=False, 
            chunksize=5000
        )
        
        print(f"‚úÖ Players successfully added {len(players_df)} new rows of players data.")

    except Exception as e:
        print(f"‚ùå ETL Failed during extraction or loading: {e}")
        

# Execute the players function
update_players(engine)

### Create dim_franchises

In [None]:
def update_team_franchises(engine: Engine):
    try:
        # Import the franchises
        #? Note: As of 2025-12-18 there is only data up to the 2024 season
        team_franchises = pylahman.TeamsFranchises()
        
        # Data cleaning
        # Identify all text columns
        text_cols = team_franchises.select_dtypes(include=['object', 'string']).columns

        # Convert to object first, then fill (since the columns are literal strings)
        for col in text_cols:
            # Converting to object allows 'N/A' to be treated as a normal string
            team_franchises[col] = team_franchises[col].astype(object).fillna('N/A')
            
            # Just in case some were literal 'nan' strings:
            team_franchises[col] = team_franchises[col].replace(['nan', 'None', '<NA>'], 'N/A')

        # Final verification
        null_count = team_franchises[text_cols].isnull().sum().sum()
        if null_count == 0:
            print("‚úÖ All string columns are clean. No nulls found!")
        else:
            print(f"‚ö†Ô∏è Warning: {null_count} nulls still remain in text columns.")
            
        
        # Loading
        print(f"Loading {len(team_franchises)} new rows into 'team_franchises'...")
        
        team_franchises.to_sql(
            'team_franchises', 
            engine, 
            if_exists='replace',
            index=False, 
            chunksize=5000
        )
        
        print(f"‚úÖ Team franchises successfully added {len(team_franchises)} new rows of data.")
    
    except Exception as e:
        print(f"‚ùå ETL Failed during extraction or loading: {e}")

        
# Apply the function
update_team_franchises(engine)

### Teams info ***NOT IN USE***

In [None]:
# def update_team_info(engine: Engine):
#     try:
#         team_info = pylahman.Teams()

#         # Identify all text columns
#         text_cols = team_info.select_dtypes(include=['object', 'string']).columns

#         # Convert to object first, then fill with N/A
#         for col in text_cols:
#             # Converting to object allows 'N/A' to be treated as a normal string
#             team_info[col] = team_info[col].astype(object).fillna('N/A')
            
#             # Just in case some were literal 'nan' strings:
#             team_info[col] = team_info[col].replace(['nan', 'None', '<NA>'], 'N/A')

#         # This selects only columns with numbers and fills their nulls with -1
#         numeric_cols = team_info.select_dtypes(include=['number']).columns
#         team_info[numeric_cols] = team_info[numeric_cols].fillna(-1)

#         # Final verification
#         null_count_text    = team_info[text_cols].isnull().sum().sum()
#         null_count_numeric = team_info[numeric_cols].isnull().sum().sum()
#         total_nulls        = null_count_text + null_count_numeric

#         if total_nulls == 0:
#             print("‚úÖ All columns are clean. No nulls found!")
#         else:
#             print(f"‚ö†Ô∏è Warning: {total_nulls} nulls still remain some columns.")

#         # Loading
#         print(f"Loading {len(team_info)} new rows into 'team_info'...")
        
#         team_info.to_sql(
#             'team_info', 
#             engine, 
#             if_exists='replace',
#             index=False, 
#             chunksize=5000
#         )
        
#         print(f"‚úÖ Team information successfully added {len(team_info)} new rows of data.")
    
#     except Exception as e:
#         print(f"‚ùå ETL Failed during extraction or loading: {e}")


# # Apply the function
# update_team_info(engine)

### Create fact_team_tables

In [None]:
def create_fact_team_tables(engine: Engine):    
    def load_fact_team_tables(engine: Engine, df, category):
        try:
            table_name = 'fact_team_' + category
            print(f"üíæ Creating {table_name}...")
            
            # Loading
            print(f"   üîÉ Loading {len(df)} rows...")
            
            df.to_sql(
                table_name, 
                engine, 
                if_exists='replace',
                index=False, 
                chunksize=5000
            )
            
            print(f"   ‚úÖ Successfully added {len(df)} new rows of data.")
        
        except Exception as e:
            print(f"   ‚ùå ETL Failed during extraction or loading: {e}")

    # Declare the years
    current_year  = date.today().year
    ten_years_ago = current_year - 10

    # Import the team data for the last 10 years
    fact_team_batting  = pyb.team_batting(ten_years_ago, current_year,  ind= 1, qual= 0)
    fact_team_pitching = pyb.team_pitching(ten_years_ago, current_year,  ind= 1, qual= 0)
    fact_team_fielding = pyb.team_fielding(ten_years_ago, current_year,  ind= 1, qual= 0)

    # Apply the function
    load_fact_team_tables(engine, fact_team_batting,  'batting')
    load_fact_team_tables(engine, fact_team_pitching, 'pitching')
    load_fact_team_tables(engine, fact_team_fielding, 'fielding')



Loading 330 new rows into 'team_info'...
‚úÖ Team information successfully added 330 new rows of data.
Loading 330 new rows into 'team_info'...
‚úÖ Team information successfully added 330 new rows of data.
Loading 330 new rows into 'team_info'...
‚úÖ Team information successfully added 330 new rows of data.


### Create fact_player

In [None]:
def create_fact_player_tables(engine: Engine):    
    def load_fact_player_tables(engine: Engine, df, category):
        try:
            table_name = 'fact_player_' + category
            print(f"üíæ Creating {table_name}...")
            
            # Loading
            print(f"   üîÉ Loading {len(df)} rows...")
            
            df.to_sql(
                table_name, 
                engine, 
                if_exists='replace',
                index=False, 
                chunksize=5000
            )
            
            print(f"   ‚úÖ Successfully added {len(df)} new rows of data.")
        
        except Exception as e:
            print(f"   ‚ùå ETL Failed during extraction or loading: {e}")

    # Declare the years
    current_year   = date.today().year
    five_years_ago = current_year - 5

    # Import the team data for the last 10 years
    # print("\n" + "="*40)
    # print(f"{'‚¨áÔ∏è  Importing player stats':^40}")
    # print(f"{'Please wait...':^40}")
    # print("="*40 + "\n")
    print("‚¨áÔ∏è  Importing player stats... please wait")
    
    fact_player_batting  = pyb.batting_stats(five_years_ago, current_year,  ind= 1, qual= 0)
    fact_player_pitching = pyb.pitching_stats(five_years_ago, current_year,  ind= 1, qual= 0)
    fact_player_fielding = pyb.fielding_stats(five_years_ago, current_year,  ind= 1, qual= 0)
    
    # Speed tables are by year - they do not include range
    # Setup year range
    #current_year = datetime.now().year
    years = range(current_year - 9, current_year + 1) # Last 10 years including current

    all_dfs = []

    for year in years:
        #print(f"Fetching sprint speed for {year}...")
        try:
            # Fetch data
            df = pyb.statcast_sprint_speed(year, 50)
            
            # Adding the years
            df['Season'] = year
            
            all_dfs.append(df)
        except Exception as e:
            print(f"Could not fetch data for {year}: {e}")

    # Combine everything into one fact table
    fact_player_running = pd.concat(all_dfs, ignore_index=True)

    # Apply the function
    load_fact_player_tables(engine, fact_player_batting,  'batting')
    load_fact_player_tables(engine, fact_player_pitching, 'pitching')
    load_fact_player_tables(engine, fact_player_fielding, 'fielding')
    load_fact_player_tables(engine, fact_player_running, 'running')
    
create_fact_player_tables(engine)





      ‚¨áÔ∏è  IMPORTING BASEBALL STATS      
             Please wait...             

üíæ Creating fact_player_batting...
   üîÉ Loading 8673 rows...
   ‚úÖ Successfully added 8673 new rows of data.
üíæ Creating fact_player_pitching...
   üîÉ Loading 5106 rows...
   ‚úÖ Successfully added 5106 new rows of data.
üíæ Creating fact_player_fielding...
   üîÉ Loading 13553 rows...
   ‚úÖ Successfully added 13553 new rows of data.
üíæ Creating fact_player_running...
   üîÉ Loading 3830 rows...
   ‚úÖ Successfully added 3830 new rows of data.


In [None]:
# Historical player stats - It has data from 1871 but it doesn't have last year (2025)
player_batting_historical     = pylahman.Batting()
player_pitching_historical    = pylahman.Pitching()
player_fielding_historical    = pylahman.Fielding()
player_appearances_historical = pylahman.Appearances()

### Get scores last n days *NOT IN USE*

In [None]:
# def get_game_results_last_n_days(n_days=90):
#     """
#     Pulls raw pitch-by-pitch data for all games played in the last 'n_days' 
#     and then extracts the final score for each game.
#     """
#     today = date.today()
    
#     # 1. Calculate the start and end dates for the 90-day range
#     end_date = today - timedelta(days=1)  # Search up to yesterday
#     start_date = today - timedelta(days=n_days)
    
#     start_date_str = start_date.strftime('%Y-%m-%d')
#     end_date_str = end_date.strftime('%Y-%m-%d')
    
#     print(f"Searching for all games played from {start_date_str} to {end_date_str}...")

#     try:
#         # 2. Pull all pitch-by-pitch data in that range
#         all_data_in_range = pyb.statcast(start_dt=start_date_str, end_dt=end_date_str)
        
#     except Exception as e:
#         print(f"Error retrieving Statcast data: {e}")
#         return pd.DataFrame()

#     if all_data_in_range.empty:
#         print(f"No games found between {start_date_str} and {end_date_str}.")
#         return pd.DataFrame()

#     print(f"Successfully pulled {len(all_data_in_range)} pitch events.")

#     # 3. Sort the data chronologically by game_pk, inning, etc.
#     data_sorted = all_data_in_range.sort_values(
#         by=['game_pk', 'inning', 'inning_topbot', 'at_bat_number', 'pitch_number'],
#         ascending=True
#     )

#     # 4. Group by game_pk and take the last row (which contains the final score)
#     final_events = data_sorted.groupby('game_pk').tail(1).reset_index(drop=True)
    
#     # 5. Extract and rename the relevant columns for the final scoreboard
#     scoreboard = final_events[[
#         'game_date', 
#         'home_team', 
#         'away_team', 
#         'home_score', 
#         'away_score'
#     ]].copy()
    
#     scoreboard.rename(columns={
#         'home_score': 'Home_Final_Score',
#         'away_score': 'Away_Final_Score',
#         'game_date': 'Date'
#     }, inplace=True)
    
#     # 6. Determine the Winner
#     scoreboard['Winner'] = scoreboard.apply(
#         lambda row: row['home_team'] if row['Home_Final_Score'] > row['Away_Final_Score'] else row['away_team'],
#         axis=1
#     )
#     scoreboard['Result'] = (
#         scoreboard['Winner'] + ' wins ' + 
#         scoreboard['Home_Final_Score'].astype(str) + '-' + 
#         scoreboard['Away_Final_Score'].astype(str)
#     )
    
#     return scoreboard[['Date', 'away_team', 'home_team', 'Away_Final_Score', 'Home_Final_Score', 'Winner', 'Result']]

# # --- EXECUTION ---
# results_yesterday_df    = get_game_results_last_n_days(n_days= 1)
# results_last_7_days_df  = get_game_results_last_n_days(n_days= 7)
# results_last_15_days_df = get_game_results_last_n_days(n_days= 15)
# results_last_30_days_df = get_game_results_last_n_days(n_days= 30)
# results_last_60_days_df = get_game_results_last_n_days(n_days= 60)
# results_last_90_days_df = get_game_results_last_n_days(n_days= 90)


# # if not results_last_90_days_df.empty:
# #     print(f"\n--- Game Results from the Last 90 Days ({len(results_last_90_days_df)} Games Found) ---")
# #     print(results_last_90_days_df.tail(10)) # Print the last 10 games found
# # else:
# #     print("\nNo games were found in the last 90 days.")

Searching for all games played from 2025-12-17 to 2025-12-17...
This is a large query, it may take a moment to complete
Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-17 and 2025-12-17.
Searching for all games played from 2025-12-11 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-11 and 2025-12-17.
Searching for all games played from 2025-12-03 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-12-03 and 2025-12-17.
Searching for all games played from 2025-11-18 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

No games found between 2025-11-18 and 2025-12-17.
Searching for all games played from 2025-10-19 to 2025-12-17...
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 28/28 [00:03<00:00,  7.52it/s]

Successfully pulled 2970 pitch events.
Searching for all games played from 2025-09-19 to 2025-12-17...
This is a large query, it may take a moment to complete



  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Skipping offseason dates


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 58/58 [00:04<00:00, 14.14it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Successfully pulled 53951 pitch events.


### Create fact_statcast_pitches

In [None]:
def create_fact_statcast_events_pitch_by_pitch(engine: Engine, n_days= 90):
    """
    Pulls raw pitch-by-pitch data for all games played in the last 'n_days' 
    and then extracts the final score for each game.
    """
    def load_fact_statcast_events(engine: Engine, df):
        try:
            table_name = 'fact_statcast_pitches'
            print(f"üíæ Creating {table_name}...")
            
            # Loading
            print(f"   üîÉ Loading {len(df)} rows...")
            
            df.to_sql(
                table_name, 
                engine, 
                if_exists='replace',
                index=False, 
                chunksize=5000
            )
            
            print(f"   ‚úÖ Successfully added {len(df)} new rows of data.")
        
        except Exception as e:
            print(f"   ‚ùå ETL Failed during extraction or loading: {e}")
    
    # Get today's date
    today = date.today()

    # Calculate the start and end dates for the n-day range
    end_date = today - timedelta(days= 1)  # Search up to yesterday
    start_date = today - timedelta(days= n_days)

    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = end_date.strftime('%Y-%m-%d')

    print(f"Searching for all games played from {start_date_str} to {end_date_str}...")

    try:
        # Pull all pitch-by-pitch data in that range
        fact_statcast_pitches_last_n_days = pyb.statcast(start_dt= start_date_str, end_dt= end_date_str)
        
    except Exception as e:
        print(f"Error retrieving Statcast data: {e}")
        return pd.DataFrame()

    if fact_statcast_pitches_last_n_days.empty:
        print(f"No games found between {start_date_str} and {end_date_str}.")
        return pd.DataFrame()

    print(f"Successfully pulled {len(fact_statcast_pitches_last_n_days)} pitch events for the last {n_days} days.")

    # def filter_days(df, days):
    #     cutoff = today - timedelta(days=days)
    #     # Convert 'Date' column to datetime objects if they aren't already
    #     df['game_date'] = pd.to_datetime(df['game_date']).dt.date
    #     return df[df['game_date'] >= cutoff]

    # # Sub-df from the main one
    # results_1_day_df   = filter_days(fact_statcast_pitches_last_90_days, 1)
    # results_7_days_df  = filter_days(fact_statcast_pitches_last_90_days, 7)
    # results_30_days_df = filter_days(fact_statcast_pitches_last_90_days, 30)

    # Apply the function
    load_fact_statcast_events(engine, fact_statcast_pitches_last_n_days)

Searching for all games played from 2025-09-19 to 2025-12-17...
This is a large query, it may take a moment to complete
Skipping offseason dates


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 58/58 [00:04<00:00, 13.99it/s]
  final_data = pd.concat(dataframe_list, axis=0).convert_dtypes(convert_string=False)


Successfully pulled 53951 pitch events.


In [None]:
# Find the max date in the fact_statcast_pitches
def get_latest_date_from_db():
    query = text("SELECT MAX(game_date) FROM fact_statcast_pitches")
    
    with engine.connect() as conn:
        result = conn.execute(query).scalar()
        
    return result

# Execute and calculate fetch window
last_date = get_latest_date_from_db()

if last_date:
    # I want to start fetching from the day AFTER the last recorded date
    fetch_start = (last_date + timedelta(days=1)).strftime('%Y-%m-%d')
    # Fetch up to yesterday
    fetch_end = (date.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    
    if fetch_start <= fetch_end:
        print(f"üîÑ Last data was {last_date}. Fetching from {fetch_start} to {fetch_end}...")
        new_data = pyb.statcast(start_dt=fetch_start, end_dt=fetch_end)
        new_data.to_sql('fact_statcast_pitches', engine, if_exists='append', index=False)
    else:
        print("‚úÖ Database is already up to date.")
else:
    print("Empty table. You need to run an initial seed fetch.")


# TEST

### Splits by team

In [None]:
def teams_split(split_type, clean_mode):
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    # Define year
    year = datetime.now().year
    
    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)  

    if split_type == 'LHP' or split_type == 'RHP': # for LHP and RHP pitchers
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=total%7CLast%20{split_type}%20days%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'RH' or split_type == 'LH': # for RH and LH Starters
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%20Starter%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=hmvis%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'first_batter_game':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=leado%7C1st%20Batter%20G%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_power_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Power%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_weak_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Finesse%7CML%7C{year}%7Cbat%7CAB%7C")
    # For each team:
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
            driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_less_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3C%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_greater_or_equal_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3E%3D%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    
    
    # Name of the table
    datatable_id = 'split1'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//table[@id='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} ({split_type}) table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)
    
    # Close the WebDriver
    driver.quit()    
    
    if clean_mode == 1:
        # Remove 'Roe' exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the last column
        df = df.iloc[:, :-1]

        # Rename last 3 columns
        new_column_names = ["BAbip", "tOPS+", "sOPS+"]  # New names for last 3 columns
        df.columns.values[-3:] = new_column_names  # Assign new names

        # Remove the first column
        df = df.iloc[:, 1:]
    else:
        # Remove 'Roe' and GS exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)
        df[0] = df[0].str.replace('GS', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Remove rows where column 'A' contains 'Rk', but keep the first row
        df = df[~((df.index > 0) & (df[0].str.contains('Rk', na=False)))]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the first column
        df = df.iloc[:, 1:]

        # Remove the last 2 columns
        df = df.iloc[:, :-2]

        # New column names
        new_column_names = ['Team', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB',
                            'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDP', 'HBP', 'SH',
                            'SF', 'IBB', 'ROE', 'BAbip', 'tOPS+', 'sOPS+']

        # Rename all columns
        df.columns = new_column_names

    return df

# Call the function to get the teams split data
team_vs_lhp             = teams_split(split_type= 'LHP',  clean_mode= 0) # GS empty
team_vs_rhp             = teams_split(split_type= 'RHP',  clean_mode= 0) # GS empty
team_vs_lh_starters     = teams_split(split_type= 'LH',   clean_mode= 1)
team_vs_rh_starters     = teams_split(split_type= 'RH',   clean_mode= 1)
team_last_seven_days    = teams_split(split_type= '7',    clean_mode= 1)
team_last_fourteen_days = teams_split(split_type= '14',   clean_mode= 1)
team_last_28_days       = teams_split(split_type= '28',   clean_mode= 1)
team_home_games         = teams_split(split_type= 'Home', clean_mode= 1)
team_away_games         = teams_split(split_type= 'Away', clean_mode= 1)
team_first_batter_game  = teams_split(split_type= 'first_batter_game', clean_mode= 0) # GS empty
team_vs_power_pitcher   = teams_split(split_type= 'vs_power_pitcher',  clean_mode= 0) # GS empty
team_vs_weak_pitcher    = teams_split(split_type= 'vs_weak_pitcher',   clean_mode= 0) # GS empty
team_vs_power_team      = teams_split(split_type= 'vs_greater_or_equal_than_500_WP', clean_mode= 1)
team_vs_weak_team       = teams_split(split_type= 'vs_less_than_500_WP',             clean_mode= 1)

# # Direct matchups
team_laa = teams_split(split_type= 'ANA', clean_mode= 1)
team_ari = teams_split(split_type= 'ARI', clean_mode= 1)
team_atl = teams_split(split_type= 'ATL', clean_mode= 1)
team_bal = teams_split(split_type= 'BAL', clean_mode= 1)
team_bos = teams_split(split_type= 'BOS', clean_mode= 1)
team_chc = teams_split(split_type= 'CHC', clean_mode= 1)
team_chw = teams_split(split_type= 'CHW', clean_mode= 1)
team_cin = teams_split(split_type= 'CIN', clean_mode= 1)
team_cle = teams_split(split_type= 'CLE', clean_mode= 1)
team_col = teams_split(split_type= 'COL', clean_mode= 1)
team_det = teams_split(split_type= 'DET', clean_mode= 1)
team_hou = teams_split(split_type= 'HOU', clean_mode= 1)
team_kcr = teams_split(split_type= 'KCR', clean_mode= 1)
team_lad = teams_split(split_type= 'LAD', clean_mode= 1)
team_mia = teams_split(split_type= 'FLA', clean_mode= 1) 
team_mil = teams_split(split_type= 'MIL', clean_mode= 1)
team_min = teams_split(split_type= 'MIN', clean_mode= 1)
team_nym = teams_split(split_type= 'NYM', clean_mode= 1)
team_nyy = teams_split(split_type= 'NYY', clean_mode= 1)
team_oak = teams_split(split_type= 'OAK', clean_mode= 1)
team_phi = teams_split(split_type= 'PHI', clean_mode= 1)
team_pit = teams_split(split_type= 'PIT', clean_mode= 1)
team_sdp = teams_split(split_type= 'SDP', clean_mode= 1)
team_sea = teams_split(split_type= 'SEA', clean_mode= 1)
team_sfg = teams_split(split_type= 'SFG', clean_mode= 1)
team_stl = teams_split(split_type= 'STL', clean_mode= 1)
team_tbr = teams_split(split_type= 'TBD', clean_mode= 1)
team_tex = teams_split(split_type= 'TEX', clean_mode= 1)
team_tor = teams_split(split_type= 'TOR', clean_mode= 1)
team_wsn = teams_split(split_type= 'WSN', clean_mode= 1)

# Dictionary of dataframes for the teams
dic_team = {
    'LAA': team_laa,
    'AZ':  team_ari,
    'ATL': team_atl,
    'BAL': team_bal,
    'BOS': team_bos,
    'CHC': team_chc,
    'CHW': team_chw,
    'CIN': team_cin,
    'CLE': team_cle,
    'COL': team_col,
    'DET': team_det,
    'HOU': team_hou,
    'KC':  team_kcr,
    'LAD': team_lad,
    'MIA': team_mia,
    'MIL': team_mil,
    'MIN': team_min,
    'NYM': team_nym,
    'NYY': team_nyy,
    'ATH': team_oak,
    'PHI': team_phi,
    'PIT': team_pit,
    'SD':  team_sdp,
    'SEA': team_sea,
    'SF':  team_sfg,
    'STL': team_stl,
    'TB':  team_tbr,
    'TEX': team_tex,
    'TOR': team_tor,
    'WSH': team_wsn   
    }

# Add an ID column with the dictionary key as the identifier
for key, df in dic_team.items():
    df['ID'] = key  # Assign the dictionary key as the ID

# Concatenate all dataFrames in the dictionary
direct_matches = pd.concat(dic_team.values(), ignore_index=True)  # Resets index

dic_splits = {
    'team_vs_lhp'        :team_vs_lhp,        
    'team_vs_rhp'        :team_vs_rhp,
    'team_vs_lh_starters':team_vs_lh_starters,
    'team_vs_rh_starters':team_vs_rh_starters,
    'team_last_seven_days':team_last_seven_days,
    'team_last_fourteen_days':team_last_fourteen_days,
    'team_last_28_days':team_last_28_days,
    'team_home_games':team_home_games,
    'team_away_games':team_away_games,
    'team_first_batter_game':team_first_batter_game,
    'team_vs_power_pitcher':team_vs_power_pitcher,
    'team_vs_weak_pitcher':team_vs_weak_pitcher,
    'team_vs_power_team':team_vs_power_team,
    'team_vs_weak_team':team_vs_weak_team      
}



### Create batting_splits

In [47]:
YEAR = 2025
DATATABLE_ID = 'team_split1' 
MAX_RETRIES = 3 

# 2. Define the lists for iteration
team_abbreviations = ['BAL']
split_parameters = [
    {'type': 'LHP',            'desc': 'vs_LHP'},
    {'type': 'RHP',            'desc': 'vs_RHP'},
    {'type': '7',              'desc': 'last_7_days'},
    {'type': '14',             'desc': 'last_14_days'},
    {'type': '28',             'desc': 'last_28_days'},
    {'type': 'Home',           'desc': 'home_games'},
    {'type': 'Away',           'desc': 'away_games'},
    {'type': 'RH',             'desc': 'vs_RH_Starters'},
    {'type': 'LH',             'desc': 'vs_LH_Starters'},
    {'type': '1st',            'desc': '1st_Half'},
    {'type': '2nd',            'desc': '2nd_Half'},
    {'type': 'April%2FMarch',  'desc': 'April_March'},
    {'type': 'June',           'desc': 'June_Splits'},
    {'type': 'July',           'desc': 'July_Splits'},
    {'type': 'August',         'desc': 'August_Splits'},
    {'type': 'Sept%2FOct',     'desc': 'Sept_Oct_Splits'},
    {'type': 'C',              'desc':'C_Position'},
    {'type': '1B',             'desc': '1B_Position'},
    {'type': '2B',             'desc': '2B_Position'},
    {'type': '3B',             'desc': '3B_Position'},
    {'type': 'SS',             'desc': 'SS_Position'},
    {'type': 'LF',             'desc': 'LF_Position'},
    {'type': 'CF',             'desc': 'CF_Position'},
    {'type': 'RF',             'desc': 'RF_Position'},
    {'type': 'DH',             'desc': 'DH_Position'},
    {'type': 'PH',             'desc': 'PH_Position'},
    {'type': '1st%20Batter',   'desc': 'First_Batter_Game'},
    {'type': 'Leadoff%20Inn.', 'desc': 'First_Batter_Inning'},
    {'type': 'Batting%201st',  'desc': 'Batting_1st'},
    {'type': 'Batting%202nd',  'desc': 'Batting_2nd'},
    {'type': 'Batting%203rd',  'desc': 'Batting_3rd'},
    {'type': 'Batting%204th',  'desc': 'Batting_4th'},
    {'type': 'Batting%205th',  'desc': 'Batting_5th'},
    {'type': 'Batting%206th',  'desc': 'Batting_6th'},
    {'type': 'Batting%207th',  'desc': 'Batting_7th'},
    {'type': 'Batting%208th',  'desc': 'Batting_8th'},
    {'type': 'Batting%209th',  'desc': 'Batting_9th'},
    {'type': '1-3',            'desc': 'in_the_lineup_1-3rd'},
    {'type': '4-6',            'desc': 'in_the_lineup_4-6th'},
    {'type': '7-9',            'desc': 'in_the_lineup_7-9th'},
    {'type': 'SP',             'desc': 'vs_SP'},
    {'type': 'RP',             'desc': 'vs_RP'},
    {'type': 'Power',          'desc': 'vs_Power_Pitchers'},
    {'type': 'Finesse',        'desc': 'vs_Finesse_Pitchers'},
    {'type': 'ANA',            'desc': 'batting_vs_ANA'},
    {'type': 'ARI',            'desc': 'batting_vs_ARI'},
    {'type': 'ATL',            'desc': 'batting_vs_ATL'},
    {'type': 'BAL',            'desc': 'batting_vs_BAL'},
    {'type': 'BOS',            'desc': 'batting_vs_BOS'},
    {'type': 'CHC',            'desc': 'batting_vs_CHC'},
    {'type': 'CHW',            'desc': 'batting_vs_CHW'},
    {'type': 'CIN',            'desc': 'batting_vs_CIN'},
    {'type': 'CLE',            'desc': 'batting_vs_CLE'},
    {'type': 'COL',            'desc': 'batting_vs_COL'},
    {'type': 'DET',            'desc': 'batting_vs_DET'},
    {'type': 'HOU',            'desc': 'batting_vs_HOU'},
    {'type': 'KCR',            'desc': 'batting_vs_KCR'},
    {'type': 'LAD',            'desc': 'batting_vs_LAD'},
    {'type': 'FLA',            'desc': 'batting_vs_FLA'},
    {'type': 'MIL',            'desc': 'batting_vs_MIL'},
    {'type': 'MIN',            'desc': 'batting_vs_MIN'},
    {'type': 'NYM',            'desc': 'batting_vs_NYM'},
    {'type': 'NYY',            'desc': 'batting_vs_NYY'},
    {'type': 'OAK',            'desc': 'batting_vs_OAK'},
    {'type': 'PHI',            'desc': 'batting_vs_PHI'},
    {'type': 'PIT',            'desc': 'batting_vs_PIT'},
    {'type': 'SDP',            'desc': 'batting_vs_SDP'},
    {'type': 'SEA',            'desc': 'batting_vs_SEA'},
    {'type': 'SFG',            'desc': 'batting_vs_SFG'},
    {'type': 'STL',            'desc': 'batting_vs_STL'},
    {'type': 'TBD',            'desc': 'batting_vs_TBD'},
    {'type': 'TEX',            'desc': 'batting_vs_TEX'},
    {'type': 'TOR',            'desc': 'batting_vs_TOR'},
    {'type': 'WSN',            'desc': 'batting_vs_WSN'},
    {'type': 'Day',            'desc': 'batting_Day_Games'},
    {'type': 'Night',          'desc': 'batting_Night_Games'},
    {'type': 'Grass',          'desc': 'batting_Grass_Field_Games'},
    {'type': 'Artif.%20Turf',  'desc': 'batting_Artificial_Turf_Games'}
]

# Helper function to initialize driver
def initialize_driver():
    """Initializes and returns a new Selenium WebDriver instance."""
    options = Options()
    #options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage") 
    options.add_argument("--headless=new")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36") # Use a recent, common User-Agent
    # NOTE: Keep the path correct my Brave installation
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" 
    
    # Attempt to start the driver with a timeout
    try:
        driver = webdriver.Chrome(options=options) 
        driver.set_page_load_timeout(60)
        return driver
    except Exception as e:
        print(f"FATAL: Could not initialize Chrome driver. Check Brave path and driver version. Error: {e}")
        return None


#  batter_split function
def batter_split(driver, split_type, team_abv, year, datatable_id, description):
    
    # --- URL CONSTRUCTION --- 
    if split_type == 'LHP' or split_type == 'RHP': # for LHP and RHP pitchers
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=total%7CLast%20{split_type}%20days%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'RH' or split_type == 'LH': # for RH and LH Starters
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%20Starter%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=hmvis%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '1st' or split_type == '2nd': # for 1st and 2nd half of the season
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=half%7C{split_type}%20Half%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'April%2FMarch' or split_type == 'May' or split_type == 'June' \
        or split_type == 'July' or split_type == 'August' or split_type == 'Sept%2FOct': # for each month
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=month%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'C' or split_type == '1B' or split_type == '2B' or split_type == '3B' \
        or split_type == 'SS' or split_type == 'LF' or split_type == 'CF' or split_type == 'RF' \
        or split_type == 'DH' or split_type == 'PH': # for each position
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=defp%7Cas%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '1st%20Batter': # first batter of the game
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Leadoff%20Inn.': # first batter of the inning
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Batting%201st' or split_type == 'Batting%202nd' or split_type == 'Batting%203rd' \
        or split_type == 'Batting%204th' or split_type == 'Batting%205th' or split_type == 'Batting%206th' \
        or split_type == 'Batting%207th' or split_type == 'Batting%208th' or split_type == 'Batting%209th': # for each spot in the lineup
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=lineu%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == '1-3' or split_type == '4-6' or split_type == '7-9': # for each third of the lineup
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=innng%7CInnings%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'SP' or split_type == 'RP': # vs SP or RP
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=times%7Cvs.%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Power' or split_type == 'avg.P%2FF' or split_type == 'Finesse':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=power%7Cvs.%20{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C" 
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=oppon%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"
    elif split_type == 'Day' or split_type == 'Night' or split_type == 'Grass' or split_type == 'Artif.%20Turf':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=stad%7C{split_type}%7C{team_abv}%7C{year}%7Cbat%7CAB%7C"

    else:
        print(f"Error: Split type '{split_type}' not supported yet.")
        return pd.DataFrame()

    try:
        driver.get(url)
    except TimeoutException:
        print(f"[{team_abv} - {description}]: Page load timed out (60s). Skipping or retrying...")
        return None # Let the main loop handle the retry/skip
        
    datatable_xpath = f"//table[@id='{datatable_id}']"
    
    # --- SCRAPING LOGIC --- 
    try:
        # Wait up to 30 seconds for the table
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        table_element = driver.find_element(By.XPATH, datatable_xpath)
        print(f"[{team_abv} - {description}]: Table loaded successfully.")
    except Exception:
        # This catches both TimeoutException and NoSuchElementException
        print(f"[{team_abv} - {description}]: Table element not found after 30s. Check site content.")
        return None 

    # Extract the full HTML, wrap in StringIO, read with pandas
    table_html = table_element.get_attribute('outerHTML')
    html_string = StringIO(table_html)
    
    try:
        tables = pd.read_html(html_string, flavor='lxml') 
    except Exception as e:
        print(f"[{team_abv} - {description}]: Error parsing HTML with pandas: {e}")
        return None

    if not tables:
        print(f"[{team_abv} - {description}]: No tables found.")
        return None

    # Create an explicit copy
    df = tables[0].copy() 
    
    # --- CLEANING LOGIC --- 
    df.columns = df.columns.str.strip()
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]

    if 'Rk' in df.columns:
        df = df[df['Rk'] != 'Rk']
        
    df = df.iloc[:-1] # Remove last row (Totals)
    
    df['description'] = description
    df['team'] = team_abv
    df['year'] = YEAR
    
    return df 

# Master loop with driver reuse and retry logic

batting_splits = pd.DataFrame()
driver = initialize_driver()

if driver is None:
    exit() # Stop if the driver failed to initialize

print("Starting Scrape Job with Driver Reuse and Retry Logic...")
print("-" * 30)

try:
    # Outer loop for teams
    for team_abv in team_abbreviations:
        # Inner loop for splits
        for split in split_parameters:
            
            # Retry loop for failed connection/table load
            for attempt in range(MAX_RETRIES):
                try:
                    # Check if the driver is still alive (by checking its current URL)
                    driver.current_url 
                    
                    new_df = batter_split(
                        driver=driver,
                        split_type=split['type'], 
                        team_abv=team_abv, 
                        year=YEAR, 
                        datatable_id=DATATABLE_ID, 
                        description=split['desc']
                    )
                    
                    if new_df is not None and not new_df.empty:
                        batting_splits = pd.concat([batting_splits, new_df], ignore_index=True)
                        print(f"SUCCESS: Appended {len(new_df)} rows. Master DF size: {len(batting_splits)}")
                        break # Break the retry loop on success
                    
                    # If new_df is None (due to TimeoutException/Table not found), retry
                    print(f"RETRYING: Attempt {attempt + 1}/{MAX_RETRIES} for {team_abv} - {split['desc']}...")
                    time.sleep(2) # Short wait before retry

                except WebDriverException as e:
                    # CRITICAL: Driver died (Connection refused/lost)
                    print(f"\n[FATAL ERROR] Driver connection lost for {team_abv} - {split['desc']}. Restarting driver...")
                    
                    # Clean up the old session
                    try:
                        driver.quit()
                    except Exception:
                        pass # Ignore errors on quitting a dead driver
                    
                    # Restart the driver
                    driver = initialize_driver()
                    if driver is None:
                        # If restart fails, stop the whole script
                        raise SystemExit("Driver restart failed. Terminating.")
                        
                    time.sleep(5) # Longer wait after a fatal crash
                    print("Driver successfully restarted. Retrying scrape.")
                    
                except Exception as e:
                    print(f"[{team_abv} - {split['desc']}]: Unhandled error: {e}")
                    break # Break retry loop on unexpected failure

            # Check if retry failed all attempts and the split was not appended
            else: 
                print(f"Skipping {team_abv} - {split['desc']} after {MAX_RETRIES} failed attempts.")
                
finally:
    # 3. CLEANUP: Quit the driver ONCE after all loops are finished
    print("-" * 30)
    print("All tasks finished. Quitting driver.")
    if 'driver' in locals() and driver:
        driver.quit() 
    
print("Scraping Complete.")
print(f"Final DataFrame Shape: {batting_splits.shape}")

Starting Scrape Job with Driver Reuse and Retry Logic...
------------------------------
[BAL - vs_LHP]: Table loaded successfully.
SUCCESS: Appended 30 rows. Master DF size: 30
[BAL - vs_RHP]: Table loaded successfully.
SUCCESS: Appended 34 rows. Master DF size: 64
[BAL - last_7_days]: Table loaded successfully.
SUCCESS: Appended 15 rows. Master DF size: 79
[BAL - last_14_days]: Table loaded successfully.
SUCCESS: Appended 16 rows. Master DF size: 95
[BAL - last_28_days]: Table loaded successfully.
SUCCESS: Appended 18 rows. Master DF size: 113
[BAL - home_games]: Table loaded successfully.
SUCCESS: Appended 38 rows. Master DF size: 151
[BAL - away_games]: Table loaded successfully.
SUCCESS: Appended 37 rows. Master DF size: 188
[BAL - vs_RH_Starters]: Table loaded successfully.
SUCCESS: Appended 38 rows. Master DF size: 226
[BAL - vs_LH_Starters]: Table loaded successfully.
SUCCESS: Appended 38 rows. Master DF size: 264
[BAL - 1st_Half]: Table loaded successfully.
SUCCESS: Appended 34

### Pitching splits

In [None]:
# --- CONFIGURATION ---
YEAR = 2025
DATATABLE_ID = 'team_split1' 
MAX_RETRIES = 3 

# 2. Define the lists for iteration
team_abbreviations = ['BAL']
split_parameters = [
    {'type': 'LHB',                      'desc': 'vs_LHB'},
    {'type': 'RHB',                      'desc': 'vs_RHB'},
    {'type': '7',                        'desc': 'last_7_days'},
    {'type': '14',                       'desc': 'last_14_days'},
    {'type': '28',                       'desc': 'last_28_days'},
    {'type': 'Home',                     'desc': 'home_games'},
    {'type': 'Away',                     'desc': 'away_games'},
    {'type': '1st',                      'desc': '1st_half'},
    {'type': '2nd',                      'desc': '2nd_half'},
    {'type': 'April%2FMarch',            'desc': 'april_march'},
    {'type': 'June',                     'desc': 'june_splits'},
    {'type': 'July',                     'desc': 'july_splits'},
    {'type': 'August',                   'desc': 'august_splits'},
    {'type': 'Sept%2FOct',               'desc': 'sept_oct_Splits'},
    {'type': '1st%20Batter',             'desc': 'first_batter_game'},
    {'type': 'Leadoff%20Inn.',           'desc': 'first_batter_inning'},
    {'type': 'Batting%201st',            'desc': 'pitching_vs_1st'},
    {'type': 'Batting%202nd',            'desc': 'pitching_vs_2nd'},
    {'type': 'Batting%203rd',            'desc': 'pitching_vs_3rd'},
    {'type': 'Batting%204th',            'desc': 'pitching_vs_4th'},
    {'type': 'Batting%205th',            'desc': 'pitching_vs_5th'},
    {'type': 'Batting%206th',            'desc': 'pitching_vs_6th'},
    {'type': 'Batting%207th',            'desc': 'pitching_vs_7th'},
    {'type': 'Batting%208th',            'desc': 'pitching_vs_8th'},
    {'type': 'Batting%209th',            'desc': 'pitching_vs_9th'},
    {'type': 'Starter',                  'desc': 'as_starter'},
    {'type': 'Reliever',                 'desc': 'as_reliever'},
    {'type': '0-2%20Runs',               'desc': 'run_support_0_2'},
    {'type': '3-5%20Runs',               'desc': 'run_support_3_5'},
    {'type': '6%2B%20Runs',              'desc': 'run_support_6_plus'},
    {'type': 'Swung%20at%201st%20Pitch', 'desc': 'outcome_of_at_bat_when_swung_at_first_pitch'},
    {'type': 'Took%201st%20Pitch',       'desc': 'outcome_of_at_bat_when_took_first_pitch'},
    {'type': '0',                        'desc': '0_outs_in_the_inning'},
    {'type': '1',                        'desc': '1_outs_in_the_inning'},
    {'type': '2',                        'desc': '2_outs_in_the_inning'},
    {'type': 'innng%7C1st',             'desc': 'pitching_in_1st_inning'},
    {'type': 'innng%7C2nd',             'desc': 'pitching_in_2nd_inning'},
    {'type': 'innng%7C3rd',             'desc': 'pitching_in_3rd_inning'},
    {'type': 'innng%7C4th',             'desc': 'pitching_in_4th_inning'},
    {'type': 'innng%7C5th',             'desc': 'pitching_in_5th_inning'},
    {'type': 'innng%7C6th',             'desc': 'pitching_in_6th_inning'},
    {'type': 'innng%7C7th',             'desc': 'pitching_in_7th_inning'},
    {'type': 'innng%7C8th',             'desc': 'pitching_in_8th_inning'},
    {'type': 'innng%7C9th',             'desc': 'pitching_in_9th_inning'},
    {'type': 'ANA',                      'desc': 'pitching_vs_ANA'},
    {'type': 'ARI',                      'desc': 'pitching_vs_ARI'},
    {'type': 'ATL',                      'desc': 'pitching_vs_ATL'},
    {'type': 'BAL',                      'desc': 'pitching_vs_BAL'},
    {'type': 'BOS',                      'desc': 'pitching_vs_BOS'},
    {'type': 'CHC',                      'desc': 'pitching_vs_CHC'},
    {'type': 'CHW',                      'desc': 'pitching_vs_CHW'},
    {'type': 'CIN',                      'desc': 'pitching_vs_CIN'},
    {'type': 'CLE',                      'desc': 'pitching_vs_CLE'},
    {'type': 'COL',                      'desc': 'pitching_vs_COL'},
    {'type': 'DET',                      'desc': 'pitching_vs_DET'},
    {'type': 'HOU',                      'desc': 'pitching_vs_HOU'},
    {'type': 'KCR',                      'desc': 'pitching_vs_KCR'},
    {'type': 'LAD',                      'desc': 'pitching_vs_LAD'},
    {'type': 'FLA',                      'desc': 'pitching_vs_FLA'},
    {'type': 'MIL',                      'desc': 'pitching_vs_MIL'},
    {'type': 'MIN',                      'desc': 'pitching_vs_MIN'},
    {'type': 'NYM',                      'desc': 'pitching_vs_NYM'},
    {'type': 'NYY',                      'desc': 'pitching_vs_NYY'},
    {'type': 'OAK',                      'desc': 'pitching_vs_OAK'},
    {'type': 'PHI',                      'desc': 'pitching_vs_PHI'},
    {'type': 'PIT',                      'desc': 'pitching_vs_PIT'},
    {'type': 'SDP',                      'desc': 'pitching_vs_SDP'},
    {'type': 'SEA',                      'desc': 'pitching_vs_SEA'},
    {'type': 'SFG',                      'desc': 'pitching_vs_SFG'},
    {'type': 'STL',                      'desc': 'pitching_vs_STL'},
    {'type': 'TBD',                      'desc': 'pitching_vs_TBD'},
    {'type': 'TEX',                      'desc': 'pitching_vs_TEX'},
    {'type': 'TOR',                      'desc': 'pitching_vs_TOR'},
    {'type': 'WSN',                      'desc': 'pitching_vs_WSN'},
    {'type': 'Day',                      'desc': 'pitching_Day_Games'},
    {'type': 'Night',                    'desc': 'pitching_Night_Games'},
    {'type': 'Grass',                    'desc': 'pitching_Grass_Field_Games'},
    {'type': 'Artif.%20Turf',            'desc': 'pitching_Artificial_Turf_Games'}
]

# Helper function to initialize driver
def initialize_driver():
    """Initializes and returns a new Selenium WebDriver instance."""
    options = Options()
    #options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage") 
    options.add_argument("--headless=new")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36") # Use a recent, common User-Agent
    # NOTE: Keep the path correct my Brave installation
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" 
    
    # Attempt to start the driver with a timeout
    try:
        driver = webdriver.Chrome(options=options) 
        driver.set_page_load_timeout(60)
        return driver
    except Exception as e:
        print(f"FATAL: Could not initialize Chrome driver. Check Brave path and driver version. Error: {e}")
        return None


#  pitcher_split function
def pitcher_split(driver, split_type, team_abv, year, datatable_id, description):

    # --- URL CONSTRUCTION --- 
    if split_type == 'LHB' or split_type == 'RHB': # matchups vs LHB and RHB hitters
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=total%7CLast%20{split_type}%20days%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=hmvis%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '1st' or split_type == '2nd': # for 1st and 2nd half of the season
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=half%7C{split_type}%20Half%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'April%2FMarch' or split_type == 'May' or split_type == 'June' \
        or split_type == 'July' or split_type == 'August' or split_type == 'Sept%2FOct': # for each month
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=month%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '1st%20Batter': # first batter of the game
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Leadoff%20Inn.': # first batter of the inning
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Batting%201st' or split_type == 'Batting%202nd' or split_type == 'Batting%203rd' \
        or split_type == 'Batting%204th' or split_type == 'Batting%205th' or split_type == 'Batting%206th' \
        or split_type == 'Batting%207th' or split_type == 'Batting%208th' or split_type == 'Batting%209th': 
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=lineu%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Starter' or split_type == 'Reliever':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=sprel%7Cas%20{split_type}%7CT{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '0-2%20Runs' or split_type == '3-5%20Runs' or split_type == '6%2B%20Runs':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=rs%7C{split_type}%20Scored%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Swung%20at%201st%20Pitch' or split_type == 'Took%201st%20Pitch':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=tkswg%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == '0' or split_type == '1' or split_type == '2':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=outs%7C{split_type}%20outs%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'innng%7C1st' or split_type == 'innng%7C2nd' or split_type == 'innng%7C3rd' \
        or split_type == 'innng%7C4th' or split_type == 'innng%7C5th' or split_type == 'innng%7C6th' \
        or split_type == 'innng%7C7th' or split_type == 'innng%7C8th' or split_type == 'innng%7C9th':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params={split_type}%20inning%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=oppon%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    elif split_type == 'Day' or split_type == 'Night' or split_type == 'Grass' or split_type == 'Artif.%20Turf':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=stad%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CAB%7C"
    else:
        print(f"Error: Split type '{split_type}' not supported yet.")
        return pd.DataFrame()

    try:
        driver.get(url)
    except TimeoutException:
        print(f"[{team_abv} - {description}]: Page load timed out (60s). Skipping or retrying...")
        return None # Let the main loop handle the retry/skip
        
    datatable_xpath = f"//table[@id='{datatable_id}']"
    
    # --- SCRAPING LOGIC --- 
    try:
        # Wait up to 30 seconds for the table
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        table_element = driver.find_element(By.XPATH, datatable_xpath)
        print(f"[{team_abv} - {description}]: Table loaded successfully.")
    except Exception:
        # This catches both TimeoutException and NoSuchElementException
        print(f"[{team_abv} - {description}]: Table element not found after 30s. Check site content.")
        return None 

    # Extract the full HTML, wrap in StringIO, read with pandas
    table_html = table_element.get_attribute('outerHTML')
    html_string = StringIO(table_html)
    
    try:
        tables = pd.read_html(html_string, flavor='lxml') 
    except Exception as e:
        print(f"[{team_abv} - {description}]: Error parsing HTML with pandas: {e}")
        return None

    if not tables:
        print(f"[{team_abv} - {description}]: No tables found.")
        return None

    # Create an explicit copy
    df = tables[0].copy() 
    
    # --- CLEANING LOGIC --- 
    df.columns = df.columns.str.strip()
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]

    if 'Rk' in df.columns:
        df = df[df['Rk'] != 'Rk']
        
    df = df.iloc[:-1] # Remove last row (Totals)
    
    df['description'] = description
    df['team'] = team_abv
    df['year'] = YEAR
    
    return df 

# Master loop with driver reuse and retry logic

pitching_splits = pd.DataFrame()
driver = initialize_driver()

if driver is None:
    exit() # Stop if the driver failed to initialize

print("Starting Scrape Job with Driver Reuse and Retry Logic...")
print("-" * 30)

try:
    # Outer loop for teams
    for team_abv in team_abbreviations:
        # Inner loop for splits
        for split in split_parameters:
            
            # Retry loop for failed connection/table load
            for attempt in range(MAX_RETRIES):
                try:
                    # Check if the driver is still alive (by checking its current URL)
                    driver.current_url 
                    
                    new_df = pitcher_split(
                        driver=driver,
                        split_type=split['type'], 
                        team_abv=team_abv, 
                        year=YEAR, 
                        datatable_id=DATATABLE_ID, 
                        description=split['desc']
                    )
                    
                    if new_df is not None and not new_df.empty:
                        pitching_splits = pd.concat([pitching_splits, new_df], ignore_index=True)
                        print(f"SUCCESS: Appended {len(new_df)} rows. Master DF size: {len(pitching_splits)}")
                        break # Break the retry loop on success
                    
                    # If new_df is None (due to TimeoutException/Table not found), retry
                    print(f"RETRYING: Attempt {attempt + 1}/{MAX_RETRIES} for {team_abv} - {split['desc']}...")
                    time.sleep(2) # Short wait before retry

                except WebDriverException as e:
                    # CRITICAL: Driver died (Connection refused/lost)
                    print(f"\n[FATAL ERROR] Driver connection lost for {team_abv} - {split['desc']}. Restarting driver...")
                    
                    # Clean up the old session
                    try:
                        driver.quit()
                    except Exception:
                        pass # Ignore errors on quitting a dead driver
                    
                    # Restart the driver
                    driver = initialize_driver()
                    if driver is None:
                        # If restart fails, stop the whole script
                        raise SystemExit("Driver restart failed. Terminating.")
                        
                    time.sleep(5) # Longer wait after a fatal crash
                    print("Driver successfully restarted. Retrying scrape.")
                    
                except Exception as e:
                    print(f"[{team_abv} - {split['desc']}]: Unhandled error: {e}")
                    break # Break retry loop on unexpected failure

            # Check if retry failed all attempts and the split was not appended
            else: 
                print(f"Skipping {team_abv} - {split['desc']} after {MAX_RETRIES} failed attempts.")
                
finally:
    # 3. CLEANUP: Quit the driver ONCE after all loops are finished
    print("-" * 30)
    print("All tasks finished. Quitting driver.")
    if 'driver' in locals() and driver:
        driver.quit() 
    
print("Scraping Complete.")
print(f"Final DataFrame Shape: {pitching_splits.shape}")

### Pitching splits - Game Level

In [None]:
# --- CONFIGURATION ---
YEAR = 2025
DATATABLE_ID = 'team_split1' 
MAX_RETRIES = 3 

# 2. Define the lists for iteration
team_abbreviations = ['BAL']
split_parameters = [
    {'type': '7',                        'desc': 'last_7_days'},
    {'type': '14',                       'desc': 'last_14_days'},
    {'type': '28',                       'desc': 'last_28_days'},
    {'type': 'Home',                     'desc': 'home_games'},
    {'type': 'Away',                     'desc': 'away_games'},
    {'type': '1st',                      'desc': '1st_half'},
    {'type': '2nd',                      'desc': '2nd_half'},
    {'type': 'April%2FMarch',            'desc': 'april_march'},
    {'type': 'June',                     'desc': 'june_splits'},
    {'type': 'July',                     'desc': 'july_splits'},
    {'type': 'August',                   'desc': 'august_splits'},
    {'type': 'Sept%2FOct',               'desc': 'sept_oct_Splits'},
    {'type': 'Starter',                  'desc': 'as_starter'},
    {'type': 'Reliever',                 'desc': 'as_reliever'},
    {'type': '0-2%20Runs',               'desc': 'run_support_0_2'},
    {'type': '3-5%20Runs',               'desc': 'run_support_3_5'},
    {'type': '6%2B%20Runs',              'desc': 'run_support_6_plus'},
    {'type': 'ANA',                      'desc': 'pitching_vs_ANA'},
    {'type': 'ARI',                      'desc': 'pitching_vs_ARI'},
    {'type': 'ATL',                      'desc': 'pitching_vs_ATL'},
    {'type': 'BAL',                      'desc': 'pitching_vs_BAL'},
    {'type': 'BOS',                      'desc': 'pitching_vs_BOS'},
    {'type': 'CHC',                      'desc': 'pitching_vs_CHC'},
    {'type': 'CHW',                      'desc': 'pitching_vs_CHW'},
    {'type': 'CIN',                      'desc': 'pitching_vs_CIN'},
    {'type': 'CLE',                      'desc': 'pitching_vs_CLE'},
    {'type': 'COL',                      'desc': 'pitching_vs_COL'},
    {'type': 'DET',                      'desc': 'pitching_vs_DET'},
    {'type': 'HOU',                      'desc': 'pitching_vs_HOU'},
    {'type': 'KCR',                      'desc': 'pitching_vs_KCR'},
    {'type': 'LAD',                      'desc': 'pitching_vs_LAD'},
    {'type': 'FLA',                      'desc': 'pitching_vs_FLA'},
    {'type': 'MIL',                      'desc': 'pitching_vs_MIL'},
    {'type': 'MIN',                      'desc': 'pitching_vs_MIN'},
    {'type': 'NYM',                      'desc': 'pitching_vs_NYM'},
    {'type': 'NYY',                      'desc': 'pitching_vs_NYY'},
    {'type': 'OAK',                      'desc': 'pitching_vs_OAK'},
    {'type': 'PHI',                      'desc': 'pitching_vs_PHI'},
    {'type': 'PIT',                      'desc': 'pitching_vs_PIT'},
    {'type': 'SDP',                      'desc': 'pitching_vs_SDP'},
    {'type': 'SEA',                      'desc': 'pitching_vs_SEA'},
    {'type': 'SFG',                      'desc': 'pitching_vs_SFG'},
    {'type': 'STL',                      'desc': 'pitching_vs_STL'},
    {'type': 'TBD',                      'desc': 'pitching_vs_TBD'},
    {'type': 'TEX',                      'desc': 'pitching_vs_TEX'},
    {'type': 'TOR',                      'desc': 'pitching_vs_TOR'},
    {'type': 'WSN',                      'desc': 'pitching_vs_WSN'},
    {'type': 'Day',                      'desc': 'pitching_Day_Games'},
    {'type': 'Night',                    'desc': 'pitching_Night_Games'},
    {'type': 'Grass',                    'desc': 'pitching_Grass_Field_Games'},
    {'type': 'Artif.%20Turf',            'desc': 'pitching_Artificial_Turf_Games'}
]

# Helper function to initialize driver
def initialize_driver():
    """Initializes and returns a new Selenium WebDriver instance."""
    options = Options()
    #options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage") 
    options.add_argument("--headless=new")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36") # Use a recent, common User-Agent
    # NOTE: Keep the path correct my Brave installation
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe" 
    
    # Attempt to start the driver with a timeout
    try:
        driver = webdriver.Chrome(options=options) 
        driver.set_page_load_timeout(60)
        return driver
    except Exception as e:
        print(f"FATAL: Could not initialize Chrome driver. Check Brave path and driver version. Error: {e}")
        return None


#  pitcher_split function
def pitcher_split_game_level(driver, split_type, team_abv, year, datatable_id, description):

    # --- URL CONSTRUCTION --- 
    if split_type == 'LHB' or split_type == 'RHB': # matchups vs LHB and RHB hitters
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=plato%7Cvs%20{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=total%7CLast%20{split_type}%20days%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=hmvis%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '1st' or split_type == '2nd': # for 1st and 2nd half of the season
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=half%7C{split_type}%20Half%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'April%2FMarch' or split_type == 'May' or split_type == 'June' \
        or split_type == 'July' or split_type == 'August' or split_type == 'Sept%2FOct': # for each month
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=month%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '1st%20Batter': # first batter of the game
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%20G%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Leadoff%20Inn.': # first batter of the inning
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=leado%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Batting%201st' or split_type == 'Batting%202nd' or split_type == 'Batting%203rd' \
        or split_type == 'Batting%204th' or split_type == 'Batting%205th' or split_type == 'Batting%206th' \
        or split_type == 'Batting%207th' or split_type == 'Batting%208th' or split_type == 'Batting%209th': 
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=lineu%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Starter' or split_type == 'Reliever':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=sprel%7Cas%20{split_type}%7CT{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '0-2%20Runs' or split_type == '3-5%20Runs' or split_type == '6%2B%20Runs':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=rs%7C{split_type}%20Scored%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Swung%20at%201st%20Pitch' or split_type == 'Took%201st%20Pitch':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=tkswg%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == '0' or split_type == '1' or split_type == '2':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=outs%7C{split_type}%20outs%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'innng%7C1st' or split_type == 'innng%7C2nd' or split_type == 'innng%7C3rd' \
        or split_type == 'innng%7C4th' or split_type == 'innng%7C5th' or split_type == 'innng%7C6th' \
        or split_type == 'innng%7C7th' or split_type == 'innng%7C8th' or split_type == 'innng%7C9th':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params={split_type}%20inning%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=oppon%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    elif split_type == 'Day' or split_type == 'Night' or split_type == 'Grass' or split_type == 'Artif.%20Turf':
        url = f"https://www.baseball-reference.com/tools/split_stats_team.cgi?full=1&params=stad%7C{split_type}%7C{team_abv}%7C{year}%7Cpitch%7CIP%7C"
    else:
        print(f"Error: Split type '{split_type}' not supported yet.")
        return pd.DataFrame()

    try:
        driver.get(url)
    except TimeoutException:
        print(f"[{team_abv} - {description}]: Page load timed out (60s). Skipping or retrying...")
        return None # Let the main loop handle the retry/skip
        
    datatable_xpath = f"//table[@id='{datatable_id}']"
    
    # --- SCRAPING LOGIC --- 
    try:
        # Wait up to 30 seconds for the table
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        table_element = driver.find_element(By.XPATH, datatable_xpath)
        print(f"[{team_abv} - {description}]: Table loaded successfully.")
    except Exception:
        # This catches both TimeoutException and NoSuchElementException
        print(f"[{team_abv} - {description}]: Table element not found after 30s. Check site content.")
        return None 

    # Extract the full HTML, wrap in StringIO, read with pandas
    table_html = table_element.get_attribute('outerHTML')
    html_string = StringIO(table_html)
    
    try:
        tables = pd.read_html(html_string, flavor='lxml') 
    except Exception as e:
        print(f"[{team_abv} - {description}]: Error parsing HTML with pandas: {e}")
        return None

    if not tables:
        print(f"[{team_abv} - {description}]: No tables found.")
        return None

    # Create an explicit copy
    df = tables[0].copy() 
    
    # --- CLEANING LOGIC --- 
    df.columns = df.columns.str.strip()
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df.columns]

    if 'Rk' in df.columns:
        df = df[df['Rk'] != 'Rk']
        
    df = df.iloc[:-1] # Remove last row (Totals)
    
    df['description'] = description
    df['team'] = team_abv
    df['year'] = YEAR
    
    return df 

# Master loop with driver reuse and retry logic

pitching_splits_game_level = pd.DataFrame()
driver = initialize_driver()

if driver is None:
    exit() # Stop if the driver failed to initialize

print("Starting Scrape Job with Driver Reuse and Retry Logic...")
print("-" * 30)

try:
    # Outer loop for teams
    for team_abv in team_abbreviations:
        # Inner loop for splits
        for split in split_parameters:
            
            # Retry loop for failed connection/table load
            for attempt in range(MAX_RETRIES):
                try:
                    # Check if the driver is still alive (by checking its current URL)
                    driver.current_url 
                    
                    new_df = pitcher_split_game_level(
                        driver=driver,
                        split_type=split['type'], 
                        team_abv=team_abv, 
                        year=YEAR, 
                        datatable_id=DATATABLE_ID, 
                        description=split['desc']
                    )
                    
                    if new_df is not None and not new_df.empty:
                        pitching_splits_game_level = pd.concat([pitching_splits_game_level, new_df], ignore_index=True)
                        print(f"SUCCESS: Appended {len(new_df)} rows. Master DF size: {len(pitching_splits_game_level)}")
                        break # Break the retry loop on success
                    
                    # If new_df is None (due to TimeoutException/Table not found), retry
                    print(f"RETRYING: Attempt {attempt + 1}/{MAX_RETRIES} for {team_abv} - {split['desc']}...")
                    time.sleep(2) # Short wait before retry

                except WebDriverException as e:
                    # CRITICAL: Driver died (Connection refused/lost)
                    print(f"\n[FATAL ERROR] Driver connection lost for {team_abv} - {split['desc']}. Restarting driver...")
                    
                    # Clean up the old session
                    try:
                        driver.quit()
                    except Exception:
                        pass # Ignore errors on quitting a dead driver
                    
                    # Restart the driver
                    driver = initialize_driver()
                    if driver is None:
                        # If restart fails, stop the whole script
                        raise SystemExit("Driver restart failed. Terminating.")
                        
                    time.sleep(5) # Longer wait after a fatal crash
                    print("Driver successfully restarted. Retrying scrape.")
                    
                except Exception as e:
                    print(f"[{team_abv} - {split['desc']}]: Unhandled error: {e}")
                    break # Break retry loop on unexpected failure

            # Check if retry failed all attempts and the split was not appended
            else: 
                print(f"Skipping {team_abv} - {split['desc']} after {MAX_RETRIES} failed attempts.")
                
finally:
    # 3. CLEANUP: Quit the driver ONCE after all loops are finished
    print("-" * 30)
    print("All tasks finished. Quitting driver.")
    if 'driver' in locals() and driver:
        driver.quit() 
    
print("Scraping Complete.")
print(f"Final DataFrame Shape: {pitching_splits_game_level.shape}")

### Update the table statcast_pitches in PostgreSQL
#### This table shows the events pitch-by-pitch

In [None]:
pybaseball.cache.enable() # Enable caching for reliability

def update_statcast_data(engine: Engine):
    """
    Pulls Statcast data starting from the day AFTER the last record in the database
    to ensure only new events are downloaded and appended.
    """
    
    today = date.today()
    
    # --- STEP 1: FIND LAST DATE IN DB ---
    try:
        # Query the database to find the latest game_date currently stored
        with engine.connect() as connection:
            result = connection.execute(
                text("SELECT MAX(game_date) FROM statcast_pitches;")
            ).scalar()
        
        # If the table is empty, start from 400 days ago (initial load range)
        if result is None:
            print("Database is empty. Starting full initial load (400 days)...")
            last_date = today - timedelta(days=400)
        else:
            # Start the new pull from the day AFTER the last record
            last_date = result.date()
            print(f"Latest game_date found in DB: {last_date.strftime('%Y-%m-%d')}")
            
    except Exception as e:
        print(f"‚ùå ERROR querying database for last date: {e}. Defaulting to last 5 days.")
        last_date = today - timedelta(days=5)

    
    # --- STEP 2: DEFINE NEW EXTRACTION RANGE ---
    start_date = last_date + timedelta(days=1)
    end_date = today - timedelta(days=1) # Pull up to yesterday, as today's games aren't finished

    start_dt_str = start_date.strftime('%Y-%m-%d')
    end_dt_str = end_date.strftime('%Y-%m-%d')

    if start_date >= end_date:
        print(f"Data is up to date as of {end_dt_str}. No new extraction needed.")
        return

    print(f"Starting DAILY Statcast ETL: Pulling data from {start_dt_str} to {end_dt_str}")
    
    # --- STEP 3: EXTRACTION ---
    try:
        df = pyb.statcast(start_dt=start_dt_str, end_dt=end_dt_str)
        
        if df is None or df.empty:
            print("No new Statcast data retrieved for this date range. Exiting.")
            return

        #  --- STEP 4: TRANSFORMATION ---        
        # Handle data types before loading (optional, but good practice)
        df['game_date'] = pd.to_datetime(df['game_date'])
        
        # # --- STEP 5: LOADING ---
        print(f"Loading {len(df)} new rows into 'statcast_pitches'...")

        df.to_sql(
            'statcast_pitches', 
            engine, 
            if_exists='replace', # CRITICAL: Append new data to the existing table
            index=False, 
            chunksize=5000
        )
        
        print(f"‚úÖ Successfully appended {len(df)} new rows of Statcast data.")

    except Exception as e:
        print(f"‚ùå Statcast ETL Failed during extraction or loading: {e}")
        

# Execute the daily update
update_statcast_data(engine)

In [None]:


def extract_statcast_data(start_date, end_date):
    """Pulls granular, pitch-by-pitch data for a specified date range."""
    print(f"-> Pulling Statcast data from {start_date} to {end_date}...")
    
    # pybaseball statcast function is designed to handle this extraction
    raw_statcast_df = pyb.statcast(start_dt=start_date, end_dt=end_date)
    
    if raw_statcast_df is None or raw_statcast_df.empty:
        print("Warning: No Statcast data returned for this date range.")
        return pd.DataFrame()
        
    return raw_statcast_df


# Example
test_start_date = '2025-10-28'
test_end_date = '2025-10-30' 

daily_data = extract_statcast_data(test_start_date, test_end_date)
print(f"Successfully extracted {len(daily_data)} individual pitches/events.")

In [None]:
# game_pk: Integer. Game id provided by MLB Advanced Media.
# get statcast data for game_pk 
game_log = pyb.statcast_single_game(813024)

In [None]:
players_lahman = pylahman.People()
player_chadwick = pyb.chadwick_register()

# Join lahman and chadwick on key identifiers and bring all the columns from lahman
# Ignore if key_bbref is empty in chadwick
players_chadwick_clean = player_chadwick[player_chadwick['key_retro'].notna()]
players_lahman_clean   = players_lahman[players_lahman['retroID'].notna()]

players_df = pd.merge(
    players_chadwick_clean,
    players_lahman_clean,
    left_on=['key_retro'],
    right_on=['retroID'],
    how='left',
)

# Remove unnecesary columns and drop them from the dataframe
cols_to_remove = ['retroID', 'bbrefID', 'mlb_played_first', 'mlb_played_last']
players_df = players_df.drop(columns= cols_to_remove)

# Rename the fields
rename_map = {
    # IDs
    "key_mlbam":     "key_mlbam",
    "key_retro":     "key_retro",
    "key_bbref":     "key_bbref",
    "key_fangraphs": "key_fangraphs",
    "ID":            "id_lahman",
    "playerID":      "player_id_lahman",

    # Names
    "name_last":     "last_name_chadwick",
    "name_first":    "first_name_chadwick",
    "nameLast":      "last_name_lahman",
    "nameFirst":     "first_name_lahman",
    "nameGiven":     "first_and_second_name_lahman",

    # Debut/Final game
    "debut":         "debut",
    "finalGame":     "final_game",

    # Info
    "weight":        "weight",
    "height":        "height",
    "bats":          "bats",
    "throws":        "throws",

    # Birth/Death
    "birthYear":     "birth_year",
    "birthMonth":    "birth_month",
    "birthDay":      "birth_day",
    "birthCity":     "birth_city",
    "birthCountry":  "birth_country",
    "birthState":    "birth_state",
    "deathYear":     "death_year",
    "deathMonth":    "death_month",
    "deathDay":      "death_day",
    "deathCountry":  "death_country",
    "deathState":    "death_state",
    "deathCity":     "death_city",
}

# Apply the rename
players_df = players_df.rename(columns= rename_map)

# Order the new columns
ordered_cols = [
    "key_mlbam",
    "key_retro",
    "key_bbref",
    "key_fangraphs",
    "id_lahman",
    "player_id_lahman",
    "last_name_chadwick",
    "first_name_chadwick",
    "last_name_lahman",
    "first_name_lahman",
    "first_and_second_name_lahman",
    "debut",
    "final_game",
    "weight",
    "height",
    "bats",
    "throws",
    "birth_year",
    "birth_month",
    "birth_day",
    "birth_city",
    "birth_country",
    "birth_state",
    "death_year",
    "death_month",
    "death_day",
    "death_country",
    "death_state",
    "death_city"
]

# Apply the order
players_df = players_df[ordered_cols]

# This selects only columns with numbers and fills their nulls with -1
numeric_cols = players_df.select_dtypes(include=['number']).columns
players_df[numeric_cols] = players_df[numeric_cols].fillna(-1)

# Replace nulls in the text columns
text_cols = [
    "key_retro",
    "key_bbref",
    "player_id_lahman",
    "last_name_chadwick",
    "first_name_chadwick",
    "last_name_lahman",
    "first_name_lahman",
    "first_and_second_name_lahman",
    "bats",
    "throws",
    "birth_city",
    "birth_country",
    "birth_state",
    "death_country",
    "death_state",
    "death_city"
]

# Convert to a standard object type first and then fill the nulls with N/A
for col in text_cols:
    players_df[col] = players_df[col].astype(object).fillna('N/A')
    

# List the date columns
date_cols = [
    "debut",
    "final_game"
]
# Fill null dates with January 1st, 1700
for col in date_cols:
    players_df[col] = players_df[col].fillna(pd.Timestamp('1700-01-01'))

# Check for nulls in my table - there shouldn't be any
if (players_df.isnull().sum() == 0).all():
    print("‚úÖ No nulls found.")
else:
    print("‚ö†Ô∏è WARNING - There are nulls in some columns in the dataframe.")

In [None]:
# 1. Identify all text columns
text_cols = team_franchises.select_dtypes(include=['object', 'string']).columns

# 2. Convert to object FIRST, then fill
for col in text_cols:
    # Converting to object allows 'N/A' to be treated as a normal string
    team_franchises[col] = team_franchises[col].astype(object).fillna('N/A')
    
    # Just in case some were literal 'nan' strings:
    team_franchises[col] = team_franchises[col].replace(['nan', 'None', '<NA>'], 'N/A')

# 3. Final Verification with Emojis
null_count = team_franchises[text_cols].isnull().sum().sum()
if null_count == 0:
    print("‚úÖ All string columns are clean. No nulls found!")
else:
    print(f"‚ö†Ô∏è Warning: {null_count} nulls still remain in text columns.")

In [None]:
team_info = pylahman.Teams()

# Identify all text columns
text_cols = team_info.select_dtypes(include=['object', 'string']).columns

# Convert to object first, then fill with N/A
for col in text_cols:
    # Converting to object allows 'N/A' to be treated as a normal string
    team_info[col] = team_info[col].astype(object).fillna('N/A')
    
    # Just in case some were literal 'nan' strings:
    team_info[col] = team_info[col].replace(['nan', 'None', '<NA>'], 'N/A')

# This selects only columns with numbers and fills their nulls with -1
numeric_cols = team_info.select_dtypes(include=['number']).columns
team_info[numeric_cols] = team_info[numeric_cols].fillna(-1)

# Final verification
null_count_text    = team_info[text_cols].isnull().sum().sum()
null_count_numeric = team_info[numeric_cols].isnull().sum().sum()
total_nulls        = null_count_text + null_count_numeric

if total_nulls == 0:
    print("‚úÖ All columns are clean. No nulls found!")
else:
    print(f"‚ö†Ô∏è Warning: {total_nulls} nulls still remain some columns.")

### Create model
## dim_pitcher_archetypes

In [None]:
#def update_dim_pitcher_archetypes(engine: Engine):
"""
Groups pitchers into 8 archetypes and updates the database.
Now includes an 'updated_at' column to track the last run date.
"""

# 1. Pull unique pitcher stats
query = """
SELECT 
    pitcher,
    AVG(release_speed) as avg_velo, 
    AVG(release_spin_rate) as avg_spin, -- The spin rate of a pitch measured in revolutions per minute (rpm) at the moment of release
    AVG(pfx_x) as avg_horiz_mvmt, -- Horizontal movement in feet from the catcher's perspective
    AVG(pfx_z) as avg_vert_mvmt -- Vertical movement from the catcher's perpsective.
FROM fact_statcast_pitches
WHERE release_speed IS NOT NULL 
    AND release_spin_rate IS NOT NULL
    AND pfx_x IS NOT NULL 
    AND pfx_z IS NOT NULL
GROUP BY pitcher
HAVING COUNT(*) > 100 
"""
pitcher_stats = pd.read_sql(query, engine)

# 2. Scale the data
scaler = StandardScaler()
features = ['avg_velo', 'avg_spin', 'avg_horiz_mvmt', 'avg_vert_mvmt']
scaled_data = scaler.fit_transform(pitcher_stats[features])

# 3. Create 8 Archetypes
kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
pitcher_stats['archetype_id'] = kmeans.fit_predict(scaled_data)

# 4. Map IDs and Add Timestamp
archetype_map = {
    0: "Power Flamethrower",
    1: "Sinker / Tail Specialist",
    2: "Breaking Ball Specialist",
    3: "Standard Control Righty",
    4: "Position Player / Eephus",
    5: "Deceptive Angle Specialist",
    6: "Low-Spin / Heavy Sinker",
    7: "Power Slider / Sweeper"
}
pitcher_stats['archetype_name'] = pitcher_stats['archetype_id'].map(archetype_map)

# Add the current timestamp to every row
pitcher_stats['updated_at'] = datetime.now()

# 5. Database Update (Truncate and Append)
with engine.connect() as conn:
    try:
        conn.execute(text("TRUNCATE TABLE dim_pitcher_archetypes;"))
        conn.commit()
        print("Refreshing existing dim_pitcher_archetypes table...")
    except Exception:
        print("Table 'dim_pitcher_archetypes' not found. Creating it for the first time...")
        conn.rollback()

# Upload data including the new column
pitcher_stats[['pitcher', 'archetype_id', 'archetype_name', 'updated_at']].to_sql(
    'dim_pitcher_archetypes', 
    engine, 
    if_exists='append', 
    index=False
)

# 6. Ensure the Primary Key is set
pk_check = """
SELECT count(*) 
FROM information_schema.table_constraints 
WHERE table_name='dim_pitcher_archetypes' AND constraint_type='PRIMARY KEY';
"""
with engine.connect() as conn:
    has_pk = conn.execute(text(pk_check)).scalar()
    if has_pk == 0:
        conn.execute(text("ALTER TABLE dim_pitcher_archetypes ADD PRIMARY KEY (pitcher);"))
        conn.commit()
        print("‚úÖ Primary Key (pitcher) established.")

print(f"‚úÖ Successfully categorized {len(pitcher_stats)} pitchers.")

‚úÖ dim_pitcher_archetypes updated! Primary Key was preserved.


### "Luck Score" ranking

In [9]:
def calculate_luck_scores(engine, min_ab= 25):
    query = "SELECT * FROM vw_batter_vs_pitcher_archetype"
    df = pd.read_sql(query, engine)
    
    #? Filter by At-Bats first to ensure statistical significance
    # Even though the SQL view has a filter, we can tighten it here if needed
    df = df[df['at_bats'] >= min_ab].copy()
    
    #? Calculate Ranks (Percentiles)
    df['ev_rank'] = df['avg_exit_velo'].rank(pct=True)
    df['speed_rank'] = df['avg_sprint_speed'].rank(pct=True)
    
    # Lower time is BETTER for home_to_first, so we rank descending
    df['h1_rank'] = df['avg_home_to_first'].rank(pct=True, ascending=False)
    df['ba_rank'] = df['batting_avg'].rank(pct=True)
    
    # 3. Weighting the "Potential" (Expected Performance)
    # 60% Exit Velo, 20% Sprint, 20% Home-to-1st
    df['potential_score'] = (df['ev_rank'] * 0.6) + (df['speed_rank'] * 0.2) + (df['h1_rank'] * 0.2)
    
    #? Luck Score Calculation
    # A positive score means their physical tools exceed their actual results
    # If High Positive (Unlucky) e.g. 0.40
    # The player‚Äôs tools are elite (e.g., 90th percentile), but their results are 
    # poor (e.g., 50th percentile). They are likely hitting into the "loudest outs" in the league.

    # If Near Zero (Fair) e.g. 0.05
    # The player is getting exactly what they deserve. 
    # Their speed and power perfectly explain their batting average.

    # If High Negative (Lucky) e.g. -0.4:
    # The player has weak tools (e.g., 30th percentile) but a high batting average (e.g., 70th percentile). 
    # They are likely benefiting from "bloop" hits, defensive errors.
    df['luck_score'] = df['potential_score'] - df['ba_rank']
    
    #? Sample Size Adjustment
    # This 'confidence' metric tells you how much you should trust the luck score
    
    # 0.90 to 1.0 -> High Confidence. This player has faced this archetype many times. 
    # The luck score is likely a true reflection of their performance.
    
    # 0.5 -> Moderate. There is enough data to see a trend, 
    # but it could still be swayed by a single lucky or unlucky game.
    
    # 0.1 or lower -> Low Certainty. The player has very few At-Bats against this archetype. 
    # The high luck score might just be "small-sample noise".
    df['luck_confidence'] = df['at_bats'] / df['at_bats'].max()
    
    # Add the current timestamp to every row
    df['calculation_date'] = datetime.now()
    
    # Sort by the luckiest (most underperforming) players first
    return df.sort_values('luck_score', ascending=False)

luck_df = calculate_luck_scores(engine, min_ab= 25)
luck_df = luck_df[['batter', 'archetype_name', 'luck_score', 'luck_confidence', 'at_bats', 'calculation_date']]
# Example usage:
# luck_df = calculate_luck_scores(engine, min_ab=30)
# print(luck_df[['batter', 'archetype_name', 'at_bats', 'luck_score']].head(10))

# Get probable starters for the day

In [None]:
import statsapi
import pandas as pd

# 1. Get today's date automatically
today_str = datetime.now().strftime('%m/%d/%Y')

def get_daily_starters(date_str):
    # 1. Fetch all games for the specified date
    # No team ID means it pulls all 30 teams
    schedule = statsapi.schedule(date=date_str)
    
    games_list = []
    
    for game in schedule:
        # Extract the core info for your Matchup Predictor
        game_data = {
            "game_id": game.get("game_id"),
            "home_team": game.get("home_name"),
            "away_team": game.get("away_name"),
            "home_pitcher": game.get("home_probable_pitcher", "TBD"),
            "away_pitcher": game.get("away_probable_pitcher", "TBD"),
            "status": game.get("status"),
            "venue": game.get("venue_name")
        }
        games_list.append(game_data)
    
    # 2. Convert to DataFrame
    df = pd.DataFrame(games_list)
    return df

# Test with June 15, 2025
df_starters = get_daily_starters('6/15/2025')

# Display the first few rows
print(df_starters.head())

# Integrating with your "Matchup Predictor"
# Now that you have this DataFrame, you can loop through the home_pitcher and away_pitcher columns.

# For each name:

# Look up their Archetype in your database.

# Highlight the "Best Matchups" for that day 
# (e.g., "Today, 3 teams are facing 'Soft Tossers'‚Äîcheck your O's hitters' 
# luck scores against that archetype").

   game_id              home_team           away_team    home_pitcher  \
0   777505         Detroit Tigers     Cincinnati Reds    Tyler Holton   
1   777501      Baltimore Orioles  Los Angeles Angels   Scott Blewett   
2   777499  Philadelphia Phillies   Toronto Blue Jays    Zack Wheeler   
3   777504         Atlanta Braves    Colorado Rockies    Grant Holmes   
4   777503   Washington Nationals       Miami Marlins  MacKenzie Gore   

    away_pitcher status                        venue  
0     Wade Miley  Final                Comerica Park  
1  Yusei Kikuchi  Final  Oriole Park at Camden Yards  
2   Jos√© Berr√≠os  Final           Citizens Bank Park  
3  Austin Gomber  Final                  Truist Park  
4     Eury P√©rez  Final               Nationals Park  


### Get today's lineup

In [None]:
import statsapi
import pandas as pd

# 1. Get today's date automatically
today_str = datetime.now().strftime('%m/%d/%Y')

def get_lineups_to_df(date_str):
    # 1. Get all games for the day to find game_ids
    schedule = statsapi.schedule(date=date_str)
    
    lineup_records = []

    for game in schedule:
        game_id = game['game_id']
        team_names = {
            'home': game['home_name'],
            'away': game['away_name']
        }
        
        # 2. Pull boxscore data which contains the lineup
        try:
            box = statsapi.boxscore_data(game_id)
            
            for side in ['home', 'away']:
                # The 'battingOrder' list contains player IDs in order (1-9)
                order_ids = box[side].get('battingOrder', [])
                
                for slot, p_id in enumerate(order_ids, start=1):
                    player_info = box[side]['players'][f"ID{p_id}"]
                    
                    lineup_records.append({
                        "game_id": game_id,
                        "team": team_names[side],
                        "batting_order": slot,
                        "player_name": player_info['person']['fullName'],
                        "player_id": p_id,
                        "position": player_info['position']['abbreviation']
                    })
        except Exception as e:
            print(f"Lineup not yet available for {team_names['away']} @ {team_names['home']}")

    return pd.DataFrame(lineup_records)

# Test for a game day (using your June 15 example)
df_lineups = get_lineups_to_df('06/15/2025')
print(df_lineups.head(9)) # Show the lead-off through 9th hitter for the first game

   game_id            team  batting_order        player_name  player_id  \
0   777505  Detroit Tigers              1    Kerry Carpenter     681481   
1   777505  Detroit Tigers              2     Gleyber Torres     650402   
2   777505  Detroit Tigers              3       Riley Greene     682985   
3   777505  Detroit Tigers              4     Dillon Dingler     693307   
4   777505  Detroit Tigers              5  Spencer Torkelson     679529   
5   777505  Detroit Tigers              6     Zach McKinstry     656716   
6   777505  Detroit Tigers              7      Wenceel P√©rez     672761   
7   777505  Detroit Tigers              8        Javier B√°ez     595879   
8   777505  Detroit Tigers              9         Colt Keith     690993   

  position  
0       DH  
1       2B  
2       LF  
3        C  
4       1B  
5       3B  
6       RF  
7       SS  
8       PH  


In [19]:
# Player's in the lineup that are due for a hit based on their luck score

# Assuming your Luck Score table is loaded as df_luck
df_today_luck = pd.merge(
    df_lineups, 
    luck_df[['player_id', 'luck_score', 'avg_exit_velo']], 
    on='player_id', 
    how='inner'
)

# Display the unluckiest players in today's lineup (due for a hit!)
due_for_hit = df_today_luck.sort_values('luck_score').head(5)

KeyError: "['player_id', 'avg_exit_velo'] not in index"

In [None]:
import pandas as pd

# 1. Load your local archetype data
# (In a real app, this comes from your SQL database)
df_archetypes = pd.read_csv('pitcher_archetypes.csv') 

# 2. Get the daily starters from the previous step
df_starters = get_daily_starters('06/15/2025')

# 3. Merge for Away Pitchers
# We rename columns so we know which archetype belongs to which team
df_final = pd.merge(
    df_starters, 
    df_archetypes[['player_name', 'archetype']], 
    left_on='away_pitcher', 
    right_on='player_name', 
    how='left'
).rename(columns={'archetype': 'away_pitcher_type'}).drop(columns=['player_name'])

# 4. Merge for Home Pitchers
df_final = pd.merge(
    df_final, 
    df_archetypes[['player_name', 'archetype']], 
    left_on='home_pitcher', 
    right_on='player_name', 
    how='left'
).rename(columns={'archetype': 'home_pitcher_type'}).drop(columns=['player_name'])

# Display the result
print(df_final[['away_team', 'away_pitcher', 'away_pitcher_type', 'home_team', 'home_pitcher', 'home_pitcher_type']])

# What this enables in your Dashboard
# Now that you have the pitcher_type in your DataFrame, you can add "Smart Alerts" to your UI:

# Filter Logic: if row['home_pitcher_type'] == 'Breaking Ball Specialist':

# UI Trigger: Display a message: "Gunnar Henderson struggles against this pitcher type. Watch out for low-away sliders today."

# Heatmap Update: Automatically filter your Matchup Heatmap to only show the column for that specific archetype.

# Test 2

In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

def run_advanced_archetypes(df):
    # 1. Select features for the models
    # We use percentiles and rates to normalize the 'scale' of data
    model_features = [
        'fb_velo', 'velo_pct', 'avg_spin', 'hb_adj', 'ivb', 
        'total_whiff_rate', 'whiff_pct', 'chase_pct', 
        'fb_putaway_pct', 'os_putaway_pct', 'zone_rate'
    ]
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[model_features])

    # 2. Model A: Isolation Forest (The "Elite/Unicorn" Detector)
    # contamination=0.05 targets the top 5% of 'unique' profiles
    iso = IsolationForest(contamination=0.05, random_state=42)
    df['is_unicorn'] = iso.fit_predict(scaled_data) # -1 is a Unicorn

    # 3. Model B: Gaussian Mixture Model (Archetype Probabilities)
    n_archetypes = 7
    gmm = GaussianMixture(n_components=n_archetypes, random_state=42, n_init=10)
    gmm.fit(scaled_data)
    probs = gmm.predict_proba(scaled_data)

    # 4. Map Clusters to Human Names
    # Note: You should check gmm.means_ to align these perfectly.
    cluster_names = [
        "Power Flamethrower",       # High velo_pct, High fb_putaway
        "Command Specialist",       # High zone_rate, Low hb_adj
        "Vertical Mover (Rise)",    # High ivb
        "Horizontal Specialist",    # High hb_adj (Sweeper/Sinker types)
        "Offspeed Artist",          # High os_putaway_pct
        "Chase Consultant",         # High chase_pct, Lower zone_rate
        "Kitchen Sink / Junkball"   # Average across the board
    ]

    # 5. The Logic: Multi-Tagging
    def assign_archetypes(idx, row):
        tags = []
        
        # Add GMM tags if probability > 20%
        for i, p in enumerate(probs[idx]):
            if p > 0.20:
                tags.append(cluster_names[i])
        
        # Add Performance-Based "Elite" Badges
        if row['is_unicorn'] == -1: tags.append("ü¶Ñ UNICORN")
        if row['whiff_pct'] >= 95: tags.append("Elite Whiff")
        if row['is_dead_zone'] == 1: tags.append("Dead Zone FB")
        
        return " | ".join(tags)

    df['archetype_labels'] = [assign_archetypes(i, row) for i, row in df.iterrows()]
    return df

# Assuming 'pitcher_df' is the result of your SQL query
final_archetypes = run_advanced_archetypes(pitcher_df)

# Old pitcher archetype model

In [3]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

def run_scouting_model(df):
    """
    Processes pitcher data through GMM clustering, anomaly detection, 
    and heuristic tagging to assign archetypes.
    """
    # 1. Advanced Feature Engineering
    # Stuff Score: Weights the physical 'lethality' of the pitches
    # Precision Score: Weights the control and ability to neutralize hitters
    df['stuff_score'] = (
        (df['velo_pct'] * 0.35) + 
        (df['extension_pct'] * 0.25) + 
        (df['iz_whiff_pct'] * 0.40)
    )
    
    df['precision_score'] = (
        (df['command_pct'] * 0.40) + 
        (df['neutrality_pct'] * 0.30) + 
        (df['paint_pct'] * 0.30)
    )

    # 2. Prep for Machine Learning Models
    model_features = [
        'fb_velo', 'avg_extension', 'vaa_proxy', 
        'raw_iz_whiff', 'k_bb_rate', 'stuff_score', 'precision_score'
    ]
    
    # Scale data for GMM and Isolation Forest
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[model_features])

    # 3. Isolation Forest (Detecting Unicorns/Outliers)
    iso = IsolationForest(contamination=0.05, random_state=42)
    df['is_unicorn'] = iso.fit_predict(scaled_data)

    # 4. GMM (Style Clustering)
    gmm = GaussianMixture(n_components=7, random_state=42)
    df['style_cluster'] = gmm.fit_predict(scaled_data)

    # 5. Heuristic Report Generation
    def generate_report(row):
        tags = []
        summary_parts = []
        
        # Style mapping remains the same
        style_map = {0:"Power", 1:"Command", 2:"Vertical", 3:"Horizontal", 
                    4:"Offspeed", 5:"Chase", 6:"Finesse"}
        style = style_map[row['style_cluster']]
        tags.append(style)

        # A. THE ELITE ACE (Tier 1)
        if row['stuff_score'] >= 85 and row['precision_score'] >= 70:
            tags.append("üëë ELITE ACE")
            summary_parts.append("A rare front-line starter combining dominant stuff with elite zone mastery.")
        
        # B. STUFF MONSTER (Tier 1 physical)
        elif row['stuff_score'] >= 92:
            tags.append("üî• STUFF MONSTER")
            summary_parts.append("Possesses an overpowering arsenal that generates elite swing-and-miss.")

        # C. UNICORN STATUS (Geometry/Physics Outlier)
        if row['is_unicorn'] == -1:
            tags.append("ü¶Ñ UNICORN")
            summary_parts.append("Mathematically unique physical profile; outlier release or movement traits.")

        # D. FLAT-ANGLE SOUTHPAW (Archetype specific)
        if row['p_throws'] == 'L' and row['vaa_proxy'] < -4.0 and row['stuff_score'] > 75:
            tags.append("üìâ FLAT-ANGLE LHP")
            summary_parts.append("Deceptive low-slot lefty with a 'rising' fastball profile.")

        # E. NEUTRALIZER
        if row['neutrality_pct'] >= 85:
            tags.append("‚öñÔ∏è NEUTRALIZER")
            summary_parts.append("Negates platoon advantages; equally effective against LHB and RHB.")

        # FIX: Final Summary Construction
        if summary_parts:
            # Join the high-value descriptions into one cohesive paragraph
            summary = " ".join(summary_parts)
        else:
            # Default fallback only if NO special traits were found
            summary = f"Reliable {style} pitcher with league-average traits."

        return " | ".join(list(set(tags))), summary

    # Apply tagging
    results = df.apply(generate_report, axis=1)
    df['archetype_tags'], df['scouting_summary'] = zip(*results)
    
    return df

def update_dim_pitcher_archetypes(engine):
    """
    SQL to extract the necessary metrics for the Python model.
    """
    query = text("""
    WITH aggregated_stats AS (
        SELECT 
            p.pitcher,
            p.p_throws,
            -- Velocity & Movement
            ROUND(AVG(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FT', 'FC', 'FS', 'FO', 'SI') THEN p.release_speed END)::numeric, 1) as fb_velo,
            ROUND(AVG(p.release_extension)::numeric, 2) as avg_extension,
            
            -- VAA Calculation (Statcast Physics)
            -- Formula: -arctan(vz_f / vy_f) * (180/pi). vy_f is final y-velo, vz_f is final z-velo.
            ROUND(AVG(-ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / 
            (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159))::numeric, 2) as vaa_proxy,

            -- Skill Metrics
            COALESCE(SUM(CASE WHEN p.zone <= 9 AND p.description IN ('swinging_strike', 'swinging_strike_blocked') THEN 1 ELSE 0 END)::numeric / 
            NULLIF(SUM(CASE WHEN p.zone <= 9 AND p.description IN ('swinging_strike', 'foul', 'foul_tip', 'hit_into_play', 'swinging_strike_blocked') THEN 1 ELSE 0 END), 0), 0) as raw_iz_whiff,
            
            COALESCE(SUM(CASE WHEN p.zone IN (11, 12, 13, 14) THEN 1 ELSE 0 END)::numeric / COUNT(*), 0) as raw_paint,
            
            -- Platoon Split (wOBA Difference)
            ABS(
            COALESCE(SUM(CASE WHEN p.stand = 'L' THEN p.woba_value END) / NULLIF(SUM(CASE WHEN p.stand = 'L' THEN 1 ELSE 0 END), 0), 0) -
            COALESCE(SUM(CASE WHEN p.stand = 'R' THEN p.woba_value END) / NULLIF(SUM(CASE WHEN p.stand = 'R' THEN 1 ELSE 0 END), 0), 0)
            ) as platoon_split_abs,

            -- Command: K-BB%
            (SUM(CASE WHEN p.events = 'strikeout' THEN 1 ELSE 0 END)::numeric - 
                SUM(CASE WHEN p.events IN ('walk', 'hit_by_pitch') THEN 1 ELSE 0 END)::numeric) / 
                NULLIF(COUNT(DISTINCT p.at_bat_number), 0) as k_bb_rate,

            COALESCE(SUM(CASE WHEN p.description IN ('swinging_strike', 'swinging_strike_blocked') THEN 1 ELSE 0 END)::numeric / 
                NULLIF(SUM(CASE WHEN p.description IN ('swinging_strike', 'foul', 'foul_tip', 'hit_into_play', 'swinging_strike_blocked') THEN 1 ELSE 0 END), 0), 0) as raw_whiff,
            
            COALESCE(SUM(CASE WHEN p.launch_speed >= 95 THEN 1 ELSE 0 END)::numeric / 
                NULLIF(SUM(CASE WHEN p.launch_speed IS NOT NULL THEN 1 ELSE 0 END), 0), 0) as raw_hard_hit

        FROM fact_statcast_pitches p
        GROUP BY p.pitcher, p.p_throws
        HAVING COUNT(*) > 100 AND AVG(p.release_speed) > 86
    ),
    ranked_stats AS (
        SELECT 
            ast.*,
            ROUND((PERCENT_RANK() OVER (ORDER BY fb_velo))::numeric, 2) * 100 as velo_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY raw_whiff))::numeric, 2) * 100 as whiff_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY avg_extension))::numeric, 2) * 100 as extension_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY k_bb_rate))::numeric, 2) * 100 as command_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY platoon_split_abs DESC))::numeric, 2) * 100 as neutrality_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY raw_iz_whiff))::numeric, 2) * 100 as iz_whiff_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY raw_paint))::numeric, 2) * 100 as paint_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY raw_hard_hit DESC))::numeric, 2) * 100 as suppression_pct
        FROM aggregated_stats ast
    )
    SELECT 
        CONCAT(pn.first_name_chadwick, ' ', pn.last_name_chadwick) as full_name,
        rs.*
    FROM ranked_stats rs
    JOIN dim_player pn ON rs.pitcher = pn.key_mlbam;
    """)
    df = pd.read_sql(query, engine)
    return run_scouting_model(df)

# Execute
pitcher_archetypes = update_dim_pitcher_archetypes(engine)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

def explain_clusters_stable(df, model_features):
    # 1. Clean Data
    X = df[model_features].values
    y = df['style_cluster'].values.astype(int)

    # 2. Use Random Forest (More stable with various NumPy/SciPy versions)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)

    # 3. Calculate Permutation Importance
    # This identifies which features define the clusters
    result = permutation_importance(rf, X, y, n_repeats=10, random_state=42)
    
    # Map importance back to feature names
    feature_importance = pd.Series(result.importances_mean, index=model_features)
    
    return rf, feature_importance

# Execute
model_features = ['fb_velo', 'avg_extension', 'vaa_proxy', 'raw_iz_whiff', 'k_bb_rate']
rf_model, trait_importance = explain_clusters_stable(pitcher_archetypes, model_features)

print("Top Drivers for your Clusters:")
print(trait_importance.sort_values(ascending=False))

Top Drivers for your Clusters:
k_bb_rate        0.436710
raw_iz_whiff     0.355542
fb_velo          0.305364
avg_extension    0.149583
vaa_proxy        0.050298
dtype: float64


# Pitcher archetype model

In [24]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

def run_scouting_model(df):
    """
    Synthesizes pitching identity (GMM) with performance outcomes (Whiff/Barrel)
    to generate advanced scouting archetypes.
    """
    
    # 1. Identity Features (Physical & Tactical only)
    # These define HOW he pitches, not how well he's doing.
    identity_features = [
        'ffour_usage', 'sinker_usage', 'bb_usage', 'offspeed_usage',
        'ffour_vaa_pct', 'velo_gap_pct', 'command_pct', 'paint_pct'
    ]
    
    # 2. Effectiveness Scores (Combining Inputs + Outcomes)
    # This helps us rank pitchers within their style groups
    df['lethality_score'] = (
        (df['whiff_pct'] * 0.70) + 
        (df['suppression_pct'] * 0.20) + 
        (df['velo_pct'] * 0.10)
    ).round(1)

    # 3. Clustering (Archetype Definition)
    scaler = StandardScaler()
    scaled_identity = scaler.fit_transform(df[identity_features].fillna(0))
    
    gmm = GaussianMixture(n_components=7, random_state=42)
    df['style_cluster'] = gmm.fit_predict(scaled_identity)

    # 4. Outlier Detection (Unicorns)
    iso = IsolationForest(contamination=0.04, random_state=42)
    df['is_unicorn'] = iso.fit_predict(scaled_identity)
    
    # def calculate_pitcher_grade(row):
    #     # Pillar 1: Lethality (Results)
    #     lethality = (row['whiff_pct'] * 0.7 + row['suppression_pct'] * 0.3)
        
    #     # Pillar 2: Physicality (Tools)
    #     physicality = (row['velo_pct'] * 0.55 + row['ffour_vaa_pct'] * 0.45)
        
    #     # Pillar 3: Execution (Stability)
    #     execution = (row['command_pct'] * 0.8 + row['neutrality_pct'] * 0.2)
        
    #     # Weighted GPA Score (0-100)
    #     gpa_score = (lethality * 0.40) + (physicality * 0.30) + (execution * 0.20)
        
    #     # Letter Grade Mapping
    #     if gpa_score   >= 90: return 'A+'
    #     elif gpa_score >= 80: return 'A'
    #     elif gpa_score >= 70: return 'B'
    #     elif gpa_score >= 55: return 'C'
    #     elif gpa_score >= 40: return 'D'
    #     else: return 'F'

    # # Apply the grade
    # df['overall_grade'] = df.apply(calculate_pitcher_grade, axis=1)

    # 5. Advanced Tagging Logic
    def generate_scouting_report(row):
        tags = []
        # Initialize Summary with the Grade
        #grade = row['overall_grade']
        # summary = f"[{grade} GRADE] " 
        summary = ""

        
        # if grade == 'A+':
        #     summary += "A franchise cornerstone with virtually no statistical weaknesses. "
        # elif grade == 'A':
        #     summary += "Top-of-the-rotation talent with elite specialized tools. "
        # elif grade == 'B':
        #     summary += "Above-average contributor with a reliable floor. "
        # elif grade == 'F':
        #     summary += "High-risk profile requiring significant mechanical or tactical overhaul. "
        
        # Style Definitions
        styles = {
            0: "Power Arsenal", 1: "Precision Specialist", 2: "High-Rise Vertical",
            3: "Groundball/Sinker", 4: "Deceptive Offspeed", 5: "Sweeper/Chase", 
            6: "Hybrid/Workhorse"
        }
        main_style = styles[row['style_cluster']]
        tags.append(main_style)

        # A. THE PERFORMANCE ELITE
        if row['lethality_score'] >= 85:
            tags.append("üëë ELITE DOMINATOR")
            summary += "Demonstrates league-leading ability to miss bats and suppress contact. "
        elif row['whiff_pct'] >= 85 and row['suppression_pct'] >= 85:
            tags.append("üíé EFFICIENCY KING")
            summary += "Rare dual-threat: elite whiff generation paired with barrel avoidance. "

        # B. THE CONTACT SUPPRESSOR (The 'Maddux' Profile)
        if row['suppression_pct'] >= 90 and row['whiff_pct'] < 50:
            tags.append("üõ°Ô∏è CONTACT MANAGER")
            summary += "Survives by inducing weak contact; masterful at staying off the barrel. "

        # C. THE GLASS CANNON (High Whiff, High Barrels)
        if row['whiff_pct'] >= 80 and row['suppression_pct'] < 30:
            tags.append("‚ö†Ô∏è GLASS CANNON")
            summary += "High-risk, high-reward. Dominates with whiffs but prone to loud contact when hit. "

        # D. MECHANICAL UNICORN
        if row['is_unicorn'] == -1:
            tags.append("ü¶Ñ UNICORN")
            summary += "Possesses a highly unique mechanical/usage profile that defies standard categorization. "

        # E. THE NEUTRALIZER
        if row['neutrality_pct'] >= 95:
            tags.append("‚öñÔ∏è NEUTRALIZER")
            summary += "Shows zero performance drop-off regardless of batter handedness. "

        # F. STRATEGIC BREAKOUT (From SQL)
        if row['breakout_potential'] != 'OPTIMIZED':
            tags.append("üöÄ BREAKOUT CANDIDATE")
            summary += f"Data suggests significant upside if usage is shifted toward: {row['breakout_potential']}. "

        # G. NEW: ULTRA-POWER OVERRIDE
        # This handles the "97 mph in Deceptive Offspeed" scenario
        if row['fb_velo'] >= 97:
            tags.append("‚ö° ULTRA-POWER")
            if main_style == "Power Arsenal":
                tags.append("üî® THOR")
                summary += "A 'Thor' profile: Elite 97+ mph velocity combined with an aggressive power philosophy. "
            else:
                summary += f"Note: Features elite 97+ mph heat, though currently utilizes a {main_style} tactical approach. "

        # Final Polish
        if not summary:
            summary = f"A reliable {main_style} with average league effectiveness."
            
        return " | ".join(list(set(tags))), summary.strip()

    # Apply Logic
    results = df.apply(generate_scouting_report, axis=1)
    df['archetype_tags'], df['scouting_summary'] = zip(*results)
    
    return df

def update_dim_pitcher_archetypes(engine):
    """
    SQL to extract the necessary metrics for the Python model.
    """
    query = text("""
    WITH attack_zone_stats AS (
        SELECT 
            p.*,
            -- Define Command/Paint Zones
            CASE 
                WHEN ABS(p.plate_x) <= 0.67 AND p.plate_z BETWEEN (p.sz_bot + 0.33) AND (p.sz_top - 0.33) THEN 'heart'
                WHEN ABS(p.plate_x) <= 1.1 AND p.plate_z BETWEEN (p.sz_bot - 0.33) AND (p.sz_top + 0.33) THEN 'shadow'
                WHEN ABS(p.plate_x) <= 1.5 AND p.plate_z BETWEEN (p.sz_bot - 0.75) AND (p.sz_top + 0.75) THEN 'chase'
                ELSE 'waste'
            END as attack_zone,
            -- Whiff Logic: 1 if the result was a swinging strike, 0 otherwise
            CASE WHEN p.description IN ('swinging_strike', 'swinging_strike_blocked', 'missed_bunt') THEN 1 ELSE 0 END as is_whiff,
            -- Swing Logic: 1 if the batter attempted a swing
            CASE WHEN p.description IN ('swinging_strike', 'swinging_strike_blocked', 'missed_bunt', 'foul', 'foul_tip', 'hit_into_play') THEN 1 ELSE 0 END as is_swing
        FROM fact_statcast_pitches p
    ),
    aggregated_stats AS (
        SELECT 
            p.pitcher,
            p.p_throws,
            COUNT(*) as total_pitches,        
            -- NEW RAW METRICS: Whiff & Barrel
            -- Whiff Rate: Whiffs / Swings
            ROUND(100.0 * SUM(p.is_whiff) / NULLIF(SUM(p.is_swing), 0), 2) as whiff_rate_raw,
            -- Barrel Rate: Barrels (launch_speed_angle = 6) / Batted Ball Events (type = 'X')
            ROUND(100.0 * SUM(CASE WHEN p.launch_speed_angle = 6 THEN 1 ELSE 0 END) / 
                NULLIF(SUM(CASE WHEN p.type = 'X' THEN 1 ELSE 0 END), 0), 2) as barrel_rate_raw,        
            -- Existing Metrics: Command & Paint
            ROUND(100.0 * SUM(CASE WHEN p.attack_zone = 'shadow' THEN 1 ELSE 0 END) / COUNT(*), 1) as paint_raw,
            ROUND(100.0 * SUM(CASE WHEN p.attack_zone IN ('shadow', 'chase') THEN 1 ELSE 0 END) / COUNT(*), 1) as command_raw,        
            -- Existing Metrics: Neutrality (Platoon)
            AVG(CASE WHEN p.stand = 'L' THEN p.estimated_woba_using_speedangle END) as xwoba_vs_lhb,
            AVG(CASE WHEN p.stand = 'R' THEN p.estimated_woba_using_speedangle END) as xwoba_vs_rhb,
            -- Velocity Grouping
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FT', 'FC', 'SI') THEN p.release_speed END)::numeric, 1), 0) as fb_velo,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'ST', 'SL', 'KC', 'GY', 'SV', 'CS', 'KN', 'EP') THEN p.release_speed END)::numeric, 1), 0) as offspeed_velo,                  
            -- Usage Percentages
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FC') THEN 1 ELSE 0 END) / COUNT(*), 1) as ffour_usage,
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('SI', 'FT') THEN 1 ELSE 0 END) / COUNT(*), 1) as sinker_usage,
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('CU', 'SL', 'KC', 'ST', 'SV', 'CS', 'KN') THEN 1 ELSE 0 END) / COUNT(*), 1) as bb_usage,
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'ST', 'SL', 'KC', 'GY', 'SV', 'CS', 'KN', 'EP') THEN 1 ELSE 0 END) / COUNT(*), 1) as offspeed_usage,       
            -- Velocity Gap
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FT', 'FC', 'SI') THEN p.release_speed END)::numeric - 
                        AVG(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'ST', 'SL', 'KC', 'GY', 'SV', 'CS', 'KN', 'EP') THEN p.release_speed END)::numeric, 1), 0) as velo_gap,       
            -- VAA Categories
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FC') THEN -ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159) END)::numeric, 2), 0) as ffour_vaa,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('SI', 'FT') THEN -ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159) END)::numeric, 2), 0) as sinker_vaa,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('CU', 'SL', 'KC', 'ST', 'SV', 'CS', 'KN') THEN -ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159) END)::numeric, 2), 0) as bb_vaa,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'EP') THEN -ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159) END)::numeric, 2), 0) as offspeed_vaa

        FROM attack_zone_stats p
        GROUP BY p.pitcher, p.p_throws
        HAVING COUNT(*) > 100 AND AVG(p.release_speed) > 84
    ),
    ranked_stats AS (
        SELECT 
            ast.*,
            -- Percentile Rankings
            ROUND((PERCENT_RANK() OVER (ORDER BY fb_velo))::numeric, 2) * 100 as velo_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (offspeed_usage > 0) ORDER BY offspeed_velo))::numeric, 2) * 100, 0) as offspeed_velo_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (offspeed_usage > 0) ORDER BY velo_gap))::numeric, 2) * 100, 0) as velo_gap_pct,                  
            -- NEW RANKINGS: Whiff & Suppression
            ROUND((PERCENT_RANK() OVER (ORDER BY whiff_rate_raw))::numeric, 2) * 100 as whiff_pct,
            -- We order barrel_rate DESC because a lower rate is better (higher percentile)
            ROUND((PERCENT_RANK() OVER (ORDER BY barrel_rate_raw DESC))::numeric, 2) * 100 as suppression_pct,
            -- Existing Rankings
            ROUND((PERCENT_RANK() OVER (ORDER BY command_raw))::numeric, 2) * 100 as command_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY paint_raw))::numeric, 2) * 100 as paint_pct,
            ROUND((100 - (ABS(COALESCE(xwoba_vs_lhb, 0.320) - COALESCE(xwoba_vs_rhb, 0.320)) * 100))::numeric, 2) as neutrality_pct,
            -- VAA Percentile Rankings
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (ffour_usage > 0) ORDER BY ffour_vaa))::numeric, 2) * 100, 0) as ffour_vaa_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (sinker_usage > 0) ORDER BY sinker_vaa DESC))::numeric, 2) * 100, 0) as sinker_vaa_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (bb_usage > 0) ORDER BY bb_vaa DESC))::numeric, 2) * 100, 0) as bb_vaa_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (offspeed_usage > 0) ORDER BY offspeed_vaa DESC))::numeric, 2) * 100, 0) as offspeed_vaa_pct
        FROM aggregated_stats ast
    )
    SELECT 
        CONCAT(pn.first_name_chadwick, ' ', pn.last_name_chadwick) as full_name,
        rs.*,
        -- Final Pitch Quality Scores
        ROUND((velo_pct * 0.25 + ffour_vaa_pct * 0.25 + whiff_pct * 0.5), 0) as ffour_quality_score,
        ROUND((offspeed_velo_pct * 0.25 + offspeed_vaa_pct * 0.25 + whiff_pct * 0.5), 0) as offspeed_quality_score,   
        -- Breakout Logic
        CASE 
            WHEN (ffour_vaa_pct > 80 AND ffour_usage < 20) THEN 'UNDERUSED ELITE FASTBALL'
            WHEN (bb_vaa_pct > 80 AND bb_usage < 15) THEN 'UNDERUSED ELITE BREAKING'
            WHEN (offspeed_vaa_pct > 80 AND offspeed_usage < 15) THEN 'UNDERUSED ELITE OFFSPD'
            ELSE 'OPTIMIZED'
        END as breakout_potential
    FROM ranked_stats rs
    JOIN dim_player pn ON rs.pitcher = pn.key_mlbam
    ORDER BY ffour_quality_score DESC;
    """)
    df = pd.read_sql(query, engine)
    
    return run_scouting_model(df)

# Execute
pitcher_archetypes = update_dim_pitcher_archetypes(engine)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

def visualize_pitcher_clusters(df):
    """
    Creates a 2D scatter plot of the clusters and a distribution breakdown.
    """
    # 1. Prepare Data (Same features used in the GMM)
    model_features = [
        'ffour_usage', 'sinker_usage', 'bb_usage', 'offspeed_usage',
        'ffour_vaa_pct', 'velo_gap_pct', 'command_pct', 'neutrality_pct'
    ]
    
    # Scale and Reduce Dimensions
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[model_features].fillna(0))
    
    pca = PCA(n_components=2)
    pca_results = pca.fit_transform(scaled_data)
    df['pca_1'] = pca_results[:, 0]
    df['pca_2'] = pca_results[:, 1]

    # 2. Plotting
    plt.figure(figsize=(12, 8))
    
    # Create the scatter plot
    scatter = sns.scatterplot(
        x='pca_1', y='pca_2', 
        hue='style_cluster', 
        style='is_unicorn',
        data=df, 
        palette='viridis', 
        s=100, 
        alpha=0.7
    )
    
    # Add labels for specific "Unicorns" or top pitchers
    top_pitchers = df.nlargest(5, 'stuff_score')
    for i, row in top_pitchers.iterrows():
        plt.text(row['pca_1'] + 0.02, row['pca_2'], row['full_name'], fontsize=9)

    plt.title('MLB Pitcher Archetype Clusters (PCA Projection)', fontsize=15)
    plt.xlabel('Principal Component 1 (Style Variance)')
    plt.ylabel('Principal Component 2 (Mechanical Variance)')
    plt.legend(title='Cluster ID', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.show()

    # 3. Print Cluster Summary
    print("### Cluster Archetype Breakdown ###")
    summary = df.groupby('style_cluster')[model_features].mean().round(1)
    print(summary)

# Call the function
visualize_pitcher_clusters(pitcher_archetypes)

### Test new archetype for pitcher

In [41]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

def run_scouting_model(df):
    """
    Synthesizes pitching identity (GMM) with performance outcomes (Whiff/Barrel),
    Perceived Power (Extension), and Vertical Separation.
    """
    
    # 1. Identity Features (Physical & Tactical only)
    identity_features = [
        'ffour_usage', 'sinker_usage', 'bb_usage', 'offspeed_usage',
        'ffour_vaa_pct', 'sinker_vaa_pct', 'bb_vaa_pct', 'offspeed_vaa_pct',
        'velo_gap_pct', 'command_pct', 'paint_pct'
    ]
    
    # 2. Effectiveness Scores
    df['lethality_score'] = (
        (df['whiff_pct'] * 0.75) + 
        (df['suppression_pct'] * 0.20) + 
        (df['velo_pct'] * 0.05)
    ).round(1)

    # 3. Clustering (Archetype Definition)
    scaler = StandardScaler()
    scaled_identity = scaler.fit_transform(df[identity_features].fillna(0))
    
    gmm = GaussianMixture(n_components=7, random_state=42)
    df['style_cluster'] = gmm.fit_predict(scaled_identity)

    # 4. Outlier Detection (Unicorns)
    iso = IsolationForest(contamination=0.04, random_state=42)
    df['is_unicorn'] = iso.fit_predict(scaled_identity)
    
    # def calculate_pitcher_grade(row):
    #     # 1. Lethality (Results)
    #     # We give more weight to Suppression for Starters to reward Skubal's profile
    #     lethality = (row['whiff_pct'] * 0.5 + row['suppression_pct'] * 0.4 + row['movement_gap_pct'] * 0.1)
        
    #     # 2. Physicality (Tools)
    #     physicality = (row['perceived_velo_pct'] * 0.45 + row['ffour_vaa_pct'] * 0.45 + row['extension_pct'] * 0.1)
        
    #     # 3. Execution (Stability)
    #     #execution = (row['command_pct'] * 0.45 + row['paint_pct'] * 0.45 + row['neutrality_pct'] * 0.1)
        
    #     # Execution: Now includes Tunneling (0-100 percentile)
    #     # Give Tunneling 20% of the Execution weight
    #     execution = (row['command_pct'] * 0.35 + 
    #                  row['paint_pct'] * 0.35 + 
    #                  row['tunnel_pct'] * 0.20 + 
    #                  row['neutrality_pct'] * 0.10)
        
    #     # 4. Weighted GPA Score
    #     base_gpa = (lethality * 0.50) + (physicality * 0.30) + (execution * 0.20)
        
    #     # 5. THE STARTER CURVE (The "Skubal Adjustment")
    #     # If they are a starter, we boost their GPA by 5 points to account for the difficulty of volume
    #     if row['is_starter'] == 1:
    #         base_gpa += 5
        
    #     # 2. THE "SAMPLE SIZE" PENALTY (NEW)
    #     # If a pitcher has fewer than 5 appearances, they cannot get an A+ 
    #     # because the data isn't "stable" yet.
    #     if row['total_appearances'] < 5:
    #         base_gpa -= 10
        
    #     # 6. Final Mapping (More generous thresholds for A+)
    #     if base_gpa >= 85: return 'A+'
    #     elif base_gpa >= 75: return 'A'
    #     elif base_gpa >= 63: return 'B'  # Slightly wider B range
    #     elif base_gpa >= 50: return 'C'
    #     elif base_gpa >= 35: return 'D'
    #     else: return 'F'
    
    def calculate_pitcher_grade(row):
        # 1. STUFF+ (Physicality)
        # This is the "Weapon" - 60% of the overall grade
        stuff_plus = row['stuff_plus_pct']
        
        # 2. LOCATION+ (Surgicality)
        # This is the "Aim" - 40% of the overall grade
        location_plus = row['location_plus_pct']
        
        # 3. PITCHING+ (The Master Score)
        # Note: We weigh Stuff higher because it's harder to find/teach
        base_score = (stuff_plus * 0.60) + (location_plus * 0.40)
        
        # 4. VOLUME/STARTER ADJUSTMENTS
        if row['is_starter'] == 1:
            base_score += 5  # The "Skubal Boost"
        
        if row['total_appearances'] < 5:
            base_score -= 10 # The "Sample Size Penalty"

        # 5. FINAL LETTER GRADE
        if base_score >= 85: 
            grade = 'A+'
        elif base_score >= 75: 
            grade = 'A'
        elif base_score >= 60: 
            grade = 'B'
        elif base_score >= 45: 
            grade = 'C'
        else: 
            grade = 'F'
            
        return grade, stuff_plus, location_plus

    #df['overall_grade'] = df.apply(calculate_pitcher_grade, axis=1)
    df[['overall_grade', 'stuff_plus_final', 'location_plus_final']] = df.apply(
        lambda x: pd.Series(calculate_pitcher_grade(x)), axis=1
    )

    # def generate_scouting_report(row):
    #     tags = []
    #     summary = f"[{row['overall_grade']} GRADE] "
        
    #     # 1. Framework Identity
    #     s_plus = row['stuff_plus_pct']
    #     l_plus = row['location_plus_pct']
        
    #     # THE "STUFF" LABELS
    #     if s_plus >= 90:
    #         tags.append("üí£ PURE FILTH")
    #     elif s_plus <= 20:
    #         tags.append("üìâ LACKS BITE")

    #     # THE "LOCATION" LABELS
    #     if l_plus >= 90:
    #         tags.append("üéØ SURGEON")
    #     elif l_plus <= 20:
    #         tags.append("üèπ WILD THING")

    #     # 2. COMBINATION SCOUTING (The "Pitching+" Profiles)
    #     if s_plus >= 85 and l_plus >= 85:
    #         tags.append("üëë DOMINANT FORCE")
    #         summary += "A rare combination of elite physical tools and surgical precision. "
        
    #     elif s_plus >= 85 and l_plus <= 40:
    #         tags.append("üíé RAW DIAMOND")
    #         summary += "Elite stuff that is currently unrefined; a primary candidate for a pitching lab overhaul. "
            
    #     elif l_plus >= 85 and s_plus <= 40:
    #         tags.append("üéì THE PROFESSOR")
    #         summary += "Succeeds through elite sequencing and location despite below-average raw velocity. "

    #     # 3. KEEPING YOUR FAVORITES
    #     if row['tunnel_pct'] >= 90: tags.append("üß¨ TUNNELER")
    #     if row['is_unicorn'] == -1: tags.append("ü¶Ñ UNICORN")
        
    #     return " | ".join(list(set(tags))), summary.strip()

    def generate_scouting_report(row):
        tags = []
        # Start the summary with the Grade and Handedness
        summary_header = f"[{row['overall_grade']} GRADE] ({row['hand']})"
        
        # 1. CORE IDENTITY TAGS (Physicality)
        s_plus = row['stuff_plus_pct']
        l_plus = row['location_plus_pct']
        
        if s_plus >= 90: tags.append("üí£ PURE FILTH")
        elif s_plus <= 20: tags.append("üìâ LACKS BITE")

        if l_plus >= 90: tags.append("üéØ SURGEON")
        elif l_plus <= 20: tags.append("üèπ WILD THING")

        # 2. MATCHUP TACTICS (New Logic)
        # We use the columns we just built in SQL
        profile = row['attack_profile']
        role = row['matchup_role']
        platoon = row['platoon_identity']
        
        # Build the Narrative Summary
        analysis = f"Identified as a {role}. "
        
        if "NORTH-SOUTH" in profile:
            analysis += "Wins vertically with high-carry fastballs; elite matchup against low-ball hitters. "
        elif "EAST-WEST" in profile:
            analysis += "Heavy horizontal movement profile; ideal for inducing double plays. "
        
        if platoon == "MATCHUP PROOF":
            tags.append("üõ°Ô∏è PLATOON NEUTRAL")
            analysis += "Maintains effectiveness regardless of batter handedness. "
        elif platoon == "PLATOON SENSITIVE":
            tags.append("‚ö†Ô∏è SPLIT RISK")
            analysis += "Performance drops significantly against opposite-handed hitters. "

        # 3. SPECIAL TRAITS
        if row['tunnel_pct'] >= 90: tags.append("üß¨ TUNNELER")
        if row['is_unicorn'] == -1: tags.append("ü¶Ñ UNICORN")
        if row['breakout_potential'] != 'OPTIMIZED':
            tags.append("üöÄ BREAKOUT")
            analysis += f"Tactical Alert: {row['breakout_potential']}. "

        # Create the final string
        tag_str = " | ".join(list(set(tags)))
        final_summary = f"{summary_header} {tag_str} ‚Äî {analysis.strip()}"
        
        return tag_str, final_summary

    # Apply to your DataFrame
    results = df.apply(generate_scouting_report, axis=1)
    df['archetype_tags'], df['scouting_summary'] = zip(*results)

    # Apply and split into two columns
    results = df.apply(generate_scouting_report, axis=1)
    df['archetype_tags'], df['scouting_summary'] = zip(*results)
    
    # # Apply Logic
    # results = df.apply(generate_scouting_report, axis=1)
    # df['archetype_tags'], df['scouting_summary'] = zip(*results)
    
    return df


def update_dim_pitcher_archetypes(engine):
    """
    SQL to extract the necessary metrics for the Python model.
    """
    query = text("""
    WITH attack_zone_stats AS (
    SELECT 
        p.*,
        -- Define Command/Paint Zones
        CASE 
            WHEN ABS(p.plate_x) <= 0.67 AND p.plate_z BETWEEN (p.sz_bot + 0.33) AND (p.sz_top - 0.33) THEN 'heart'
            WHEN ABS(p.plate_x) <= 1.1 AND p.plate_z BETWEEN (p.sz_bot - 0.33) AND (p.sz_top + 0.33) THEN 'shadow'
            WHEN ABS(p.plate_x) <= 1.5 AND p.plate_z BETWEEN (p.sz_bot - 0.75) AND (p.sz_top + 0.75) THEN 'chase'
            ELSE 'waste'
        END as attack_zone,
        CASE WHEN p.description IN ('swinging_strike', 'swinging_strike_blocked', 'missed_bunt') THEN 1 ELSE 0 END as is_whiff,
        CASE WHEN p.description IN ('swinging_strike', 'swinging_strike_blocked', 'missed_bunt', 'foul', 'foul_tip', 'hit_into_play') THEN 1 ELSE 0 END as is_swing
    FROM fact_statcast_pitches p
    ),
    vaa_base_calc AS (
        SELECT 
            az.*,
            CASE WHEN az.pitch_type IN ('FA', 'FF', 'FC') THEN 
                -ATAN((az.vz0 + (az.az * ((-az.vy0 - SQRT(az.vy0^2 - (2 * az.ay * (50 - (17/12))))) / az.ay))) / 
                (-SQRT(az.vy0^2 - (2 * az.ay * (50 - (17/12)))))) * (180/3.14159) 
            END as individual_ff_vaa
        FROM attack_zone_stats az
    ),
    aggregated_stats AS (
        SELECT 
            p.pitcher,
            p.p_throws,
            COUNT(*) as total_pitches,        
            AVG(p.release_extension) as avg_extension,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FT', 'FC', 'SI') THEN p.release_speed + ((p.release_extension - 6.2) * 2) END)::numeric, 1), 0) as perceived_fb_velo,
            (AVG(CASE WHEN p.pitch_type IN ('FA', 'FF') THEN p.pfx_z * 12 END) - 
            AVG(CASE WHEN p.pitch_type IN ('CH', 'FS', 'SI') THEN p.pfx_z * 12 END)) as v_break_gap_raw,
            
            ROUND(100.0 * SUM(p.is_whiff) / NULLIF(SUM(p.is_swing), 0), 2) as whiff_rate_raw,
            ROUND(100.0 * SUM(CASE WHEN p.launch_speed_angle = 6 THEN 1 ELSE 0 END) / 
                NULLIF(SUM(CASE WHEN p.type = 'X' THEN 1 ELSE 0 END), 0), 2) as barrel_rate_raw,        
            
            ROUND(100.0 * SUM(CASE WHEN p.attack_zone = 'shadow' THEN 1 ELSE 0 END) / COUNT(*), 1) as paint_raw,
            ROUND(100.0 * SUM(CASE WHEN p.attack_zone IN ('shadow', 'chase') THEN 1 ELSE 0 END) / COUNT(*), 1) as command_raw,        
            
            AVG(CASE WHEN p.stand = 'L' THEN p.estimated_woba_using_speedangle END) as xwoba_vs_lhb,
            AVG(CASE WHEN p.stand = 'R' THEN p.estimated_woba_using_speedangle END) as xwoba_vs_rhb,           
            
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FT', 'FC', 'SI') THEN p.release_speed END)::numeric, 1), 0) as fb_velo,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'ST', 'SL', 'KC', 'GY', 'SV', 'CS', 'KN', 'EP') THEN p.release_speed END)::numeric, 1), 0) as offspeed_velo,                                                                                                                                                                    
            
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FC') THEN 1 ELSE 0 END) / COUNT(*), 1) as ffour_usage,
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('SI', 'FT') THEN 1 ELSE 0 END) / COUNT(*), 1) as sinker_usage,
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('CU', 'SL', 'KC', 'ST', 'SV', 'CS', 'KN') THEN 1 ELSE 0 END) / COUNT(*), 1) as bb_usage,
            ROUND(100.0 * SUM(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'ST', 'SL', 'KC', 'GY', 'SV', 'CS', 'KN', 'EP') THEN 1 ELSE 0 END) / COUNT(*), 1) as offspeed_usage,        
            
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('FA', 'FF', 'FT', 'FC', 'SI') THEN p.release_speed END)::numeric - 
                        AVG(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'ST', 'SL', 'KC', 'GY', 'SV', 'CS', 'KN', 'EP') THEN p.release_speed END)::numeric, 1), 0) as velo_gap,        
            COALESCE(ROUND(AVG(p.individual_ff_vaa)::numeric, 2), 0) as ffour_vaa,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('SI', 'FT') THEN -ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159) END)::numeric, 2), 0) as sinker_vaa,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('CU', 'SL', 'KC', 'ST', 'SV', 'CS', 'KN') THEN -ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159) END)::numeric, 2), 0) as bb_vaa,
            COALESCE(ROUND(AVG(CASE WHEN p.pitch_type IN ('CH', 'FS', 'FO', 'SC', 'EP') THEN -ATAN((p.vz0 + (p.az * ((-p.vy0 - SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12))))) / p.ay))) / (-SQRT(p.vy0^2 - (2 * p.ay * (50 - (17/12)))))) * (180/3.14159) END)::numeric, 2), 0) as offspeed_vaa,

            COUNT(DISTINCT game_pk) as total_appearances,
            (COUNT(*) / COUNT(DISTINCT game_pk)) as avg_pitches_per_app,
            CASE WHEN (COUNT(*) / COUNT(DISTINCT game_pk)) >= 40 AND COUNT(DISTINCT game_pk) >= 3 THEN 1 ELSE 0 END as is_starter,
            
            STDDEV(p.release_pos_x) as release_x_std,
            STDDEV(p.release_pos_z) as release_z_std,
            (STDDEV(p.release_pos_x) + STDDEV(p.release_pos_z)) as tunnel_raw,
            AVG(p.individual_ff_vaa - ((-0.68 * p.plate_z) - 3.8)) as vaa_above_expected_raw,       
            -- RAW STUFF+ (Process)
            ( (AVG(p.release_speed) * 0.4) + (AVG(p.release_extension) * 0.2) + (AVG(ABS(p.pfx_x)) * 12 * 0.2) + (AVG(p.pfx_z) * 12 * 0.2) ) as stuff_raw,
            -- RAW LOCATION+ (Process)
            ( (SUM(CASE WHEN p.attack_zone = 'shadow' THEN 1 ELSE 0 END)::float / COUNT(*)) * 0.6 + (SUM(CASE WHEN p.attack_zone = 'heart' THEN 0 ELSE 1 END)::float / COUNT(*)) * 0.4 ) as location_raw

        FROM vaa_base_calc p
        GROUP BY p.pitcher, p.p_throws
        HAVING COUNT(*) > 100 AND AVG(p.release_speed) > 84
    ),
    ranked_stats AS (
        SELECT 
            ast.*,
            ROUND((PERCENT_RANK() OVER (ORDER BY fb_velo))::numeric, 2) * 100 as velo_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (offspeed_usage > 0) ORDER BY offspeed_velo))::numeric, 2) * 100, 0) as offspeed_velo_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (offspeed_usage > 0) ORDER BY velo_gap))::numeric, 2) * 100, 0) as velo_gap_pct,                   
            ROUND((PERCENT_RANK() OVER (ORDER BY whiff_rate_raw))::numeric, 2) * 100 as whiff_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY barrel_rate_raw DESC))::numeric, 2) * 100 as suppression_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY command_raw))::numeric, 2) * 100 as command_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY paint_raw))::numeric, 2) * 100 as paint_pct,            
            ROUND((PERCENT_RANK() OVER (ORDER BY perceived_fb_velo))::numeric, 2) * 100 as perceived_velo_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (offspeed_usage > 0 OR sinker_usage > 0) ORDER BY v_break_gap_raw))::numeric, 2) * 100, 0) as movement_gap_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY avg_extension))::numeric, 2) * 100 as extension_pct,           
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (ffour_usage > 0) ORDER BY ffour_vaa))::numeric, 2) * 100, 0) as ffour_vaa_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (sinker_usage > 0) ORDER BY sinker_vaa DESC))::numeric, 2) * 100, 0) as sinker_vaa_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (bb_usage > 0) ORDER BY bb_vaa DESC))::numeric, 2) * 100, 0) as bb_vaa_pct,
            COALESCE(ROUND((PERCENT_RANK() OVER (PARTITION BY (offspeed_usage > 0) ORDER BY offspeed_vaa DESC))::numeric, 2) * 100, 0) as offspeed_vaa_pct,
            ROUND((100 - (ABS(COALESCE(xwoba_vs_lhb, 0.320) - COALESCE(xwoba_vs_rhb, 0.320)) * 100))::numeric, 2) as neutrality_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY tunnel_raw DESC))::numeric, 2) * 100 as tunnel_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY vaa_above_expected_raw))::numeric, 2) * 100 as vaa_plus_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY stuff_raw))::numeric, 2) * 100 as stuff_plus_pct,
            ROUND((PERCENT_RANK() OVER (ORDER BY location_raw))::numeric, 2) * 100 as location_plus_pct
        FROM aggregated_stats ast
    )
    SELECT 
        CONCAT(pn.first_name_chadwick, ' ', pn.last_name_chadwick) as full_name,
        rs.p_throws as hand,
        rs.*,
        -- MATCHUP COLUMN 1: ATTACK PROFILE (Rise vs Run)
        CASE 
            WHEN vaa_plus_pct > 75 THEN 'NORTH-SOUTH (High Rise)'
            WHEN sinker_usage > 25 THEN 'EAST-WEST (Sinker/Run)'
            WHEN movement_gap_pct > 75 THEN 'DECEPTIVE (High Break)'
            ELSE 'BALANCED'
        END as attack_profile,
        -- MATCHUP COLUMN 2: ROLE IDENTITY
        CASE 
            WHEN rs.whiff_pct > 75 AND rs.location_plus_pct > 75 THEN 'DOMINANT ACE'
            WHEN rs.whiff_pct > 75 AND rs.location_plus_pct < 40 THEN 'POWER ARMS (High Risk)'
            WHEN rs.location_plus_pct > 75 AND rs.whiff_pct < 45 THEN 'PITCH TO CONTACT SURGEON'
            ELSE 'ROTATION STABILIZER'
        END as matchup_role,
        -- MATCHUP COLUMN 3: PLATOON RESISTANCE
        CASE 
            WHEN rs.neutrality_pct > 75 THEN 'MATCHUP PROOF'
            WHEN rs.neutrality_pct < 35 THEN 'PLATOON SENSITIVE'
            ELSE 'STANDARD SPLITS'
        END as platoon_identity,
        ROUND((perceived_velo_pct * 0.25 + ffour_vaa_pct * 0.25 + whiff_pct * 0.5), 0) as ffour_quality_score,
        ROUND((movement_gap_pct * 0.25 + offspeed_vaa_pct * 0.25 + whiff_pct * 0.5), 0) as offspeed_quality_score,   
        CASE 
            WHEN (ffour_vaa_pct > 80 AND ffour_usage < 20) THEN 'UNDERUSED ELITE FASTBALL'
            WHEN (bb_vaa_pct > 80 AND bb_usage < 15) THEN 'UNDERUSED ELITE BREAKING'
            WHEN (offspeed_vaa_pct > 80 AND offspeed_usage < 15) THEN 'UNDERUSED ELITE OFFSPD'
            ELSE 'OPTIMIZED'
        END as breakout_potential
    FROM ranked_stats rs
    JOIN dim_player pn ON rs.pitcher = pn.key_mlbam
    ORDER BY stuff_plus_pct DESC;
    """)
    df = pd.read_sql(query, engine)
    
    return run_scouting_model(df)

# Execute
pitcher_archetypes = update_dim_pitcher_archetypes(engine)