In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from rapidfuzz import process
from datetime import datetime, timedelta
import re
import os

### Rotowire.com scores

In [2]:
def teams_matchups(game_date):
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)    
    driver.get(f"https://www.rotowire.com/baseball/scoreboard.php?date={game_date}")

    datatable_id = 'grid-noGutter mb-15'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//div[@class='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)

    # Find indices for where "Final" and "View Box Score" appear
    start_indices = df[df[0].str.contains("Final", case=False)].index
    end_indices = df[df[0].str.contains("View Box Score", case=False)].index

    # Extract and split data
    game_dataframes = []
    for start in start_indices:
        # Find the corresponding end index that's greater than the start index
        end = end_indices[end_indices > start].min()
        if pd.notna(end):  # Ensure there's a valid end index
            game_data = df.iloc[start:end+1]  # Capture all rows in between
            # Split every 13 rows and create DataFrame
            reshaped_data = [game_data.iloc[i:i+13] for i in range(0, len(game_data), 13)]
            game_dataframes.extend(reshaped_data)

    # Final dataframe with all games
    final_df = pd.concat(game_dataframes, ignore_index= True)

    # Convert the DataFrame to a numpy array for reshaping
    data = final_df.values  

    # Reshape: each group of 13 rows becomes one row with 13 columns
    reshaped_data = [data[i:i+13].flatten() for i in range(0, len(data), 13)]

    # Convert reshaped data back to a DataFrame
    reshaped_df = pd.DataFrame(reshaped_data)

    # Drop the first 4 columns and last column
    reshaped_df = reshaped_df.drop(reshaped_df.columns[[0, 1, 2, 3, -1]], axis= 1)

    # Add the headers
    reshaped_df.columns = ['Away', 'Home', 'R_Away', 'H_Away', 'E_Away', 'R_Home', 'H_Home', 'E_Home']

    # Add the date
    reshaped_df['date'] = game_date

    # Calculate the winner, loser and the difference in runs, hits and errors
    reshaped_df['winner']    = reshaped_df.apply(lambda x: x['Away'] if int(x['R_Away']) > int(x['R_Home']) else x['Home'], axis= 1)
    reshaped_df['loser']     = reshaped_df.apply(lambda x: x['Away'] if int(x['R_Away']) < int(x['R_Home']) else x['Home'], axis= 1)
    reshaped_df['diff_runs_away_vs_home_team']   = reshaped_df.apply(lambda x: int(x['R_Away']) - int(x['R_Home']), axis= 1)
    reshaped_df['diff_hits_away_vs_home_team']   = reshaped_df.apply(lambda x: int(x['H_Away']) - int(x['H_Home']), axis= 1)
    reshaped_df['diff_errors_away_vs_home_team'] = reshaped_df.apply(lambda x: int(x['E_Away']) - int(x['E_Home']), axis= 1)
    
    return reshaped_df

# Define dates
dates = [
    (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=4)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=5)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=6)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
    ]

# Initialize a dictionary to store dataframes
dataframes = {}

# Loop through each date and store results in a unique dataframe
for game_date in dates:
    print(f"Processing data for date: {game_date}")
    
    # Run the teams_matchups function and save the resulting dataframe
    result_df = teams_matchups(game_date)
    
    # Store the dataframe in the dictionary with the date as the key
    dataframes[game_date] = result_df

# Create the final dataframe
games = pd.concat(dataframes.values(), ignore_index=True)

# Convert to datetime format
games["date"] = pd.to_datetime(games["date"])

# Create a game_id column using the index
games["game_id"] = games.index + 1
games["game_id"] = games["game_id"].astype(str)

# Create a key column for the game
games["key"] = games["date"].dt.strftime("%Y%m%d") + "_" + games["game_id"]

# Know if the winner was the away or home team
games["visitor_won"]           = games.apply(lambda x: 1 if int(x["R_Away"]) > int(x["R_Home"]) else 0, axis=1)
games["home_won"]              = games.apply(lambda x: 1 if int(x["R_Away"]) < int(x["R_Home"]) else 0, axis=1) 
games["visit_or_home_victory"] = games.apply(lambda x: 'H' if int(x["R_Away"]) < int(x["R_Home"]) else 'V', axis=1) 

# Create a column that indicates if the game was a shutout
games["shutout"] = games.apply(lambda x: 1 if int(x["R_Away"]) == 0 or int(x["R_Home"]) == 0 else 0, axis=1)

# Create a column that indicates if the game was a one-run game
games["one_run_game"] = games.apply(lambda x: 1 if abs(int(x["R_Away"]) - int(x["R_Home"])) == 1 else 0, axis=1)

# Create a column that indicates if the game was a high-scoring game
games["high_scoring_game"] = games.apply(lambda x: 1 if int(x["R_Away"]) + int(x["R_Home"]) >= 10 else 0, axis=1)

# Create a column that indicates if the game was a low-scoring game
games["low_scoring_game"] = games.apply(lambda x: 1 if int(x["R_Away"]) + int(x["R_Home"]) <= 3 else 0, axis=1)

# Create a column that indicates if the game was a blowout
games["blowout"] = games.apply(lambda x: 1 if abs(int(x["R_Away"]) - int(x["R_Home"])) >= 5 else 0, axis=1)

# Create a column that indicates how many runs were scored in the game
games["total_runs"] = games.apply(lambda x: int(x["R_Away"]) + int(x["R_Home"]), axis=1)

# Create a column that indicates how many hits were scored in the game
games["total_hits"] = games.apply(lambda x: int(x["H_Away"]) + int(x["H_Home"]), axis=1)

# Create a column that indicates how many errors were scored in the game
games["total_errors"] = games.apply(lambda x: int(x["E_Away"]) + int(x["E_Home"]), axis=1)

# Create a column that join the home and away teams
games["teams"] = games.apply(lambda x: x["Away"] + " vs " + x["Home"], axis=1)

# Count occurrences of each team matchup in the 'teams' column
team_counts = games['teams'].value_counts()

# Map the counts back to the original dataFrame
games['team_matchup_count'] = games['teams'].map(team_counts)

# Create a group id for each team matchup
games['series_id'] = games.groupby('teams').ngroup() + 1

# Export the dataframe to a CSV file
games.to_csv('D:\\mlb_analyzer\\output\\teams\\teams_matchup.csv', index=False)


Processing data for date: 2025-05-08
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-05-07
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-05-06
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-05-05
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-05-04
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-05-03
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-05-02
grid-noGutter mb-15 table loaded successfully.


## Basic pitcher information

In [3]:
def games_today():
    """Get today's matchups from the RotoWire scoreboard.
    This function uses Selenium to scrape the RotoWire website for today's MLB matchups.
    It extracts the game time, away team, home team, away pitcher name, away pitcher record,
    home pitcher name, and home pitcher record from the scoreboard.
    The data is then reshaped into a DataFrame format for further analysis.

    Returns:
        _type_: _description_
    """
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    today = datetime.now().strftime('%Y-%m-%d')

    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)    
    driver.get(f"https://www.rotowire.com/baseball/scoreboard.php?date={today}")

    datatable_id = 'grid-noGutter mb-15'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//div[@class='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)

    # Find indices for where "Final" and "View Box Score" appear
    start_indices = df[df[0].str.contains(" ET", case= False)].index
    end_indices = df[df[0].str.contains("View Box Score", case=False)].index

    # Extract and split data
    game_dataframes = []
    for start in start_indices:
        # Find the corresponding end index that's greater than the start index
        end = end_indices[end_indices > start].min()
        if pd.notna(end):  # Ensure there's a valid end index
            game_data = df.iloc[start:end+1]  # Capture all rows in between
            # Split every 8 rows and create DataFrame
            reshaped_data = [game_data.iloc[i:i+8] for i in range(0, len(game_data), 8)]
            game_dataframes.extend(reshaped_data)

    # Final dataframe with all games
    final_df = pd.concat(game_dataframes, ignore_index= True)

    # Convert the DataFrame to a numpy array for reshaping
    data = final_df.values  

    # Reshape: each group of 8 rows becomes one row with 8 columns
    reshaped_data = [data[i:i+8].flatten() for i in range(0, len(data), 8)]

    # Convert reshaped data back to a DataFrame
    reshaped_df = pd.DataFrame(reshaped_data)

    # Drop the last column
    reshaped_df = reshaped_df.drop(reshaped_df.columns[[-1]], axis= 1)

    # Add the headers
    reshaped_df.columns = ['game_time', 'away_team', 'home_team', 'away_pitcher_name', 'away_pitcher_record', 'home_pitcher_name', 'home_pitcher_record']

    # Add the date
    reshaped_df['date'] = today
    
    return reshaped_df

# Get today's matchups
games_today_df = games_today()

# Convert the date column to datetime format
games_today_df["date"] = pd.to_datetime(games_today_df["date"])

# Create a game_id column using the index
games_today_df["game_id"] = games_today_df.index + 1
games_today_df["game_id"] = games_today_df["game_id"].astype(str)

# Export the dataframe to a CSV file
games_today_df.to_csv('D:\\mlb_analyzer\\output\\teams\\games_today.csv', index=False)


grid-noGutter mb-15 table loaded successfully.


## Advanced Matchups

In [4]:
def advanced_matchups():
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    today = datetime.now().strftime('%Y-%m-%d')

    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)    
    driver.get(f"https://baseballsavant.mlb.com/probable-pitchers")

    datatable_id = 'template__content template--two-column__content--one'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//div[@class='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)

    # Identify rows that contain " @ "
    split_indices = df[df[0].str.contains(" @ ")].index.tolist()

    # Add start and end indices
    split_indices.append(len(df))
    split_data = [df.iloc[split_indices[i]:split_indices[i+1]].values.flatten().tolist()
                    for i in range(len(split_indices)-1)]

    # Create new DataFrame
    new_df = pd.DataFrame(split_data)

    # Splitting on the " | " symbol. Splitting only col 2 while keeping other columns
    df_expanded = new_df.copy()  # Preserve other columns
    df_expanded[['Column1', 'Column2']] = df_expanded[2].str.split("ET", expand=True)

    # Drop original column
    df_expanded = df_expanded.drop(columns=[2, 3])

    #! Removing rows where col 4 contains 'to be announced'
    df_filtered = df_expanded[df_expanded[4] != "To be announced."]

    # Filtering rows where col6 contains "Never Faced Any Players on this Team."
    df_never_faced_the_team = df_filtered[df_filtered[6] == "Never Faced Any Players on this Team."].copy()

    # Removing rows from the filtered DataFrame
    df_filtered = df_filtered[df_filtered[6] != "Never Faced Any Players on this Team."]

    # Filtering rows where col9 does not contains "Exit Velo Launch Angle xBA xSLG xwOBA"
    df_not_complete = df_filtered[df_filtered[9] != "Exit Velo Launch Angle xBA xSLG xwOBA"].copy()

    # Removing rows from the filtered DataFrame
    df_filtered = df_filtered[df_filtered[9] == "Exit Velo Launch Angle xBA xSLG xwOBA"]

    # Convert empty strings to NaN for better handling
    df_filtered[17] = df_filtered[17].replace("", pd.NA)

    # Count occurrences of each unique non-empty value
    value_counts = df_filtered[17].dropna().value_counts()

    if not value_counts.empty:
        # Identify the most frequent value
        most_frequent_value = value_counts.idxmax()

        # Fill NaN values with the most frequent value
        df_filtered[17] = df_filtered[17].fillna(most_frequent_value)

    # Splitting on the " ET ". Splitting only col 2 while keeping other columns
    df_expanded = new_df.copy()  # Preserve other columns
    df_expanded[['Column1', 'Column2']] = df_expanded[2].str.split("ET", expand=True)

    # Drop original column
    df_expanded = df_expanded.drop(columns=[2, 3])

    # Splitting columns
    df_split = df_filtered.copy()  # Preserve other columns
    df_split[['PA_away_pitcher', 'K%_away_pitcher', 'BB%_away_pitcher', 'AVG_away_pitcher', 'wOBA_away_pitcher']] = df_split[8].str.split(" ", expand=True)
    df_split[['Exit_Velo_away_pitcher', 'unit_away', 'Lunch_Angle_away_pitcher', 'xBA_away_pitcher', 'xSLG_away_pitcher', 'xwOBA_away_pitcher']] = df_split[10].str.split(" ", expand=True)

    df_split[['PA_home_pitcher', 'K%_home_pitcher', 'BB%_home_pitcher', 'AVG_home_pitcher', 'wOBA_home_pitcher']] = df_split[16].str.split(" ", expand=True)
    df_split[['Exit_Velo_home_pitcher', 'unit_home', 'Lunch_Angle_home_pitcher', 'xBA_home_pitcher', 'xSLG_home_pitcher', 'xwOBA_home_pitcher']] = df_split[18].str.split(" ", expand=True)

    # Drop original columns if needed
    df_split = df_split.drop(columns=[6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19])

    # Function to remove city names
    def remove_city_names(text):
        return re.sub(r'\b(?:San Diego|Pittsburgh|Arizona|Philadelphia|Kansas City|Baltimore|Tampa Bay|New York|Cleveland|Toronto|Minnesota|Boston|Los Angeles|Atlanta|Houston|Chicago|Seattle|Texas|Milwaukee|St. Louis|Detroit|Colorado|San Francisco)\b ', '', text)

    # Apply the function to the first column of the DataFrame
    df_split[0] = df_split[0].apply(remove_city_names)

    # Replace @ in col 0 with "vs"
    df_split[0] = df_split[0].str.replace(" @ ", " vs ", regex= True)

    # Splitting on the " vs ". Splitting only col 0 while keeping other columns.
    df_split = df_split.copy()  # Preserve other columns
    df_split[['away_team', 'home_team']] = df_split[0].str.split(" vs ", expand=True)

    # Update the column with the date
    df_split[1] = datetime.now().strftime('%Y-%m-%d')

    # # Add the headers
    # new_df.columns = ['teams', 'date', 'time_and_park', 'away_pitcher_name', 'away_pitcher_throws', 'away_pitcher_info', 
    #                     'away_pitcher_fields_1', 'away_pitcher_data_1', 'away_pitcher_fields_2', 'away_pitcher_data_2',]
    
    return df_split, df_never_faced_the_team, df_not_complete


# Call the function to get today's matchups with advanced stats
advanced_matchups_df, never_faced_the_team_df, df_not_complete = advanced_matchups()

# Export the dataframe to a CSV file
advanced_matchups_df.to_csv('D:\\mlb_analyzer\\output\\teams\\advanced_matchups\\advanced_matchups.csv', index=False)
never_faced_the_team_df.to_csv('D:\\mlb_analyzer\\output\\teams\\advanced_matchups\\never_faced_the_team.csv', index=False)
df_not_complete.to_csv('D:\\mlb_analyzer\\output\\teams\\advanced_matchups\\not_complete.csv', index=False)


template__content template--two-column__content--one table loaded successfully.


# Import advanced data and gamelogs for each team

#### This works for hitting and pitching

In [5]:
# def team_advanced_stats(analysis_type):
#     """Get the advanced stats for the team.
#     The function scrapes the advanced stats from the MLB website using Selenium and returns three dataframes:
#     statcast, plate_discipline and batted_ball_profile.

#     Returns:
#         _type_: _description_
#     """
#     # Load the options
#     options = Options()
#     options.add_argument("--headless")  # Optional: Run in headless mode
#     options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

#     year = datetime.now().year

#     # Set up the WebDriver
#     driver = webdriver.Chrome(options= options)    
#     driver.get(f"https://baseballsavant.mlb.com/team/114?view=statcast&nav={analysis_type}&season={year}")

#     datatable_id = 'div_statcast'
#     datatable_xpath = f"//div[@id='{datatable_id}']"  # Update XPATH as needed

#     # Initialize an empty dataframe
#     team_advanced_stats_df = pd.DataFrame()

#     try:
#         # Explicitly wait for the table element to load
#         WebDriverWait(driver, 20).until(
#             EC.presence_of_element_located((By.XPATH, datatable_xpath))
#         )
#         print(f"{datatable_id} table loaded successfully.")

#         # Locate the table
#         table_element = driver.find_element(By.XPATH, datatable_xpath)
#         text_content = table_element.text

#         # Process the table content
#         rows = text_content.split("\n")
#         table_data = [row.split("\t") for row in rows]

#         # Convert to dataframe
#         team_advanced_stats_df = pd.DataFrame(table_data)

#     except Exception as e:
#         print(f"Error: Table {datatable_id} did not load. Returning an empty dataframe. Details: {e}")

#     finally:
#         driver.quit()
    
#     # Check if the dataframe is empty
#     # If the dataframe is empty, return empty dataframes
#     if team_advanced_stats_df.empty:
#         print("No data found. Skipping table creation.")        
#         return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()  # Return empty dataframes
#     else:
        
#         #! STATCAST TABLE
#         #! Cleaning the data for the statcast table


#         def combine_rows(df, row_number_1, row_number_2):
#             """Combine two rows in a DataFrame into one row.
#             The first row will contain the combined data, and the second row will be dropped.

#             Args:
#                 df (_type_): _description_
#                 row_number_1 (_type_): _description_
#                 row_number_2 (_type_): _description_

#             Returns:
#                 _type_: _description_
#             """
#             # Combine the two rows
#             df.loc[row_number_1, 0] = df.loc[row_number_1, 0] + ' ' + df.loc[row_number_2, 0]

#             # Drop the second row
#             df = df.drop(row_number_2).reset_index(drop=True)
            
#             return df


#         # Join rows
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 19, 20)
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 23, 24)
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 24, 25)

#         #! Create the headers for the first table (statcast)
#         # Find the first occurrence of 'Player' and 'XWOBACON'
#         # Find indices
#         start_idx = team_advanced_stats_df[team_advanced_stats_df[0] == 'Player'].index[0]
#         end_idx = team_advanced_stats_df[team_advanced_stats_df[0] == 'XWOBACON'].index[0]

#         # Slice and transpose
#         headers_statcast = team_advanced_stats_df.iloc[start_idx:end_idx + 1].T

#         # Reset column names
#         headers_statcast.columns = headers_statcast.iloc[0]
#         headers_statcast         = headers_statcast[1:].reset_index(drop=True)

#         # Remove those rows from original dataframe
#         team_advanced_stats_df = team_advanced_stats_df.drop(team_advanced_stats_df.index[start_idx:end_idx + 1]).reset_index(drop=True)

#         # Remove the first 3 rows
#         team_advanced_stats_df = team_advanced_stats_df.iloc[3:].reset_index(drop=True)  # Using iloc

#         # Use regex to split the column while preserving negative numbers
#         team_advanced_stats_df[['Name', 'Numbers']] = team_advanced_stats_df[0].str.extract(r'^(.*?)([-\d\s.,]*)$')

#         # Split the 'Numbers' column into separate columns (25 columns)
#         team_advanced_stats_df = team_advanced_stats_df.join(team_advanced_stats_df['Numbers'].str.split(expand=True).rename(lambda x: f'col_{x+1}', axis=1))

#         # Drop the original 'Numbers' column
#         team_advanced_stats_df = team_advanced_stats_df.drop(columns=['Numbers'])

#         # Find the first empty row
#         first_empty_idx = team_advanced_stats_df[team_advanced_stats_df[0] == ''].index.min()

#         # Extract rows from the start until the first empty row
#         statcast = team_advanced_stats_df.iloc[:first_empty_idx]

#         # Drop the first column
#         statcast = statcast.drop(columns=[0])

#         # Swap last name and first name
#         statcast['Name'] = statcast['Name'].str.split(', ').str[::-1].str.join(' ')

#         # Add the headers to the dataframe
#         statcast.columns = headers_statcast.columns

#         #! PLATE DISCIPLINE TABLE
#         #! Cleaning the data for the plate discipline table
#         # Find the first empty row
#         first_empty_idx = team_advanced_stats_df[team_advanced_stats_df[0] == ''].index.min()

#         # Remove rows from the first row until the first empty row
#         team_advanced_stats_df = team_advanced_stats_df.iloc[first_empty_idx + 1:].reset_index(drop=True)

#         # Remove the first row
#         team_advanced_stats_df = team_advanced_stats_df.iloc[1:].reset_index(drop= True)  # Using iloc

#         # Join rows
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 4, 5)
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 5, 6)
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 7, 8)
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 9, 10)
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 13, 14)

#         #! Create the headers for the second table (plate discipline)
#         # Find the first occurrence of 'Player' and 'Meatball Swing %'
#         # Find indices
#         start_idx = team_advanced_stats_df[team_advanced_stats_df[0] == 'Player'].index[0]
#         end_idx = team_advanced_stats_df[team_advanced_stats_df[0] == 'Meatball Swing %'].index[0]

#         # Slice and transpose
#         headers_plate_discipline = team_advanced_stats_df.iloc[start_idx:end_idx + 1].T

#         # Reset column names
#         headers_plate_discipline.columns = headers_plate_discipline.iloc[0]
#         headers_plate_discipline         = headers_plate_discipline[1:].reset_index(drop=True)

#         # Remove those rows from original dataframe
#         team_advanced_stats_df = team_advanced_stats_df.drop(team_advanced_stats_df.index[start_idx:end_idx + 1]).reset_index(drop=True)

#         # Find the first empty row
#         first_empty_idx = team_advanced_stats_df[team_advanced_stats_df[0] == ''].index.min()

#         # Extract rows from the start until the first empty row
#         plate_discipline = team_advanced_stats_df.iloc[:first_empty_idx]

#         # Drop the first column
#         plate_discipline = plate_discipline.drop(columns=[0])

#         # Swap last name and first name
#         plate_discipline['Name'] = plate_discipline['Name'].str.split(', ').str[::-1].str.join(' ')

#         # Drop columns where all values are NaN (empty)
#         plate_discipline = plate_discipline.dropna(axis= 1, how= 'all')

#         # Add the headers to the dataframe
#         plate_discipline.columns = headers_plate_discipline.columns

#         #! BATTED BALL PROFILE TABLE
#         #! Cleaning the data for the batted ball profile table
#         # Find the first empty row
#         first_empty_idx = team_advanced_stats_df[team_advanced_stats_df[0] == ''].index.min()

#         # Remove rows from the first row until the first empty row
#         team_advanced_stats_df = team_advanced_stats_df.iloc[first_empty_idx + 1:].reset_index(drop=True)

#         # Remove the first row
#         team_advanced_stats_df = team_advanced_stats_df.iloc[1:].reset_index(drop= True)  # Using iloc

#         # Join rows
#         team_advanced_stats_df = combine_rows(team_advanced_stats_df, 15, 16)

#         #! Create the headers for the third table (batted ball profile)
#         # Find the first occurrence of 'Player' and 'Barrel %'
#         # Find indices
#         start_idx = team_advanced_stats_df[team_advanced_stats_df[0] == 'Player'].index[0]
#         end_idx = team_advanced_stats_df[team_advanced_stats_df[0] == 'Barrel %'].index[0]

#         # Slice and transpose
#         headers_batted_ball_profile = team_advanced_stats_df.iloc[start_idx:end_idx + 1].T

#         # Reset column names
#         headers_batted_ball_profile.columns = headers_batted_ball_profile.iloc[0]
#         headers_batted_ball_profile         = headers_batted_ball_profile[1:].reset_index(drop=True)

#         # Remove those rows from original dataframe
#         team_advanced_stats_df = team_advanced_stats_df.drop(team_advanced_stats_df.index[start_idx:end_idx + 1]).reset_index(drop=True)

#         # Find the first empty row
#         first_empty_idx = team_advanced_stats_df[team_advanced_stats_df[0] == ''].index.min()

#         if pd.isna(first_empty_idx):
#             batted_ball_profile = team_advanced_stats_df.copy()  # If no empty row, use the entire DataFrame
#         else:
#             # Extract rows from the start until the first empty row
#             batted_ball_profile = team_advanced_stats_df.iloc[:first_empty_idx]

#         # Drop the first column
#         batted_ball_profile = batted_ball_profile.drop(columns=[0])

#         # Swap last name and first name
#         batted_ball_profile['Name'] = batted_ball_profile['Name'].str.split(', ').str[::-1].str.join(' ')

#         # Drop columns where all values are NaN (empty)
#         batted_ball_profile = batted_ball_profile.dropna(axis= 1, how= 'all')

#         # Add the headers to the dataframe
#         batted_ball_profile.columns = headers_batted_ball_profile.columns
        
#         return statcast, plate_discipline, batted_ball_profile


# # Call the function to get today's matchups with advanced stats
# #hitting_statcast_df,  hitting_plate_discipline_df,  hitting_batted_ball_profile_df  = team_advanced_stats(analysis_type= 'hitting')
# pitching_statcast_df, pitching_plate_discipline_df, pitching_batted_ball_profile_df = team_advanced_stats(analysis_type= 'pitching')


In [6]:

# #! STATCAST TABLE
# #! Cleaning the data for the statcast table

# def combine_rows(df, row_number_1, row_number_2):
#     """Combine two rows in a DataFrame into one row.
#     The first row will contain the combined data, and the second row will be dropped.

#     Args:
#         df (_type_): _description_
#         row_number_1 (_type_): _description_
#         row_number_2 (_type_): _description_

#     Returns:
#         _type_: _description_
#     """
#     # Combine the two rows
#     df.loc[row_number_1, 0] = df.loc[row_number_1, 0] + ' ' + df.loc[row_number_2, 0]

#     # Drop the second row
#     df = df.drop(row_number_2).reset_index(drop=True)
    
#     return df


# # Join rows
# test_df = combine_rows(test_df, 19, 20)
# test_df = combine_rows(test_df, 23, 24)
# test_df = combine_rows(test_df, 24, 25)

# #! Create the headers for the first table (statcast)
# # Find the first occurrence of 'Player' and 'XWOBACON'
# # Find indices
# start_idx = test_df[test_df[0] == 'Player'].index[0]
# end_idx = test_df[test_df[0] == 'XWOBACON'].index[0]

# # Slice and transpose
# headers_statcast = test_df.iloc[start_idx:end_idx + 1].T

# # Reset column names
# headers_statcast.columns = headers_statcast.iloc[0]
# headers_statcast         = headers_statcast[1:].reset_index(drop=True)

# # Remove those rows from original dataframe
# test_df = test_df.drop(test_df.index[start_idx:end_idx + 1]).reset_index(drop=True)

# # Remove the first 3 rows
# test_df = test_df.iloc[3:].reset_index(drop=True)  # Using iloc

# # Use regex to split the column while preserving negative numbers
# test_df[['Name', 'Numbers']] = test_df[0].str.extract(r'^(.*?)([-\d\s.,]*)$')

# # Split the 'Numbers' column into separate columns (25 columns)
# test_df = test_df.join(test_df['Numbers'].str.split(expand=True).rename(lambda x: f'col_{x+1}', axis=1))

# # Drop the original 'Numbers' column
# test_df = test_df.drop(columns=['Numbers'])

# # Find the first empty row
# first_empty_idx = test_df[test_df[0] == ''].index.min()

# # Extract rows from the start until the first empty row
# statcast = test_df.iloc[:first_empty_idx]

# # Drop the first column
# statcast = statcast.drop(columns=[0])

# # Swap last name and first name
# statcast['Name'] = statcast['Name'].str.split(', ').str[::-1].str.join(' ')

# # Add the headers to the dataframe
# statcast.columns = headers_statcast.columns


In [7]:

# #! PLATE DISCIPLINE TABLE
# #! Cleaning the data for the plate discipline table
# # Find the first empty row
# first_empty_idx = test_df[test_df[0] == ''].index.min()

# # Remove rows from the first row until the first empty row
# test_df = test_df.iloc[first_empty_idx + 1:].reset_index(drop=True)

# # Remove the first row
# test_df = test_df.iloc[1:].reset_index(drop= True)  # Using iloc

# # Join rows
# test_df = combine_rows(test_df, 4, 5)
# test_df = combine_rows(test_df, 5, 6)
# test_df = combine_rows(test_df, 7, 8)
# test_df = combine_rows(test_df, 9, 10)
# test_df = combine_rows(test_df, 13, 14)

# #! Create the headers for the second table (plate discipline)
# # Find the first occurrence of 'Player' and 'Meatball Swing %'
# # Find indices
# start_idx = test_df[test_df[0] == 'Player'].index[0]
# end_idx = test_df[test_df[0] == 'Meatball Swing %'].index[0]

# # Slice and transpose
# headers_plate_discipline = test_df.iloc[start_idx:end_idx + 1].T

# # Reset column names
# headers_plate_discipline.columns = headers_plate_discipline.iloc[0]
# headers_plate_discipline         = headers_plate_discipline[1:].reset_index(drop=True)

# # Remove those rows from original dataframe
# test_df = test_df.drop(test_df.index[start_idx:end_idx + 1]).reset_index(drop=True)

# # Find the first empty row
# first_empty_idx = test_df[test_df[0] == ''].index.min()

# # Extract rows from the start until the first empty row
# plate_discipline = test_df.iloc[:first_empty_idx]

# # Drop the first column
# plate_discipline = plate_discipline.drop(columns=[0])

# # Swap last name and first name
# plate_discipline['Name'] = plate_discipline['Name'].str.split(', ').str[::-1].str.join(' ')

# # Drop columns where all values are NaN (empty)
# plate_discipline = plate_discipline.dropna(axis= 1, how= 'all')

# # Add the headers to the dataframe
# plate_discipline.columns = headers_plate_discipline.columns


In [8]:

# #! BATTED BALL PROFILE TABLE
# #! Cleaning the data for the batted ball profile table
# # Find the first empty row
# first_empty_idx = test_df[test_df[0] == ''].index.min()

# # Remove rows from the first row until the first empty row
# test_df = test_df.iloc[first_empty_idx + 1:].reset_index(drop=True)

# # Remove the first row
# test_df = test_df.iloc[1:].reset_index(drop= True)  # Using iloc

# # Join rows
# test_df = combine_rows(test_df, 15, 16)

# #! Create the headers for the third table (batted ball profile)
# # Find the first occurrence of 'Player' and 'Barrel %'
# # Find indices
# start_idx = test_df[test_df[0] == 'Player'].index[0]
# end_idx = test_df[test_df[0] == 'Barrel %'].index[0]

# # Slice and transpose
# headers_batted_ball_profile = test_df.iloc[start_idx:end_idx + 1].T

# # Reset column names
# headers_batted_ball_profile.columns = headers_batted_ball_profile.iloc[0]
# headers_batted_ball_profile         = headers_batted_ball_profile[1:].reset_index(drop=True)

# # Remove those rows from original dataframe
# test_df = test_df.drop(test_df.index[start_idx:end_idx + 1]).reset_index(drop=True)

# # Find the first empty row
# first_empty_idx = test_df[test_df[0] == ''].index.min()

# if pd.isna(first_empty_idx):
#     batted_ball_profile = test_df.copy()  # If no empty row, use the entire DataFrame
# else:
#     # Extract rows from the start until the first empty row
#     batted_ball_profile = test_df.iloc[:first_empty_idx]

# # Drop the first column
# batted_ball_profile = batted_ball_profile.drop(columns=[0])

# # Swap last name and first name
# batted_ball_profile['Name'] = batted_ball_profile['Name'].str.split(', ').str[::-1].str.join(' ')

# # Drop columns where all values are NaN (empty)
# batted_ball_profile = batted_ball_profile.dropna(axis= 1, how= 'all')

# # Add the headers to the dataframe
# batted_ball_profile.columns = headers_batted_ball_profile.columns

In [34]:
def import_advanced_stats(analysis_type, team_ids, category):
    """Get the advanced stats for multiple teams.
    The function scrapes the advanced stats from the MLB website using Selenium and returns a dictionary of dataframes.

    Returns:
        dict: A dictionary mapping team IDs to their respective advanced stats dataframe.
    """
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    year = datetime.now().year
    driver = webdriver.Chrome(options=options)
    
    team_data = {}

    for team_id in team_ids:
        try:
            if category == "statcast":
                url = f"https://baseballsavant.mlb.com/team/{team_id}?view={category}&nav={analysis_type}&season={year}"
                datatable_id = 'div_statcast'
            elif category == "gamelogs":
                url = f"https://baseballsavant.mlb.com/team/{team_id}?view={category}&nav={analysis_type}&season={year}"
                datatable_id = 'div_gamelogs'

            driver.get(url)
            
            datatable_xpath = f"//div[@id='{datatable_id}']"

            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, datatable_xpath))
            )

            table_element = driver.find_element(By.XPATH, datatable_xpath)
            text_content = table_element.text

            rows = text_content.split("\n")
            table_data = [row.split("\t") for row in rows]

            team_data[team_id] = pd.DataFrame(table_data)

            # Print the status of the data retrieval
            if category == "statcast":
                print(f"{analysis_type} data successfully retrieved for team {team_id}")
            elif category == "gamelogs":
                print(f"{analysis_type} gamelogs successfully retrieved for team {team_id}")

        except Exception as e:
            print(f"Skipping team {team_id} due to error: {e}")
            continue  # Skip to the next iteration
        
    driver.quit()
    
    return team_data

# Import the tables for each team
team_ids = [108, 117, 133, 141, 144, 158, 138, 112, 109, 119, 137, 114,
            137, 114, 136, 146, 121, 120, 110, 135, 143, 134, 140, 139,
            113, 111, 115, 118, 116, 142, 145, 147]

# Import the advanced stats for each team
data_pitching = import_advanced_stats("pitching", team_ids, "statcast")
data_hitting = import_advanced_stats("hitting", team_ids, "statcast")

# Import the gamelogs for each team
gamelog_data_pitching = import_advanced_stats("pitching", team_ids, "gamelogs")
gamelog_data_hitting = import_advanced_stats("hitting", team_ids, "gamelogs")

pitching data successfully retrieved for team 108
pitching data successfully retrieved for team 117
pitching data successfully retrieved for team 133
pitching data successfully retrieved for team 141
pitching data successfully retrieved for team 144
pitching data successfully retrieved for team 158
pitching data successfully retrieved for team 138
pitching data successfully retrieved for team 112
Skipping team 109 due to error: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7A9EFCF25+75717]
	GetHandleVerifier [0x00007FF7A9EFCF80+75808]
	(No symbol) [0x00007FF7A9CC8F9A]
	(No symbol) [0x00007FF7A9D1F4C6]
	(No symbol) [0x00007FF7A9D1F77C]
	(No symbol) [0x00007FF7A9D72577]
	(No symbol) [0x00007FF7A9D473BF]
	(No symbol) [0x00007FF7A9D6F39C]
	(No symbol) [0x00007FF7A9D47153]
	(No symbol) [0x00007FF7A9D10421]
	(No symbol) [0x00007FF7A9D111B3]
	GetHandleVerifier [0x00007FF7AA1FD6FD+3223453]
	GetHandleVerifier [0x00007FF7AA1F7CA2+3200322]
	GetHandleVerifier [0x00007FF7AA215AD3+3322739]
	Get

In [35]:
def team_advanced_stats(dataframe_name):
    """Get the advanced stats for the team.
    The function scrapes the advanced stats from the MLB website using Selenium and returns three dataframes:
    statcast, plate_discipline and batted_ball_profile.

    Returns:
        _type_: _description_
    """   
        
    #! STATCAST TABLE
    #! Cleaning the data for the statcast table
    def combine_rows(df, row_number_1, row_number_2):
        """Combine two rows in a DataFrame into one row.
        The first row will contain the combined data, and the second row will be dropped.

        Args:
            df (_type_): _description_
            row_number_1 (_type_): _description_
            row_number_2 (_type_): _description_

        Returns:
            _type_: _description_
        """
        # Combine the two rows
        df.loc[row_number_1, 0] = df.loc[row_number_1, 0] + ' ' + df.loc[row_number_2, 0]

        # Drop the second row
        df = df.drop(row_number_2).reset_index(drop=True)
        
        return df


    # Join rows
    dataframe_name = combine_rows(dataframe_name, 19, 20)
    dataframe_name = combine_rows(dataframe_name, 23, 24)
    dataframe_name = combine_rows(dataframe_name, 24, 25)

    #! Create the headers for the first table (statcast)
    # Find the first occurrence of 'Player' and 'XWOBACON'
    # Find indices
    start_idx = dataframe_name[dataframe_name[0] == 'Player'].index[0]
    end_idx = dataframe_name[dataframe_name[0] == 'XWOBACON'].index[0]

    # Slice and transpose
    headers_statcast = dataframe_name.iloc[start_idx:end_idx + 1].T

    # Reset column names
    headers_statcast.columns = headers_statcast.iloc[0]
    headers_statcast         = headers_statcast[1:].reset_index(drop=True)

    # Remove those rows from original dataframe
    dataframe_name = dataframe_name.drop(dataframe_name.index[start_idx:end_idx + 1]).reset_index(drop=True)

    # Remove the first 3 rows
    dataframe_name = dataframe_name.iloc[3:].reset_index(drop=True)  # Using iloc

    # Use regex to split the column while preserving negative numbers
    dataframe_name[['Name', 'Numbers']] = dataframe_name[0].str.extract(r'^(.*?)([-\d\s.,]*)$')
    
    # Remove initial dot from values that start with "."
    dataframe_name['Numbers'] = dataframe_name['Numbers'].apply(lambda x: x[1:] if x.startswith('. ') else x)

    # Split the 'Numbers' column into separate columns (25 columns)
    dataframe_name = dataframe_name.join(dataframe_name['Numbers'].str.split(expand=True).rename(lambda x: f'col_{x+1}', axis=1))

    # Drop the original 'Numbers' column
    dataframe_name = dataframe_name.drop(columns=['Numbers'])

    # Find the first empty row
    first_empty_idx = dataframe_name[dataframe_name[0] == ''].index.min()

    # Extract rows from the start until the first empty row
    statcast = dataframe_name.iloc[:first_empty_idx]

    # Drop the first column
    statcast = statcast.drop(columns=[0])

    # Swap last name and first name
    statcast['Name'] = statcast['Name'].str.split(', ').str[::-1].str.join(' ')

    # Add the headers to the dataframe
    statcast.columns = headers_statcast.columns

    #! PLATE DISCIPLINE TABLE
    #! Cleaning the data for the plate discipline table
    # Find the first empty row
    first_empty_idx = dataframe_name[dataframe_name[0] == ''].index.min()

    # Remove rows from the first row until the first empty row
    dataframe_name = dataframe_name.iloc[first_empty_idx + 1:].reset_index(drop=True)

    # Remove the first row
    dataframe_name = dataframe_name.iloc[1:].reset_index(drop= True)  # Using iloc

    # Join rows
    dataframe_name = combine_rows(dataframe_name, 4, 5)
    dataframe_name = combine_rows(dataframe_name, 5, 6)
    dataframe_name = combine_rows(dataframe_name, 7, 8)
    dataframe_name = combine_rows(dataframe_name, 9, 10)
    dataframe_name = combine_rows(dataframe_name, 13, 14)

    #! Create the headers for the second table (plate discipline)
    # Find the first occurrence of 'Player' and 'Meatball Swing %'
    # Find indices
    start_idx = dataframe_name[dataframe_name[0] == 'Player'].index[0]
    end_idx = dataframe_name[dataframe_name[0] == 'Meatball Swing %'].index[0]

    # Slice and transpose
    headers_plate_discipline = dataframe_name.iloc[start_idx:end_idx + 1].T

    # Reset column names
    headers_plate_discipline.columns = headers_plate_discipline.iloc[0]
    headers_plate_discipline         = headers_plate_discipline[1:].reset_index(drop=True)

    # Remove those rows from original dataframe
    dataframe_name = dataframe_name.drop(dataframe_name.index[start_idx:end_idx + 1]).reset_index(drop=True)

    # Find the first empty row
    first_empty_idx = dataframe_name[dataframe_name[0] == ''].index.min()

    # Extract rows from the start until the first empty row
    plate_discipline = dataframe_name.iloc[:first_empty_idx]

    # Drop the first column
    plate_discipline = plate_discipline.drop(columns=[0])

    # Swap last name and first name
    plate_discipline['Name'] = plate_discipline['Name'].str.split(', ').str[::-1].str.join(' ')

    # Drop columns where all values are NaN (empty)
    plate_discipline = plate_discipline.dropna(axis= 1, how= 'all')

    # Add the headers to the dataframe
    plate_discipline.columns = headers_plate_discipline.columns

    #! BATTED BALL PROFILE TABLE
    #! Cleaning the data for the batted ball profile table
    # Find the first empty row
    first_empty_idx = dataframe_name[dataframe_name[0] == ''].index.min()

    # Remove rows from the first row until the first empty row
    dataframe_name = dataframe_name.iloc[first_empty_idx + 1:].reset_index(drop=True)

    # Remove the first row
    dataframe_name = dataframe_name.iloc[1:].reset_index(drop= True)  # Using iloc

    # Join rows
    dataframe_name = combine_rows(dataframe_name, 15, 16)

    #! Create the headers for the third table (batted ball profile)
    # Find the first occurrence of 'Player' and 'Barrel %'
    # Find indices
    start_idx = dataframe_name[dataframe_name[0] == 'Player'].index[0]
    end_idx = dataframe_name[dataframe_name[0] == 'Barrel %'].index[0]

    # Slice and transpose
    headers_batted_ball_profile = dataframe_name.iloc[start_idx:end_idx + 1].T

    # Reset column names
    headers_batted_ball_profile.columns = headers_batted_ball_profile.iloc[0]
    headers_batted_ball_profile         = headers_batted_ball_profile[1:].reset_index(drop=True)

    # Remove those rows from original dataframe
    dataframe_name = dataframe_name.drop(dataframe_name.index[start_idx:end_idx + 1]).reset_index(drop=True)

    # Find the first empty row
    first_empty_idx = dataframe_name[dataframe_name[0] == ''].index.min()

    if pd.isna(first_empty_idx):
        batted_ball_profile = dataframe_name.copy()  # If no empty row, use the entire DataFrame
    else:
        # Extract rows from the start until the first empty row
        batted_ball_profile = dataframe_name.iloc[:first_empty_idx]

    # Drop the first column
    batted_ball_profile = batted_ball_profile.drop(columns=[0])

    # Swap last name and first name
    batted_ball_profile['Name'] = batted_ball_profile['Name'].str.split(', ').str[::-1].str.join(' ')

    # Drop columns where all values are NaN (empty)
    batted_ball_profile = batted_ball_profile.dropna(axis= 1, how= 'all')

    # Add the headers to the dataframe
    batted_ball_profile.columns = headers_batted_ball_profile.columns
    
    
    def remove_duplicates(df):
        """
        Removes duplicate rows from a given DataFrame.

        Args:
            df (pd.DataFrame): The DataFrame to process.

        Returns:
            pd.DataFrame: A new DataFrame without duplicate rows.
        """
        return df.drop_duplicates()
    
    
    # Remove duplicated rows from the dataframes
    statcast_cleaned            = remove_duplicates(statcast)    
    plate_discipline_cleaned    = remove_duplicates(plate_discipline)    
    batted_ball_profile_cleaned = remove_duplicates(batted_ball_profile)    
    
    return statcast_cleaned, plate_discipline_cleaned, batted_ball_profile_cleaned


In [36]:
def process_gamelogs(dataframe_name):        
    #! Create the headers for the gamelogs
    # Find the first occurrence of 'Game Date' and 'Hard Hit %'
    # Find indices
    start_idx = dataframe_name[dataframe_name[0] == 'Game Date'].index[0]
    end_idx = dataframe_name[dataframe_name[0] == 'Hard Hit %'].index[0]

    # Slice and transpose
    headers_gamelogs = dataframe_name.iloc[start_idx:end_idx + 1].T

    # Reset column names
    headers_gamelogs.columns = headers_gamelogs.iloc[0]
    headers_gamelogs         = headers_gamelogs[1:].reset_index(drop=True)
    
    # Remove those rows from original dataframe
    dataframe_name = dataframe_name.drop(dataframe_name.index[start_idx:end_idx + 1]).reset_index(drop=True)
    
    # Use regex to split the column while preserving negative numbers
    dataframe_name[['Name', 'Numbers']] = dataframe_name[0].str.extract(r'^(.*?)([-\d\s.,]*)$')
        
    # Split the 'Numbers' column into separate columns (25 columns)
    dataframe_name = dataframe_name.join(dataframe_name['Numbers'].str.split(expand=True).rename(lambda x: f'col_{x+1}', axis=1))
    
    # Drop the first column
    dataframe_name = dataframe_name.drop(columns=[0])
    
    # Drop the original 'Numbers' column
    dataframe_name = dataframe_name.drop(columns=['Numbers'])
    
    # Remove the first 2 rows
    dataframe_name = dataframe_name.iloc[2:].reset_index(drop= True)  # Using iloc
    
    # # Split the 'Name' column into 'Game Date' and 'Opponent'
    # dataframe_name[['Game Date', 'Opponent']] = dataframe_name['Name'].str.split(" ", n=2, expand=True)
    
    # Create a copy of the dataframe to store the gamelogs
    gamelogs = dataframe_name.copy()
    
    # Split the first column into two
    gamelogs[['Game Date', 'Opponent']] = gamelogs['Name'].str.split(" ", n=1, expand=True)
    
    # Drop the first column
    gamelogs = gamelogs.drop(columns= ['Name'])
    
    # Get the last two column names
    last_two_cols  = gamelogs.columns[-2:].tolist()
    remaining_cols = gamelogs.columns[:-2].tolist()

    # Reorder the DataFrame
    gamelogs = gamelogs[last_two_cols + remaining_cols]
    
    # Add the headers to the dataframe
    gamelogs.columns = headers_gamelogs.columns

    return gamelogs

### Apply a function to the dictionary


In [37]:

#! STATCAST TABLES
# Initialize dictionaries to store the results for each team
statcast_data_pitching            = {}
plate_discipline_data_pitching    = {}
batted_ball_profile_data_pitching = {}

statcast_data_hitting             = {}
plate_discipline_data_hitting     = {}
batted_ball_profile_data_hitting  = {}

#? Pitching
# Iterate over the team_data dictionary
for team_id, df in data_pitching.items():
    # Call the team_advanced_stats function and unpack the returned DataFrames
    statcast, plate_discipline, batted_ball_profile = team_advanced_stats(df)
    
    # Store the results in the respective dictionaries
    statcast_data_pitching[team_id]            = statcast
    plate_discipline_data_pitching[team_id]    = plate_discipline
    batted_ball_profile_data_pitching[team_id] = batted_ball_profile

#? Hitting
# Iterate over the team_data dictionary
for team_id, df in data_hitting.items():
    # Call the team_advanced_stats function and unpack the returned DataFrames
    statcast, plate_discipline, batted_ball_profile = team_advanced_stats(df)
    
    # Store the results in the respective dictionaries
    statcast_data_hitting[team_id]            = statcast
    plate_discipline_data_hitting[team_id]    = plate_discipline
    batted_ball_profile_data_hitting[team_id] = batted_ball_profile
    


In [38]:

#! GAMELOGS
# Initialize dictionaries to store the results for each team
gamelog_pitching = {}
gamelog_hitting  = {}

#? Pitching
# Iterate over the team_data dictionary
for team_id, df in gamelog_data_pitching.items():
    # Call the team_advanced_stats function and unpack the returned DataFrames
    gamelogs = process_gamelogs(df)
    
    # Store the results in the respective dictionaries
    gamelog_pitching[team_id] = gamelogs

#? Hitting
# Iterate over the team_data dictionary
for team_id, df in gamelog_data_hitting.items():
    # Call the team_advanced_stats function and unpack the returned DataFrames
    gamelogs = process_gamelogs(df)
    
    # Store the results in the respective dictionaries
    gamelog_hitting[team_id] = gamelogs

In [39]:
def create_parquet_files(data_dict, suffix):
    """Create Parquet files from a dictionary of DataFrames, ensuring uniform column names."""
    
    df_store = {}

    for key, df in data_dict.items():
        new_name = f"{key}{suffix}"  # Assigning a suffix
        df_copy = df.copy()  # Create a copy to avoid modifying the original DataFrame
        df_copy["ID"] = key  # Add the original dictionary key as a column

        # Convert all column names to strings
        df_copy.columns = df_copy.columns.astype(str)

        df_store[new_name] = df_copy  # Store the modified DataFrame

    # for name, df in df_store.items():
    #     # Save the DataFrame as a Parquet file
    #     df.to_parquet(f"D:\\mlb_analyzer\\output\\teams\\{suffix}\\{name}.parquet")

def merge_dataframes(data_dict):
    """Append all DataFrames in a dictionary into a single DataFrame, preserving IDs."""
    
    # Add an 'ID' column to track each DataFrame's origin
    for name, df in data_dict.items():
        df["ID"] = name  # Append the key as a column
    
    # Concatenate all DataFrames into one
    merged_df = pd.concat(data_dict.values(), ignore_index=True)
    
    return merged_df


# Call the function and export the DataFrames with an ID column
create_parquet_files(statcast_data_hitting, "_hitting_statcast")
create_parquet_files(plate_discipline_data_hitting, "_hitting_plate_discipline")
create_parquet_files(batted_ball_profile_data_hitting, "_hitting_batted_ball_profile")

create_parquet_files(statcast_data_pitching, "_pitching_statcast")
create_parquet_files(plate_discipline_data_pitching, "_pitching_plate_discipline")
create_parquet_files(batted_ball_profile_data_pitching, "_pitching_batted_ball_profile")

create_parquet_files(gamelog_hitting, "_gamelogs_hitting")
create_parquet_files(gamelog_pitching, "_gamelogs_pitching")

# Append all DataFrames into a single DataFrame
statcast_hitting_df = merge_dataframes(statcast_data_hitting)
plate_discipline_hitting_df = merge_dataframes(plate_discipline_data_hitting)   
batted_ball_profile_hitting_df = merge_dataframes(batted_ball_profile_data_hitting)

statcast_pitching_df = merge_dataframes(statcast_data_pitching)
plate_discipline_pitching_df = merge_dataframes(plate_discipline_data_pitching)
batted_ball_profile_pitching_df = merge_dataframes(batted_ball_profile_data_pitching)

gamelog_hitting_df = merge_dataframes(gamelog_hitting)
gamelog_pitching_df = merge_dataframes(gamelog_pitching)


In [40]:

# Create a column with type of analysis
statcast_hitting_df['type']  = 'hitting'
statcast_pitching_df['type'] = 'pitching'

plate_discipline_hitting_df['type']  = 'hitting'
plate_discipline_pitching_df['type'] = 'pitching'

batted_ball_profile_hitting_df['type']  = 'hitting'
batted_ball_profile_pitching_df['type'] = 'pitching'

gamelog_hitting_df['type']  = 'hitting'
gamelog_pitching_df['type'] = 'pitching'

# Append statcast DataFrames
statcast_df = pd.concat([statcast_hitting_df, statcast_pitching_df], ignore_index=True)
statcast_df['analysis_type'] = 'statcast'

# Append plate discipline DataFrames
plate_discipline_df = pd.concat([plate_discipline_hitting_df, plate_discipline_pitching_df], ignore_index=True)
plate_discipline_df['analysis_type'] = 'plate_discipline'

# Append batted ball profile DataFrames
batted_ball_profile_df = pd.concat([batted_ball_profile_hitting_df, batted_ball_profile_pitching_df], ignore_index=True)
batted_ball_profile_df['analysis_type'] = 'batted_ball_profile'

# Append gamelog DataFrames
gamelog_df = pd.concat([gamelog_hitting_df, gamelog_pitching_df], ignore_index=True)
gamelog_df['analysis_type'] = 'gamelog'

# Export the merged DataFrames to Parquet files
output_path = "D:\\mlb_analyzer\\output\\teams\\"

statcast_df.to_parquet(f"{output_path}statcast.parquet")
plate_discipline_df.to_parquet(f"{output_path}plate_discipline.parquet")
batted_ball_profile_df.to_parquet(f"{output_path}batted_ball_profile.parquet")
gamelog_df.to_parquet(f"{output_path}gamelog.parquet")



### Add Splits by team https://www.baseball-reference.com/leagues/split.cgi?t=b&lg=MLB&year=2025

In [None]:
def teams_split(split_type, clean_mode):
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    # Define year
    year = datetime.now().year
    
    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)  
    
    if split_type == 'LHP' or split_type == 'RHP': # for LHP and RHP pitchers
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == '7' or split_type == '14' or split_type == '28': # for the last 7, 14 and 28 days
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=total%7CLast%20{split_type}%20days%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'RH' or split_type == 'LH': # for RH and LH Starters
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=plato%7Cvs%20{split_type}%20Starter%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'Home' or split_type == 'Away': # for home and away games
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=hmvis%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'first_batter_game':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=leado%7C1st%20Batter%20G%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_power_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Power%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_weak_pitcher':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=power%7Cvs.%20Finesse%7CML%7C{year}%7Cbat%7CAB%7C")
    # For each team:
    elif split_type == 'ANA' or split_type == 'ARI' or split_type == 'ATL' or split_type == 'BAL' or split_type == 'BOS' \
        or split_type == 'CHC' or split_type == 'CHW' or split_type == 'CIN' or split_type == 'CLE' or split_type == 'COL' \
        or split_type == 'DET' or split_type == 'HOU' or split_type == 'KCR' or split_type == 'LAD' or split_type == 'FLA' \
        or split_type == 'MIL' or split_type == 'MIN' or split_type == 'NYM' or split_type == 'NYY' or split_type == 'OAK' \
        or split_type == 'PHI' or split_type == 'PIT' or split_type == 'SDP' or split_type == 'SEA' or split_type == 'SFG' \
        or split_type == 'STL' or split_type == 'TBD' or split_type == 'TEX' or split_type == 'TOR' or split_type == 'WSN':
            driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7C{split_type}%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_less_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3C%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    elif split_type == 'vs_greater_or_equal_than_500_WP':
        driver.get(f"https://www.baseball-reference.com/tools/split_stats_lg.cgi?full=1&params=oppon%7CWP%20%3E%3D%20.500%7CML%7C{year}%7Cbat%7CAB%7C")
    
    
    # Name of the table
    datatable_id = 'split1'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//table[@id='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} ({split_type}) table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)
    
    # Close the WebDriver
    driver.quit()    
    
    if clean_mode == 1:
        # Remove 'Roe' exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the last column
        df = df.iloc[:, :-1]

        # Rename last 3 columns
        new_column_names = ["BAbip", "tOPS+", "sOPS+"]  # New names for last 3 columns
        df.columns.values[-3:] = new_column_names  # Assign new names

        # Remove the first column
        df = df.iloc[:, 1:]
    else:
        # Remove 'Roe' and GS exactly (case-sensitive)
        df[0] = df[0].str.replace('Roe', '', regex=False)
        df[0] = df[0].str.replace('GS', '', regex=False)

        # Remove last row
        df = df.iloc[:-1]

        # Remove rows where column 'A' contains 'Rk', but keep the first row
        df = df[~((df.index > 0) & (df[0].str.contains('Rk', na=False)))]

        # Split column from right using spaces
        df = df[0].str.split(" ", n= 30, expand=True)

        # Set first row as header
        df.columns = df.iloc[0]  # Assign first row as column names
        df = df[1:].reset_index(drop=True)  # Remove first row and reset index

        # Remove the first column
        df = df.iloc[:, 1:]

        # Remove the last 2 columns
        df = df.iloc[:, :-2]

        # New column names
        new_column_names = ['Team', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB',
                            'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'TB', 'GDP', 'HBP', 'SH',
                            'SF', 'IBB', 'ROE', 'BAbip', 'tOPS+', 'sOPS+']

        # Rename all columns
        df.columns = new_column_names

    return df

# Call the function to get the teams split data
team_vs_lhp             = teams_split(split_type= 'LHP',  clean_mode= 0) # GS empty
team_vs_rhp             = teams_split(split_type= 'RHP',  clean_mode= 0) # GS empty
team_vs_lh_starters     = teams_split(split_type= 'LH',   clean_mode= 1)
team_vs_rh_starters     = teams_split(split_type= 'RH',   clean_mode= 1)
team_last_seven_days    = teams_split(split_type= '7',    clean_mode= 1)
team_last_fourteen_days = teams_split(split_type= '14',   clean_mode= 1)
team_last_28_days       = teams_split(split_type= '28',   clean_mode= 1)
team_home_games         = teams_split(split_type= 'Home', clean_mode= 1)
team_away_games         = teams_split(split_type= 'Away', clean_mode= 1)
team_first_batter_game  = teams_split(split_type= 'first_batter_game', clean_mode= 0) # GS empty
team_vs_power_pitcher   = teams_split(split_type= 'vs_power_pitcher',  clean_mode= 0) # GS empty
team_vs_weak_pitcher    = teams_split(split_type= 'vs_weak_pitcher',   clean_mode= 0) # GS empty
team_vs_power_team      = teams_split(split_type= 'vs_greater_or_equal_than_500_WP', clean_mode= 1)
team_vs_weak_team       = teams_split(split_type= 'vs_less_than_500_WP',             clean_mode= 1)

# # Direct matchups
team_laa = teams_split(split_type= 'ANA', clean_mode= 1)
team_ari = teams_split(split_type= 'ARI', clean_mode= 1)
team_atl = teams_split(split_type= 'ATL', clean_mode= 1)
team_bal = teams_split(split_type= 'BAL', clean_mode= 1)
team_bos = teams_split(split_type= 'BOS', clean_mode= 1)
team_chc = teams_split(split_type= 'CHC', clean_mode= 1)
team_chw = teams_split(split_type= 'CHW', clean_mode= 1)
team_cin = teams_split(split_type= 'CIN', clean_mode= 1)
team_cle = teams_split(split_type= 'CLE', clean_mode= 1)
team_col = teams_split(split_type= 'COL', clean_mode= 1)
team_det = teams_split(split_type= 'DET', clean_mode= 1)
team_hou = teams_split(split_type= 'HOU', clean_mode= 1)
team_kcr = teams_split(split_type= 'KCR', clean_mode= 1)
team_lad = teams_split(split_type= 'LAD', clean_mode= 1)
team_mia = teams_split(split_type= 'FLA', clean_mode= 1) 
team_mil = teams_split(split_type= 'MIL', clean_mode= 1)
team_min = teams_split(split_type= 'MIN', clean_mode= 1)
team_nym = teams_split(split_type= 'NYM', clean_mode= 1)
team_nyy = teams_split(split_type= 'NYY', clean_mode= 1)
team_oak = teams_split(split_type= 'OAK', clean_mode= 1)
team_phi = teams_split(split_type= 'PHI', clean_mode= 1)
team_pit = teams_split(split_type= 'PIT', clean_mode= 1)
team_sdp = teams_split(split_type= 'SDP', clean_mode= 1)
team_sea = teams_split(split_type= 'SEA', clean_mode= 1)
team_sfg = teams_split(split_type= 'SFG', clean_mode= 1)
team_stl = teams_split(split_type= 'STL', clean_mode= 1)
team_tbr = teams_split(split_type= 'TBD', clean_mode= 1)
team_tex = teams_split(split_type= 'TEX', clean_mode= 1)
team_tor = teams_split(split_type= 'TOR', clean_mode= 1)
team_wsn = teams_split(split_type= 'WSN', clean_mode= 1)

# Dictionary of dataframes for the teams
dic_team = {
    'LAA': team_laa,
    'AZ': team_ari,
    'ATL': team_atl,
    'BAL': team_bal,
    'BOS': team_bos,
    'CHC': team_chc,
    'CHW': team_chw,
    'CIN': team_cin,
    'CLE': team_cle,
    'COL': team_col,
    'DET': team_det,
    'HOU': team_hou,
    'KC': team_kcr,
    'LAD': team_lad,
    'MIA': team_mia,
    'MIL': team_mil,
    'MIN': team_min,
    'NYM': team_nym,
    'NYY': team_nyy,
    'ATH': team_oak,
    'PHI': team_phi,
    'PIT': team_pit,
    'SD': team_sdp,
    'SEA': team_sea,
    'SF': team_sfg,
    'STL': team_stl,
    'TB': team_tbr,
    'TEX': team_tex,
    'TOR': team_tor,
    'WSH': team_wsn   
    }

# Add an ID column with the dictionary key as the identifier
for key, df in dic_team.items():
    df['ID'] = key  # Assign the dictionary key as the ID

# Concatenate all dataFrames in the dictionary
direct_matches = pd.concat(dic_team.values(), ignore_index=True)  # Resets index

dic_splits = {
    'team_vs_lhp'        :team_vs_lhp,        
    'team_vs_rhp'        :team_vs_rhp,
    'team_vs_lh_starters':team_vs_lh_starters,
    'team_vs_rh_starters':team_vs_rh_starters,
    'team_last_seven_days':team_last_seven_days,
    'team_last_fourteen_days':team_last_fourteen_days,
    'team_last_28_days':team_last_28_days,
    'team_home_games':team_home_games,
    'team_away_games':team_away_games,
    'team_first_batter_game':team_first_batter_game,
    'team_vs_power_pitcher':team_vs_power_pitcher,
    'team_vs_weak_pitcher':team_vs_weak_pitcher,
    'team_vs_power_team':team_vs_power_team,
    'team_vs_weak_team':team_vs_weak_team      
}
#! This works only for .py files.
# # Get the current working directory and create the path for the 'output' folder
# output_folder = os.path.join(os.path.dirname(__file__), 'output')

# Get the current working directory and create the path for the 'output' folder
# output_folder = os.path.join(os.getcwd(), 'output')

output_folder_teams           = ('D:\\mlb_analyzer\\output\\teams\\direct_matches\\')
output_folder_splits          = ('D:\\mlb_analyzer\\output\\teams\\splits\\')
output_folder_individual_team = ('D:\\mlb_analyzer\\output\\teams\\team\\')

# Save each DataFrame in the 'output' folder
# For direct matches
direct_matches.to_csv(os.path.join(output_folder_teams, 'direct_matches.csv'), index=False)

# For Splits
for name, dataframe in dic_splits.items():
    dataframe.to_csv(os.path.join(output_folder_splits, f'{name}.csv'), index=False)
    
# For Teams
for name, dataframe in dic_team.items():
    dataframe.to_csv(os.path.join(output_folder_individual_team, f'{name}.csv'), index=False)


### Import Standings


In [None]:
def import_standings():
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)    
    driver.get(f"https://www.baseball-reference.com/leagues/MLB-standings.shtml")

    datatable_id = 'expanded_standings_overall'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//table[@id='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)
    
    # Close the WebDriver
    driver.quit()
    
    #? Clean the extraction
    # Remove last row
    df = df.iloc[:-1]

    # Split column from right using spaces
    df = df[0].str.rsplit(" ", n= 27, expand=True)

    # Use regex to separate the leading number from the team name
    df[['Number', 'Team_Name']] = df[0].str.extract(r'(\d+)\s(.+)')

    # Remove the first column
    df = df.iloc[:, 1:]

    # Remove rows where column '0 contains 'Rk', but keep the first row
    df = df[~((df.index > 0) & (df[1].str.contains('Tm', na=False)))]

    # Set first row as header
    df.columns = df.iloc[0]  # Assign first row as column names
    df = df[1:].reset_index(drop=True)  # Remove first row and reset index

    # New column names
    new_column_names = ['W', 'L', 'W-L%', 'StrkWL', 'StrkNb', 'R', 'RA', 'Rdiff', 'SOS', 'SRS', 'pythWL', 'Luck',
                        'vEast', 'vCent', 'vWest', 'Inter', 'Home', 'Road', 'ExInn', '1Run', 'vRHP', 'vLHP', '≥.500',
                        '<.500', 'last10', 'last20', 'last30', 'Rk', 'Team']

    # Rename all columns
    df.columns = new_column_names

    # Drop the column named 'Rk'
    df = df.drop(columns=['Rk'])

    # Step 2: Move the last column to the first position
    cols = df.columns.tolist()
    df = df[[cols[-1]] + cols[:-1]]
    
    return df

# Call the function
standings_df = import_standings()

# Export the table
output_folder_teams  = ('D:\\mlb_analyzer\\output\\teams\\')
standings_df.to_csv(os.path.join(output_folder_teams, 'standings.csv'), index=False)


### Import Odds