In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from rapidfuzz import process
from datetime import datetime, timedelta
import re
import os

### Rotowire.com scores

In [8]:
def teams_matchups(game_date):
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)    
    driver.get(f"https://www.rotowire.com/baseball/scoreboard.php?date={game_date}")

    datatable_id = 'grid-noGutter mb-15'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//div[@class='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)

    # Find indices for where "Final" and "View Box Score" appear
    start_indices = df[df[0].str.contains("Final", case=False)].index
    end_indices = df[df[0].str.contains("View Box Score", case=False)].index

    # Extract and split data
    game_dataframes = []
    for start in start_indices:
        # Find the corresponding end index that's greater than the start index
        end = end_indices[end_indices > start].min()
        if pd.notna(end):  # Ensure there's a valid end index
            game_data = df.iloc[start:end+1]  # Capture all rows in between
            # Split every 13 rows and create DataFrame
            reshaped_data = [game_data.iloc[i:i+13] for i in range(0, len(game_data), 13)]
            game_dataframes.extend(reshaped_data)

    # Final dataframe with all games
    final_df = pd.concat(game_dataframes, ignore_index= True)

    # Convert the DataFrame to a numpy array for reshaping
    data = final_df.values  

    # Reshape: each group of 13 rows becomes one row with 13 columns
    reshaped_data = [data[i:i+13].flatten() for i in range(0, len(data), 13)]

    # Convert reshaped data back to a DataFrame
    reshaped_df = pd.DataFrame(reshaped_data)

    # Drop the first 4 columns and last column
    reshaped_df = reshaped_df.drop(reshaped_df.columns[[0, 1, 2, 3, -1]], axis= 1)

    # Add the headers
    reshaped_df.columns = ['Away', 'Home', 'R_Away', 'H_Away', 'E_Away', 'R_Home', 'H_Home', 'E_Home']

    # Add the date
    reshaped_df['date'] = game_date

    # Calculate the winner, loser and the difference in runs, hits and errors
    reshaped_df['winner']    = reshaped_df.apply(lambda x: x['Away'] if int(x['R_Away']) > int(x['R_Home']) else x['Home'], axis= 1)
    reshaped_df['loser']     = reshaped_df.apply(lambda x: x['Away'] if int(x['R_Away']) < int(x['R_Home']) else x['Home'], axis= 1)
    reshaped_df['diff_runs_away_vs_home_team']   = reshaped_df.apply(lambda x: int(x['R_Away']) - int(x['R_Home']), axis= 1)
    reshaped_df['diff_hits_away_vs_home_team']   = reshaped_df.apply(lambda x: int(x['H_Away']) - int(x['H_Home']), axis= 1)
    reshaped_df['diff_errors_away_vs_home_team'] = reshaped_df.apply(lambda x: int(x['E_Away']) - int(x['E_Home']), axis= 1)
    
    return reshaped_df

# Define dates
dates = [
    (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=4)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=5)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=6)).strftime('%Y-%m-%d'),
    (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
    ]

# Initialize a dictionary to store dataframes
dataframes = {}

# Loop through each date and store results in a unique dataframe
for game_date in dates:
    print(f"Processing data for date: {game_date}")
    
    # Run the teams_matchups function and save the resulting dataframe
    result_df = teams_matchups(game_date)
    
    # Store the dataframe in the dictionary with the date as the key
    dataframes[game_date] = result_df

# Create the final dataframe
games = pd.concat(dataframes.values(), ignore_index=True)

# Convert to datetime format
games["date"] = pd.to_datetime(games["date"])

# Create a game_id column using the index
games["game_id"] = games.index + 1
games["game_id"] = games["game_id"].astype(str)

# Create a key column for the game
games["key"] = games["date"].dt.strftime("%Y%m%d") + "_" + games["game_id"]

# Know if the winner was the away or home team
games["visitor_won"]           = games.apply(lambda x: 1 if int(x["R_Away"]) > int(x["R_Home"]) else 0, axis=1)
games["home_won"]              = games.apply(lambda x: 1 if int(x["R_Away"]) < int(x["R_Home"]) else 0, axis=1) 
games["visit_or_home_victory"] = games.apply(lambda x: 'H' if int(x["R_Away"]) < int(x["R_Home"]) else 'V', axis=1) 

# Create a column that indicates if the game was a shutout
games["shutout"] = games.apply(lambda x: 1 if int(x["R_Away"]) == 0 or int(x["R_Home"]) == 0 else 0, axis=1)

# Create a column that indicates if the game was a one-run game
games["one_run_game"] = games.apply(lambda x: 1 if abs(int(x["R_Away"]) - int(x["R_Home"])) == 1 else 0, axis=1)

# Create a column that indicates if the game was a high-scoring game
games["high_scoring_game"] = games.apply(lambda x: 1 if int(x["R_Away"]) + int(x["R_Home"]) >= 10 else 0, axis=1)

# Create a column that indicates if the game was a low-scoring game
games["low_scoring_game"] = games.apply(lambda x: 1 if int(x["R_Away"]) + int(x["R_Home"]) <= 3 else 0, axis=1)

# Create a column that indicates if the game was a blowout
games["blowout"] = games.apply(lambda x: 1 if abs(int(x["R_Away"]) - int(x["R_Home"])) >= 5 else 0, axis=1)

# Create a column that indicates how many runs were scored in the game
games["total_runs"] = games.apply(lambda x: int(x["R_Away"]) + int(x["R_Home"]), axis=1)

# Create a column that indicates how many hits were scored in the game
games["total_hits"] = games.apply(lambda x: int(x["H_Away"]) + int(x["H_Home"]), axis=1)

# Create a column that indicates how many errors were scored in the game
games["total_errors"] = games.apply(lambda x: int(x["E_Away"]) + int(x["E_Home"]), axis=1)

# Create a column that join the home and away teams
games["teams"] = games.apply(lambda x: x["Away"] + " vs " + x["Home"], axis=1)

# Count occurrences of each team matchup in the 'teams' column
team_counts = games['teams'].value_counts()

# Map the counts back to the original dataFrame
games['team_matchup_count'] = games['teams'].map(team_counts)

# Create a group id for each team matchup
games['series_id'] = games.groupby('teams').ngroup() + 1


Processing data for date: 2025-04-30
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-04-29
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-04-28
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-04-27
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-04-26
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-04-25
grid-noGutter mb-15 table loaded successfully.
Processing data for date: 2025-04-24
grid-noGutter mb-15 table loaded successfully.


### up to here - GOOD


In [28]:
def todays_matchup():
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    today = datetime.now().strftime('%Y-%m-%d')

    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)    
    driver.get(f"https://www.rotowire.com/baseball/scoreboard.php?date={today}")

    datatable_id = 'grid-noGutter mb-15'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//div[@class='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)

    # Find indices for where "Final" and "View Box Score" appear
    start_indices = df[df[0].str.contains(" ET", case= False)].index
    end_indices = df[df[0].str.contains("View Box Score", case=False)].index

    # Extract and split data
    game_dataframes = []
    for start in start_indices:
        # Find the corresponding end index that's greater than the start index
        end = end_indices[end_indices > start].min()
        if pd.notna(end):  # Ensure there's a valid end index
            game_data = df.iloc[start:end+1]  # Capture all rows in between
            # Split every 8 rows and create DataFrame
            reshaped_data = [game_data.iloc[i:i+8] for i in range(0, len(game_data), 8)]
            game_dataframes.extend(reshaped_data)

    # Final dataframe with all games
    final_df = pd.concat(game_dataframes, ignore_index= True)

    # Convert the DataFrame to a numpy array for reshaping
    data = final_df.values  

    # Reshape: each group of 8 rows becomes one row with 8 columns
    reshaped_data = [data[i:i+8].flatten() for i in range(0, len(data), 8)]

    # Convert reshaped data back to a DataFrame
    reshaped_df = pd.DataFrame(reshaped_data)

    # Drop the last column
    reshaped_df = reshaped_df.drop(reshaped_df.columns[[-1]], axis= 1)

    # Add the headers
    reshaped_df.columns = ['game_time', 'away_team', 'home_team', 'away_pitcher_name', 'away_pitcher_record', 'home_pitcher_name', 'home_pitcher_record']

    # Add the date
    reshaped_df['date'] = today
    
    return reshaped_df

# Get today's matchups
todays_matchups = todays_matchup()

# Convert the date column to datetime format
todays_matchups["date"] = pd.to_datetime(todays_matchups["date"])

# Create a game_id column using the index
todays_matchups["game_id"] = todays_matchups.index + 1
todays_matchups["game_id"] = todays_matchups["game_id"].astype(str)


grid-noGutter mb-15 table loaded successfully.


## Extract probable pitcher data from https://baseballsavant.mlb.com/probable-pitchers

In [76]:
def advanced_matchups():
    # Load the options
    options = Options()
    options.add_argument("--headless")  # Optional: Run in headless mode
    options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"

    today = datetime.now().strftime('%Y-%m-%d')

    # Set up the WebDriver
    driver = webdriver.Chrome(options= options)    
    driver.get(f"https://baseballsavant.mlb.com/probable-pitchers")

    datatable_id = 'template__content template--two-column__content--one'

    # Explicitly wait for the table element to load
    datatable_xpath = f"//div[@class='{datatable_id}']"  # Update XPATH as needed
    try:
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, datatable_xpath))
        )
        print(f"{datatable_id} table loaded successfully.")
    except Exception as e:
        print(f"Error: Table {datatable_id} did not load. Details: {e}")
        driver.quit()

    # Wait for the load of the page
    time.sleep(10)

    # Locate the table
    table_element = driver.find_element(By.XPATH, datatable_xpath)
    text_content = table_element.text

    # Process the table content
    rows = text_content.split("\n")
    table_data = [row.split("\t") for row in rows]

    # Convert to dataframe
    df = pd.DataFrame(table_data)

    # Identify rows that contain " @ "
    split_indices = df[df[0].str.contains(" @ ")].index.tolist()

    # Add start and end indices
    split_indices.append(len(df))
    split_data = [df.iloc[split_indices[i]:split_indices[i+1]].values.flatten().tolist()
                    for i in range(len(split_indices)-1)]

    # Create new DataFrame
    new_df = pd.DataFrame(split_data)

    # Splitting on the " | " symbol. Splitting only col 2 while keeping other columns
    df_expanded = new_df.copy()  # Preserve other columns
    df_expanded[['Column1', 'Column2']] = df_expanded[2].str.split("ET", expand=True)

    # Drop original column
    df_expanded = df_expanded.drop(columns=[2, 3])

    #! Removing rows where col 4 contains 'to be announced'
    df_filtered = df_expanded[df_expanded[4] != "To be announced."]

    # Filtering rows where col6 contains "Never Faced Any Players on this Team."
    df_never_faced_the_team = df_filtered[df_filtered[6] == "Never Faced Any Players on this Team."].copy()

    # Removing rows from the filtered DataFrame
    df_filtered = df_filtered[df_filtered[6] != "Never Faced Any Players on this Team."]

    # Convert empty strings to NaN for better handling
    df_filtered[17] = df_filtered[17].replace("", pd.NA)

    # Count occurrences of each unique non-empty value
    value_counts = df_filtered[17].dropna().value_counts()

    if not value_counts.empty:
        # Identify the most frequent value
        most_frequent_value = value_counts.idxmax()

        # Fill NaN values with the most frequent value
        df_filtered[17] = df_filtered[17].fillna(most_frequent_value)

    # Splitting on the " ET ". Splitting only col 2 while keeping other columns
    df_expanded = new_df.copy()  # Preserve other columns
    df_expanded[['Column1', 'Column2']] = df_expanded[2].str.split("ET", expand=True)

    # Drop original column
    df_expanded = df_expanded.drop(columns=[2, 3])

    # Splitting columns
    df_split = df_filtered.copy()  # Preserve other columns
    df_split[['PA_away_pitcher', 'K%_away_pitcher', 'BB%_away_pitcher', 'AVG_away_pitcher', 'wOBA_away_pitcher']] = df_split[8].str.split(" ", expand=True)
    df_split[['Exit_Velo_away_pitcher', 'unit_away', 'Lunch_Angle_away_pitcher', 'xBA_away_pitcher', 'xSLG_away_pitcher', 'xwOBA_away_pitcher']] = df_split[10].str.split(" ", expand=True)

    df_split[['PA_home_pitcher', 'K%_home_pitcher', 'BB%_home_pitcher', 'AVG_home_pitcher', 'wOBA_home_pitcher']] = df_split[16].str.split(" ", expand=True)
    df_split[['Exit_Velo_home_pitcher', 'unit_home', 'Lunch_Angle_home_pitcher', 'xBA_home_pitcher', 'xSLG_home_pitcher', 'xwOBA_home_pitcher']] = df_split[18].str.split(" ", expand=True)

    # Drop original columns if needed
    df_split = df_split.drop(columns=[6, 7, 8, 9, 10, 11, 14, 15, 16, 17, 18, 19])

    # Function to remove city names
    def remove_city_names(text):
        return re.sub(r'\b(?:San Diego|Pittsburgh|Arizona|Philadelphia|Kansas City|Baltimore|Tampa Bay|New York|Cleveland|Toronto|Minnesota|Boston|Los Angeles|Atlanta|Houston|Chicago|Seattle|Texas|Milwaukee|St. Louis|Detroit|Colorado|San Francisco)\b ', '', text)

    # Apply the function to the first column of the DataFrame
    df_split[0] = df_split[0].apply(remove_city_names)

    # Replace @ in col 0 with "vs"
    df_split[0] = df_split[0].str.replace(" @ ", " vs ", regex= True)

    # Splitting on the " vs ". Splitting only col 0 while keeping other columns.
    df_split = df_split.copy()  # Preserve other columns
    df_split[['away_team', 'home_team']] = df_split[0].str.split(" vs ", expand=True)

    # Update the column with the date
    df_split[1] = datetime.now().strftime('%Y-%m-%d')

    # # Add the headers
    # new_df.columns = ['teams', 'date', 'time_and_park', 'away_pitcher_name', 'away_pitcher_throws', 'away_pitcher_info', 
    #                     'away_pitcher_fields_1', 'away_pitcher_data_1', 'away_pitcher_fields_2', 'away_pitcher_data_2',]
    
    return df_split


In [77]:
advanced_matchups_df = advanced_matchups()

template__content template--two-column__content--one table loaded successfully.
