In [54]:
#ALL IMPORTS AND FUNCTIONS

from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
import requests
from selenium import webdriver
from selenium.webdriver.safari.service import Service
from selenium.webdriver import SafariOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from datetime import datetime
import pandas as pd
import re


# Function to extract last name and first initial
def extract_name(name):
    parts = name.split()
    if len(parts) > 1:  # Format is "Firstname Lastname" or "F. Lastname"
        return parts[1] + ' ' + parts[0][0]
    else:  # Format is "Lastname"
        return parts[0]
        
# Function to parse the first string in the string and the number that comes afterwards
def split_string_number(s):
    match = re.search(r'(.*) (\d+) \(\d+\)', s)
    if match:
        return list(match.groups())
    else:
        return ["", ""]

def merge_dataframes(df1, df2, key1, key2):
    # Create a new column in df1 that contains only the last name
    df1['last_name'] = df1[key1].apply(lambda x: x.split()[1] if len(x.split()) > 1 else x.split()[0])

    # Split df2 into two dataframes
    df2_lastname = df2[df2[key2].apply(lambda x: len(x.split()) == 1)].copy()
    df2_initial_lastname = df2[df2[key2].apply(lambda x: len(x.split()) > 1)].copy()

    # Perform the join operation on last name
    result_lastname = pd.merge(df1, df2_lastname, left_on='last_name', right_on=key2, how='left')

    # Standardize names in both dataframes
    df1['standardized'] = df1[key1].apply(extract_name)
    df2_initial_lastname['standardized'] = df2_initial_lastname[key2].apply(extract_name)

    # Perform the join operation on first initial and last name
    result_initial_lastname = pd.merge(df1, df2_initial_lastname, left_on='standardized', right_on='standardized', how='left')

    # Append the rows from result_initial_lastname that are not already in result_lastname
    result = pd.concat([result_lastname, result_initial_lastname.loc[~result_initial_lastname.index.isin(result_lastname.index)]])

    # Drop the temporary columns
    result = result.drop(columns=['last_name', 'standardized', key2])
    
    return result

def to_camel_case(text):
    words = text.split()
    return words[0].capitalize() + ''.join(word.title() for word in words[1:])

def get_main_boxscores(box_soup):
    box_table = box_soup.find('table')
    headers = [header.text.strip() for header in box_table.find_all('th')]
    headers.insert(0,'jersey_number')
    rows = box_table.find_all('tr')
    box_df = []
    for row in rows[1:]:  # Skip the header row
        cols = row.find_all('td')
        cols = [col.text.strip() for col in cols]
        box_df.append(cols)
    box_individual_stats = pd.DataFrame(box_df, columns=headers)
    box_total_stats = box_individual_stats.tail(1)
    box_individual_stats = box_individual_stats.drop(box_individual_stats.index[-1])
    return box_individual_stats

def get_additional_batting_stats(box_soup, split_string_number):
    additional_stats = pd.DataFrame(columns=['Player','2B','3B','HR','RBI'])       
    for div in box_soup.find_all('div', {'class': 'additional-details'}):
        # Get the stat type (2B, HR, RBI)
        stat_type = div.contents[0].strip().replace(':', '')
        # Loop over each li in the ul
        for li in div.find_all('li'):
            # Get the name and stat value
            name, stat_value = split_string_number(li.text)
            # If the name is already in the DataFrame, update the stat value
            if name in additional_stats['Player'].values:
                additional_stats.loc[additional_stats['Player'] == name, stat_type] = int(stat_value)
            # Otherwise, add a new row to the DataFrame
            else:
                additional_stats = pd.concat([additional_stats, pd.DataFrame({'Player': [name], stat_type: [stat_value]})], ignore_index=True)
    additional_stats = additional_stats.astype({'Player': str, '2B': 'Int64', '3B': 'Int64', 'HR': 'Int64', 'RBI': 'Int64'}, errors='ignore')
    additional_stats.rename(columns={'RBI': 'RBI Check'}, inplace=True)
    
    return additional_stats

def get_additional_pitching_stats(box_soup):
    data = []
    for div in box_soup.find_all('div', {'class': 'additional-details'}):
        # Check if it's a win or loss
        is_loss = "Loss:" in div.text
        is_won = "Win:" in div.text

        # Find all li elements within the div
        for li in div.find_all('li'):
            if is_won or is_loss:
                player_name = li.text.strip()
                data.append([player_name, 1 if is_won else 0, 1 if is_loss else 0,0,0])
                is_loss=False
                is_won=False
            else:
                match = re.match(r'(.*) (\d+) \((\d+) strikes\)', li.text)
                if match:
                    player = match.group(1)
                    pitches = int(match.group(2))
                    strikes = int(match.group(3))
                    data.append([player, 0, 0, pitches, strikes])

    # Create a DataFrame with Additional Pitching stats
    df = pd.DataFrame(data, columns=['Player', 'Wins', 'Losses', 'Pitches', 'Strikes'])
    additional_stats = df.groupby('Player').sum().reset_index()
    
    return additional_stats
    
def get_game_stats(soup, team_type):

    team = soup.find(class_=f"gamecenter-game-banner-team {team_type}")

    team_name_long = team.find(class_='team-name-long').text.strip()
    team_name_short = team.find(class_='team-name-short').text.strip()
    record = team.find(class_='record').text.strip()
    team_logo = team.find('img')['src']

    wins, losses, ties = map(int, record.strip('()').split('-'))
    data = [[team_name_long, team_name_short, wins, losses, ties, team_logo]]

    data_frames = pd.DataFrame(data, columns=['team_name_long', 'team_name_short', 'wins', 'losses', 'ties', 'team_logo'])

    return data_frames

def convert_all_stats_to_int(stats):
    excluded_cols = ['Player', 'Batter', 'Pitcher', 'IP', 'ERA', 'AVG', 'POS','OBP','team_name_long','game_id','team_type','date','ingestion_time']
    cols = [col for col in stats.columns if col not in excluded_cols]
    stats[cols] = stats[cols].astype(int)

    return stats

def get_game_urls(all_todays_games):
    game_urls=[]
    for game in all_todays_games:
        # Get the URL of the game details page
        game_url = game.find('a')['href']
        game_urls.append(game_url)
    return game_urls

In [55]:
def get_all_box_scores(all_todays_game_urls):

    #CREATE FINAL INDIVIDUAL_BATTING_STATS DATAFRAME (1)
    individual_batting_stats = pd.DataFrame()
    #CREATE FINAL INDIVIDUAL_PITCHING_STATS DATAFRAME (1)
    individual_pitching_stats = pd.DataFrame()
    #CREATE DATAFRAME FOR GAMES WHERE DATA IS NOT SCRAPED PROPERLY
    unscraped_games = []
    
    for game in all_todays_game_urls:
        print(game)
        
    #--------------------------------- LOOP THROUGH EACH GAME ON DATE IN DATES
    for game in all_todays_game_urls:
        driver = webdriver.Safari()
        driver.get('https://www.ncaa.com/'+game)
        #DATAFRAME OF DATAFRAMES THAT WILL BE UPLOADED TO INDIVIDUAL BATTING/PITCHING STATS 
        data_frames = {}
        team_types = ['visitor', 'home']
        box_types = ['batter', 'pitcher']
    #------------------------------- SCRAPE ONCE FOR VISITOR BOX SCORES AND ONCE FOR HOME BOX SCORES
        
        for team_type in team_types:
            
            if team_type == 'home':
                team_type_mask = 'home'
        # Select home team to expose home team box scores
                try:
                    wait = WebDriverWait(driver, 1)  # wait up to 1 seconds
                    home_team_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.boxscore-team-selector-team.home')))
                    home_team_button.click()
                    print("Home team button clicked successfully.")
                    time.sleep(5)
                except Exception as e:
                    print("Error in clicking home team button: ", str(e))
            else:
                team_type_mask = 'away'
                
            soup = BeautifulSoup(driver.page_source, 'lxml')
            
#----------------------------------- GET BEATIFUL SOUP BOX SCORES AND ADD FAILSAFES   
            
            for box_type in box_types:
    
                box_soup = soup.find(class_=f"boxscore-table boxscore-table_{box_type}_{team_type}")
                
                if box_soup is None:
                    print(f"Failed to captured {team_type} {box_type} stats from {game}")
                    unscraped_games.append(game)
                    continue
                    
    #------------------------------------- GET MAIN BATTING/PITCHING BOX SCORE STATS
                
                box_individual_stats = get_main_boxscores(box_soup)
    
    #----------------------------------------SKIP TO NEXT BOX SCORE SCRAPE JOB
                
                if box_individual_stats.empty:
                    print(f"Main boxscore dataframe is empty and {team_type}_{box_type} stats for {game} were not ingested")
                    continue
                
    #------------------------------------------ GET ADDITIONAL BATTING STATS 
                
                if box_type == 'batter':   
                    additional_stats = get_additional_batting_stats(box_soup, split_string_number)
                    # print(additional_stats)
                
    #----------------------------------------GET ADDITIONAL PITCHING STATS 
                
                else:
                    additional_stats = get_additional_pitching_stats(box_soup)
                    # print(additional_stats)
    
    #---------------------------------JOIN BOX SCORES DF WITH ADDITIONAL STATS (BOTH PITCHING AND HITTING)
                
                camel_box_type = to_camel_case(box_type)          
                # print(camel_box_type)    
                
                if not box_individual_stats.empty and not additional_stats.empty:
                    data_frames[f"{team_type}_{box_type}_individual_stats"] = merge_dataframes(box_individual_stats,additional_stats,camel_box_type, 'Player')
                else:
                    data_frames[f"{team_type}_{box_type}_individual_stats"] = box_individual_stats
                
    #----------------------------------ADD ADDITIONAL STATS TO INDIVIDUAL GAME STATS
            
                if f"{team_type}_{box_type}_individual_stats" in data_frames:
        
                    data_frames[f"{team_type}_{box_type}_individual_stats"]['team_name_long'] = get_game_stats(soup,team_type_mask)['team_name_long'].iloc[0]
                    data_frames[f"{team_type}_{box_type}_individual_stats"]['game_id'] = game
                    data_frames[f"{team_type}_{box_type}_individual_stats"]['team_type'] = team_type        
                    data_frames[f"{team_type}_{box_type}_individual_stats"]['date'] = pd.to_datetime(date)
    
    #--------------------------------APPEND GAME STATS TO ALL INDIVIDUAL GAME STATS DATAFRAME
                
                if box_type == 'batter':
                    data_frames[f"{team_type}_{box_type}_individual_stats"] = data_frames[f"{team_type}_{box_type}_individual_stats"].dropna(how='all')
                    individual_batting_stats = pd.concat([individual_batting_stats,data_frames[f"{team_type}_{box_type}_individual_stats"]])
                    print(f"Successfully captured {team_type} {box_type} stats from {game}")
                    # print(convert_all_stats_to_int(individual_batting_stats))
                else:
                    data_frames[f"{team_type}_{box_type}_individual_stats"] = data_frames[f"{team_type}_{box_type}_individual_stats"].dropna(how='all')
                    individual_pitching_stats = pd.concat([individual_pitching_stats,data_frames[f"{team_type}_{box_type}_individual_stats"]])
                    print(f"Successfully captured {team_type} {box_type} stats from {game}")
                    # print(convert_all_stats_to_int(individual_pitching_stats))
                
        time.sleep(1)
        driver.quit()
    
    individual_batting_stats=individual_batting_stats.fillna(0)
    individual_pitching_stats=individual_pitching_stats.fillna(0)
    individual_batting_stats = convert_all_stats_to_int(individual_batting_stats)
    individual_pitching_stats = convert_all_stats_to_int(individual_pitching_stats)

    return individual_batting_stats, individual_pitching_stats, unscraped_games

In [57]:

# Start the timer
start_time = time.time()

dates = ['2024/03/04']
final_batting_stats = pd.DataFrame()
final_pitching_stats = pd.DataFrame()

for date in dates:
    # The URL of the page you want to scrape
    url = 'https://www.ncaa.com/scoreboard/baseball/d1/{}'.format(date)
    print(url)
    # Send a GET request to the website
    response = requests.get(url)
    # Parse the HTML content of the page with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the games on the page
    all_todays_games = soup.find_all('div', class_='gamePod gamePod-type-game status-final')   
    all_todays_game_urls = get_game_urls(all_todays_games)
    print("Original games being scraped:")
    original_batting_stats, original_pitching_stats, missed_games = get_all_box_scores(all_todays_game_urls)
    missed_games = list(set(missed_games))
    print(f"missed games: {missed_games}")
    while missed_games:    
        print("Missed games being scraped:")
        batting_backup_stats, pitching_backup_stats, missed_games = get_all_box_scores(missed_games)
        original_batting_stats = pd.concat([original_batting_stats,batting_backup_stats])
        original_pitching_stats = pd.concat([original_pitching_stats,pitching_backup_stats])
        missed_games = list(set(missed_games))  
        
    original_batting_stats = original_batting_stats.drop_duplicates()
    original_pitching_stats = original_pitching_stats.drop_duplicates()
    
    final_batting_stats = pd.concat([final_batting_stats,original_batting_stats])
    final_pitching_stats = pd.concat([final_pitching_stats,original_pitching_stats])
    
final_batting_stats['ingestion_time'] = datetime.now()
final_pitching_stats['ingetstion_time'] = datetime.now()
    
# End the timer and print the elapsed time
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")


# OPTIMIZE SO RUNS FASTER
# HEADLESS BROWSER - Mozilla

    
#WRITE NCAA GAME SCRAPER
#WRITE NCAA ROSTER SCRAPER
        

https://www.ncaa.com/scoreboard/baseball/d1/2024/03/04
Original games being scraped:
/game/6290612
/game/6234317
Successfully captured visitor batter stats from /game/6290612
Successfully captured visitor pitcher stats from /game/6290612
Home team button clicked successfully.
Successfully captured home batter stats from /game/6290612
Successfully captured home pitcher stats from /game/6290612
Successfully captured visitor batter stats from /game/6234317
Successfully captured visitor pitcher stats from /game/6234317
Home team button clicked successfully.
Successfully captured home batter stats from /game/6234317
Successfully captured home pitcher stats from /game/6234317
missed games: []
Time taken: 26.95056390762329 seconds


In [37]:
# missed_games = list(set(missed_games))
# while missed_games:    
#     print("Missed games being scraped:")
#     batting_backup_stats, pitching_backup_stats, missed_games = get_all_box_scores(missed_games)
#     original_batting_stats = pd.concat([original_batting_stats,batting_backup_stats])
#     original_pitching_stats = pd.concat([original_pitching_stats,pitching_backup_stats])
    

Missed games being scraped:
/game/6233798
/game/6233874
/game/6233804
/game/6233811
/game/6233795
/game/6233858
/game/6233802
/game/6233861
/game/6233809
/game/6233800
/game/6232486
/game/6233803
/game/6233810
/game/6233799
/game/6233296
/game/6240601
/game/6233792
/game/6233790
/game/6288466
/game/6233808
/game/6233794
/game/6233805
/game/6233793
/game/6233801
/game/6232620
/game/6233797
/game/6233860
/game/6233796
/game/6233807
/game/6233855
Successfully captured visitor batter stats from /game/6233798
Successfully captured visitor pitcher stats from /game/6233798
Home team button clicked successfully.
Successfully captured home batter stats from /game/6233798
Successfully captured home pitcher stats from /game/6233798
Successfully captured visitor batter stats from /game/6233874
Successfully captured visitor pitcher stats from /game/6233874
Home team button clicked successfully.
Successfully captured home batter stats from /game/6233874
Successfully captured home pitcher stats from 

  individual_batting_stats = pd.concat([individual_batting_stats,data_frames[f"{team_type}_{box_type}_individual_stats"]])


Successfully captured visitor batter stats from /game/6233811
Successfully captured visitor pitcher stats from /game/6233811
Home team button clicked successfully.
Successfully captured home batter stats from /game/6233811
Successfully captured home pitcher stats from /game/6233811
Successfully captured visitor batter stats from /game/6233795
Successfully captured visitor pitcher stats from /game/6233795
Home team button clicked successfully.
Successfully captured home batter stats from /game/6233795
Successfully captured home pitcher stats from /game/6233795
Successfully captured visitor batter stats from /game/6233858
Successfully captured visitor pitcher stats from /game/6233858
Home team button clicked successfully.
Successfully captured home batter stats from /game/6233858
Successfully captured home pitcher stats from /game/6233858
Successfully captured visitor batter stats from /game/6233802
Successfully captured visitor pitcher stats from /game/6233802
Home team button clicked s

In [50]:
print(final_batting_stats[final_batting_stats['team_name_long'] == 'Niagara'])

    jersey_number        Batter POS  AB  R  H  RBI  BB  SO AVG OBP  2B  3B  \
0               5        Groves  CF   2  3  1    0   2   1   0   0   0   0   
1              14      Hutchins  CF   1  0  0    0   0   0   0   0   0   0   
2              10        Monile  LF   1  2  0    0   2   0   0   0   0   0   
3              21      Rataczak  1B   2  1  0    2   2   0   0   0   0   0   
4              35         Green  1B   1  0  1    0   0   0   0   0   0   0   
..            ...           ...  ..  .. .. ..  ...  ..  ..  ..  ..  ..  ..   
7              18  Gnardellis N  LF   3  0  1    0   0   1   0   0   0   0   
8              12     C. Satcho  3B   2  0  0    0   1   1   0   0   0   0   
9              25   J. Messmore   P   0  0  0    0   0   0   0   0   0   0   
10             30    R Gonzalez   P   0  0  0    0   0   0   0   0   0   0   
11              6       K. Head   P   0  0  0    0   0   0   0   0   0   0   

    HR  RBI Check team_name_long        game_id team_type      

In [58]:

# Export individual_batting_stats to a CSV file
final_batting_stats.to_csv(f"test_data/all_batting_stats_mar_3.csv", index=False)

# Export individual_pitching_stats to a CSV file
final_pitching_stats.to_csv(f"test_data/all_pitching_stats_mar_3.csv", index=False)

driver.quit()

In [8]:
# #GET BATTING BOX SCORES FROM A SINGLE GAME

#----------------------------------- GET BEATIFUL SOUP BOX SCORES AND ADD FAILSAFES
            
            # box_soup = None
            # attempts = 0
            # max_attempts = 5
            # while box_soup is None and attempts < max_attempts:
            #     try:
            #         wait = WebDriverWait(driver, 2)  # wait up to 1 seconds
            #         box_soup = soup.find(class_=f"boxscore-table boxscore-table_{box_type}_{team_type}")
            #         if box_soup is not None:
            #             break  # If we find the element, exit the loop
            #     except Exception as e:
            #         print(f"An error occurred: {e}")
            #     time.sleep(2)  # Wait for 1 second before trying again
            #     attempts += 1
            #     print(f"Attempt: {attempts}")
                
            # if attempts == max_attempts:
            #     print(f"Failed to captured {team_type} {box_type} stats from {game_url}")
            #     continue  # Skip to the next part of the loop
            
#--------------------------------------FIGURE OUT HOW TO RERUN FAILED SCRAPES

            #CODE TO TRY AGAIN 



# #NEXT STEPS:
# #ADD GAME ID INFO TO EACH TABLE AND WHICH TEAM THE PLAYER BELONGS TO, Home/Away, ALSO ADD TIMESTAMP OF GAME AND DATA INGESTION
# #CONSIDER GRABBING ADDITIONAL BATTING STATISTICS, PITCHING STATISTICS (1B,2B,3B,HR)
# #    - WRITE CODE TO GRAB ADDITIONAL PITCHING STATS (PITCHER	POS	IP	H	R	ER	BB	SO	BF	ERA)
# #INTRODUCE THE NEW METRICS ABOVE INTO THE FOR LOOP AND CODE
# #ALSO NEED TO CREATE A GAME TABLE WITH RELEVANT INFO (AWAY TEAM, HOME TEAM, SCORE, RUNS, HITS, ERRORS, INNINGS)
# #THEN NEED TO CREATE BATTING, PITCHING, AND GAME TABLES
# #LOOP THROUGH ALL GAMES AND ADD 5 DATAFRAMES TO 3 CORE ONES
# #NEED TO JOIN BY JERSEY NUMBER BECAUSE NAMES ARE NOT NECESSARILY CONSISTENT ACROSS GAMES

# from bs4 import BeautifulSoup
# from pyvirtualdisplay import Display
# import requests
# from selenium import webdriver
# from selenium.webdriver.safari.service import Service
# from selenium.webdriver import SafariOptions
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By
# import time
# import pandas as pd
# import re

# # Function to extract last name and first initial
# def extract_name(name):
#     parts = name.split()
#     if len(parts) > 1:  # Format is "Firstname Lastname" or "F. Lastname"
#         return parts[1] + ' ' + parts[0][0]
#     else:  # Format is "Lastname"
#         return parts[0]

# def split_string_number(s):
#     match = re.search(r'(.*) (\d+) \(\d+\)', s)
#     if match:
#         return list(match.groups())
#     else:
#         return ["", ""]

# import pandas as pd

# def merge_dataframes(df1, df2, key1, key2):
#     # Create a new column in df1 that contains only the last name
#     df1['last_name'] = df1[key1].apply(lambda x: x.split()[1] if len(x.split()) > 1 else x.split()[0])

#     # Split df2 into two dataframes
#     df2_lastname = df2[df2[key2].apply(lambda x: len(x.split()) == 1)]
#     df2_initial_lastname = df2[df2[key2].apply(lambda x: len(x.split()) > 1)]

#     # Perform the join operation on last name
#     result_lastname = pd.merge(df1, df2_lastname, left_on='last_name', right_on=key2, how='left')

#     # Standardize names in both dataframes
#     df1['standardized'] = df1[key1].apply(extract_name)
#     df2_initial_lastname.loc[:, 'standardized'] = df2_initial_lastname[key2].apply(extract_name)

#     # Perform the join operation on first initial and last name
#     result_initial_lastname = pd.merge(df1, df2_initial_lastname, left_on='standardized', right_on='standardized', how='left')

#     # Append the rows from result_initial_lastname that are not already in result_lastname
#     result = pd.concat([result_lastname, result_initial_lastname.loc[~result_initial_lastname.index.isin(result_lastname.index)]])

#     # Drop the temporary columns
#     result = result.drop(columns=['last_name', 'standardized', key2])
#     result = result.fillna(0)
    
#     return result


# # Start the timer
# start_time = time.time()

# game_url = 'game/6288640' ## Use this as game id

# print("https://www.ncaa.com"+game_url)
    
# #Headless Driver - Don't bring up browser
# options = SafariOptions()
# # options.add_argument("--window-size=1920,1080")
# driver = webdriver.Safari()

# driver.get('https://www.ncaa.com/'+game_url)

# soup = BeautifulSoup(driver.page_source, 'lxml')

# box_soup = soup.find(class_="boxscore-table boxscore-table_batter_visitor")


# # ----------------GET CORE BOX SCORE STATS (TO TEST JOIN WITH ADDITIONAL BATTING STATS)

# box_table = box_soup.find('table')
        
# headers = [header.text.strip() for header in box_table.find_all('th')]
# headers.insert(0,'jersey_number')

# rows = box_table.find_all('tr')
# box_df = []

# for row in rows[1:]:  # Skip the header row

#     cols = row.find_all('td')
#     cols = [col.text.strip() for col in cols]
#     box_df.append(cols)

# box_individual_stats = pd.DataFrame(box_df, columns=headers)

# box_total_stats = box_individual_stats.tail(1)

# box_individual_stats = box_individual_stats.drop(box_individual_stats.index[-1])

# # print(box_individual_stats)



# #-------------------------GET ADDITIONAL BATTING STATS 

# additional_df = pd.DataFrame(columns=['Player','2B','3B','HR','RBI']) ## OTHERS???

# for div in box_soup.find_all('div', {'class': 'additional-details'}):
    
#     # Get the stat type (2B, HR, RBI)
#     stat_type = div.contents[0].strip().replace(':', '')
#     # print(stat_type)
#     # Loop over each li in the ul
#     for li in div.find_all('li'):
#         # Get the name and stat value
#         name, stat_value = split_string_number(li.text)
#         # print(f"name:{name}")
#         # print(f"stat_value:{stat_value}")
        
#         # If the name is already in the DataFrame, update the stat value
#         if name in additional_df['Player'].values:
#             additional_df.loc[additional_df['Player'] == name, stat_type] = stat_value
#         # Otherwise, add a new row to the DataFrame
#         else:
#             additional_df = pd.concat([additional_df, pd.DataFrame({'Player': [name], stat_type: [stat_value]})], ignore_index=True)
#     additional_df = additional_df.fillna(0)
#     additional_df = additional_df.astype({'Player': str, '2B': int, '3B': int, 'HR': int, 'RBI': int})

# print(additional_df)

# final_df = merge_dataframes(df1=box_individual_stats, df2=additional_df, key1='Batter', key2='Player')

# # result = merge_dataframes(df1=box_individual_stats, df2=additional_df, key1 = 'Batter', key2= 'Player') #MUST REPLACE NAME AND BATTER/PITCHER BASED ON FOR LOOP

# # JOIN ADDITIONAL STATS TO INDIVIDUAL BOX SCORE STATS
# # --------------------------------------
# # # Create a new column in box_individual_stats that contains only the last name
# # box_individual_stats['Batter_last_name'] = box_individual_stats['Batter'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else x.split()[0])

# # # Split additional_df into two dataframes
# # additional_df_lastname = additional_df[additional_df['Player'].apply(lambda x: len(x.split()) == 1)]
# # additional_df_initial_lastname = additional_df[additional_df['Player'].apply(lambda x: len(x.split()) > 1)]

# # # Perform the join operation on last name
# # result_lastname = pd.merge(box_individual_stats, additional_df_lastname, left_on='Batter_last_name', right_on='Player', how='left')

# # # Standardize names in both dataframes
# # box_individual_stats['Batter_standardized'] = box_individual_stats['Batter'].apply(extract_name)
# # additional_df_initial_lastname['Player_standardized'] = additional_df_initial_lastname['Player'].apply(extract_name)

# # # Perform the join operation on first initial and last name
# # result_initial_lastname = pd.merge(box_individual_stats, additional_df_initial_lastname, left_on='Batter_standardized', right_on='Player_standardized', how='left')

# # # Append the rows from result_initial_lastname that are not already in result_lastname
# # result = pd.concat([result_lastname, result_initial_lastname.loc[~result_initial_lastname.index.isin(result_lastname.index)]])

# # # Drop the temporary columns
# # result = result.drop(columns=['Batter_last_name', 'Batter_standardized', 'Player_standardized','Player'])

# print(final_df)


In [None]:
# #GET PITCHING BOX SCORES FROM A SINGLE GAME

# from bs4 import BeautifulSoup
# from pyvirtualdisplay import Display
# import requests
# from selenium import webdriver
# from selenium.webdriver.safari.service import Service
# from selenium.webdriver import SafariOptions
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By
# import time
# import pandas as pd
# import re
# # 
# # Function to extract last name and first initial
# def extract_name(name):
#     parts = name.split()
#     if len(parts) > 1:  # Format is "Firstname Lastname" or "F. Lastname"
#         return parts[1] + ' ' + parts[0][0]
#     else:  # Format is "Lastname"
#         return parts[0]

# def split_string_number(s):
#     match = re.search(r'(.*) (\d+) \(\d+\)', s)
#     if match:
#         return list(match.groups())
#     else:
#         return ["", ""]

# def merge_dataframes(df1, df2, key1, key2):
#     # Create a new column in df1 that contains only the last name
#     df1['last_name'] = df1[key1].apply(lambda x: x.split()[1] if len(x.split()) > 1 else x.split()[0])

#     # Split df2 into two dataframes
#     df2_lastname = df2[df2[key2].apply(lambda x: len(x.split()) == 1)]
#     df2_initial_lastname = df2[df2[key2].apply(lambda x: len(x.split()) > 1)]

#     # Perform the join operation on last name
#     result_lastname = pd.merge(df1, df2_lastname, left_on='last_name', right_on=key2, how='left')

#     # Standardize names in both dataframes
#     df1['standardized'] = df1[key1].apply(extract_name)
#     df2_initial_lastname['standardized'] = df2_initial_lastname[key2].apply(extract_name)

#     # Perform the join operation on first initial and last name
#     result_initial_lastname = pd.merge(df1, df2_initial_lastname, left_on='standardized', right_on='standardized', how='left')

#     # Append the rows from result_initial_lastname that are not already in result_lastname
#     result = pd.concat([result_lastname, result_initial_lastname.loc[~result_initial_lastname.index.isin(result_lastname.index)]])

#     # Drop the temporary columns
#     result = result.drop(columns=['last_name', 'standardized', key2])
#     result = result.fillna(0)
    
#     return result

# # Start the timer
# start_time = time.time()

# game_url = 'game/6233800' ## Use this as game id

# print("https://www.ncaa.com/"+game_url)
    
# #Headless Driver - Don't bring up browser
# options = SafariOptions()
# # options.add_argument("--window-size=1920,1080")
# driver = webdriver.Safari()

# driver.get('https://www.ncaa.com/'+game_url)

# #-----------------Click JavaScript to get home team stats
# home_team_button = driver.find_element('css selector', '.boxscore-team-selector-team.home')
# home_team_button.click()

# soup = BeautifulSoup(driver.page_source, 'lxml')

# box_soup = soup.find(class_="boxscore-table boxscore-table_pitcher_home")


# # ----------------GET CORE BOX SCORE STATS (TO TEST JOIN WITH ADDITIONAL BATTING STATS)

# box_table = box_soup.find('table')
        
# headers = [header.text.strip() for header in box_table.find_all('th')]
# headers.insert(0,'jersey_number')

# rows = box_table.find_all('tr')
# box_df = []

# for row in rows[1:]:  # Skip the header row

#     cols = row.find_all('td')
#     cols = [col.text.strip() for col in cols]
#     box_df.append(cols)

# box_individual_stats = pd.DataFrame(box_df, columns=headers)

# box_total_stats = box_individual_stats.tail(1)

# box_individual_stats = box_individual_stats.drop(box_individual_stats.index[-1])

# print(box_individual_stats)



# #-------------------------GET ADDITIONAL PITCHING STATS 


# data = []
# # print(box_soup.find_all('div', {'class': 'additional-details'})
# for div in box_soup.find_all('div', {'class': 'additional-details'}):
    
#     # Check if it's a win or loss
#     is_loss = "Loss:" in div.text
#     is_won = "Win:" in div.text
    
#     # Find all li elements within the div
#     for li in div.find_all('li'):
#         if is_won or is_loss:
#             player_name = li.text.strip()
#             data.append([player_name, 1 if is_won else 0, 1 if is_loss else 0,0,0])
#             is_loss=False
#             is_won=False
#         else:
#             match = re.match(r'(.*) (\d+) \((\d+) strikes\)', li.text)
#             if match:
#                 player = match.group(1)
#                 pitches = int(match.group(2))
#                 strikes = int(match.group(3))
#                 data.append([player, 0, 0, pitches, strikes])
  
# # Create a DataFrame with Additional Pitching stats
# df = pd.DataFrame(data, columns=['Player', 'Wins', 'Losses', 'Pitches', 'Strikes'])
# additional_df = df.groupby('Player').sum().reset_index()

# print(additional_df)
    



# #JOIN ADDITIONAL STATS TO INDIVIDUAL BOX SCORE STATS
# # --------------------------------------
# final_batter_or_pitcher = merge_dataframes(df1=box_individual_stats, df2=additional_df, key1 = 'Pitcher', key2= 'Player') #MUST REPLACE NAME AND BATTER/PITCHER BASED ON FOR LOOP

# print(final_batter_or_pitcher)

# driver.quit()

In [None]:
# #CODE TO ADD ADDITIONAL DATA FROM OUTSIDE TO RETURNED BOX SCORE DATAFRAMES
# team_type = 'away'

# from bs4 import BeautifulSoup
# from pyvirtualdisplay import Display
# import requests
# from selenium import webdriver
# from selenium.webdriver.safari.service import Service
# from selenium.webdriver import SafariOptions
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By
# import time
# import pandas as pd
# import re

# # Function to extract last name and first initial
# def extract_name(name):
#     parts = name.split()
#     if len(parts) > 1:  # Format is "Firstname Lastname" or "F. Lastname"
#         return parts[1] + ' ' + parts[0][0]
#     else:  # Format is "Lastname"
#         return parts[0]

# def split_string_number(s):
#     match = re.search(r'(.*) (\d+) \(\d+\)', s)
#     if match:
#         return list(match.groups())
#     else:
#         return ["", ""]


# game_url = 'game/6233830' ## Use this as game id

# print("https://www.ncaa.com"+game_url)
    
# #Headless Driver - Don't bring up browser
# options = SafariOptions()
# # options.add_argument("--window-size=1920,1080")
# driver = webdriver.Safari()

# driver.get('https://www.ncaa.com/'+game_url)

# home_team_button = driver.find_element('css selector', '.boxscore-team-selector-team.home')
# home_team_button.click()

# soup = BeautifulSoup(driver.page_source, 'lxml')


# #--------------------GET GAME STATS (TEAM NAMES, RECORDS, IMAGE LINKS)
# team = soup.find(class_=f"gamecenter-game-banner-team {team_type}")

# team_name_long = team.find(class_='team-name-long').text.strip()  # strip leading/trailing whitespaces and newline characters
# team_name_short = team.find(class_='team-name-short').text.strip()  # strip leading/trailing whitespaces and newline characters
# record = team.find(class_='record').text.strip()  # strip leading/trailing whitespaces and newline characters
# team_logo = team.find('img')['src']  # get the 'src' attribute of the 'img' tag
# # print(team_name_long)
# # print(team_name_short)
# # print(record)
# wins, losses, ties = map(int, record.strip('()').split('-'))
# data_frames ={}
# data = []
# data.append([team_name_long, team_name_short, wins, losses, ties, team_logo])

# data_frames[f"{team_type}_team_stats"] = pd.DataFrame(data, columns=['team_name_long', 'team_name_short', 'wins', 'losses', 'ties','team_logo'])

# print(data_frames[f"{team_type}_team_stats"])





# # ----------------GET CORE BOX SCORE STATS (TO TEST JOIN WITH ADDITIONAL BATTING STATS)


# #Need to grab home team name long, short and record from this html:
# # <div class="gamecenter-game-banner-content">

# # <div class="gamecenter-game-banner-team away">
# # <div class="team-stats">

# # <div class="team-stats-name-record-container">
# # <span class="team-name">
# # <span class="team-name-long">
# # Florida
# # </span>
# # <span class="team-name-short">
# # Florid
# # </span>
# # </span>
# # <span class="record">
# # (4-1-0)
# # </span>
# # </div>
# # </div>

# # <img class="team-logo" src="//i.turner.ncaa.com/sites/default/files/images/logos/schools/bgd/florida.svg" alt="Florida" title="Florida">

# # </div><div class="gamecenter-game-banner-team home">
# # <div class="team-stats">

# # <div class="team-stats-name-record-container">
# # <span class="team-name">
# # <span class="team-name-long">
# # Stetson
# # </span>
# # <span class="team-name-short">
# # Stetso
# # </span>
# # </span>
# # <span class="record">
# # (3-3-0)
# # </span>
# # </div>
# # </div>

# # <img class="team-logo" src="//i.turner.ncaa.com/sites/default/files/images/logos/schools/bgd/stetson.svg" alt="Stetson" title="Stetson">

# # </div><div class="gamecenter-game-banner-scoring">

# # </div></div>
# # from datetime import datetime

# #         # Add additional columns
# #         box_individual_stats['game_id'] = game_id
# #         box_individual_stats['game_date'] = game_date
# #         box_individual_stats['data_ingestion_time'] = data_ingestion_time

