## Import Libraries
Summary: This block imports the necessary libraries for web scraping (selenium and BeautifulSoup) and for handling CSV files (csv).

In [None]:

## Importing Libraries
%pip install selenium beautifulsoup4
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup


## Initialize WebDriver
Summary: This cell initializes the Chromedriver, and opens the specified URL using Selenium. Due to dynamic loading, it waits up to 20 seconds for an element with the text "5K" to appear on the page, ensuring that the page is fully loaded. Then, it retrieves the HTML source of the fully rendered page and parses it with BeautifulSoup.

In [None]:
# Set up Chrome options for headless browsing
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.binary_location = "/usr/local/bin/chrome-mac-x64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing"

# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)


## Set Up CSV File Format
This cell defines how to properly configure the fields.

In [None]:
# Desired column order
ordered_fields = [
    'Bib Number', 'Place Gender', 'Place Age‑Graded', 'Gun Time', '5K', '10K', '15K', '20K', 'HALF',
    '25K', '30K', '35K', '40K', '20M', '25.2M', '26M', 'MAR', 'Official Time'
]



# Set up the CSV file with the ordered fields


# Define the filename and path to save the CSV file in the user's home directory
home_directory = os.path.expanduser("~")
filename = os.path.join(home_directory, 'ny_marathon_results.csv')
with open(filename, mode='a', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=ordered_fields)
    writer.writeheader()

## Extract Runner Information
This cell initializes an empty dictionary to store the split times extracted from the HTML. This cell finds the container with the class content-box and iterates through all form-group-item sections within it, extracting the label and corresponding time, and storing them in the split_times dictionary.

In [None]:
# Process each runner's bib number
for bib_number in range(1, 66900):  # Full range of runners in 2024 race. Not all numbers are valid due to no-shows or reserved numbers for elite runners.
    url = f"https://results.nyrr.org/event/M2024/result/{bib_number}"
    driver.get(url)

    # Initialize data dictionary for the runner's results with None as default for all ordered fields
    runner_data = {field: None for field in ordered_fields}
    runner_data['Bib Number'] = bib_number

    try:
        # Wait for the 5K split to load to ensure the page is fully rendered
        WebDriverWait(driver, .5).until(
            EC.presence_of_element_located((By.XPATH, "//label[text()='5K']"))
        )

        # Retrieve the page source after JavaScript has rendered
        page_source = driver.page_source

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Dictionary to store the split times
        split_times = {}

        # Extract split times in the parsed HTML
        content_box = soup.find("div", class_="content-box")
        if content_box:
            split_sections = content_box.find_all("div", class_="form-group-item")
            for section in split_sections:
                label = section.find("label")
                time = section.find("span", class_="label-value")
                if label and time:
                    split_label = label.get_text(strip=True)
                    split_time = time.get_text(strip=True)
                    split_times[split_label] = split_time

        # Update runner_data with the extracted splits and other fields
        runner_data.update(split_times)


        # Append the current runner's data to the CSV file
        with open(filename, mode='a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=ordered_fields)
            writer.writerow(runner_data)

        print(f"Processed bib number {bib_number}")

    except TimeoutException:
        print(f"Timed out while processing bib number {bib_number}. Skipping...")

# Close the driver
driver.quit()
print("All bibs processed and saved to CSV.")