In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import csv
import re

In [2]:
# Dictionary for the raw data, the key is the wbesite containing the raw data and the key is the ID relating to the table required on the web page
raw_data_dict = {'https://fbref.com/en/comps/8/2023-2024/stats/2023-2024-Champions-League-Stats': {'id': 'stats_standard'}, 
                 'https://fbref.com/en/comps/8/2023-2024/defense/2023-2024-Champions-League-Stats': {'id': 'stats_defense'},
                 'https://fbref.com/en/comps/8/2023-2024/misc/2023-2024-Champions-League-Stats': {'id': 'stats_misc'},
                 'https://fbref.com/en/comps/8/2023-2024/shooting/2023-2024-Champions-League-Stats': {'id': 'stats_shooting'},
                 'https://fbref.com/en/comps/8/2023-2024/gca/2023-2024-Champions-League-Stats': {'id': 'stats_gca'},
                 'https://fbref.com/en/comps/8/2023-2024/passing/2023-2024-Champions-League-Stats': {'id': 'stats_passing'},
                 'https://fbref.com/en/comps/8/2023-2024/passing_types/2023-2024-Champions-League-Stats': {'id': 'stats_passing_types'},
                 'https://fbref.com/en/comps/8/2023-2024/keepers/2023-2024-Champions-League-Stats': {'id': 'stats_keeper'},
                 'https://fbref.com/en/comps/8/2023-2024/possession/2023-2024-Champions-League-Stats': {'id': 'stats_possession'}
                 }


In [3]:
# Set up Selenium options
options = Options()
options.headless = True  # Run in headless mode (no GUI)
service = Service(ChromeDriverManager().install())

# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=options)

# Fetch the webpages
for page, class_id in raw_data_dict.items():
    print(page, class_id)
    driver.get(page)

    # Get the page source after JavaScript has rendered
    html = driver.page_source

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find the table with the specified class
    table = soup.find('table', class_id)

    # Extract the data from the table
    if table:

        # Extract column headers with aria-label
        headers = table.find_all('th')
        header_labels = [header.get('aria-label', header.text).strip() for header in headers if header.get('aria-label', header.text).strip()]

        if '/stats/' in page:
            header_labels[35] = 'npxG + xAG/90'

        # Use a set to remove duplicates, required due to multiple headers in the table across the web page
        unique_header_labels = list(set(header_labels))

        # Sort the list to maintain the original order
        unique_header_labels.sort(key=header_labels.index)

        # Removing unwanted blank fields scraped incorrectly
        filtered_lst = [header for header in unique_header_labels if not (header.isdigit())][1:]


        rows = table.find_all('tr')
        data = []
        for row in rows:
            columns = row.find_all('td')
            row_data = [column.text for column in columns]
            data.append(row_data)

        # Creating output csv name
        def create_filename(url):
            # Extract the relevant parts of the URL
            match = re.search(r'/(\d{4}-\d{4})/([^/]+)/(\d{4}-\d{4}-[^/]+)', url)
            if match:
                season = match.group(1).replace('-', '_')
                category = match.group(2)
                stats = match.group(3).replace('-', '_')
                # Format the filename
                filename = f"{stats}_{category}.csv"
                return filename
            else:
                return None
            
        csv_name = create_filename(page)

        # Write data to CSV
        with open(csv_name, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            # Write header labels first
            writer.writerow(filtered_lst)
            # Write the rest of the data
            writer.writerows(data)

    else:
        print("Table not found")

# Close the WebDriver
driver.quit()

https://fbref.com/en/comps/8/2023-2024/stats/2023-2024-Champions-League-Stats {'id': 'stats_standard'}
