In [None]:
"""
TO DO
1. Extract the each race URLs from the page year
2. Clean the list to get just the URLs with the circuit race
3. Extract the race result table date and circuit for each race URL
4. Store the tables in a dictionary
5. Clean the dictionary dropping the columns with Nan values
"""

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import chardet
import html

In [None]:
# URLs to extract


#"https://www.formula1.com/en/results.html/2018/races.html"
#"https://www.formula1.com/en/results.html/2019/races.html"
#"https://www.formula1.com/en/results.html/2020/races.html"
#"https://www.formula1.com/en/results.html/2021/races.html"
#"https://www.formula1.com/en/results.html/2022/races.html"
#"https://www.formula1.com/en/results.html/2023/races.html"


In [None]:
def extract_menu_links(url):
    url_list = []  # Initialize an empty list to store the URLs
    try:
        # Makes a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Checks if the request was successful

        #  Creates a BeautifulSoup object from the page content
        soup = BeautifulSoup(response.text, "html.parser")
        # Finds the elements <li> with the specific class
        li_elements = soup.find_all("li", class_="resultsarchive-filter-item")

        if li_elements:
            # Extracts and adds the URLs to the list inside the <li> elements
            for li_element in li_elements:
                # Finds the elements <a> inside each <li>
                a_elements = li_element.find_all(
                    "a",
                    href=lambda href: href and href.startswith("/en/results.html/2023"),
                )

                # Add URLs to the list
                for a_element in a_elements:
                    href = a_element.get("href")
                    if href:
                        # Uses urljoin to create URLs
                        absolute_url = urljoin(url, href)
                        url_list.append(absolute_url)
        else:
            print("Elementos <li> não encontrados.")
    except Exception as e:
        print(f"Ocorreu um erro: {e}")

    return url_list


# URL page with elements <li>
#Change URL
url = "https://www.formula1.com/en/results.html/2023/races.html"

# Calls function
urls_list = extract_menu_links(url)

print("Lista de URLs:", urls_list)

In [None]:
# Filtering only racing URLs
filtered_urls = [url for url in urls_list if '/race-result.html' in url]

# Print filtered URLs
print(filtered_urls)

# Print the length of filtered URLs
print(len(filtered_urls))


In [None]:
#Change URL
url = 'https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html'

In [None]:
def extract_table_from_url(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the table on the page (you might need to adjust this based on the HTML structure)
        table = soup.find("table")

        # Find the date element within a <p> tag
        date_element = soup.find("p", class_="date")

        # Find the element <span class="full-date">
        full_date_element = date_element.find("span", class_="full-date") if date_element else None

        # Find the element <span class="circuit-info">
        circuit_element = soup.find("span", class_="circuit-info")

        if table:
            # Use pandas to read the HTML table into a DataFrame
            df = pd.read_html(str(table))[0]

            # Add a new column 'Date' with the value of the element <span class="full-date">
            if 'Date' in df.columns:
                # Update the existing 'Date' column
                df['Date'] = full_date_element.get_text() if full_date_element else "No date element found"
            else:
                # Add a new column 'Date' with the value of the element <span class="full-date">
                df.insert(0, "Date", full_date_element.get_text() if full_date_element else "No date element found")


            # Add a new column 'Circuit' with the value of the element <span class="circuit-info">
            if circuit_element:
                df.insert(0, "Circuit", circuit_element.get_text())
                df["Circuit"] = df["Circuit"].str.replace(',', ' -')  # Replace ',' with '-'
            else:
                df.insert(0, "Circuit", "No circuit element found")

            return df
        else:
            print("No table found on the page.")
            return None
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

table_data = extract_table_from_url(url)

if table_data is not None:
    display(table_data)


In [None]:
tables_dict = {}

# Loop through each URL
for url in filtered_urls:
    table_data = extract_table_from_url(url)

    if table_data is not None:
        # Store the table in the dictionary with the URL as the key
        tables_dict[url] = table_data

# Concatenate the tables into a single DataFrame
result_df = pd.concat(tables_dict.values(), keys=tables_dict.keys())

# Display the concatenated DataFrame
display(result_df)

In [None]:
df_cleaned = result_df.dropna(axis=1)

In [None]:
display(df_cleaned)

In [None]:
print(df_cleaned.dtypes)

In [None]:
# Converting date from char to datetime 

df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], format='%d %b %Y')

print(df_cleaned.dtypes)
display(df_cleaned)


In [None]:
output_directory = 'C:/Users/bruno/Projetos Python/F1_Results_Web_Scraping/'

# Export DataFrame to CSV file in the specified directory
df_cleaned.to_csv(output_directory + 'races_results_2023.csv', index=False)