In [1]:
"""
TO DO
1. Extract the each race URLs from the page year
2. Clean the list to get just the URLs with the circuit race
3. Extract the race result table date and circuit for each race URL
4. Store the tables in a dictionary
5. Clean the dictionary dropping the columns with Nan values
"""

'\nTO DO\n1. Extract the each race URLs from the page year\n2. Clean the list to get just the URLs with the circuit race\n3. Extract the race result table date and circuit for each race URL\n4. Store the tables in a dictionary\n5. Clean the dictionary dropping the columns with Nan values\n'

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import chardet
import html

In [3]:
def extract_menu_links(url):
    url_list = []  # Initialize an empty list to store the URLs
    try:
        # Makes a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Checks if the request was successful

        #  Creates a BeautifulSoup object from the page content
        soup = BeautifulSoup(response.text, "html.parser")
        # Finds the elements <li> with the specific class
        li_elements = soup.find_all("li", class_="resultsarchive-filter-item")

        if li_elements:
            # Extracts and adds the URLs to the list inside the <li> elements
            for li_element in li_elements:
                # Finds the elements <a> inside each <li>
                a_elements = li_element.find_all(
                    "a",
                    href=lambda href: href and href.startswith("/en/results.html/2023"),
                )

                # Add URLs to the list
                for a_element in a_elements:
                    href = a_element.get("href")
                    if href:
                        # Uses urljoin to create URLs
                        absolute_url = urljoin(url, href)
                        url_list.append(absolute_url)
        else:
            print("Elementos <li> não encontrados.")
    except Exception as e:
        print(f"Ocorreu um erro: {e}")

    return url_list


# URL page with elements <li>
url = "https://www.formula1.com/en/results.html/2023/races.html"

# Calls function
urls_list = extract_menu_links(url)

print("Lista de URLs:", urls_list)

Lista de URLs: ['https://www.formula1.com/en/results.html/2023/races.html', 'https://www.formula1.com/en/results.html/2023/races.html', 'https://www.formula1.com/en/results.html/2023/drivers.html', 'https://www.formula1.com/en/results.html/2023/team.html', 'https://www.formula1.com/en/results.html/2023/fastest-laps.html', 'https://www.formula1.com/en/results.html/2023/races.html', 'https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1142/saudi-arabia/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1143/australia/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1207/azerbaijan/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1208/miami/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1209/italy/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1210/monaco/race-result.html', 'https://www.for

In [4]:
# Filtering only racing URLs
filtered_urls = [url for url in urls_list if '/race-result.html' in url]

# Print filtered URLs
print(filtered_urls)

# Print the length of filtered URLs
print(len(filtered_urls))


['https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1142/saudi-arabia/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1143/australia/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1207/azerbaijan/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1208/miami/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1209/italy/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1210/monaco/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1211/spain/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1212/canada/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1213/austria/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/1214/great-britain/race-result.html', 'https://www.formula1.com/en/results.html/2023/races/121

In [5]:
url = "https://www.formula1.com/en/results.html/2023/races/1223/mexico/race-result.html"

In [6]:
def extract_table_from_url(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the table on the page (you might need to adjust this based on the HTML structure)
        table = soup.find("table")

        # Find the date element within a <p> tag
        date_element = soup.find("p", class_="date")

        # Find the element <span class="full-date">
        full_date_element = date_element.find("span", class_="full-date") if date_element else None

        # Find the element <span class="circuit-info">
        circuit_element = soup.find("span", class_="circuit-info")

        if table:
            # Use pandas to read the HTML table into a DataFrame
            df = pd.read_html(str(table))[0]

            # Add a new column 'Date' with the value of the element <span class="full-date">
            if 'Date' in df.columns:
                # Update the existing 'Date' column
                df['Date'] = full_date_element.get_text() if full_date_element else "No date element found"
            else:
                # Add a new column 'Date' with the value of the element <span class="full-date">
                df.insert(0, "Date", full_date_element.get_text() if full_date_element else "No date element found")


            # Add a new column 'Circuit' with the value of the element <span class="circuit-info">
            if circuit_element:
                df.insert(0, "Circuit", circuit_element.get_text())
                df["Circuit"] = df["Circuit"].str.replace(',', ' -')  # Replace ',' with '-'
            else:
                df.insert(0, "Circuit", "No circuit element found")

            return df
        else:
            print("No table found on the page.")
            return None
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

table_data = extract_table_from_url(url)

if table_data is not None:
    display(table_data)


Unnamed: 0.1,Circuit,Date,Unnamed: 0,Pos,No,Driver,Car,Laps,Time/Retired,PTS,Unnamed: 8
0,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,1,1,Max Verstappen VER,Red Bull Racing Honda RBPT,71,2:02:30.814,25,
1,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,2,44,Lewis Hamilton HAM,Mercedes,71,+13.875s,19,
2,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,3,16,Charles Leclerc LEC,Ferrari,71,+23.124s,15,
3,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,4,55,Carlos Sainz SAI,Ferrari,71,+27.154s,12,
4,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,5,4,Lando Norris NOR,McLaren Mercedes,71,+33.266s,10,
5,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,6,63,George Russell RUS,Mercedes,71,+41.020s,8,
6,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,7,3,Daniel Ricciardo RIC,AlphaTauri Honda RBPT,71,+41.570s,6,
7,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,8,81,Oscar Piastri PIA,McLaren Mercedes,71,+43.104s,4,
8,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,9,23,Alexander Albon ALB,Williams Mercedes,71,+48.573s,2,
9,Autódromo Hermanos Rodríguez - Mexico City,29 Oct 2023,,10,31,Esteban Ocon OCO,Alpine Renault,71,+62.879s,1,


In [7]:
tables_dict = {}

# Loop through each URL
for url in filtered_urls:
    table_data = extract_table_from_url(url)

    if table_data is not None:
        # Store the table in the dictionary with the URL as the key
        tables_dict[url] = table_data

# Concatenate the tables into a single DataFrame
result_df = pd.concat(tables_dict.values(), keys=tables_dict.keys())

# Display the concatenated DataFrame
print(result_df)

No table found on the page.
                                                                                      Circuit  \
https://www.formula1.com/en/results.html/2023/r... 0   Bahrain International Circuit - Sakhir   
                                                   1   Bahrain International Circuit - Sakhir   
                                                   2   Bahrain International Circuit - Sakhir   
                                                   3   Bahrain International Circuit - Sakhir   
                                                   4   Bahrain International Circuit - Sakhir   
...                                                                                       ...   
https://www.formula1.com/en/results.html/2023/r... 15         Yas Marina Circuit - Yas Island   
                                                   16         Yas Marina Circuit - Yas Island   
                                                   17         Yas Marina Circuit - Yas Island   
  

In [8]:
df_cleaned = result_df.dropna(axis=1)

In [9]:
display(df_cleaned)

Unnamed: 0,Unnamed: 1,Circuit,Date,Pos,No,Driver,Car,Laps,Time/Retired,PTS
https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html,0,Bahrain International Circuit - Sakhir,05 Mar 2023,1,1,Max Verstappen VER,Red Bull Racing Honda RBPT,57,1:33:56.736,25
https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html,1,Bahrain International Circuit - Sakhir,05 Mar 2023,2,11,Sergio Perez PER,Red Bull Racing Honda RBPT,57,+11.987s,18
https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html,2,Bahrain International Circuit - Sakhir,05 Mar 2023,3,14,Fernando Alonso ALO,Aston Martin Aramco Mercedes,57,+38.637s,15
https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html,3,Bahrain International Circuit - Sakhir,05 Mar 2023,4,55,Carlos Sainz SAI,Ferrari,57,+48.052s,12
https://www.formula1.com/en/results.html/2023/races/1141/bahrain/race-result.html,4,Bahrain International Circuit - Sakhir,05 Mar 2023,5,44,Lewis Hamilton HAM,Mercedes,57,+50.977s,10
...,...,...,...,...,...,...,...,...,...,...
https://www.formula1.com/en/results.html/2023/races/1226/abu-dhabi/race-result.html,15,Yas Marina Circuit - Yas Island,26 Nov 2023,16,2,Logan Sargeant SAR,Williams Mercedes,58,+87.791s,0
https://www.formula1.com/en/results.html/2023/races/1226/abu-dhabi/race-result.html,16,Yas Marina Circuit - Yas Island,26 Nov 2023,17,24,Zhou Guanyu ZHO,Alfa Romeo Ferrari,58,+89.422s,0
https://www.formula1.com/en/results.html/2023/races/1226/abu-dhabi/race-result.html,17,Yas Marina Circuit - Yas Island,26 Nov 2023,18,55,Carlos Sainz SAI,Ferrari,57,DNF,0
https://www.formula1.com/en/results.html/2023/races/1226/abu-dhabi/race-result.html,18,Yas Marina Circuit - Yas Island,26 Nov 2023,19,77,Valtteri Bottas BOT,Alfa Romeo Ferrari,57,+1 lap,0


In [10]:
output_directory = 'C:/Users/bruno/Projetos Python/F1_Results_Web_Scraping/'

# Export DataFrame to CSV file in the specified directory
df_cleaned.to_csv(output_directory + 'races_results_2023.csv', index=False)