In [None]:
"""
TO DO
1. Extract the each race URLs from the page year
2. Clean the list to get just the URLs with the circuit race
3. Extract the race result table date and circuit for each race URL
4. Store the tables in a dictionary
5. Clean the dictionary dropping the columns with Nan values
"""

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import chardet
import html

In [None]:
"""
Sprint races
https://www.formula1.com/en/results.html/2021/races/1072/great-britain/sprint-results.html
https://www.formula1.com/en/results.html/2021/races/1076/italy/sprint-results.html
https://www.formula1.com/en/results.html/2021/races/1104/brazil/sprint-results.html

https://www.formula1.com/en/results.html/2022/races/1109/italy/sprint-results.html
https://www.formula1.com/en/results.html/2022/races/1115/austria/sprint-results.html
https://www.formula1.com/en/results.html/2022/races/1137/brazil/sprint-results.html

https://www.formula1.com/en/results.html/2023/races/1207/azerbaijan/sprint-results.html
https://www.formula1.com/en/results.html/2023/races/1213/austria/sprint-results.html
https://www.formula1.com/en/results.html/2023/races/1216/belgium/sprint-results.html
https://www.formula1.com/en/results.html/2023/races/1221/qatar/sprint-results.html
https://www.formula1.com/en/results.html/2023/races/1222/united-states/sprint-results.html
https://www.formula1.com/en/results.html/2023/races/1224/brazil/sprint-results.html



"""


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_table_from_url(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, "html.parser")

        # Find the table on the page
        table = soup.find("table")

        # Find the date element within a <p> tag
        date_element = soup.find("p", class_="date")

        # Find the element <span class="full-date">
        full_date_element = date_element.find("span", class_="full-date") if date_element else None

        # Find the element <span class="circuit-info">
        circuit_element = soup.find("span", class_="circuit-info")

        if table:
            # Use pandas to read the HTML table into a DataFrame
            df = pd.read_html(str(table))[0]

            # Add a new column 'Date' with the value of the element <span class="full-date">
            if 'Date' in df.columns:
                # Update the existing 'Date' column
                df['Date'] = full_date_element.get_text() if full_date_element else "No date element found"
            else:
                # Add a new column 'Date' with the value of the element <span class="full-date">
                df.insert(0, "Date", full_date_element.get_text() if full_date_element else "No date element found")

            # Add a new column 'Circuit' with the value of the element <span class="circuit-info">
            if circuit_element:
                df.insert(0, "Circuit", circuit_element.get_text())
                df["Circuit"] = df["Circuit"].str.replace(',', ' -')  # Replace ',' with '-'
            else:
                df.insert(0, "Circuit", "No circuit element found")

            return df
        else:
            print("No table found on the page.")
            return None
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

# List of URLs
url_list = [
    "https://www.formula1.com/en/results.html/2021/races/1072/great-britain/sprint-results.html",
    "https://www.formula1.com/en/results.html/2021/races/1076/italy/sprint-results.html",
    "https://www.formula1.com/en/results.html/2021/races/1104/brazil/sprint-results.html",
    "https://www.formula1.com/en/results.html/2022/races/1109/italy/sprint-results.html",
    "https://www.formula1.com/en/results.html/2022/races/1115/austria/sprint-results.html",
    "https://www.formula1.com/en/results.html/2022/races/1137/brazil/sprint-results.html",
    "https://www.formula1.com/en/results.html/2023/races/1207/azerbaijan/sprint-results.html",
    "https://www.formula1.com/en/results.html/2023/races/1213/austria/sprint-results.html",
    "https://www.formula1.com/en/results.html/2023/races/1216/belgium/sprint-results.html",
    "https://www.formula1.com/en/results.html/2023/races/1221/qatar/sprint-results.html",
    "https://www.formula1.com/en/results.html/2023/races/1222/united-states/sprint-results.html",
    "https://www.formula1.com/en/results.html/2023/races/1224/brazil/sprint-results.html"
]

# List to store DataFrames
dfs = []

# Run the code for each URL in the list
for url in url_list:
    table_data = extract_table_from_url(url)
    if table_data is not None:
        dfs.append(table_data)

# Concatenate DataFrames into one DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame
print(combined_df)


                                    Circuit         Date  Unnamed: 0 Pos  No  \
0         Silverstone Circuit - Silverstone  18 Jul 2021         NaN   1  33   
1         Silverstone Circuit - Silverstone  18 Jul 2021         NaN   2  44   
2         Silverstone Circuit - Silverstone  18 Jul 2021         NaN   3  77   
3         Silverstone Circuit - Silverstone  18 Jul 2021         NaN   4  16   
4         Silverstone Circuit - Silverstone  18 Jul 2021         NaN   5   4   
..                                      ...          ...         ...  ..  ..   
234  Autódromo José Carlos Pace - São Paulo  05 Nov 2023         NaN  16  20   
235  Autódromo José Carlos Pace - São Paulo  05 Nov 2023         NaN  17  24   
236  Autódromo José Carlos Pace - São Paulo  05 Nov 2023         NaN  18  27   
237  Autódromo José Carlos Pace - São Paulo  05 Nov 2023         NaN  19  77   
238  Autódromo José Carlos Pace - São Paulo  05 Nov 2023         NaN  20   2   

                  Driver               

In [4]:
df_cleaned = combined_df.dropna(axis=1)

In [5]:
display(df_cleaned)

Unnamed: 0,Circuit,Date,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,Silverstone Circuit - Silverstone,18 Jul 2021,1,33,Max Verstappen VER,Red Bull Racing Honda,17,25:38.426,3
1,Silverstone Circuit - Silverstone,18 Jul 2021,2,44,Lewis Hamilton HAM,Mercedes,17,+1.430s,2
2,Silverstone Circuit - Silverstone,18 Jul 2021,3,77,Valtteri Bottas BOT,Mercedes,17,+7.502s,1
3,Silverstone Circuit - Silverstone,18 Jul 2021,4,16,Charles Leclerc LEC,Ferrari,17,+11.278s,0
4,Silverstone Circuit - Silverstone,18 Jul 2021,5,4,Lando Norris NOR,McLaren Mercedes,17,+24.111s,0
...,...,...,...,...,...,...,...,...,...
234,Autódromo José Carlos Pace - São Paulo,05 Nov 2023,16,20,Kevin Magnussen MAG,Haas Ferrari,24,+56.507s,0
235,Autódromo José Carlos Pace - São Paulo,05 Nov 2023,17,24,Zhou Guanyu ZHO,Alfa Romeo Ferrari,24,+58.723s,0
236,Autódromo José Carlos Pace - São Paulo,05 Nov 2023,18,27,Nico Hulkenberg HUL,Haas Ferrari,24,+60.330s,0
237,Autódromo José Carlos Pace - São Paulo,05 Nov 2023,19,77,Valtteri Bottas BOT,Alfa Romeo Ferrari,24,+60.749s,0


In [6]:
print(df_cleaned.dtypes)

Circuit         object
Date            object
Pos             object
No               int64
Driver          object
Car             object
Laps             int64
Time/Retired    object
PTS              int64
dtype: object


In [7]:
# Converting date from char to datetime 

df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], format='%d %b %Y')

print(df_cleaned.dtypes)
display(df_cleaned)


Circuit                 object
Date            datetime64[ns]
Pos                     object
No                       int64
Driver                  object
Car                     object
Laps                     int64
Time/Retired            object
PTS                      int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], format='%d %b %Y')


Unnamed: 0,Circuit,Date,Pos,No,Driver,Car,Laps,Time/Retired,PTS
0,Silverstone Circuit - Silverstone,2021-07-18,1,33,Max Verstappen VER,Red Bull Racing Honda,17,25:38.426,3
1,Silverstone Circuit - Silverstone,2021-07-18,2,44,Lewis Hamilton HAM,Mercedes,17,+1.430s,2
2,Silverstone Circuit - Silverstone,2021-07-18,3,77,Valtteri Bottas BOT,Mercedes,17,+7.502s,1
3,Silverstone Circuit - Silverstone,2021-07-18,4,16,Charles Leclerc LEC,Ferrari,17,+11.278s,0
4,Silverstone Circuit - Silverstone,2021-07-18,5,4,Lando Norris NOR,McLaren Mercedes,17,+24.111s,0
...,...,...,...,...,...,...,...,...,...
234,Autódromo José Carlos Pace - São Paulo,2023-11-05,16,20,Kevin Magnussen MAG,Haas Ferrari,24,+56.507s,0
235,Autódromo José Carlos Pace - São Paulo,2023-11-05,17,24,Zhou Guanyu ZHO,Alfa Romeo Ferrari,24,+58.723s,0
236,Autódromo José Carlos Pace - São Paulo,2023-11-05,18,27,Nico Hulkenberg HUL,Haas Ferrari,24,+60.330s,0
237,Autódromo José Carlos Pace - São Paulo,2023-11-05,19,77,Valtteri Bottas BOT,Alfa Romeo Ferrari,24,+60.749s,0


In [9]:
output_directory = 'C:/Users/bruno/Projetos Python/f1_prediction/raw_files/'

# Export DataFrame to CSV file in the specified directory
df_cleaned.to_csv(output_directory + 'sprint_races_results_2021_to_2023.csv', index=False)