# Scraping Qualifying Data from https://www.formula1.com/

In [98]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

In [100]:
def parse_driver_name(row):
    words = row.split()
    parsed_string = ' '.join(words[:-1])
    return parsed_string

def parse_car_name(row):
    words = row.split()
    parsed_string = ' '.join(words)
    return parsed_string

In [99]:
qualifying_results = pd.DataFrame()

for year in range(2022, 2024):
    races_url = f'https://www.formula1.com/en/results.html/{year}/races.html'
    response = requests.get(races_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    race_links = []
    filter_links = soup.find_all('a', attrs={'class': 'resultsarchive-filter-item-link FilterTrigger'})
    for link in filter_links:
        href = link.get('href')
        if f'/en/results.html/{year}/races/' in href:
            race_links.append(href)

    year_df = pd.DataFrame()

    round_num = 1
    for race_link in race_links:
        grid_link = race_link.replace('race-result.html', 'starting-grid.html')
        try:
            df = pd.read_html(f'https://www.formula1.com{grid_link}')[0]
        except:
            continue
        df['season'] = year
        df['round'] = round_num
        df = df.loc[:, ~df.columns.str.contains('Unnamed')]
        year_df = pd.concat([year_df, df], ignore_index=True)
        round_num += 1

    qualifying_results = pd.concat([qualifying_results, year_df], ignore_index=True)

print(qualifying_results.shape)


(634, 7)


In [101]:
print(qualifying_results)

     Pos  No                 Driver                    Car      Time  season  \
0      1  16  Charles  Leclerc  LEC                Ferrari  1:30.558    2022   
1      2   1   Max  Verstappen  VER   Red Bull Racing RBPT  1:30.681    2022   
2      3  55     Carlos  Sainz  SAI                Ferrari  1:30.687    2022   
3      4  11     Sergio  Perez  PER   Red Bull Racing RBPT  1:30.921    2022   
4      5  44   Lewis  Hamilton  HAM               Mercedes  1:31.238    2022   
..   ...  ..                    ...                    ...       ...     ...   
629   16  22     Yuki  Tsunoda  TSU  AlphaTauri Honda RBPT  1:30.025    2023   
630   17  24      Zhou  Guanyu  ZHO     Alfa Romeo Ferrari  1:30.123    2023   
631   18  21    Nyck  De Vries  DEV  AlphaTauri Honda RBPT  1:30.513    2023   
632   19  20  Kevin  Magnussen  MAG           Haas Ferrari  1:32.378    2023   
633   20  77  Valtteri  Bottas  BOT     Alfa Romeo Ferrari       NaN    2023   

     round  
0        1  
1        1  


In [102]:
qualifying_results["Driver"] = qualifying_results["Driver"].apply(parse_driver_name)
qualifying_results["Car"] = qualifying_results["Car"].apply(parse_car_name)
qualifying_results.rename(columns = {'Pos': 'grid_position', 'No': 'car_number', 'Driver': 'driver_name', 'Car': 'constructor',
                                     'Time': 'qualifying_time'}, inplace = True)

In [104]:
qualifying_results = qualifying_results[['season', 'round', 'grid_position', 'driver_name', 'qualifying_time']]
print(qualifying_results)

     season  round  grid_position      driver_name qualifying_time
0      2022      1              1  Charles Leclerc        1:30.558
1      2022      1              2   Max Verstappen        1:30.681
2      2022      1              3     Carlos Sainz        1:30.687
3      2022      1              4     Sergio Perez        1:30.921
4      2022      1              5   Lewis Hamilton        1:31.238
..      ...    ...            ...              ...             ...
629    2023     10             16     Yuki Tsunoda        1:30.025
630    2023     10             17      Zhou Guanyu        1:30.123
631    2023     10             18    Nyck De Vries        1:30.513
632    2023     10             19  Kevin Magnussen        1:32.378
633    2023     10             20  Valtteri Bottas             NaN

[634 rows x 5 columns]
