# Scraping Qualifying Data from https://www.formula1.com/

In [44]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

create_data_here = '/Users/anirudhkrishna/GitHub/FormulaData/csv-data/'

In [45]:
def parse_driver_name(row):
    words = row.split()
    parsed_string = ' '.join(words[:-1])
    return parsed_string

def parse_car_name(row):
    try:
        words = row.split()
    except:
        return row
    parsed_string = ' '.join(words)
    return parsed_string

In [46]:
qualifying_results = pd.DataFrame()

for year in range(1951, 2024):
    races_url = f'https://www.formula1.com/en/results.html/{year}/races.html'
    response = requests.get(races_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    race_links = []
    filter_links = soup.find_all('a', attrs={'class': 'resultsarchive-filter-item-link FilterTrigger'})
    for link in filter_links:
        href = link.get('href')
        if f'/en/results.html/{year}/races/' in href:
            race_links.append(href)

    year_df = pd.DataFrame()

    round_num = 1
    for race_link in race_links:
        grid_link = race_link.replace('race-result.html', 'starting-grid.html')
        try:
            df = pd.read_html(f'https://www.formula1.com{grid_link}')[0]
        except:
            continue
        df['season'] = year
        df['round'] = round_num
        df = df.loc[:, ~df.columns.str.contains('Unnamed')]
        year_df = pd.concat([year_df, df], ignore_index=True)
        round_num += 1

    qualifying_results = pd.concat([qualifying_results, year_df], ignore_index=True)

print(qualifying_results.shape)


(24205, 10)


In [47]:
print(qualifying_results)


      Pos  No                      Driver                    Car      Time  \
0       1  24    Juan Manuel  Fangio  FAN             Alfa Romeo  2:35.900   
1       2  22           Nino  Farina  FAR             Alfa Romeo       NaN   
2       3  18        Luigi Villoresi  VIL                Ferrari       NaN   
3       4  28        Consalvo Sanesi  SAN             Alfa Romeo       NaN   
4       5  26  Toulo  de Graffenried  DEG             Alfa Romeo       NaN   
...    ..  ..                         ...                    ...       ...   
24200  16  22          Yuki  Tsunoda  TSU  AlphaTauri Honda RBPT  1:30.025   
24201  17  24           Zhou  Guanyu  ZHO     Alfa Romeo Ferrari  1:30.123   
24202  18  21         Nyck  De Vries  DEV  AlphaTauri Honda RBPT  1:30.513   
24203  19  20       Kevin  Magnussen  MAG           Haas Ferrari  1:32.378   
24204  20  77       Valtteri  Bottas  BOT     Alfa Romeo Ferrari       NaN   

       season  round  Laps Time/Retired  PTS  
0        1951   

In [48]:
qualifying_results["Driver"] = qualifying_results["Driver"].apply(parse_driver_name)
qualifying_results["Car"] = qualifying_results["Car"].apply(parse_car_name)
qualifying_results.rename(columns = {'Pos': 'grid_position', 'No': 'car_number', 'Driver': 'driver_name', 'Car': 'constructor',
                                     'Time': 'qualifying_time'}, inplace = True)

In [49]:
qualifying_results = qualifying_results[['season', 'round', 'grid_position', 'driver_name', 'qualifying_time']]
print(qualifying_results)

       season  round grid_position           driver_name qualifying_time
0        1951      1             1    Juan Manuel Fangio        2:35.900
1        1951      1             2           Nino Farina             NaN
2        1951      1             3       Luigi Villoresi             NaN
3        1951      1             4       Consalvo Sanesi             NaN
4        1951      1             5  Toulo de Graffenried             NaN
...       ...    ...           ...                   ...             ...
24200    2023     10            16          Yuki Tsunoda        1:30.025
24201    2023     10            17           Zhou Guanyu        1:30.123
24202    2023     10            18         Nyck De Vries        1:30.513
24203    2023     10            19       Kevin Magnussen        1:32.378
24204    2023     10            20       Valtteri Bottas             NaN

[24205 rows x 5 columns]


In [50]:
qualifying_results.to_csv(create_data_here + 'qualifying_results.csv', index=False)