In [31]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

In [32]:
def parse_driver_name(row):
    words = row.split()
    parsed_string = ' '.join(words[:-1])
    return parsed_string

In [33]:
fastest_laps = pd.DataFrame()

for year in range(2022, 2024):
    races_url = f'https://www.formula1.com/en/results.html/{year}/races.html'
    response = requests.get(races_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    race_links = []
    filter_links = soup.find_all('a', attrs={'class': 'resultsarchive-filter-item-link FilterTrigger'})
    for link in filter_links:
        href = link.get('href')
        if f'/en/results.html/{year}/races/' in href:
            race_links.append(href)

    year_df = pd.DataFrame()

    round_num = 1
    for race_link in race_links:
        fastest_lap_link = race_link.replace('race-result.html', 'fastest-laps.html')
        try:
            df = pd.read_html(f'https://www.formula1.com{fastest_lap_link}')[0]
            df = df[['Pos', 'Driver', 'Lap', 'Time of day', "Time", "Avg Speed"]]
        except:
            continue
        df['season'] = year
        df['round'] = round_num
        df = df.loc[:, ~df.columns.str.contains('Unnamed')]
        year_df = pd.concat([year_df, df], ignore_index=True)
        round_num += 1

    fastest_laps = pd.concat([fastest_laps, year_df], ignore_index=True)

fastest_laps.rename(columns={'Driver':'driver','Pos': 'fastest_lap_pos', 'Lap':'fastest_lap_number', 'Time of day':'time_of_day', 'Time':'lap_time', 'Avg Speed': 'avg_speed'}, inplace=True)
print(fastest_laps.shape)


(625, 8)


In [34]:
fastest_laps["Driver"] = fastest_laps["Driver"].apply(parse_driver_name)

In [38]:
print(fastest_laps.head())

   fastest_lap_pos           Driver  fastest_lap_number time_of_day  lap_time  \
0                1  Charles Leclerc                  51    19:31:35  1:34.570   
1                2   Max Verstappen                  51    19:31:37  1:35.440   
2                3     Carlos Sainz                  52    19:33:13  1:35.740   
3                4     Sergio Perez                  52    19:33:14  1:36.089   
4                5   Lewis Hamilton                  53    19:34:51  1:36.228   

   avg_speed  season  round  
0    206.018    2022      1  
1    204.140    2022      1  
2    203.501    2022      1  
3    202.762    2022      1  
4    202.469    2022      1  


In [36]:
print(fastest_laps.columns)

Index(['fastest_lap_pos', 'Driver', 'fastest_lap_number', 'time_of_day',
       'lap_time', 'avg_speed', 'season', 'round'],
      dtype='object')
