# Scraping Race Data from https://www.formula1.com/

In [78]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

In [79]:
def parse_circuit_name(link):
    country = link.split('/')[6]
    country_list = country.split('-')
    for i, word in enumerate(country_list):
        country_list[i] = word[0].upper() + word[1:]
    country = ' '.join(country_list)
    return country

def parse_driver_name(row):
    words = row.split()
    parsed_string = ' '.join(words[:-1])
    return parsed_string

def parse_car_name(row):
    words = row.split()
    parsed_string = ' '.join(words)
    return parsed_string

In [80]:
race_results = pd.DataFrame()
race_list = pd.DataFrame(columns=['season', 'round', 'circuit'])

for year in range(2022, 2024):
    races_url = f'https://www.formula1.com/en/results.html/{year}/races.html'
    response = requests.get(races_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    race_links = []
    filter_links = soup.find_all('a', attrs={'class': 'resultsarchive-filter-item-link FilterTrigger'})
    for link in filter_links:
        href = link.get('href')
        if f'/en/results.html/{year}/races/' in href:
            race_links.append(href)

    year_df = pd.DataFrame()

    round_num = 1
    for race_link in race_links:
        circuit = parse_circuit_name(race_link)
        try:
            df = pd.read_html(f'https://www.formula1.com{race_link}')[0]
        except:
            continue

        df['season'] = year
        df['round'] = round_num
        df['circuit'] = circuit

        df = df.loc[:, ~df.columns.str.contains('Unnamed')]
        
        year_df = pd.concat([year_df, df], ignore_index=True)   
        race_list.loc[len(race_list)] = [year, round_num, circuit]
        print(len(race_list))

        round_num += 1

    race_results = pd.concat([race_results, year_df], ignore_index=True)

print(race_results.shape)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
(640, 10)


In [81]:
print(race_list)

    season  round        circuit
0     2022      1        Bahrain
1     2022      2   Saudi Arabia
2     2022      3      Australia
3     2022      4          Italy
4     2022      5          Miami
5     2022      6          Spain
6     2022      7         Monaco
7     2022      8     Azerbaijan
8     2022      9         Canada
9     2022     10  Great Britain
10    2022     11        Austria
11    2022     12         France
12    2022     13        Hungary
13    2022     14        Belgium
14    2022     15    Netherlands
15    2022     16          Italy
16    2022     17      Singapore
17    2022     18          Japan
18    2022     19  United States
19    2022     20         Mexico
20    2022     21         Brazil
21    2022     22      Abu Dhabi
22    2023      1        Bahrain
23    2023      2   Saudi Arabia
24    2023      3      Australia
25    2023      4     Azerbaijan
26    2023      5          Miami
27    2023      6         Monaco
28    2023      7          Spain
29    2023

In [82]:
print(race_results)

    Pos  No                 Driver                    Car  Laps Time/Retired  \
0     1  16  Charles  Leclerc  LEC                Ferrari    57  1:37:33.584   
1     2  55     Carlos  Sainz  SAI                Ferrari    57      +5.598s   
2     3  44   Lewis  Hamilton  HAM               Mercedes    57      +9.675s   
3     4  63   George  Russell  RUS               Mercedes    57     +11.211s   
4     5  20  Kevin  Magnussen  MAG           Haas Ferrari    57     +14.754s   
..   ..  ..                    ...                    ...   ...          ...   
635  16  22     Yuki  Tsunoda  TSU  AlphaTauri Honda RBPT    52     +31.225s   
636  17  21    Nyck  De Vries  DEV  AlphaTauri Honda RBPT    52     +33.128s   
637  18  10     Pierre  Gasly  GAS         Alpine Renault    46          DNF   
638  NC  20  Kevin  Magnussen  MAG           Haas Ferrari    31          DNF   
639  NC  31     Esteban  Ocon  OCO         Alpine Renault     9          DNF   

     PTS  season  round        circuit 

In [83]:
race_results["Driver"] = race_results["Driver"].apply(parse_driver_name)
race_results["Car"] = race_results["Car"].apply(parse_car_name)
race_results.rename(columns = {'Pos': 'finishing_position', 'No': 'car_number', 'Driver': 'driver_name', 'Car': 'car',
                                     'Laps': 'laps', 'Time/Retired': 'race_time', 'PTS': 'points'}, inplace = True)

In [84]:
print(race_results)

    finishing_position  car_number      driver_name                    car  \
0                    1          16  Charles Leclerc                Ferrari   
1                    2          55     Carlos Sainz                Ferrari   
2                    3          44   Lewis Hamilton               Mercedes   
3                    4          63   George Russell               Mercedes   
4                    5          20  Kevin Magnussen           Haas Ferrari   
..                 ...         ...              ...                    ...   
635                 16          22     Yuki Tsunoda  AlphaTauri Honda RBPT   
636                 17          21    Nyck De Vries  AlphaTauri Honda RBPT   
637                 18          10     Pierre Gasly         Alpine Renault   
638                 NC          20  Kevin Magnussen           Haas Ferrari   
639                 NC          31     Esteban Ocon         Alpine Renault   

     laps    race_time  points  season  round        circuit  


In [85]:
race_list.to_csv('/Users/anirudhkrishna/GitHub/FormulaData/race_list.csv', index=False)