# Scraping Race Data from https://www.formula1.com/

In [41]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

create_data_here = '/Users/anirudhkrishna/GitHub/FormulaData/csv-data/'

In [42]:
def parse_location_name(link):
    country = link.split('/')[6]
    country_list = country.split('-')
    for i, word in enumerate(country_list):
        country_list[i] = word[0].upper() + word[1:]
    country = ' '.join(country_list)
    return country

def parse_driver_name(row):
    words = row.split()
    parsed_string = ' '.join(words[:-1])
    return parsed_string

def parse_car_name(row):
    try:
        words = row.split()
    except:
        return row
    parsed_string = ' '.join(words)
    return parsed_string

In [43]:
race_results = pd.DataFrame()
race_list = pd.DataFrame(columns=['season', 'round', 'location'])

for year in range(1950, 2024):
    races_url = f'https://www.formula1.com/en/results.html/{year}/races.html'
    response = requests.get(races_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    race_links = []
    race_locations = []
    filter_links = soup.find_all('a', attrs={'class': 'resultsarchive-filter-item-link FilterTrigger'})
    for link in filter_links:
        href = link.get('href')
        text = link.text
        if f'/en/results.html/{year}/races/' in href:
            race_links.append(href)
            race_locations.append(text)

    year_df = pd.DataFrame()

    round_num = 1
    for i,race_link in enumerate(race_links):
        location = race_locations[i].strip()
        if location == '':
            location = parse_location_name(race_link)
        try:
            df = pd.read_html(f'https://www.formula1.com{race_link}')[0]
        except:
            continue

        df['season'] = year
        df['round'] = round_num
        df['location'] = location

        df = df.loc[:, ~df.columns.str.contains('Unnamed')]
        
        year_df = pd.concat([year_df, df], ignore_index=True)   
        race_list.loc[len(race_list)] = [year, round_num, location]

        round_num += 1

    race_results = pd.concat([race_results, year_df], ignore_index=True)

print(race_results.shape)

(24235, 10)


In [44]:
print(race_list)

      season  round          location
0       1950      1     Great Britain
1       1950      2            Monaco
2       1950      3  Indianapolis 500
3       1950      4       Switzerland
4       1950      5           Belgium
...      ...    ...               ...
1084    2023      6            Monaco
1085    2023      7             Spain
1086    2023      8            Canada
1087    2023      9           Austria
1088    2023     10     Great Britain

[1089 rows x 3 columns]


In [45]:
print(race_results)

      Pos  No                       Driver                    Car  Laps  \
0       1   2            Nino  Farina  FAR             Alfa Romeo  70.0   
1       2   3          Luigi  Fagioli  FAG             Alfa Romeo  70.0   
2       3   4             Reg Parnell  PAR             Alfa Romeo  70.0   
3       4  14  Yves  Giraud-Cabantous  GIR            Talbot-Lago  68.0   
4       5  15            Louis Rosier  ROS            Talbot-Lago  68.0   
...    ..  ..                          ...                    ...   ...   
24230  16  22           Yuki  Tsunoda  TSU  AlphaTauri Honda RBPT  52.0   
24231  17  21          Nyck  De Vries  DEV  AlphaTauri Honda RBPT  52.0   
24232  18  10           Pierre  Gasly  GAS         Alpine Renault  46.0   
24233  NC  20        Kevin  Magnussen  MAG           Haas Ferrari  31.0   
24234  NC  31           Esteban  Ocon  OCO         Alpine Renault   9.0   

      Time/Retired  PTS  season  round       location  
0      2:13:23.600  9.0    1950      1  Gre

In [46]:
race_results["Driver"] = race_results["Driver"].apply(parse_driver_name)
race_results["Car"] = race_results["Car"].apply(parse_car_name)
race_results.rename(columns = {'Pos': 'finishing_position', 'No': 'car_number', 'Driver': 'driver_name', 'Car': 'constructor',
                                     'Laps': 'race_laps', 'Time/Retired': 'race_time', 'PTS': 'points'}, inplace = True)

In [47]:
print(race_results)

      finishing_position  car_number            driver_name  \
0                      1           2            Nino Farina   
1                      2           3          Luigi Fagioli   
2                      3           4            Reg Parnell   
3                      4          14  Yves Giraud-Cabantous   
4                      5          15           Louis Rosier   
...                  ...         ...                    ...   
24230                 16          22           Yuki Tsunoda   
24231                 17          21          Nyck De Vries   
24232                 18          10           Pierre Gasly   
24233                 NC          20        Kevin Magnussen   
24234                 NC          31           Esteban Ocon   

                 constructor  race_laps    race_time  points  season  round  \
0                 Alfa Romeo       70.0  2:13:23.600     9.0    1950      1   
1                 Alfa Romeo       70.0      +2.600s     6.0    1950      1   
2     

In [48]:
race_list.to_csv(create_data_here + 'race_list.csv', index=False)

In [49]:
race_results.to_csv(create_data_here +'race_results.csv', index=False)