In [1]:
# importing all necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
import re
import sys
from dateutil.parser import parse
from IPython.display import clear_output

# Getting Teams URL

In [2]:
user_agent = {'User-agent': 'Mozilla/5.0'}


# function used for some codes : return HTML from an URL using BeautifulSoup
def download_soup(url):
    print('Starting request for: ' + url)
    response = requests.get(url, headers=user_agent)
    if response.status_code == 200:
        page = response.text
        soup = BeautifulSoup(page, 'lxml')
        print('Successful request!')
        return soup
    else:
        print('Unsuccessful request, status code: '+ response.status_code)

In [17]:
# Getting teams URL for year 2023
def get_team_urls(year):
    newdir = 'data/team_urls' 
    if not os.path.exists(newdir):
        os.makedirs(newdir)
    
    print("Starting get_team_urls, year {}".format(year))
    url_str = "https://www.procyclingstats.com/teams.php?s=worldtour&year=" + str(year)
    startlist = download_soup(url_str).find_all('a', href = re.compile('team/'))
    urls = [x['href'] for x in startlist]
    list_team_urls = ["https://www.procyclingstats.com/" + u for u in urls if u[-4:] == str(year) ]
    
    df_teams = pd.DataFrame()
    df_teams["team_url"] = list_team_urls
    df_teams = df_teams.drop_duplicates("team_url")
    ouputfile_str = newdir + "/team_urls_" + str(year) + ".csv"
    df_teams.to_csv(ouputfile_str)
        
    print("Finished get_team_urls, year {}".format(year))

# Getting Riders URL

In [54]:
# ### Get riders pages URLs from team pages with the previously generated file
# download each team page and scrape through the page to find riders URLs
# creates its own file of URLs
def get_riders_urls(start_year, end_year):
    print("Starting get_riders_urls")
    df_rider_urls = pd.DataFrame(columns=["rider_url"])
    
    # in case needed for more than 1 year, loop
    for year in range(start_year, end_year+1):
        df_team_urls = pd.read_csv("data/team_urls/team_urls_" + str(year) + ".csv")
        list_rider_urls = []
        for idx, row in df_team_urls.iterrows():
            url = row["team_url"]
            timer = 0.5 + 0.5 * random.random()
            time.sleep(timer)
            riders = download_soup(url).find_all('a', class_ = ['rider', 'ttabs', 'tabb'])
            urls = [x['href'] for x in riders]
            for u in urls:
                list_rider_urls.append('https://www.procyclingstats.com/' + u)
                list_rider_urls.append(u[6:])
        tmp_rider_urls = pd.DataFrame()
        tmp_rider_urls["rider_url"] = list_rider_urls
        df_rider_urls = pd.concat([df_rider_urls, tmp_rider_urls])
        df_rider_urls = df_rider_urls.drop_duplicates("rider_url")
    df_rider_urls.to_csv("data/rider_urls.csv")
    print('Finished get_riders_url')

In [81]:
df_rider_urls = pd.DataFrame(columns=["rider_url"])

df_team_urls = pd.read_csv("data/team_urls/team_urls_2023.csv")
list_rider_urls = []
for idx, row in df_team_urls.iterrows():
    url = row["team_url"]
    timer = 0.5 + 0.5 * random.random()
    time.sleep(timer)
    riders = download_soup(url).find('div', class_ = 'hide ttabs taba').find_all('a')
    urls = [x['href'] for x in riders]
    for u in urls:
#        list_rider_urls.append('https://www.procyclingstats.com/' + u)
        list_rider_urls.append(u[6:])
    tmp_rider_urls = pd.DataFrame()
    tmp_rider_urls["rider_url"] = list_rider_urls
    df_rider_urls = pd.concat([df_rider_urls, tmp_rider_urls])
    df_rider_urls = df_rider_urls.drop_duplicates("rider_url")
    df_rider_urls.to_csv("data/rider_urls.csv")
    print('Finished get_riders_url')

Starting request for: https://www.procyclingstats.com/team/bora-hansgrohe-2023
Successful request!
Finished get_riders_url
Starting request for: https://www.procyclingstats.com/team/cofidis-2023
Successful request!
Finished get_riders_url
Starting request for: https://www.procyclingstats.com/team/ef-education-easypost-2023
Successful request!
Finished get_riders_url
Starting request for: https://www.procyclingstats.com/team/groupama-fdj-2023
Successful request!
Finished get_riders_url
Starting request for: https://www.procyclingstats.com/team/ineos-grenadiers-2023
Successful request!
Finished get_riders_url
Starting request for: https://www.procyclingstats.com/team/intermarche-circus-wanty-2023
Successful request!
Finished get_riders_url
Starting request for: https://www.procyclingstats.com/team/team-jumbo-visma-2023
Successful request!
Finished get_riders_url
Starting request for: https://www.procyclingstats.com/team/movistar-team-2023
Successful request!
Finished get_riders_url
Start

# Getting race urls

In [146]:
# generating a list of URLs that shows list of races per year and per circuit
list_url=[]
for year in years:
    for circuit in race_circuits_men:
        list_url.append("https://www.procyclingstats.com/races.php?year="+str(year)+"&circuit="+str(circuit)+"&class=&filter=Filter")
list_url

['https://www.procyclingstats.com/races.php?year=2018&circuit=1&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=2&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=26&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=11&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=12&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=13&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=14&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=15&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2018&circuit=21&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2019&circuit=1&class=&filter=Filter',
 'https://www.procyclingstats.com/races.php?year=2019&circuit=2&class=&filter=Filter',
 'https://www.procyclingstats.com/ra

In [154]:
# function to get all race urls
def get_race_urls(year, circuit):
    newdir = 'data/race_urls' 
    if not os.path.exists(newdir):
        os.makedirs(newdir)
        
    race_urls = []
    df_races = pd.DataFrame()
    url_str = "https://www.procyclingstats.com/races.php?year="+str(year)+"&circuit="+str(circuit)+"&class=&filter=Filter"    
    soup = BeautifulSoup(requests.get(url_str).content, 'html.parser')
    race_links = [a['href'] for a in soup.find_all('a', href=lambda href: href and 'race/' in href)]
    race_urls.extend(race_links)
            
    df_races = pd.DataFrame({'race_url': race_urls})

    # dropping the part of the website scrapped (bottom page) that are "shortcuts" to most important races
    pattern = r'\d{4}'
    mask = df_races['race_url'].str.contains(pattern)
    df_races_filtered = df_races[mask]


    name_csv = "data/race_urls/race_urls_" + str(circuit) +"_" + str(year) + ".csv"
    df_races_filtered.to_csv(name_csv)

 Circuit Number:
 1: World Tour
 2: World Championship
 26: Pro Series (created in 2020 and integrate in code 26 on PCS only after 2021, so no need to integrate manually 
 some back races to 2019-20)
 11: only significant races => Tour of Rwanda, Tropicale Amissa Bongo
 12: only significant races => Tour of Oman, Tour de Saudi Arabia
 13: Europe => remove all " National Championships" (national races)
 14: only significant race => herald sun tour, race torquay
 18: only significant race => vuelta-ciclista-a-la-provincia-de-san-juan
 21: Nations cup (U23) => course de la paix, l'étoile d'or, orlen nations grand prix, tour de l'avenir
 3: Olympics: 2 races (Road, ITT)
 
 Circuit 11, 12, 13, 14, 18, 21 are less important, no need to scrap them

In [156]:
# Scrapping race urls 1 & 2 circuit
years = [2019, 2020, 2021, 2022, 2023]
race_circuits = [1, 2]

for circuit in race_circuits:
    for year in years:
        get_race_urls(year, circuit)


In [157]:
# ProSeries race urls (circuit 26) 
years = [2021, 2022, 2023]
pro_series_circuit = 26

for year in years:
    get_race_urls(year, pro_series_circuit)

In [235]:
# Europe Tour race urls (circuit 13) - In case I need them later
def get_race_europe_urls(year, classification):
    newdir = 'data/race_urls' 
    if not os.path.exists(newdir):
        os.makedirs(newdir)
        
    race_urls = []
    df_races = pd.DataFrame()
    url_str = "https://www.procyclingstats.com/races.php?year="+str(year)+"&circuit=13&class="+str(classification)+"&filter=Filter"    
    soup = BeautifulSoup(requests.get(url_str).content, 'html.parser')
    race_links = [a['href'] for a in soup.find_all('a', href=lambda href: href and 'race/' in href)]
    race_urls.extend(race_links)
            
    df_races = pd.DataFrame({'race_url': race_urls})

    # dropping the part of the website scrapped that is not what we are looking
    pattern = r'\d{4}'
    mask = df_races['race_url'].str.contains(pattern)
    df_races_filtered = df_races[mask]


    name_csv = "data/race_urls/race_urls_13_"+str(classification)+"_"+ str(year)+".csv"
    df_races_filtered.to_csv(name_csv)

# for year 2019
classification = ["1.HC", "2.HC"] # only significant races

for classi in classification:
    get_race_europe_urls(2019, classi)

# for year 2020
classification = ["1.Pro", "2.Pro"] # only significant races

for classi in classification:
    get_race_europe_urls(2020, classi)
    
# for year 2021, 2022, 2023
classification = ["1.1", "2.1"] # only significant races
years=[2021, 2022, 2023]
for classi in classification:
    for year in years:
        get_race_europe_urls(year, classi)

In [159]:
# Other significant race - Might potential scrap the results
list_other_races=["race/vuelta-ciclista-a-la-provincia-de-san-juan/2019",
"race/la-tropicale-amissa-bongo/2019",
"race/tour-of-rwanda/2019",
"race/tour-of-oman/2019",
"race/tour-de-saudi-arabia/2019",
"race/l-etoile-d-or/2019",
"race/herald-sun-tour/2019",
"race/orlen-nations-grand-prix/2019",
"race/tour-de-l-avenir/2019",
"race/la-tropicale-amissa-bongo/2020",
"race/tour-of-rwanda/2020",
"race/vuelta-ciclista-a-la-provincia-de-san-juan/2020",
"race/tour-de-saudi-arabia/2020",
"race/herald-sun-tour/2020",
"race/race-torquay/2020",
"race/l-etoile-d-or/2020",
"race/orlen-nations-grand-prix/2020",
"race/tour-of-rwanda/2021",
"race/course-de-la-paix-u23/2021",
"race/l-etoile-d-or/2021",
"race/orlen-nations-grand-prix/2021",
"race/tour-de-l-avenir/2021",
"race/olympic-games/2021",
"race/olympic-games-itt/2021",
"race/tour-of-rwanda/2022",
"race/tour-de-saudi-arabia/2022",
"race/course-de-la-paix-u23/2022",
"race/orlen-nations-grand-prix/2022",
"race/tour-de-l-avenir/2022",
"race/la-tropicale-amissa-bongo/2023",
"race/tour-de-saudi-arabia/2023"]

In [161]:
df = pd.DataFrame({'race_url': list_other_races})
df.to_csv("race_urls_others.csv")

In [186]:
# identifying cancelled raced to remove them
# NB: I found an other way to identify cancelled races, so this was eventually not used but this function works
def get_cancelled_race_urls(year, circuit):
    newdir = 'data/cancelled_race_urls' 
    if not os.path.exists(newdir):
        os.makedirs(newdir)
        
    race_urls = []
    df_races = pd.DataFrame()
    url_str = "https://www.procyclingstats.com/races.php?year="+str(year)+"&circuit="+str(circuit)+"&class=&filter=Filter"    
    soup = BeautifulSoup(requests.get(url_str).content, 'html.parser')
    rows = soup.find_all('tr', class_=lambda x: x in ['striked', 'no-striked'])
            
    # Extract the href attributes of the 'a' tags in each row
    for row in rows:
        a_tag = row.find('a', href=lambda href: href and 'race/' in href)
        if a_tag:
            df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
    

    # dropping the part of the website scrapped that is not what we are looking
#    pattern = r'\d{4}'
#    mask = df_races['race_url'].str.contains(pattern)
#    df_races_filtered = df_races[mask]


    name_csv = "data/cancelled_race_urls/cancelled_race_urls_" + str(circuit) +"_" + str(year) + ".csv"
#    df_races_filtered.to_csv(name_csv)
    df_races.to_csv(name_csv)

In [187]:
# cancelled race of world tour & world championship
years = [2019, 2020, 2021, 2022, 2023]
race_circuits = [1, 2]

for circuit in race_circuits:
    for year in years:
        get_cancelled_race_urls(year, circuit)

  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)

In [183]:
# cancelled race of pro series
years = [2021, 2022, 2023]
pro_series_circuit = 26

for year in years:
    get_cancelled_race_urls(year, pro_series_circuit)

  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)


In [189]:
# cancelled Tour (circuit 13)
years = [2019, 2020, 2021, 2022, 2023]
europe_tour = 13

for year in years:
    get_cancelled_race_urls(year, europe_tour)

  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)

  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)

  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)

  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)
  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)

  df_races = df_races.append({'race_url': a_tag['href']}, ignore_index=True)


# Cleaning urls 

In [244]:
# in Europe Tour removing all national championship 
race_urls_13_1_1_2021 = pd.read_csv('data/race_urls/race_urls_13_1_1_2021.csv')
race_urls_13_1_1_2022 = pd.read_csv('data/race_urls/race_urls_13_1_1_2022.csv')
race_urls_13_1_1_2023 = pd.read_csv('data/race_urls/race_urls_13_1_1_2023.csv')
race_urls_13_1_HC_2019 = pd.read_csv('data/race_urls/race_urls_13_1_HC_2019.csv')
race_urls_13_1_Pro_2020 = pd.read_csv('data/race_urls/race_urls_13_1_Pro_2020.csv')
race_urls_13_2_1_2021 = pd.read_csv('data/race_urls/race_urls_13_2_1_2021.csv')
race_urls_13_2_1_2022 = pd.read_csv('data/race_urls/race_urls_13_2_1_2022.csv')
race_urls_13_2_1_2023 = pd.read_csv('data/race_urls/race_urls_13_2_1_2023.csv')
race_urls_13_2_HC_2019 = pd.read_csv('data/race_urls/race_urls_13_2_HC_2019.csv')
race_urls_13_2_Pro_2020 = pd.read_csv('data/race_urls/race_urls_13_2_Pro_2020.csv')

df_13 = pd.concat([race_urls_13_1_1_2021, race_urls_13_1_1_2022, race_urls_13_1_1_2023, 
                   race_urls_13_1_HC_2019, race_urls_13_1_Pro_2020, race_urls_13_2_1_2021,
                   race_urls_13_2_1_2022, race_urls_13_2_1_2023, race_urls_13_2_HC_2019, race_urls_13_2_Pro_2020], axis=0)

# Removing the race ending with "preview": it's either cancelled raced, or didn't happened yet (after March 5th 2023)
df_13 = df_13[~df_13['race_url'].str.contains('/startlist/preview')]

# removing all national championships
df_13 = df_13[~df_13['race_url'].str.contains('race/nc')]
df_13 = df_13[~df_13['race_url'].str.contains('race/national-championships')]

df_13.to_csv("data/clean_race_urls/race_urls_13.csv")

In [245]:
# one single file for circuit 1 & circuit 26
df_1_2019 = pd.read_csv('data/race_urls/race_urls_1_2019.csv')
df_1_2020 = pd.read_csv('data/race_urls/race_urls_1_2020.csv')
df_1_2021 = pd.read_csv('data/race_urls/race_urls_1_2021.csv')
df_1_2022 = pd.read_csv('data/race_urls/race_urls_1_2022.csv')
df_1_2023 = pd.read_csv('data/race_urls/race_urls_1_2023.csv')

df_1 = pd.concat([df_1_2019, df_1_2020, df_1_2021, df_1_2022, df_1_2023 ], axis=0)

df_1 = df_1[~df_1['race_url'].str.contains('/startlist/preview')]

df_1.to_csv("data/clean_race_urls/race_urls_1.csv")

In [246]:
df_26_2021 = pd.read_csv('data/race_urls/race_urls_26_2021.csv')
df_26_2022 = pd.read_csv('data/race_urls/race_urls_26_2022.csv')
df_26_2023 = pd.read_csv('data/race_urls/race_urls_26_2023.csv')
df_26 = pd.concat([df_26_2021, df_26_2022, df_26_2023 ], axis=0)

df_26 = df_26[~df_26['race_url'].str.contains('/startlist/preview')]

df_26.to_csv("data/clean_race_urls/race_urls_26.csv")

In [249]:
# Consolidating all race urls in one single
folder_path = "data/clean_race_urls"

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

dfs = []

for file in csv_files:
    df = pd.read_csv(os.path.join(folder_path, file))
    dfs.append(df)

concatenated_df = concatenated_df.reset_index(drop=True)

concatenated_df.to_csv("data/all_race_urls.csv", index=False)

In [248]:
# Cleaning the races by removing those cancelled or those who didn't happened yet

df_races= pd.read_csv("data/all_race_urls.csv", index_col=False)
print(df_races.shape)
df_races.head(5)

(2192, 2)


Unnamed: 0.1,Unnamed: 0,race_url
0,17,race/trofeo-cala-millor/2019
1,18,race/trofeo-andratx-mirador-d-es-colomer/2019
2,19,race/deia-trophy/2019
3,20,race/gp-d-ouverture/2019
4,21,race/trofeo-palma/2019


In [184]:
# Cancelled races consolidation => NB not needed eventually as other way of identifying cancelled race
#folder_path = "data/cancelled_race_urls"

#csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

#dfs = []

#for file in csv_files:
#    df = pd.read_csv(os.path.join(folder_path, file))
#    dfs.append(df)

#concatenated_df = concatenated_df.reset_index(drop=True)

#concatenated_df.to_csv("data/cancelled_race_urls.csv", index=False)

# Scrapping type of races

In [95]:
df_race=pd.read_csv("data/all_race_urls.csv")
df_race.shape

(502, 3)

In [96]:
df_race.columns

Index(['Unnamed: 0', 'race_code', 'race_url'], dtype='object')

In [97]:
df_race.drop(columns=["Unnamed: 0"], inplace=True)
df_race.head(10)

Unnamed: 0,race_code,race_url
0,tour-down-under/2019,https://www.procyclingstats.com/race/tour-down...
1,great-ocean-race/2019,https://www.procyclingstats.com/race/great-oce...
2,uae-tour/2019,https://www.procyclingstats.com/race/uae-tour/...
3,omloop-het-nieuwsblad/2019,https://www.procyclingstats.com/race/omloop-he...
4,strade-bianche/2019,https://www.procyclingstats.com/race/strade-bi...
5,paris-nice/2019,https://www.procyclingstats.com/race/paris-nic...
6,tirreno-adriatico/2019,https://www.procyclingstats.com/race/tirreno-a...
7,milano-sanremo/2019,https://www.procyclingstats.com/race/milano-sa...
8,volta-a-catalunya/2019,https://www.procyclingstats.com/race/volta-a-c...
9,oxyclean-classic-brugge-de-panne/2019,https://www.procyclingstats.com/race/oxyclean-...


# Getting race info

In [107]:
# Seperating race that are one-day race and stage race
df_stage_race = result_df[result_df["UCI scale:"].str.contains("Stage")]
df_one_day_race = result_df[~result_df["UCI scale:"].str.contains("Stage")]
df_stage_race.to_csv("data/stage_race.csv")
df_one_day_race.to_csv("data/one_day_race.csv")

In [129]:
# generating suffixes to stage-1 to stage-21 as grand tour have 21 stage (exception: vuelta 2020 who had 18 stages)
suffixes=[]

for i in range(1,22):
    suffixes.append("/stage-"+str(i))

suffixes

['/stage-1',
 '/stage-2',
 '/stage-3',
 '/stage-4',
 '/stage-5',
 '/stage-6',
 '/stage-7',
 '/stage-8',
 '/stage-9',
 '/stage-10',
 '/stage-11',
 '/stage-12',
 '/stage-13',
 '/stage-14',
 '/stage-15',
 '/stage-16',
 '/stage-17',
 '/stage-18',
 '/stage-19',
 '/stage-20',
 '/stage-21']

In [126]:
# Filetiring on stage race of grand tour (vuelta, giro, tour)
stage_race_grand_tour = pd.read_csv("data/stage_race_grand_tour.csv")
list_gc = stage_race_grand_tour["race_code"].tolist()

In [139]:
# adding the suffixe to the grand tour 
list_stage_wc =[]
for i in list_gc:
    for suffixe in suffixes:
        list_stage_wc.append("https://www.procyclingstats.com/"+str(i)+suffixe)
list_stage_wc
stage_detail_wc = pd.DataFrame(list_stage_wc)
stage_detail_wc.to_csv("data/stage_wc_urls.csv")

In [148]:
stage_gt_details_df = pd.read_csv("data/stage_wc_urls.csv")
stage_gt_details_df.rename(columns={'Unnamed: 0': 'index_2','0': 'stage_url'}, inplace=True, errors='raise')
stage_gt_details_df

Unnamed: 0,index_2,stage_url
0,0,https://www.procyclingstats.com/race/giro-d-it...
1,1,https://www.procyclingstats.com/race/giro-d-it...
2,2,https://www.procyclingstats.com/race/giro-d-it...
3,3,https://www.procyclingstats.com/race/giro-d-it...
4,4,https://www.procyclingstats.com/race/giro-d-it...
...,...,...
247,247,https://www.procyclingstats.com/race/vuelta-a-...
248,248,https://www.procyclingstats.com/race/vuelta-a-...
249,249,https://www.procyclingstats.com/race/vuelta-a-...
250,250,https://www.procyclingstats.com/race/vuelta-a-...


In [157]:
# Since grand tour have 21 stages, generating the url with stage-1, stage-2, etc.
stage_df["race_code"]= stage_df["race_code"].str[37:]
stage_df["race_code"]

0         iro-d-italia/2019/stage-1
1         iro-d-italia/2019/stage-2
2         iro-d-italia/2019/stage-3
3         iro-d-italia/2019/stage-4
4         iro-d-italia/2019/stage-5
                   ...             
247    uelta-a-espana/2022/stage-17
248    uelta-a-espana/2022/stage-18
249    uelta-a-espana/2022/stage-19
250    uelta-a-espana/2022/stage-20
251    uelta-a-espana/2022/stage-21
Name: race_code, Length: 252, dtype: object

In [159]:
# mistakenly remove one character
stage_df["race_code"] = stage_df["race_code"].str.replace("iro", "giro")
stage_df["race_code"] = stage_df["race_code"].str.replace("uelta", "vuelta")
stage_df["race_code"] = stage_df["race_code"].str.replace("our", "tour")
stage_df["race_code"]

0         giro-d-italia/2019/stage-1
1         giro-d-italia/2019/stage-2
2         giro-d-italia/2019/stage-3
3         giro-d-italia/2019/stage-4
4         giro-d-italia/2019/stage-5
                   ...              
247    vuelta-a-espana/2022/stage-17
248    vuelta-a-espana/2022/stage-18
249    vuelta-a-espana/2022/stage-19
250    vuelta-a-espana/2022/stage-20
251    vuelta-a-espana/2022/stage-21
Name: race_code, Length: 252, dtype: object

In [160]:
stage_df.to_csv("data/stage_race_info.csv")

In [367]:
# For other than grand tour stage (21 stages), find the number of stages

# url without stage number brings to a page where title shows the stage number
def get_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.title.string
    return title

In [369]:
df_wt_other_stage=pd.read_csv('data/stage_race_other_wt_urls.csv')
df_wt_other_stage

Unnamed: 0.1,Unnamed: 0,race_code,race_url
0,0,tour-down-under/2019,https://www.procyclingstats.com/race/tour-down...
1,2,uae-tour/2019,https://www.procyclingstats.com/race/uae-tour/...
2,5,paris-nice/2019,https://www.procyclingstats.com/race/paris-nic...
3,6,tirreno-adriatico/2019,https://www.procyclingstats.com/race/tirreno-a...
4,8,volta-a-catalunya/2019,https://www.procyclingstats.com/race/volta-a-c...
5,14,itzulia-basque-country/2019,https://www.procyclingstats.com/race/itzulia-b...
6,16,tour-of-turkey/2019,https://www.procyclingstats.com/race/tour-of-t...
7,20,tour-de-romandie/2019,https://www.procyclingstats.com/race/tour-de-r...
8,23,tour-of-california/2019,https://www.procyclingstats.com/race/tour-of-c...
9,24,dauphine/2019,https://www.procyclingstats.com/race/dauphine/...


In [372]:
# apply the function to each URL
df_wt_other_stage['title'] = df_wt_other_stage['race_url'].apply(get_title)

df_wt_other_stage

Unnamed: 0.1,Unnamed: 0,race_code,race_url,title
0,0,tour-down-under/2019,https://www.procyclingstats.com/race/tour-down...,Santos Tour Down Under 2019 Stage 6 results
1,2,uae-tour/2019,https://www.procyclingstats.com/race/uae-tour/...,UAE Tour 2019 Stage 7 results
2,5,paris-nice/2019,https://www.procyclingstats.com/race/paris-nic...,Paris - Nice 2019 Stage 8 results
3,6,tirreno-adriatico/2019,https://www.procyclingstats.com/race/tirreno-a...,Tirreno-Adriatico 2019 Stage 7 (ITT) results
4,8,volta-a-catalunya/2019,https://www.procyclingstats.com/race/volta-a-c...,Volta Ciclista a Catalunya 2019 Stage 7 results
5,14,itzulia-basque-country/2019,https://www.procyclingstats.com/race/itzulia-b...,Itzulia Basque Country 2019 Stage 6 results
6,16,tour-of-turkey/2019,https://www.procyclingstats.com/race/tour-of-t...,Presidential Cycling Tour of Turkey 2019 Stage...
7,20,tour-de-romandie/2019,https://www.procyclingstats.com/race/tour-de-r...,Tour de Romandie 2019 Stage 5 (ITT) results
8,23,tour-of-california/2019,https://www.procyclingstats.com/race/tour-of-c...,Amgen Tour of California 2019 Stage 7 results
9,24,dauphine/2019,https://www.procyclingstats.com/race/dauphine/...,Critérium du Dauphiné 2019 Stage 8 results


In [373]:
df_wt_other_stage['title'] = df_wt_other_stage['title'].str.extract(r'Stage(.*)')

# remove everything after 3 characters of "Stage"
df_wt_other_stage['title'] = df_wt_other_stage['title'].str.slice(stop=3)

df_wt_other_stage

Unnamed: 0.1,Unnamed: 0,race_code,race_url,title
0,0,tour-down-under/2019,https://www.procyclingstats.com/race/tour-down...,6 r
1,2,uae-tour/2019,https://www.procyclingstats.com/race/uae-tour/...,7 r
2,5,paris-nice/2019,https://www.procyclingstats.com/race/paris-nic...,8 r
3,6,tirreno-adriatico/2019,https://www.procyclingstats.com/race/tirreno-a...,7 (
4,8,volta-a-catalunya/2019,https://www.procyclingstats.com/race/volta-a-c...,7 r
5,14,itzulia-basque-country/2019,https://www.procyclingstats.com/race/itzulia-b...,6 r
6,16,tour-of-turkey/2019,https://www.procyclingstats.com/race/tour-of-t...,6 r
7,20,tour-de-romandie/2019,https://www.procyclingstats.com/race/tour-de-r...,5 (
8,23,tour-of-california/2019,https://www.procyclingstats.com/race/tour-of-c...,7 r
9,24,dauphine/2019,https://www.procyclingstats.com/race/dauphine/...,8 r


In [374]:
df_wt_other_stage['title'] = df_wt_other_stage['title'].str.slice(stop=2)
df_wt_other_stage

Unnamed: 0.1,Unnamed: 0,race_code,race_url,title
0,0,tour-down-under/2019,https://www.procyclingstats.com/race/tour-down...,6
1,2,uae-tour/2019,https://www.procyclingstats.com/race/uae-tour/...,7
2,5,paris-nice/2019,https://www.procyclingstats.com/race/paris-nic...,8
3,6,tirreno-adriatico/2019,https://www.procyclingstats.com/race/tirreno-a...,7
4,8,volta-a-catalunya/2019,https://www.procyclingstats.com/race/volta-a-c...,7
5,14,itzulia-basque-country/2019,https://www.procyclingstats.com/race/itzulia-b...,6
6,16,tour-of-turkey/2019,https://www.procyclingstats.com/race/tour-of-t...,6
7,20,tour-de-romandie/2019,https://www.procyclingstats.com/race/tour-de-r...,5
8,23,tour-of-california/2019,https://www.procyclingstats.com/race/tour-of-c...,7
9,24,dauphine/2019,https://www.procyclingstats.com/race/dauphine/...,8


In [395]:
# For generating the stage url race i will use a function on excel

In [510]:
# function to scrap the info of each race
def extract_race_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # initialize all keys to None
    race_info = {
        'Date': None,
        'Start time': None,
        'Avg. speed winner': None,
        'Race category': None,
        'Distance': None,
        'Points scale': None,
        'UCI scale': None,
        'Parcours type': None,
        'ProfileScore': None,
        'Vert. meters': None,
        'Departure': None,
        'Arrival': None,
        'Race ranking': None,
        'Startlist quality score': None,
        'Won how': None,
        'Avg. temperature': None,
        'race_code': url.replace('https://www.procyclingstats.com/race/','') #adding the race_code
    }
    
    infolist = soup.find('ul', {'class': 'infolist'})
    
    # check if infolist is None (i.e., the element was not found)
    if infolist is None:
        return race_info
    
 #   for li in infolist.find_all('li'):
 #       key_div, value_div = li.find_all('div')
 #       key = key_div.get_text().strip()
 #       value = value_div.get_text().strip()
 #       race_info[key] = value
        
    for li in infolist.find_all('li'):
        key_div, value_div = li.find_all('div')
        key = key_div.get_text().strip()
        # parcours type is a bit particular: it has no text, but only icon, so we will get the name of the icon instead
        if key == 'Parcours type:':
            value_span = li.find('span', {'class': 'icon'})
            if value_span is not None:
                value = value_span.get('class')[2]
                #p0: no value => will need to look for each race what same race of other years have for values
                # or look at profile score & stage profile image 
                #p1: Flat
                #p2: Hills, flat finish
                #p3: Hills, uphill finish
                #p4: Mountains, flat finish
                #p5: Mountains, uphill finish
            else:
                value = ''

        else:
            value = value_div.get_text().strip()
        race_info[key] = value
        
    return race_info

In [508]:
def create_race_info_df(df, url_column):
    num_urls = len(df[url_column])
    race_info_list = []
    for i, url in enumerate(df[url_column]):
        print(f"\rProcessing URL {i+1}/{num_urls}: {url}", end="")
        # exception: to ignore team time trials that have different structures
        try:
            race_info = extract_race_info(url)
            race_info_list.append(race_info)
        except ValueError:
            print(f"\rNot processing this url (Team time trial): {url} ")
            continue
        timer = 0.5 + 0.5*random.random()
        time.sleep(timer)
        
    race_info_df = pd.DataFrame(race_info_list)
    
    # removing the first columns that returns None  (in website those information appears twice: one empty, and one with the values )
    race_info_df.drop(columns=['Date', 'Start time', 'Avg. speed winner',
                               'Race category', 'Distance', 'Points scale', 'UCI scale',
                               'Parcours type', 'ProfileScore', 'Vert. meters', 'Departure', 'Arrival',
                               'Race ranking', 'Startlist quality score', 'Won how',
                               'Avg. temperature'], inplace=True)
    return race_info_df

In [474]:
df_tour = pd.read_csv('data/race_urls_final/stage_gt_urls_tour.csv')
df_giro = pd.read_csv('data/race_urls_final/stage_gt_urls_giro.csv')
df_vuelta = pd.read_csv('data/race_urls_final/stage_gt_urls_vuelta.csv')
df_one_day_2019 = pd.read_csv('data/race_urls_final/one_day_race_urls_2019.csv')
df_one_day_2020 = pd.read_csv('data/race_urls_final/one_day_race_urls_2020.csv')
df_one_day_2021 = pd.read_csv('data/race_urls_final/one_day_race_urls_2021.csv')
df_one_day_2022 = pd.read_csv('data/race_urls_final/one_day_race_urls_2022.csv')
df_one_day_2023 = pd.read_csv('data/race_urls_final/one_day_race_urls_2023.csv')
df_stage_2019 = pd.read_csv('data/race_urls_final/stage_race_2019.csv')
df_stage_2020 = pd.read_csv('data/race_urls_final/stage_race_2020.csv')
df_stage_2021 = pd.read_csv('data/race_urls_final/stage_race_2021.csv')
df_stage_2022 = pd.read_csv('data/race_urls_final/stage_race_2022.csv')
df_stage_2023 = pd.read_csv('data/race_urls_final/stage_race_2023.csv')

In [512]:
# In case there is an issue with one of the links, splitting the process
# generating the race info with Tour de France stages
df_race_tour=create_race_info_df(df_tour, "race_url")
df_race_tour.to_csv('data/race_info_tour.csv')

Processing URL 84/84: https://www.procyclingstats.com/race/tour-de-france/2022/stage-21

In [513]:
# generating the race info with Giro d'Italia stages
df_race_giro=create_race_info_df(df_giro, "race_url")
df_race_giro.to_csv('data/race_info_giro.csv')

Processing URL 84/84: https://www.procyclingstats.com/race/giro-d-italia/2022/stage-21

In [514]:
# generating the race info with Vuelta a Espana stages
df_race_vuelta=create_race_info_df(df_vuelta, "race_url")
df_race_vuelta.to_csv('data/race_info_vuelta.csv')

Processing URL 81/81: https://www.procyclingstats.com/race/vuelta-a-espana/2022/stage-21

In [515]:
# generating the race info with other stage races of world tour series 2019
df_race_stage_2019=create_race_info_df(df_stage_2019, "stage_url")
df_race_stage_2019.to_csv('data/race_info_stage_2019.csv')

Processing URL 95/95: https://www.procyclingstats.com/race/tour-de-suisse/2019/stage-87e-7ge-6

In [516]:
# generating the race info with other stage races of world tour series 2020
df_race_stage_2020=create_race_info_df(df_stage_2020, "stage_url")
df_race_stage_2020.to_csv('data/race_info_stage_2020.csv')

Processing URL 41/41: https://www.procyclingstats.com/race/tirreno-adriatico/2020/stage-8

In [517]:
# generating the race info with other stage races of world tour series 2021
df_race_stage_2021=create_race_info_df(df_stage_2021, "stage_url")
df_race_stage_2021.to_csv('data/race_info_stage_2021.csv')

Processing URL 70/70: https://www.procyclingstats.com/race/tour-de-suisse/2021/stage-87-7age-6

In [518]:
# generating the race info with other stage races of world tour series 2022
df_race_stage_2022=create_race_info_df(df_stage_2022, "stage_url")
df_race_stage_2022.to_csv('data/race_info_stage_2022.csv')

Processing URL 63/63: https://www.procyclingstats.com/race/tour-de-suisse/2022/stage-87-7age-6

In [525]:
# generating the race info with other stage races of world tour series 2023
df_race_stage_2023=create_race_info_df(df_stage_2023, "stage_url")
df_race_stage_2023.to_csv('data/race_info_stage_2023.csv')

Processing URL 12/12: https://www.procyclingstats.com/race/uae-tour/2023/stage-7tage-5

In [520]:
# generating the race info for one day race 2019
df_race_one_2019=create_race_info_df(df_one_day_2019, "race_url")
df_race_one_2019.to_csv('data/race_info_one_day_2019.csv')

Processing URL 48/48: https://www.procyclingstats.com/race/l-etoile-d-or/2019/20191919190192019

In [521]:
# generating the race info for one day race 2020
df_race_one_2020=create_race_info_df(df_one_day_2020, "race_url")
df_race_one_2020.to_csv('data/race_info_one_day_2020.csv')

Processing URL 29/29: https://www.procyclingstats.com/race/race-torquay/202020i/20202020nne/2020

In [522]:
# generating the race info for one day race 2021
df_race_one_2021=create_race_info_df(df_one_day_2021, "race_url")
df_race_one_2021.to_csv('data/race_info_one_day_2021.csv')

Processing URL 103/103: https://www.procyclingstats.com/race/olympic-games-itt/20212121212021lde/20211

In [523]:
# generating the race info for one day race 2022
df_race_one_2022=create_race_info_df(df_one_day_2022, "race_url")
df_race_one_2022.to_csv('data/race_info_one_day_2022.csv')

Processing URL 110/110: https://www.procyclingstats.com/race/japan-cup/20222222/2022202222022lde/20222

In [524]:
# generating the race info for one day race 2023
df_race_one_2023=create_race_info_df(df_one_day_2023, "race_url")
df_race_one_2023.to_csv('data/race_info_one_day_2023.csv')

Processing URL 20/20: https://www.procyclingstats.com/race/trofeo-laigueglia/202320233or/2023cia/2023

# Getting team info

In [184]:
# function for getting team info 
def extract_team_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # initialize all keys to None
    team_gear = {
        'Bike': None,
        'Groupset': None,
        'Wheels': None,
        'Saddle': None,
        'Tyres': None,
        'Pedals': None,
        'Powermeter': None,
        'Sunglasses': None,
        'Helmets': None,
        'Shoes': None,
        'Bartape': None,
        'Kit': None,
        'Sports nutrition': None,
        'Cycling computer': None,
        'Hometrainer': None,
    }
    
    table = soup.find('ul', {'class': 'list lines fs14 pad4'})
    
    # check if infolist is None (i.e., the element was not found)
    
    for row in table.find_all('li')[1:]:
        cells = row.find_all('div')
        gear_type = cells[0].text.strip()
        brand = cells[1].text.strip()
        if gear_type in team_gear:
            team_gear[gear_type] = brand
        else:
            team_gear['Other'] += f"{gear_type} - {brand}\n"
    
    # Store the data in a pandas DataFrame
    df = pd.DataFrame(team_gear, index=[0])
 #   df = pd.DataFrame(team_gear.items(), columns=['Gear Type', 'Brand'])
  #  df = df.pivot(columns='Gear Type', values='Brand').reset_index(drop=True)
    return df

In [188]:
df_teams = pd.read_csv("data/team_urls_2023.csv")
gear_df = pd.concat(df_teams['more_gear_url'].apply(extract_team_info).tolist())

In [196]:
gear_df.to_csv("gear_info.csv")

In [197]:
gear_df

Unnamed: 0,Bike,Groupset,Wheels,Saddle,Tyres,Pedals,Powermeter,Sunglasses,Helmets,Shoes,Bartape,Kit,Sports nutrition,Cycling computer,Hometrainer,Avg. temperature
0,SpecializedSpecialized,Shimano,Roval,Specialized,,,,100%,Specialized,,,Le ColRoeckl,,Wahoo,,
0,LOOK,Shimano,Corima,Selle Italia,Michelin,,,EKOÏ,EKOÏ,,,Van Rysel,,Wahoo,,
0,Cannondale,Shimano,Vision,Prologo,Vittoria,,,POC,POC,,,Rapha,,Wahoo,Wahoo,
0,Lapierre,Shimano,Shimano,Prologo,Continental,,,Julbo,Giro,,,Alé,Apurna,Garmin,,
0,Pinarello,Shimano,Shimano,Fizik,Continental,,,SunGod,Kask,,,Bioracer,SiS,Garmin,,
0,Cube,Shimano,ShimanoNewmen,Prologo,ContinentalContinental,,,Uvex Sports,Uvex Sports,Gaerne,Prologo,Nalini,,Bryton,Bryton,
0,Cervélo,SRAM,Reserve Wheels,Fizik,Vittoria,Speedplay,,Oakley,Lazer,Nimbl,FSA,AGU,Amacx,Garmin,,
0,Canyon,SRAM,ZippZipp,Fizik,ContinentalContinental,LOOK,Quarq,100%,Abus,Fizik,Lizard Skins,GOBIK,226ERS,Garmin,Elite,
0,Specialized,Shimano,Roval,Specialized,Specialized,Shimano,Shimano,Oakley,Specialized,Specialized,Supacaz,Castelli,6D Sports Nutrition,Garmin,Tacx,
0,Bianchi,Shimano,Shimano,Selle Italia,Continental,,,EKOÏ,EKOÏ,,,Jinga,,Wahoo,,


In [458]:
# scrapping previous names of team
def extract_team_history(urls):
    dfs = []
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        select_elem = soup.find('select')
        option_values = [option['value'] for option in select_elem.find_all('option')]

        df = pd.DataFrame({'Option Value': option_values, 'URL': url})

        dfs.append(df)

    result_df = pd.concat(dfs)

    return result_df

In [459]:
team_url= pd.read_csv('data/teams/team_urls_2023.csv')
team_url["team_url"]
df_team_history = extract_team_history(team_url["team_url"])
df_team_history

Unnamed: 0,Option Value,URL
0,team/bora-hansgrohe-2027/overview/,https://www.procyclingstats.com/team/bora-hans...
1,team/bora-hansgrohe-2026/overview/,https://www.procyclingstats.com/team/bora-hans...
2,team/bora-hansgrohe-2025/overview/,https://www.procyclingstats.com/team/bora-hans...
3,team/bora-hansgrohe-2024/overview/,https://www.procyclingstats.com/team/bora-hans...
4,team/bora-hansgrohe-2023/overview/,https://www.procyclingstats.com/team/bora-hans...
...,...,...
13,team/team-mtn-qhubeka-2012/overview/,https://www.procyclingstats.com/team/q365-pro-...
14,team/team-mtn-qhubeka-2011/overview/,https://www.procyclingstats.com/team/q365-pro-...
15,team/mtn-energade-road-team-2010/overview/,https://www.procyclingstats.com/team/q365-pro-...
16,team/mtn-energade-road-team-2009/overview/,https://www.procyclingstats.com/team/q365-pro-...


In [460]:
df_team_history["URL"]=df_team_history["URL"].str.replace('https://www.procyclingstats.com/team/','')
df_team_history

  df_team_history["URL"]=df_team_history["URL"].str.replace('https://www.procyclingstats.com/team/','')


Unnamed: 0,Option Value,URL
0,team/bora-hansgrohe-2027/overview/,bora-hansgrohe-2023
1,team/bora-hansgrohe-2026/overview/,bora-hansgrohe-2023
2,team/bora-hansgrohe-2025/overview/,bora-hansgrohe-2023
3,team/bora-hansgrohe-2024/overview/,bora-hansgrohe-2023
4,team/bora-hansgrohe-2023/overview/,bora-hansgrohe-2023
...,...,...
13,team/team-mtn-qhubeka-2012/overview/,q365-pro-cycing-team
14,team/team-mtn-qhubeka-2011/overview/,q365-pro-cycing-team
15,team/mtn-energade-road-team-2010/overview/,q365-pro-cycing-team
16,team/mtn-energade-road-team-2009/overview/,q365-pro-cycing-team


In [462]:
keep_year = ['2023','2022','2021','2020','2019']
mask = df_team_history['Option Value'].str.contains('|'.join(keep_year))
df_team_history_2018_23 = df_team_history[mask]

In [463]:
df_team_history_2018_23

Unnamed: 0,Option Value,URL
4,team/bora-hansgrohe-2023/overview/,bora-hansgrohe-2023
5,team/bora-hansgrohe-2022/overview/,bora-hansgrohe-2023
6,team/bora-hansgrohe-2021/overview/,bora-hansgrohe-2023
7,team/bora-hansgrohe-2020/overview/,bora-hansgrohe-2023
8,team/bora-hansgrohe-2019/overview/,bora-hansgrohe-2023
...,...,...
9,team/uno-x-norwegian-development-team-2019/ove...,uno-x-pro-cycling-team-2023
3,team/team-qhubeka-nexthash-2021/overview/,q365-pro-cycing-team
4,team/team-qhubeka-assos-2021/overview/,q365-pro-cycing-team
5,team/ntt-pro-cycling-2020/overview/,q365-pro-cycing-team


In [464]:
df_team_history_2018_23.to_csv('data/teams/team_history_name.csv')

In [526]:
# scrapping wins and performances per season from 2019 to 2022
def get_team_performance(urls):

    df_list = []
    
    for url in urls:
        response = requests.get(url)

        soup = BeautifulSoup(response.content, 'html.parser')
        
        title = soup.title.text.strip()

        wins = soup.find('li', {'class': 'title'}, text='Victories').find_next_sibling().find('a').text
        points = soup.find('li', {'class': 'title'}, text='Points').find_next_sibling().find('a').text
        pcs_ranking = soup.find('li', {'class': 'title'}, text='PCS#').find_next_sibling().find('a').text
        uci_ranking = soup.find('li', {'class': 'title'}, text='UCI#').find_next_sibling().find('a').text

        data = {'URL': [url], 'Team Name': [title], 'Wins': [wins], 'Points': [points], 'PCS Ranking': [pcs_ranking], 'UCI Ranking': [uci_ranking]}

        df = pd.DataFrame(data)

        df_list.append(df)
    
    result_df = pd.concat(df_list, ignore_index=True)
    
    return result_df

In [527]:
team_df=pd.read_csv('data/teams/team_history_name.csv')
team_df["team_history_url"]

0      https://www.procyclingstats.com/team/bora-hans...
1      https://www.procyclingstats.com/team/bora-hans...
2      https://www.procyclingstats.com/team/bora-hans...
3      https://www.procyclingstats.com/team/bora-hans...
4      https://www.procyclingstats.com/team/cofidis-2022
                             ...                        
135    https://www.procyclingstats.com/team/uno-x-nor...
136    https://www.procyclingstats.com/team/team-qhub...
137    https://www.procyclingstats.com/team/team-qhub...
138    https://www.procyclingstats.com/team/ntt-pro-c...
139    https://www.procyclingstats.com/team/team-dime...
Name: team_history_url, Length: 140, dtype: object

In [528]:
result_team_df = get_team_performance(team_df["team_history_url"])
result_team_df

Unnamed: 0,URL,Team Name,Wins,Points,PCS Ranking,UCI Ranking
0,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe 2022,30,9936,4,4
1,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe 2021,30,7761,6,-
2,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe 2020,21,6557,6,-
3,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe 2019,47,12501,2,-
4,https://www.procyclingstats.com/team/cofidis-2022,Cofidis 2022,19,7626,11,10
...,...,...,...,...,...,...
135,https://www.procyclingstats.com/team/uno-x-nor...,Uno-X Norwegian Development Team 2019,5,444,67,-
136,https://www.procyclingstats.com/team/team-qhub...,Team Qhubeka NextHash 2021,5,3874,21,-
137,https://www.procyclingstats.com/team/team-qhub...,Team Qhubeka ASSOS 2021,5,3874,21,-
138,https://www.procyclingstats.com/team/ntt-pro-c...,NTT Pro Cycling 2020,8,3075,21,-


In [529]:
result_team_df["Team Name"]=result_team_df["Team Name"].str.replace(' 2022', '')
result_team_df["Team Name"]=result_team_df["Team Name"].str.replace(' 2021', '')
result_team_df["Team Name"]=result_team_df["Team Name"].str.replace(' 2020', '')
result_team_df["Team Name"]=result_team_df["Team Name"].str.replace(' 2019', '')
team_history_performance = pd.concat([team_df, result_team_df], axis=1)
team_history_performance

Unnamed: 0.1,Unnamed: 0,team_code_history,team_history_url,team_url_2023,URL,Team Name,Wins,Points,PCS Ranking,UCI Ranking
0,5,bora-hansgrohe-2022,https://www.procyclingstats.com/team/bora-hans...,bora-hansgrohe-2023,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe,30,9936,4,4
1,6,bora-hansgrohe-2021,https://www.procyclingstats.com/team/bora-hans...,bora-hansgrohe-2023,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe,30,7761,6,-
2,7,bora-hansgrohe-2020,https://www.procyclingstats.com/team/bora-hans...,bora-hansgrohe-2023,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe,21,6557,6,-
3,8,bora-hansgrohe-2019,https://www.procyclingstats.com/team/bora-hans...,bora-hansgrohe-2023,https://www.procyclingstats.com/team/bora-hans...,BORA - hansgrohe,47,12501,2,-
4,3,cofidis-2022,https://www.procyclingstats.com/team/cofidis-2022,cofidis-2023,https://www.procyclingstats.com/team/cofidis-2022,Cofidis,19,7626,11,10
...,...,...,...,...,...,...,...,...,...,...
135,9,uno-x-norwegian-development-team-2019,https://www.procyclingstats.com/team/uno-x-nor...,uno-x-pro-cycling-team-2023,https://www.procyclingstats.com/team/uno-x-nor...,Uno-X Norwegian Development Team,5,444,67,-
136,3,team-qhubeka-nexthash-2021,https://www.procyclingstats.com/team/team-qhub...,q365-pro-cycing-team,https://www.procyclingstats.com/team/team-qhub...,Team Qhubeka NextHash,5,3874,21,-
137,4,team-qhubeka-assos-2021,https://www.procyclingstats.com/team/team-qhub...,q365-pro-cycing-team,https://www.procyclingstats.com/team/team-qhub...,Team Qhubeka ASSOS,5,3874,21,-
138,5,ntt-pro-cycling-2020,https://www.procyclingstats.com/team/ntt-pro-c...,q365-pro-cycing-team,https://www.procyclingstats.com/team/ntt-pro-c...,NTT Pro Cycling,8,3075,21,-


In [530]:
team_history_performance.to_csv('data/teams/team_history_performance.csv')

# Scrapping Riders info

In [198]:
# function that scraps all info from riders
def extract_rider_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # initialize all keys to None
    rider_info = {
        'Name': None,
        'Team': None,
        'Date of birth': None,
        'Nationality': None,
        'Weight': None,
        'Height': None,
        'Place of birth': None,
        'One day races': None,
        'GC': None,
        'Time trial': None,
        'Sprint': None,
        'Climber': None,
        'PCS Ranking': None,
        'UCI World': None,
        'Number Win': None,
        'GC Win': None,
        'Oneday races': None,
        'ITT': None,
        'PCS Point 2023': None,
        'PCS Ranking 2023': None,
        'PCS Point 2022': None,
        'PCS Ranking 2022': None,
        'PCS Point 2021': None,
        'PCS Ranking 2021': None,
        'PCS Point 2020': None,
        'PCS Ranking 2020': None,
        'PCS Point 2019': None,
        'PCS Ranking 2019': None,
    }
    
    table = soup.find('ul', {'class': 'clear'})
    table_2
    # check if infolist is None (i.e., the element was not found)
    
    for row in table.find_all('li')[1:]:
        cells = row.find_all('div')
        gear_type = cells[0].text.strip()
        brand = cells[1].text.strip()
        if gear_type in team_gear:
            team_gear[gear_type] = brand
        else:
            team_gear['Other'] += f"{gear_type} - {brand}\n"
    
    # Store the data in a pandas DataFrame
    df = pd.DataFrame(team_gear, index=[0])
 #   df = pd.DataFrame(team_gear.items(), columns=['Gear Type', 'Brand'])
  #  df = df.pivot(columns='Gear Type', values='Brand').reset_index(drop=True)
    return df

In [357]:
# function to get info from each rider
def get_uniquerider_infos(rider_url):
    print('Starting get_uniquerider_infos('+rider_url+')')

    rider_soup = download_soup("https://www.procyclingstats.com/rider/" + rider_url)
    rider_infos = pd.Series()
    
#     try:
#         fullname = rider_soup.find('title').text.encode('latin-1', 'ignore').decode('utf-8', 'ignore')
#     except(UnicodeEncodeError):
#         fullname = rider_soup.find('title').text
    rider_infos["fullname"] = rider_soup.find('title').text
    
#     try:
#         team = rider_soup.find('span', class_='red').text.encode('latin-1', 'ignore').decode('utf-8', 'ignore')
#     except(UnicodeEncodeError):
#         team = rider_soup.find('span', class_='red').text
    try:
        team = rider_soup.find('span', class_='red').text
    except:
        team = 'noteam'
    rider_infos["team"] = team    
        
    inter_soup = rider_soup.find('div', {"class":"rdr-info-cont"})
    list_birthdate = inter_soup.contents[1:4]
    try:
        birthdate_tmp = pd.datetime.strptime(list_birthdate[0] + list_birthdate[2][:-5], " %d %B %Y").date()
    except(TypeError):
        print(list_birthdate)
        birthdate_tmp = list_birthdate
    except:
        birthdate_tmp = pd.datetime.strptime(list_birthdate[0] + list_birthdate[2], " %d %B %Y").date()
    rider_infos["birthdate"] = birthdate_tmp
    #birthdate = int(birthdate_tmp.year*10000 + birthdate_tmp.month*100 + birthdate_tmp.day)
    
    try:
        country = rider_soup.find('a', class_='black').text.encode('latin-1', 'ignore').decode('utf-8', 'ignore')
    except(UnicodeEncodeError):
        country = rider_soup.find('a', class_='black').text
    rider_infos["country"] = country
    
    try:
        height = float(rider_soup.find(text='Height:').next.split()[0])
    except:
        height = 0
    rider_infos["height"] = height    
        
    try:
        weight = float(rider_soup.find(text='Weight:').next.split()[0])
    except:
        weight = 0
    rider_infos["weight"] = weight
    
    try:
        birthplace = rider_soup.find('b', text='Place of birth:').find_next_sibling('a').text.strip() 
    except:
        birthplace = "None" 
    rider_infos["birthplace"] = birthplace   

    try:
        bg_green_div = rider_soup.find('div', class_='bg green')
        pnt_div_one = bg_green_div.find_next('div', class_='pnt')
    except:
        pnt_div_one = 0
    rider_infos["one_day_race_pts"] = pnt_div_one

    try:
        bg_red_div = rider_soup.find('div', class_='bg red')
        pnt_div_gc = bg_red_div.find_next('div', class_='pnt')
    except:
        pnt_div_gc = 0
    rider_infos["gc_pts"] = pnt_div_gc

    try:
        bg_blue_div = rider_soup.find('div', class_='bg blue')
        pnt_div_tt = bg_blue_div.find_next('div', class_='pnt')
    except:
        pnt_div_tt = 0
    rider_infos["tt_pts"] = pnt_div_tt
    
    try:
        bg_orange_div = rider_soup.find('div', class_='bg orange')
        pnt_div_sprint = bg_orange_div.find_next('div', class_='pnt')
    except:
        pnt_div_sprint = 0
    rider_infos["sprint_pts"] = pnt_div_sprint
 
    try:
        bg_purple_div = rider_soup.find('div', class_='bg purple')
        pnt_div_climb = bg_purple_div.find_next('div', class_='pnt')
    except:
        pnt_div_climb = 0
    rider_infos["climb_pts"] = pnt_div_climb
 
    try:
        rnk_div = rider_soup.find('div', text='PCS Ranking').find_next_sibling('div', class_='rnk')
        pcs_rnk = rnk_div.text.strip()
        pcs_rnk = int(pcs_rnk)
    except:
        pcs_rnk = 0
    rider_infos["pcs_ranking"] = pcs_rnk

    try:
        uci_rnk_div = rider_soup.find('div', text='UCI World').find_next_sibling('div', class_='rnk')
        uci_rnk = uci_rnk_div.text.strip()
        uci_rnk = int(uci_rnk)
    except:
        uci_rnk = 0
    rider_infos["uci_ranking"] = uci_rnk
    
    try:
        nr_div = rider_soup.find('div', class_='nr')
        value_nr = nr_div.text.strip()
        value_nr = int(value_nr)
    except:
        value_nr = 0
    rider_infos["number_wins"] = value_nr

#    try:
#        nr_gc = soup.find(text='GC').find_next('div', {'class': 'info'}).text.replace('GC (', '').replace(')', '')
#    except:
#        nr_gc = 0
#    rider_infos["gc_wins"] = nr_gc

#    try:
#        oneday_div = soup.find(text='Oneday races').find_next('div', {'class': 'info fs11'})
#        oneday_count = oneday_div.text.replace('Oneday races (', '').replace(')', '') if oneday_div else 0
#    except:
#        oneday_count = 0
#    rider_infos["one_day_wins"] = oneday_count

#    try:
#        itt_nr = kpi_list.find('div', text='ITT ').find_next('div', {'class': 'info'}).text
#    except:
#        itt_nr = 0
#    rider_infos["itt_wins"] = itt_nr

    try:
        row_2023 = rider_soup.find('td', {'class': 'season'}, text='2023').find_parent('tr')
        pts_pcs_2023 = row_2023.find('div', {'class': 'bg left green2'}).find_next_sibling('span').text
        rank_pcs_2023 = row_2023.find('td', {'class': 'ac'}).text
    except:
        pts_pcs_2023 = 0
        rank_pcs_2023 = 0
    rider_infos["pts_pcs_2023"] = pts_pcs_2023
    rider_infos["rank_pcs_2023"] = rank_pcs_2023
    
    try:
        row_2022 = rider_soup.find('td', {'class': 'season'}, text='2022').find_parent('tr')
        pts_pcs_2022 = row_2022.find('div', {'class': 'bg left green2'}).find_next_sibling('span').text
        rank_pcs_2022 = row_2022.find('td', {'class': 'ac'}).text
    except:
        pts_pcs_2022 = 0
        rank_pcs_2022 = 0
    rider_infos["pts_pcs_2022"] = pts_pcs_2022
    rider_infos["rank_pcs_2022"] = rank_pcs_2022
    try:
        row_2021 = rider_soup.find('td', {'class': 'season'}, text='2021').find_parent('tr')
        pts_pcs_2021 = row_2021.find('div', {'class': 'bg left green2'}).find_next_sibling('span').text
        rank_pcs_2021 = row_2021.find('td', {'class': 'ac'}).text
    except:
        pts_pcs_2021 = 0
        rank_pcs_2021 = 0
    rider_infos["pts_pcs_2021"] = pts_pcs_2021
    rider_infos["rank_pcs_2021"] = rank_pcs_2021

    try:
        row_2020 = rider_soup.find('td', {'class': 'season'}, text='2020').find_parent('tr')
        pts_pcs_2020 = row_2020.find('div', {'class': 'bg left green2'}).find_next_sibling('span').text
        rank_pcs_2020 = row_2020.find('td', {'class': 'ac'}).text
    except:
        pts_pcs_2020 = 0
        rank_pcs_2020 = 0
    rider_infos["pts_pcs_2020"] = pts_pcs_2020
    rider_infos["rank_pcs_2020"] = rank_pcs_2020

    try:
        row_2019 = rider_soup.find('td', {'class': 'season'}, text='2019').find_parent('tr')
        pts_pcs_2019 = row_2019.find('div', {'class': 'bg left green2'}).find_next_sibling('span').text
        rank_pcs_2019 = row_2019.find('td', {'class': 'ac'}).text
    except:
        pts_pcs_2019 = 0
        rank_pcs_2019 = 0
    rider_infos["pts_pcs_2019"] = pts_pcs_2019
    rider_infos["rank_pcs_2019"] = rank_pcs_2019
        
    rider_infos["rider_url"] = rider_url
    
    print('Finished get_uniquerider_inf('+rider_url+')')
    return rider_infos

In [358]:
# testing if the function works
get_uniquerider_infos("valentin-madouas")

Starting get_uniquerider_infos(valentin-madouas)
Starting request for: https://www.procyclingstats.com/rider/valentin-madouas
Successful request!
Finished get_uniquerider_inf(valentin-madouas)


  rider_infos = pd.Series()
  birthdate_tmp = pd.datetime.strptime(list_birthdate[0] + list_birthdate[2][:-5], " %d %B %Y").date()


fullname            Valentin  Madouas 
team                    Groupama - FDJ
birthdate                   1996-07-12
country                         France
height                            1.79
weight                            71.0
birthplace                       Brest
one_day_race_pts                [1853]
gc_pts                           [924]
tt_pts                            [90]
sprint_pts                       [136]
climb_pts                       [1814]
pcs_ranking                         27
uci_ranking                         40
number_wins                          5
pts_pcs_2023                       193
rank_pcs_2023                       30
pts_pcs_2022                       936
rank_pcs_2022                       38
pts_pcs_2021                       525
rank_pcs_2021                       92
pts_pcs_2020                       364
rank_pcs_2020                       94
pts_pcs_2019                       525
rank_pcs_2019                      112
rider_url             val

In [359]:
# function that will run a list of riders url and create the df
def get_allriders_info():
    print('Starting get_allriders_info')
    
    df_rider_urls = pd.read_csv("data/rider_urls_code.csv")
    n_riders = len(df_rider_urls)
    df_rider_infos = pd.DataFrame(columns=["fullname", "team", "birthdate", "country", "height", "weight",
                                           "birthplace","one_day_race_pts ","gc_pts","tt_pts","sprint_pts","climb_pts",
                                           "pcs_ranking","uci_ranking","number_wins","pts_pcs_2023","rank_pcs_2023",
                                           "pts_pcs_2022","rank_pcs_2022","pts_pcs_2021","rank_pcs_2021",
                                           "pts_pcs_2020","rank_pcs_2020","pts_pcs_2019","rank_pcs_2019","rider_url"])
    
    for idx, row in df_rider_urls.iterrows():
        if idx < 0:
            pass
        else:
            # timer to not make too many requests too fast
            timer = 0.5 + 0.5 * random.random()
            time.sleep(timer)
            clear_output(wait=True)
            add = get_uniquerider_infos(row["rider_code"])
            print("Rider n° : " + str(idx) + " / " + str(n_riders), flush=True) # should have put idx+1 (as it takes index)
            df_rider_infos = df_rider_infos.append(add, ignore_index=True)   
            if idx % 100 == 0:
                df_rider_infos.to_csv("data/rider_infos.csv")
    
    df_rider_infos.to_csv("data/rider_infos.csv")
    
    return df_rider_infos

In [360]:
# generating the outputs
output = get_allriders_info()

Starting get_uniquerider_infos(marcel-camprubi)
Starting request for: https://www.procyclingstats.com/rider/marcel-camprubi
Successful request!
Finished get_uniquerider_inf(marcel-camprubi)
Rider n° : 935 / 936


  rider_infos = pd.Series()
  birthdate_tmp = pd.datetime.strptime(list_birthdate[0] + list_birthdate[2][:-5], " %d %B %Y").date()
  df_rider_infos = df_rider_infos.append(add, ignore_index=True)


# Scrapping race results

In [3]:
# function to get the race results with following information: rank of race, GC rank (not in one day race), 
# timelag on GC (not in one day race), BIB (identificaiton number specific to the race), rider name, 
#age of rider, team of rider, uci point, pcs point, time (or difference vs #1), race_code
# Note, this code ignores Team Time Trial which are too specific exercices, have different format on PCS website

def create_dataframe_from_url(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    rows = soup.find_all('tr')
    
    columns = [th.text.strip() for th in rows[0].find_all('th')]
    
    data = []
    
    for row in rows:
        values = [td.text.strip() for td in row.find_all('td')]
    
        # if the row contains a rider (i.e., it's not a header or footer row), add it to the data list
        
        # following line does not apply to one day race => to put in comment when scrapping for one day races and remove indentation on "data.append(values)"
        if (values and values[2].startswith('+')) or (values and values[0] == 'DNF') or (values and values[0] == 'DNS'):
            data.append(values)
        
    df = pd.DataFrame(data, columns=columns)
    df['race_code'] = url.replace('https://www.procyclingstats.com/race/','')
    
    return df

In [4]:
# function that runs the scrapping of races results for list of urls
def create_dataframe_from_urls(df, url_column):
    num_urls = len(df[url_column])
    dataframes=[]
    for i, url in enumerate(df[url_column]):
        print(f"\rProcessing URL {i+1}/{num_urls}: {url}", end="")
        dataframes.append(create_dataframe_from_url(url))
        # random delay to avoid getting banned
        timer = 0.5 + 0.5 * random.random()
        time.sleep(timer)
    # concatenate all the dataframes in the list into a single dataframe
    result_df = pd.concat(dataframes, ignore_index=True)
    return result_df

In [5]:
# uploading the dataframe
df_tour = pd.read_csv('data/race_urls_final/stage_gt_urls_tour.csv')
df_giro = pd.read_csv('data/race_urls_final/stage_gt_urls_giro.csv')
df_vuelta = pd.read_csv('data/race_urls_final/stage_gt_urls_vuelta.csv')
df_one_day_2019 = pd.read_csv('data/race_urls_final/one_day_race_urls_2019.csv')
df_one_day_2020 = pd.read_csv('data/race_urls_final/one_day_race_urls_2020.csv')
df_one_day_2021 = pd.read_csv('data/race_urls_final/one_day_race_urls_2021.csv')
df_one_day_2022 = pd.read_csv('data/race_urls_final/one_day_race_urls_2022.csv')
df_one_day_2023 = pd.read_csv('data/race_urls_final/one_day_race_urls_2023.csv')
df_stage_2019 = pd.read_csv('data/race_urls_final/stage_race_2019.csv')
df_stage_2020 = pd.read_csv('data/race_urls_final/stage_race_2020.csv')
df_stage_2021 = pd.read_csv('data/race_urls_final/stage_race_2021.csv')
df_stage_2022 = pd.read_csv('data/race_urls_final/stage_race_2022.csv')
df_stage_2023 = pd.read_csv('data/race_urls_final/stage_race_2023.csv')

In [None]:
create_dataframe_from_url()

In [409]:
# In case there is an issue with one of the links, splitting the process
# generating the result with Tour de France stages
df_result_tour=create_dataframe_from_urls(df_tour, "race_url")
df_result_tour.to_csv('data/result_tour.csv')

Processing URL 84/84: https://www.procyclingstats.com/race/tour-de-france/2022/stage-21

In [410]:
# generating the result with Giro d'Italia stages
df_result_giro=create_dataframe_from_urls(df_giro, "race_url")
df_result_giro.to_csv('data/result_giro.csv')

Processing URL 81/81: https://www.procyclingstats.com/race/giro-d-italia/2022/stage-21

In [6]:
# generating the result with Vuelta a Espana stages
df_result_vuelta=create_dataframe_from_urls(df_vuelta, "race_url")
df_result_vuelta.to_csv('data/result_vuelta.csv')

Processing URL 81/81: https://www.procyclingstats.com/race/vuelta-a-espana/2022/stage-21

In [423]:
# generating the result with other stage races of world tour series 2019
df_result_stage_2019=create_dataframe_from_urls(df_stage_2019, "stage_url")
df_result_stage_2019.to_csv('data/result_stage_2019.csv')

Processing URL 95/95: https://www.procyclingstats.com/race/tour-de-suisse/2019/stage-87e-7ge-6

In [424]:
# generating the result with other stage races of world tour series 2020
df_result_stage_2020=create_dataframe_from_urls(df_stage_2020, "stage_url")
df_result_stage_2020.to_csv('data/result_stage_2020.csv')

Processing URL 41/41: https://www.procyclingstats.com/race/tirreno-adriatico/2020/stage-8

In [425]:
# generating the result with other stage races of world tour series 2021
df_result_stage_2021=create_dataframe_from_urls(df_stage_2021, "stage_url")
df_result_stage_2021.to_csv('data/result_stage_2021.csv')

Processing URL 70/70: https://www.procyclingstats.com/race/tour-de-suisse/2021/stage-87-7age-6

In [426]:
# generating the result with other stage races of world tour series 2022
df_result_stage_2022=create_dataframe_from_urls(df_stage_2022, "stage_url")
df_result_stage_2022.to_csv('data/result_stage_2022.csv')

Processing URL 63/63: https://www.procyclingstats.com/race/tour-de-suisse/2022/stage-87-7age-6

In [6]:
# generating the result with other stage races of world tour series 2023
df_result_stage_2023=create_dataframe_from_urls(df_stage_2023, "stage_url")
df_result_stage_2023.to_csv('data/result_stage_2023.csv')

Processing URL 12/12: https://www.procyclingstats.com/race/uae-tour/2023/stage-7tage-5

In [436]:
# generating the result for one day race 2019
df_result_one_2019=create_dataframe_from_urls(df_one_day_2019, "race_url")
df_result_one_2019.to_csv('data/result_one_day_2019.csv')

Processing URL 48/48: https://www.procyclingstats.com/race/l-etoile-d-or/2019/20191919190192019

In [439]:
# generating the result for one day race 2020
df_result_one_2020=create_dataframe_from_urls(df_one_day_2020, "race_url")
df_result_one_2020.to_csv('data/result_one_day_2020.csv')

Processing URL 29/29: https://www.procyclingstats.com/race/race-torquay/202020i/20202020nne/2020

In [440]:
# generating the result for one day race 2021
df_result_one_2021=create_dataframe_from_urls(df_one_day_2021, "race_url")
df_result_one_2021.to_csv('data/result_one_day_2021.csv')

Processing URL 103/103: https://www.procyclingstats.com/race/olympic-games-itt/20212121212021lde/20211

In [444]:
# generating the result for one day race 2022
df_result_one_2022=create_dataframe_from_urls(df_one_day_2022, "race_url")
df_result_one_2022.to_csv('data/result_one_day_2022.csv')

Processing URL 110/110: https://www.procyclingstats.com/race/japan-cup/20222222/2022202222022lde/20222

In [445]:
# generating the result for one day race 2023
df_result_one_2023=create_dataframe_from_urls(df_one_day_2023, "race_url")
df_result_one_2023.to_csv('data/result_one_day_2023.csv')

Processing URL 20/20: https://www.procyclingstats.com/race/trofeo-laigueglia/202320233or/2023cia/2023