In [1]:
import numpy as np
import pandas as pd
import re
import requests
import time
from collections import defaultdict
import datetime
from bs4 import BeautifulSoup

In [2]:
df_race_results = pd.DataFrame(columns = [
    'RACE',
    'YEAR',
    'STAGE_ID',
    'STAGE_DATE',
    'STAGE_NAME',
    'STAGE_PROFILE',
    'STAGE_TYPE',
    'RANK',
    'RIDER',
    'TEAM',
    'UCI_POINTS',
    'PCS_POINTS',
    'UPDATE_TIMESTAMP'
], index = [0])

In [None]:
def get_results(link):
    global df_race_results

    # Base values
    URL_BASE = re.findall(r"(.+)\d{4}$", link)[0]
    URL_STAGES = link + '/route/'
    RACE = re.findall(r"/race/(.+)/\d{4}", link)[0]
    YEAR = re.findall(r"\d{4}$", link)[0]
    UPDATE_TIMESTAMP = str(datetime.datetime.now())[0:19]
    
    # Find out if race is one day or stage race and what jerseys to look for    
    page = requests.get(URL_STAGES)
    soup = BeautifulSoup(page.content, "html.parser")

    title = soup.title.text
    if title.startswith("Page not found"):
        return 0

    stages_table = soup.find_all("table", class_="basic")[0]
    rows = stages_table.find("tbody").find_all("tr")

    STAGES = defaultdict(int)
    if len(rows) > 2:
        RACE_TYPE = 'Stage race'
        for i, row in enumerate(rows):
            cols = row.find_all("td")
            if cols[2].text:
                stage_date_construct = cols[0].text.strip() + '/' + YEAR
                stage_date = datetime.datetime.strptime(stage_date_construct, "%d/%m/%Y").strftime("%Y-%m-%d")
                stage_name = cols[2].text.strip()
                profile_search = re.findall("icon profile (p\d)", str(cols[1]))
                stage_profile = (profile_search[0] if profile_search else 'Unknown')
                type_search = re.findall("\((.+)\)", cols[2].text.strip())
                stage_type = (type_search[0] if type_search else ('Prologue' if cols[2].text.strip() == 'Prologue' else 'Normal'))
                stage_id = 'stage-' + ('0' if stage_type == 'Prologue' else re.findall("Stage (\w+) ", stage_name + ' (')[0])
                STAGES[i] = {
                    'StageID': stage_id,
                    'StageDate': stage_date,
                    'StageName': stage_name,
                    'StageProfile': stage_profile,
                    'StageType': stage_type
                }
    else:
        RACE_TYPE = 'One day race'

    # Get stage results
    RESULTS_LIST = []
    if RACE_TYPE == 'One day race':
        RESULTS_LIST.append('/result')
    else:
        for k, v in STAGES.items():
            for k2, v2 in v.items():
                if k2 == 'StageID':
                    RESULTS_LIST.append('/' + v2)
        RESULTS_LIST.append('/gc')
    
    ALL_RESULTS = defaultdict(int)

    table_no = 0
    for result in RESULTS_LIST:
        print(result)
        print("=============")

        if result == "/stage-0":
            URL_RESULT = URL_BASE + '/' + YEAR + "/prologue" 
        else:
            URL_RESULT = URL_BASE + '/' + YEAR + result

        page = requests.get(URL_RESULT)
        soup = BeautifulSoup(page.content, "html.parser")
        tables = soup.find_all("table", class_="results-ttt")
        print("tables 1", tables)

        if len(tables) != 0:
            table = tables[0]
            tbody = table.find("tbody")
            rows = tbody.find_all("tr")

            RESULTS = defaultdict(str)
            team = ''
            rank = ''
            uci_points = 0
            pcs_points = 0
            for i, row in enumerate(rows):
                cols = row.find_all("td")
                if cols[0].text != '':
                    rank = cols[0].text
                    team = cols[1].text
                else:
                    rider = cols[1].text.strip()
                    try: 
                        rider = re.findall("(.+) \\xa0", rider)[0]
                    except:
                        pass
                    rider = rider.title()
                    RESULTS[i] = {
                            'RANK': rank,
                            'RIDER': rider,
                            'TEAM': team,
                            'UCI_POINTS': uci_points,
                            'PCS_POINTS': pcs_points
                    }
                ALL_RESULTS[result] = RESULTS
            continue
        else:
            tables = soup.find_all("table")
            print("tables2", tables)

            if result == "/gc":
                for tn, table in enumerate(tables):
                    thead = table.find("thead")
                    headers = thead.find_all("th")
                    headers = [header.text.strip() for header in headers]
                    if "UCI" in headers and "Prev" in headers and "Pnt" in headers:
                        table_no = tn
                        break
            if result != "/gc":
                for tn, table in enumerate(tables):
                    thead = table.find("thead")
                    headers = thead.find_all("th")
                    headers = [header.text.strip() for header in headers]
                    if "UCI" in headers and "Prev" not in headers and "Pnt" in headers:
                        table_no = tn
                        break                
    
            table = tables[table_no]
            tbody = table.find("tbody")
            thead = table.find("thead")
            rows = tbody.find_all("tr")
            headers = thead.find_all("th")            

            RESULTS = defaultdict(str)
            COLS = []
            if rows != []:
                for i, header in enumerate(headers):
                    if header.text.strip() in ('Rnk', 'Rider', 'Team', 'UCI', 'Pnt'):
                        COLS.append(i)

                for i, row in enumerate(rows):
                    cols = row.find_all("td")
                    if len(cols) < 3:
                        continue
                    rank = cols[COLS[0]].text.strip()
                    rider = cols[COLS[1]].text.strip()
                    team = cols[COLS[2]].text.strip()
                    rider = rider.replace(team, "").strip().title()
                    rider = rider.replace("fav_Gc", "").strip().title()
                    rider = rider.replace(" Fav_Gc", "").strip().title()
                    rider = rider.strip().title()
                    if result in ('/gc', '/result'):
                        uci_points = cols[COLS[3]].text.strip()
                        pcs_points = cols[COLS[4]].text.strip()
                    else:
                        uci_points = 0
                        pcs_points = 0
                    RESULTS[i] = {
                            'RANK': rank,
                            'RIDER': rider,
                            'TEAM': team,
                            'UCI_POINTS': uci_points,
                            'PCS_POINTS': pcs_points
                    }
            ALL_RESULTS[result] = RESULTS

    STAGE_INFO = {}
    for k1, v1 in STAGES.items():
        for k2, v2 in v1.items():
            if k2 == 'StageID':
                STAGE_INFO[v2] = []
                stage_id = v2
            else:
                STAGE_INFO[stage_id].append(v2)

    max_date = ''
    for k1, v1 in ALL_RESULTS.items():
        stage_id = k1.replace('/', '')
        if len(stage_id) - len(stage_id.replace('-', '')) == 2:
            stage_id = re.findall(r".+-.+-(.+)$", stage_id)[0]
        if stage_id not in ('gc', 'points', 'kom', 'youth', 'result'):
            stage_info = STAGE_INFO[stage_id]
            max_date = (stage_info[0] if stage_info[0] > max_date else max_date)
        else:
            stage_info = [max_date, stage_id, '', '']
        for k2, v2 in v1.items():
            local_df = pd.DataFrame({
                'RACE': RACE,
                'YEAR': YEAR,
                'STAGE_ID': stage_id,
                'STAGE_DATE': stage_info[0],
                'STAGE_NAME': stage_info[1],
                'STAGE_PROFILE': stage_info[2],
                'STAGE_TYPE': stage_info[3],
                'RANK': v2['RANK'],
                'RIDER': v2['RIDER'],
                'TEAM': v2['TEAM'],
                'UCI_POINTS': v2['UCI_POINTS'],
                'PCS_POINTS': v2['PCS_POINTS'],
                'UPDATE_TIMESTAMP': UPDATE_TIMESTAMP
            }, index = [0])           

            df_race_results = pd.concat([df_race_results, local_df], ignore_index = True)

    return df_race_results

In [4]:
races = pd.read_csv("../data/races.csv")

NOW = datetime.datetime.now()
FROM = NOW + datetime.timedelta(days = -10)
TO = NOW + datetime.timedelta(days = 30)

RACES_TO_GET = races

RACES_TO_GET["RaceStart"] = pd.to_datetime(RACES_TO_GET["RaceStart"], format = "%Y-%m-%d %H:%M:00")
RACES_TO_GET["RaceEnd"] = pd.to_datetime(RACES_TO_GET["RaceEnd"], format = "%Y-%m-%d %H:%M:00")
RACES_TO_GET = races[(races["RaceStart"] < NOW) & (races["RaceEnd"] > FROM) & (races["RaceEnd"] < TO)]

for i, r in RACES_TO_GET.iterrows():
    url = r['RaceLink_PCS']
    print(i, url)
    race_df = get_results(url)
    time.sleep(15)

43 https://www.procyclingstats.com/race/tour-de-france/2025
/stage-1
/stage-2
/stage-3


KeyboardInterrupt: 

In [None]:
RESULTS = pd.read_csv("../data/results/results_2025_full.csv")
races_scraped = list(df_race_results.RACE.unique())
RESULTS = RESULTS[~RESULTS["RACE"].isin(races_scraped)]
RESULTS = pd.concat([RESULTS, df_race_results])
RESULTS = RESULTS[~RESULTS["RACE"].isna()]
RESULTS.YEAR = RESULTS.YEAR.astype(int)
RESULTS_SHORT = RESULTS[RESULTS["RANK"].isin([
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
    "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
    "21", "22", "23", "24", "25"])]

In [None]:
RESULTS.to_csv("../data/results/results_2025_full.csv", index = False)
RESULTS_SHORT.to_csv("../data/results/results_2025_short.csv", index = False)