In [1]:
import csv
import datetime
import duckdb
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def get_results(racename, raceyear):
    
    # Sleep a little
    time.sleep(3)

    # Set the time and date for the output table
    UPDATE_TIMESTAMP = str(datetime.datetime.now())[0:19]

    # URLs for race, route and results
    URL_PCS_BASE = "https://www.procyclingstats.com/race/"
    URL_RACE = URL_PCS_BASE + racename + "/" + str(raceyear)
    URL_ROUTE = URL_RACE + "/route/"

    print()
    print("URL_RACE:", URL_RACE)

    # Get soup of the route to check if race is valid; and if so if it is a one day race or a stage race
    page = requests.get(URL_ROUTE)
    soup = BeautifulSoup(page.content, "html.parser")

    # Check if link is valid
    if soup.title is None or soup.title.text.startswith("Page not found"):
        print("RACE LINK NOT VALID!")
        return 0
    
    # List of results needed to scrape
    RESULTS_TO_SCRAPE = []

    # Find out if the race is a one day race or a stage race
    tables = soup.find_all("table")
    for table in tables:
        headers = table.find("thead").find_all("th")
        headers = [header.text.strip() for header in headers]
        if "Date" in headers:
            idx_date = headers.index("Date")
            idx_stage = headers.index("#")
            idx_distance = headers.index("Distance")
            idx_vertical = headers.index("Vertical meters")
            rows = table.find("tbody").find_all("tr")

            # If rows are found, it is a stage race, otherwise a one day race
            if rows:
                # Get the stages of the race
                for row_num, row in enumerate(rows):                    
                    cols = row.find_all("td")
                    stage_number = row_num + 1
                    stage_date = cols[idx_date].text
                    stage_name = cols[idx_stage].text
                    stage_distance = cols[idx_distance].text
                    stage_vertical = cols[idx_vertical].text
                    stage_link = "https://www.procyclingstats.com/" + cols[idx_stage].find("a")["href"] if cols[idx_stage].find("a") else None

                    if "Prologue" in stage_name:
                        stage_type = "Prologue"
                    elif "ITT" in stage_name:
                        stage_type = "ITT"
                    elif "TTT" in stage_name:
                        stage_type = "TTT"
                    else:
                        stage_type = "Normal stage"

                    # Ignore the final row in the table of all stages
                    if stage_name == "":
                        continue

                    RESULTS_TO_SCRAPE.append({
                        "Stage number": stage_number,
                        "Stage name": stage_name,
                        "Stage link": stage_link,
                        "Stage type": stage_type,
                        "Stage date": stage_date,
                        "Stage distance": stage_distance,
                        "Stage vertical meters": stage_vertical
                    })
                # Also add GC and perhaps different jerseys to the list of results to get
                RESULTS_TO_SCRAPE.append({
                    "Stage number": 99,
                    "Stage name": "GC",
                    "Stage link": URL_RACE + "/gc",
                    "Stage type": "GC",
                    "Stage date": "",
                    "Stage distance": "",
                    "Stage vertical meters": ""
                })     

            else:
                # If it is a one day race get the race date from the main page of the race
                page = requests.get(URL_RACE)
                soup = BeautifulSoup(page.content, "html.parser")
                info_list = soup.find('ul', class_='keyvalueList')
                for li in info_list.find_all('li'):
                    title_div = li.find('div', class_='title')
                    if title_div and 'Startdate:' in title_div.text:
                        value_div = li.find('div', class_='value')
                        if value_div:
                            odr_date = value_div.text.strip()

                RESULTS_TO_SCRAPE.append({
                    "Stage number": 99,
                    "Stage name": "Result",
                    "Stage link": URL_RACE + "/result",
                    "Stage type": "Result",
                    "Stage date": odr_date,
                    "Stage distance": "",
                    "Stage vertical meters": ""
                })     
                
            break

    conn = duckdb.connect()
    conn.execute("""
    CREATE TABLE ScrapedResults (
        RaceName            TEXT,
        RaceYear            INTEGER,
        StageNumber         INTEGER,
        StageName           TEXT,
        StageType           TEXT,
        StageDate           TEXT,
        RiderRank           TEXT,
        RiderName           TEXT,
        TeamName            TEXT,
        UCIPoints           TEXT,
        PCSPoints           TEXT,
        UpdateTimeStamp     TEXT
    )
    """)

    for r in RESULTS_TO_SCRAPE:
        time.sleep(2)
        URL_RESULT = r["Stage link"]
        STAGE_TYPE = r["Stage type"]
        STAGE_NAME = r["Stage name"]
        STAGE_DATE = r["Stage date"]
        STAGE_NUMBER = r["Stage number"]
        print(STAGE_NAME)  
        
        page = requests.get(URL_RESULT)
        soup = BeautifulSoup(page.content, "html.parser")
        tables = soup.find_all("table")

        if STAGE_TYPE == "TTT":
            info_list = soup.find('ul', class_='ttt-results')
            for li in info_list.find_all("li"):
                for di in li.find_all("div"):
                    if "w10" in di.get("class"):
                        li_rank = di.text.strip()
                    elif "w90" in di.get("class"):
                        li_team = di.text.strip()
                    elif "w100" in di.get("class"):
                        table = di.find("table")
                        rows = table.find_all("tr")
                        for row in rows:
                            cols = row.find_all("td")
                            li_rider = cols[0].text.strip()
                            li_uci = cols[1].text.strip()
                            li_pcs = cols[2].text.strip().strip()
                            data = [
                                racename,
                                raceyear,
                                STAGE_NUMBER,
                                STAGE_NAME,
                                STAGE_TYPE,
                                STAGE_DATE,
                                li_rank,
                                li_rider,
                                li_team,
                                li_uci,
                                li_pcs,
                                UPDATE_TIMESTAMP
                            ]
                            conn.executemany("INSERT INTO ScrapedResults VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", [data])

            continue

        relevant_table_found = False
        relevant_table_number = 0
        for table_number, table in enumerate(tables):
            headers = table.find("thead").find_all("th")
            headers = [header.text.strip() for header in headers]     
            if STAGE_TYPE == "Normal stage":
                if "UCI" in headers and "Time won/lost" not in headers:
                    relevant_table_number = table_number
                    relevant_table_found = True
                    break
            elif STAGE_TYPE == "GC":
                if "UCI" in headers and "Time won/lost" in headers:
                    relevant_table_number = table_number
                    relevant_table_found = True
                    break
            elif STAGE_TYPE == "ITT" or STAGE_TYPE == "Prologue":
                if "UCI" in headers and "Avg" in headers:
                    relevant_table_number = table_number
                    relevant_table_found = True
                    break
            elif STAGE_TYPE == "TTT":
                pass
            elif STAGE_TYPE == "Result":
                if "UCI" in headers and "Time" in headers:
                    relevant_table_number = table_number
                    relevant_table_found = True
                    break
            
        if relevant_table_found == False:
            continue

        table = tables[relevant_table_number]
        headers = table.find("thead").find_all("th")
        headers = [header.text.strip() for header in headers]
        idx_rank = headers.index("Rnk")
        idx_rider = headers.index("Rider")
        idx_team = headers.index("Team")
        idx_uci = headers.index("UCI")
        idx_pcs = headers.index("Pnt")

        rows = table.find("tbody").find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            # If the row isn't an actual result row, but a row with info on a relegation, it is removed
            if len(cols) < 2:
                pass
            else:
                data = [
                    racename,
                    raceyear,
                    STAGE_NUMBER,
                    STAGE_NAME,
                    STAGE_TYPE,
                    STAGE_DATE,
                    cols[idx_rank].text.strip(),
                    cols[idx_rider].text.strip(),
                    cols[idx_team].text.strip(),
                    cols[idx_uci].text.strip(),
                    cols[idx_pcs].text.strip(),
                    UPDATE_TIMESTAMP
                ]

            conn.executemany("INSERT INTO ScrapedResults VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", [data])

    conn.execute("COPY ScrapedResults TO '../data/results_races/results_" + racename + "-" + str(raceyear) + ".csv' (HEADER, DELIMITER ',')")

In [3]:
NOW = datetime.datetime.now()
FROM = NOW + datetime.timedelta(days = -10)
TO = NOW + datetime.timedelta(days = 30)

RACES = []
with open("../data/races.csv", newline = '') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if datetime.datetime.strptime(row["RaceStart"], "%Y-%m-%d %H:%M:00") < NOW and datetime.datetime.strptime(row["RaceEnd"], "%Y-%m-%d %H:%M:00") > FROM and datetime.datetime.strptime(row["RaceEnd"], "%Y-%m-%d %H:%M:00") < TO:
            RACES.append(row["RaceName_PCS"])

for race in RACES:
    get_results(race, 2025)


URL_RACE: https://www.procyclingstats.com/race/cyclassics-hamburg/2025
Result

URL_RACE: https://www.procyclingstats.com/race/renewi-tour/2025
Stage 1
Stage 2
Stage 3
Stage 4
Stage 5
GC

URL_RACE: https://www.procyclingstats.com/race/deutschland-tour/2025
Prologue
Stage 1
Stage 2
Stage 3
Stage 4
GC

URL_RACE: https://www.procyclingstats.com/race/vuelta-a-espana/2025
Stage 1
Stage 2
Stage 3
Stage 4
Stage 5 (TTT)
Stage 6
Stage 7
Stage 8
Stage 9
Stage 10
Stage 11
Stage 12
Stage 13
Stage 14
Stage 15
Stage 16
Stage 17
Stage 18 (ITT)
Stage 19
Stage 20
Stage 21
GC
