In [1]:
!pip install procyclingstats



In [2]:
import pandas as pd
world_tour_races = pd.read_csv('../data/pcs_worldtour_races.csv')
world_tour_races = world_tour_races.query('year >= 2016')
world_tour_races

Unnamed: 0,race_name,year,stage_race,race_url
83,tour-down-under,2016,True,race/tour-down-under/2016
84,paris-nice,2016,True,race/paris-nice/2016
85,tirreno-adriatico,2016,True,race/tirreno-adriatico/2016
86,milano-sanremo,2016,False,race/milano-sanremo/2016
87,volta-a-catalunya,2016,True,race/volta-a-catalunya/2016
...,...,...,...,...
322,benelux-tour,2022,True,race/benelux-tour/2022
323,gp-quebec,2022,False,race/gp-quebec/2022
324,gp-montreal,2022,False,race/gp-montreal/2022
325,il-lombardia,2022,False,race/il-lombardia/2022


In [3]:
from procyclingstats import Stage, Race
from tqdm import tqdm

riders = pd.DataFrame(columns = ['rider_url', 'rider_name', 'nationality'])
teams = pd.DataFrame(columns = ['team_url', 'team_name'])
results = pd.DataFrame(columns = ['rider_url', 'team_url', 'rank', 'status', 'age', 'time', 'pcs_points', 'uci_points'])

def add_results_from_stage(race_url):
    global riders, teams, results
    
    try:
        stage = Stage(race_url)
    except ValueError:
        print(f"Unable to add results from {race_url} - bad URL")
        return
    
    try:
        parsed = stage.parse()
    except IndexError:
        print(f"Unable to add results from {race_url} - bad HTML (possibly TTT or cancelled)")
        return
    except:
        print(f"Unable to add results from {race_url} - bad HTML")
        return
    
    full_df = pd.DataFrame(parsed['results'])
    
    # Clean links   
    full_df['rider_url'] = full_df['rider_url'].str.removeprefix('rider/')
    full_df['team_url'] = full_df['team_url'].str.removeprefix('team/')
    full_df['race_url'] = race_url
    
    # Add other race information
    full_df['profile_icon'] = parsed['profile_icon']
    full_df['profile_score'] = parsed['profile_score']
    
    rider = full_df[['rider_url', 'rider_name', 'nationality']]
    riders = pd.concat([riders, rider]).drop_duplicates()
    team = full_df[['team_url', 'team_name']]
    teams = pd.concat([teams, team]).drop_duplicates()
    result = full_df[[
        'race_url', 'rider_url', 'team_url', 'rank', 'status', 'age', 'time',
        'pcs_points', 'uci_points', 'profile_icon', 'profile_score'
    ]]
    results = pd.concat([results, result])
    
def get_stage_urls(race_url):
    stage = Race(f"{race_url}/overview")
    stages = stage.parse()['stages']
    return [stage['stage_url'] for stage in stages]

for i, race_url in tqdm(world_tour_races.query('not stage_race').race_url.items()):
    add_results_from_stage(race_url)
    
for i, race_url in tqdm(world_tour_races.query('stage_race').race_url.items()):
    stage_urls = get_stage_urls(race_url)
    for stage in stage_urls:
        add_results_from_stage(stage)
        
riders.to_csv('../data/pcs_worldtour_riders.csv')
teams.to_csv('../data/pcs_worldtour_teams.csv')
results.to_csv('../data/pcs_worldtour_results.csv')

2it [00:00,  1.98it/s]

Unable to add results from race/e3-harelbeke/2016 - bad HTML


35it [00:20,  1.04s/it]

Unable to add results from race/great-ocean-race/2018 - bad HTML


47it [00:27,  2.10it/s]

Unable to add results from race/eschborn-frankfurt/2018 - bad HTML


79it [00:43,  2.99it/s]

Unable to add results from race/e3-harelbeke/2020 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/eschborn-frankfurt/2020 - bad HTML (possibly TTT or cancelled)


80it [00:43,  2.69it/s]

Unable to add results from race/san-sebastian/2020 - bad HTML (possibly TTT or cancelled)


84it [00:45,  2.13it/s]

Unable to add results from race/ride-london-classic/2020 - bad HTML (possibly TTT or cancelled)


86it [00:46,  1.97it/s]

Unable to add results from race/gp-quebec/2020 - bad HTML (possibly TTT or cancelled)


87it [00:47,  2.36it/s]

Unable to add results from race/gp-montreal/2020 - bad HTML (possibly TTT or cancelled)


89it [00:47,  2.50it/s]

Unable to add results from race/cyclassics-hamburg/2020 - bad HTML (possibly TTT or cancelled)


91it [00:48,  2.19it/s]

Unable to add results from race/amstel-gold-race/2020 - bad HTML (possibly TTT or cancelled)


93it [00:49,  2.59it/s]

Unable to add results from race/dwars-door-vlaanderen/2020 - bad HTML (possibly TTT or cancelled)


96it [00:50,  2.17it/s]

Unable to add results from race/paris-roubaix/2020 - bad HTML (possibly TTT or cancelled)


97it [00:51,  2.13it/s]

Unable to add results from race/great-ocean-race/2021 - bad HTML (possibly TTT or cancelled)


110it [00:57,  2.44it/s]

Unable to add results from race/cyclassics-hamburg/2021 - bad HTML (possibly TTT or cancelled)


112it [00:58,  3.00it/s]

Unable to add results from race/gp-quebec/2021 - bad HTML (possibly TTT or cancelled)


113it [00:58,  3.39it/s]

Unable to add results from race/gp-montreal/2021 - bad HTML (possibly TTT or cancelled)


117it [01:00,  2.60it/s]

Unable to add results from race/great-ocean-race/2022 - bad HTML (possibly TTT or cancelled)


136it [01:08,  1.98it/s]
1it [00:05,  5.44s/it]

Unable to add results from race/paris-nice/2016/stage-3 - bad HTML (possibly TTT or cancelled)


2it [00:10,  5.43s/it]

Unable to add results from race/tirreno-adriatico/2016/stage-5 - bad HTML (possibly TTT or cancelled)


6it [00:28,  4.55s/it]

Unable to add results from race/giro-d-italia/2016/stage-8 - bad HTML


10it [01:12,  9.92s/it]

Unable to add results from race/tour-de-pologne/2016/stage-6 - bad HTML (possibly TTT or cancelled)


17it [01:50,  5.07s/it]

Unable to add results from race/volta-a-catalunya/2017/stage-2 - bad HTML (possibly TTT or cancelled)


33it [03:32,  5.17s/it]

Unable to add results from race/tirreno-adriatico/2018/stage-1 - bad HTML (possibly TTT or cancelled)


35it [03:42,  5.10s/it]

Unable to add results from race/itzulia-basque-country/2018/stage-4 - bad HTML


37it [03:50,  4.45s/it]

Unable to add results from race/giro-d-italia/2018/stage-20 - bad HTML


39it [04:12,  7.17s/it]

Unable to add results from race/dauphine/2018/stage-3 - bad HTML


40it [04:19,  7.03s/it]

Unable to add results from race/tour-de-suisse/2018/stage-1 - bad HTML (possibly TTT or cancelled)


41it [04:26,  7.03s/it]

Unable to add results from race/tour-de-france/2018/stage-3 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-de-france/2018/stage-9 - bad HTML


48it [05:20,  6.51s/it]

Unable to add results from race/uae-tour/2019/stage-1 - bad HTML (possibly TTT or cancelled)


50it [05:30,  5.81s/it]

Unable to add results from race/tirreno-adriatico/2019/stage-1 - bad HTML (possibly TTT or cancelled)


59it [06:23,  6.01s/it]

Unable to add results from race/tour-de-france/2019/stage-2 - bad HTML (possibly TTT or cancelled)


60it [06:38,  8.63s/it]

Unable to add results from race/tour-de-pologne/2019/stage-4 - bad HTML (possibly TTT or cancelled)


68it [07:21,  4.01s/it]

Unable to add results from race/paris-nice/2020/stage-8 - bad HTML (possibly TTT or cancelled)


72it [07:28,  2.70s/it]

Unable to add results from race/tour-de-france/2020/stage-8 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-de-france/2020/stage-9 - bad HTML (possibly TTT or cancelled)


74it [07:52,  6.95s/it]

Unable to add results from race/benelux-tour/2020/stage-2 - bad HTML (possibly TTT or cancelled)


77it [08:24, 10.21s/it]

Unable to add results from race/tour-of-guangxi/2020/stage-1 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2020/stage-2 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2020/stage-3 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2020/stage-4 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2020/stage-5 - bad HTML (possibly TTT or cancelled)


78it [08:26,  7.81s/it]

Unable to add results from race/tour-of-guangxi/2020/stage-6 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2021/stage-1 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2021/stage-2 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2021/stage-3 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2021/stage-4 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2021/stage-5 - bad HTML (possibly TTT or cancelled)


79it [08:29,  6.24s/it]

Unable to add results from race/tour-down-under/2021/stage-6 - bad HTML (possibly TTT or cancelled)


93it [10:08,  6.56s/it]

Unable to add results from race/tour-down-under/2022/stage-1 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2022/stage-2 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2022/stage-3 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2022/stage-4 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-down-under/2022/stage-5 - bad HTML (possibly TTT or cancelled)


94it [10:10,  5.17s/it]

Unable to add results from race/tour-down-under/2022/stage-6 - bad HTML (possibly TTT or cancelled)


105it [11:22,  7.93s/it]

Unable to add results from race/vuelta-a-espana/2022/stage-1 - bad HTML


106it [11:39, 10.81s/it]

Unable to add results from race/benelux-tour/2022/stage-1 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/benelux-tour/2022/stage-2 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/benelux-tour/2022/stage-3 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/benelux-tour/2022/stage-4 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/benelux-tour/2022/stage-5 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/benelux-tour/2022/stage-6 - bad HTML (possibly TTT or cancelled)


107it [11:45,  9.19s/it]

Unable to add results from race/benelux-tour/2022/stage-7 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2022/stage-1 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2022/stage-2 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2022/stage-3 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2022/stage-4 - bad HTML (possibly TTT or cancelled)
Unable to add results from race/tour-of-guangxi/2022/stage-5 - bad HTML (possibly TTT or cancelled)


108it [11:48,  6.56s/it]

Unable to add results from race/tour-of-guangxi/2022/stage-6 - bad HTML (possibly TTT or cancelled)



