In [1]:
!pip install procyclingstats



In [6]:
import pandas as pd
world_tour_races = pd.read_csv('world_tour_races.csv')
world_tour_races['race_url'] = 'race/' + world_tour_races['race_name'] + '/' + world_tour_races['year'].astype(str)
world_tour_races = world_tour_races.query('year >= 2016')
world_tour_races

Unnamed: 0,race_name,year,stage_race,race_url
83,tour-down-under,2016,True,race/tour-down-under/2016
84,paris-nice,2016,True,race/paris-nice/2016
85,tirreno-adriatico,2016,True,race/tirreno-adriatico/2016
86,milano-sanremo,2016,False,race/milano-sanremo/2016
87,volta-a-catalunya,2016,True,race/volta-a-catalunya/2016
...,...,...,...,...
322,benelux-tour,2022,True,race/benelux-tour/2022
323,gp-quebec,2022,False,race/gp-quebec/2022
324,gp-montreal,2022,False,race/gp-montreal/2022
325,il-lombardia,2022,False,race/il-lombardia/2022


In [11]:
from procyclingstats import Stage, Race
from tqdm import tqdm

riders = pd.DataFrame(columns = ['rider_url', 'rider_name', 'nationality'])
teams = pd.DataFrame(columns = ['team_url', 'team_name'])
results = pd.DataFrame(columns = ['rider_url', 'team_url', 'rank', 'status', 'age', 'time', 'pcs_points', 'uci_points'])

def add_results_from_stage(race_url):
    global riders, teams, results
    
    try:
        stage = Stage(race_url)
    except ValueError:
        print(f"Unable to add results from {race_url} - bad URL")
        return
    
    try:
        parsed = stage.parse()
    except IndexError:
        print(f"Unable to add results from {race_url} - bad HTML (possibly TTT)")
        return
    except:
        print(f"Unable to add results from {race_url} - bad HTML")
        return
    
    full_df = pd.DataFrame(parsed['results'])
    
    # Clean links   
    full_df['rider_url'] = full_df['rider_url'].str.removeprefix('rider/')
    full_df['team_url'] = full_df['team_url'].str.removeprefix('team/')
    full_df['race_url'] = race_url
    
    rider = full_df[['rider_url', 'rider_name', 'nationality']]
    riders = pd.concat([riders, rider]).drop_duplicates()
    team = full_df[['team_url', 'team_name']]
    teams = pd.concat([teams, team]).drop_duplicates()
    result = full_df[['race_url', 'rider_url', 'team_url', 'rank', 'status', 'age', 'time', 'pcs_points', 'uci_points']]
    results = pd.concat([results, result])
    
def get_stage_urls(race_url):
    stage = Race(f"{race_url}/overview")
    stages = stage.parse()['stages']
    return [stage['stage_url'] for stage in stages]

for i, race_url in tqdm(world_tour_races.query('not stage_race').race_url.items()):
    add_results_from_stage(race_url)
    
for i, race_url in tqdm(world_tour_races.query('stage_race').race_url.items()):
    stage_urls = get_stage_urls(race_url)
    for stage in stage_urls:
        add_results_from_stage(stage)
        
riders.to_csv('pcs_riders.csv')
teams.to_csv('pcs_teams.csv')
results.to_csv('pcs_results.csv')

1it [00:00,  1.44it/s]

Unable to add results from race/e3-harelbeke/2016 - bad URL


19it [00:09,  2.41it/s]

Unable to add results from race/e3-harelbeke/2017 - bad URL


35it [00:16,  2.35it/s]

Unable to add results from race/great-ocean-race/2018 - bad HTML


38it [00:17,  2.56it/s]

Unable to add results from race/e3-harelbeke/2018 - bad URL


47it [00:20,  2.92it/s]

Unable to add results from race/eschborn-frankfurt/2018 - bad HTML


59it [00:24,  3.26it/s]

Unable to add results from race/e3-harelbeke/2019 - bad URL


77it [00:34,  2.10it/s]

Unable to add results from race/e3-harelbeke/2020 - bad URL


80it [00:34,  3.09it/s]

Unable to add results from race/eschborn-frankfurt/2020 - bad HTML (possibly TTT)
Unable to add results from race/san-sebastian/2020 - bad HTML (possibly TTT)


84it [00:36,  2.86it/s]

Unable to add results from race/ride-london-classic/2020 - bad HTML (possibly TTT)


87it [00:37,  3.62it/s]

Unable to add results from race/gp-quebec/2020 - bad HTML (possibly TTT)
Unable to add results from race/gp-montreal/2020 - bad HTML (possibly TTT)


89it [00:37,  3.82it/s]

Unable to add results from race/cyclassics-hamburg/2020 - bad HTML (possibly TTT)


91it [00:38,  3.97it/s]

Unable to add results from race/amstel-gold-race/2020 - bad HTML (possibly TTT)


93it [00:39,  2.97it/s]

Unable to add results from race/dwars-door-vlaanderen/2020 - bad HTML (possibly TTT)


97it [00:40,  3.72it/s]

Unable to add results from race/paris-roubaix/2020 - bad HTML (possibly TTT)
Unable to add results from race/great-ocean-race/2021 - bad HTML (possibly TTT)


101it [00:42,  2.10it/s]

Unable to add results from race/e3-harelbeke/2021 - bad URL


110it [00:45,  3.16it/s]

Unable to add results from race/cyclassics-hamburg/2021 - bad HTML (possibly TTT)


112it [00:46,  2.71it/s]

Unable to add results from race/gp-quebec/2021 - bad HTML (possibly TTT)


113it [00:46,  3.12it/s]

Unable to add results from race/gp-montreal/2021 - bad HTML (possibly TTT)


117it [00:48,  2.48it/s]

Unable to add results from race/great-ocean-race/2022 - bad HTML (possibly TTT)


121it [00:49,  2.41it/s]

Unable to add results from race/e3-harelbeke/2022 - bad URL


136it [00:57,  2.38it/s]
1it [00:03,  3.28s/it]

Unable to add results from race/paris-nice/2016/stage-3 - bad HTML (possibly TTT)


2it [00:10,  5.32s/it]

Unable to add results from race/tirreno-adriatico/2016/stage-5 - bad HTML (possibly TTT)


6it [00:27,  4.31s/it]

Unable to add results from race/giro-d-italia/2016/stage-8 - bad HTML


10it [01:10,  9.62s/it]

Unable to add results from race/tour-de-pologne/2016/stage-6 - bad HTML (possibly TTT)


17it [01:51,  5.55s/it]

Unable to add results from race/volta-a-catalunya/2017/stage-2 - bad HTML (possibly TTT)


33it [03:33,  4.60s/it]

Unable to add results from race/tirreno-adriatico/2018/stage-1 - bad HTML (possibly TTT)


35it [03:42,  4.60s/it]

Unable to add results from race/itzulia-basque-country/2018/stage-4 - bad HTML


37it [03:49,  4.08s/it]

Unable to add results from race/giro-d-italia/2018/stage-20 - bad HTML


39it [04:09,  6.58s/it]

Unable to add results from race/dauphine/2018/stage-3 - bad HTML


40it [04:15,  6.28s/it]

Unable to add results from race/tour-de-suisse/2018/stage-1 - bad HTML (possibly TTT)


41it [04:21,  6.17s/it]

Unable to add results from race/tour-de-france/2018/stage-3 - bad HTML (possibly TTT)
Unable to add results from race/tour-de-france/2018/stage-9 - bad HTML


48it [05:10,  5.63s/it]

Unable to add results from race/uae-tour/2019/stage-1 - bad HTML (possibly TTT)


50it [05:18,  4.83s/it]

Unable to add results from race/tirreno-adriatico/2019/stage-1 - bad HTML (possibly TTT)


59it [06:09,  6.19s/it]

Unable to add results from race/tour-de-france/2019/stage-2 - bad HTML (possibly TTT)


60it [06:24,  9.02s/it]

Unable to add results from race/tour-de-pologne/2019/stage-4 - bad HTML (possibly TTT)


67it [07:01,  4.80s/it]

Unable to add results from race/paris-nice/2020/stage-8 - bad HTML (possibly TTT)


72it [07:08,  2.54s/it]

Unable to add results from race/tour-de-france/2020/stage-8 - bad HTML (possibly TTT)
Unable to add results from race/tour-de-france/2020/stage-9 - bad HTML (possibly TTT)


74it [07:31,  6.53s/it]

Unable to add results from race/benelux-tour/2020/stage-2 - bad HTML (possibly TTT)


77it [08:03,  9.73s/it]

Unable to add results from race/tour-of-guangxi/2020/stage-1 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2020/stage-2 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2020/stage-3 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2020/stage-4 - bad HTML (possibly TTT)


78it [08:05,  7.41s/it]

Unable to add results from race/tour-of-guangxi/2020/stage-5 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2020/stage-6 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2021/stage-1 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2021/stage-2 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2021/stage-3 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2021/stage-4 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2021/stage-5 - bad HTML (possibly TTT)


79it [08:07,  5.77s/it]

Unable to add results from race/tour-down-under/2021/stage-6 - bad HTML (possibly TTT)


93it [09:37,  5.64s/it]

Unable to add results from race/tour-down-under/2022/stage-1 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2022/stage-2 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2022/stage-3 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2022/stage-4 - bad HTML (possibly TTT)
Unable to add results from race/tour-down-under/2022/stage-5 - bad HTML (possibly TTT)


94it [09:40,  4.77s/it]

Unable to add results from race/tour-down-under/2022/stage-6 - bad HTML (possibly TTT)


106it [11:08,  9.80s/it]

Unable to add results from race/benelux-tour/2022/stage-1 - bad HTML (possibly TTT)
Unable to add results from race/benelux-tour/2022/stage-2 - bad HTML (possibly TTT)
Unable to add results from race/benelux-tour/2022/stage-3 - bad HTML (possibly TTT)
Unable to add results from race/benelux-tour/2022/stage-4 - bad HTML (possibly TTT)
Unable to add results from race/benelux-tour/2022/stage-5 - bad HTML (possibly TTT)


107it [11:09,  7.42s/it]

Unable to add results from race/benelux-tour/2022/stage-6 - bad HTML (possibly TTT)
Unable to add results from race/benelux-tour/2022/stage-7 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2022/stage-1 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2022/stage-2 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2022/stage-3 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2022/stage-4 - bad HTML (possibly TTT)
Unable to add results from race/tour-of-guangxi/2022/stage-5 - bad HTML (possibly TTT)


108it [11:12,  6.22s/it]

Unable to add results from race/tour-of-guangxi/2022/stage-6 - bad HTML (possibly TTT)



