In [32]:
import pandas as pd
from time import sleep
from tqdm import tqdm
import requests
from io import StringIO
import os

In [2]:
TARGET_YEAR = '2023'

In [3]:
def check_YYYY(YYYY):
    is_good = isinstance(YYYY, str) and len(YYYY) == 4 and YYYY.isdigit()
    if(not(is_good)):
       raise Exception("Bad args")
    return(is_good)

In [4]:
def clean_names(old_cols):
    new_cols = [c.lower().replace('#', '') for c in old_cols]
    return(new_cols)

In [5]:
def read_html_clean(url):
    # replacing bad character, which will cause read_html to fail directly, even with encoding='utf8'
    raw_html = requests.get(url).text
    raw_html = raw_html.replace('\x93', ' ')
    return(raw_html)


In [6]:
def get_events(YYYY):
    check_YYYY(YYYY)
    url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date={YYYY}&type=&dist=&rtid=&esortby=cert&collapse=0"
    print(url)
        
    html_input = read_html_clean(url)
    
    # get versions with and without links
    dat_links = pd.read_html(StringIO(str(html_input)), extract_links='all')[0]
    dat = pd.read_html(StringIO(str(html_input)))[0]
    
    # break apart tuples to get urls. we'll have columns called 0 and 1
        # TODO: validate it contains Route and select columns
    route_ids = pd.DataFrame(dat_links[('Route', None)].tolist(), 
                             index=dat_links.index)
    event_ids = pd.DataFrame(dat_links[('select', None)].tolist(), 
                             index=dat_links.index)

    # extract ids from link urls in column 1
        # TODO validate regex, in case format changes
    dat['rt_url'] = route_ids[1]
    dat['rtid'] = route_ids[1].str.extract('^.*rtid=(.*)')
    dat['event_url'] = event_ids[1]
    dat['eid'] = event_ids[1].str.extract('^.*eid=(.*)&')
    
    # clean col names
    new_cols = [c.lower() for c in list(dat.columns)]
    dat.columns = new_cols

    # drop first row and the 'select' column
    dat = dat.drop(0, axis=0)
    dat = dat.drop('select', axis=1)

    return(dat)

In [7]:
events = get_events(TARGET_YEAR)
events.head()

https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date=2023&type=&dist=&rtid=&esortby=cert&collapse=0


Unnamed: 0,region,club,type,distance,date,route,rt_url,rtid,event_url,eid
1,CA: Los Angeles,Pacific Coast Highway Randonneurs / 905051,ACPB,200,2023/01/01,Santa Barbara Easy 200k,/cgi-bin/routesearch_PF.pl?rtid=1977,1977,/cgi-bin/resultsearch_PF.pl?eid=14213&esortby=...,14213
2,CA: Davis,Davis Bike Club / 905014,ACPB,200,2023/01/01,Colusa - Davis,/cgi-bin/routesearch_PF.pl?rtid=2159,2159,/cgi-bin/resultsearch_PF.pl?eid=14474&esortby=...,14474
3,CA: Davis,Davis Bike Club / 905014,RUSAP,108,2023/01/01,Elmira Ramble,/cgi-bin/routesearch_PF.pl?rtid=2501,2501,/cgi-bin/resultsearch_PF.pl?eid=14473&esortby=...,14473
4,OR: Portland,Oregon Randonneurs / 937020,ACPB,200,2023/01/01,Wine Country 200,/cgi-bin/routesearch_PF.pl?rtid=54,54,/cgi-bin/resultsearch_PF.pl?eid=14436&esortby=...,14436
5,OR: Portland,Oregon Randonneurs / 937020,RUSAP,109,2023/01/01,Wine Country,/cgi-bin/routesearch_PF.pl?rtid=53,53,/cgi-bin/resultsearch_PF.pl?eid=14692&esortby=...,14692


In [8]:
events.tail()

Unnamed: 0,region,club,type,distance,date,route,rt_url,rtid,event_url,eid
1078,CA: San Francisco,San Francisco Randonneurs / 905030,RUSAP,173,2023/12/24,Del Puerto Canyon Junior,/cgi-bin/routesearch_PF.pl?rtid=3233,3233,/cgi-bin/resultsearch_PF.pl?eid=15816&esortby=...,15816
1079,AZ: Phoenix,Arizona Randonneurs / 903020,ACPB,300,2023/12/30,Vulture Mine 300k,/cgi-bin/routesearch_PF.pl?rtid=1697,1697,/cgi-bin/resultsearch_PF.pl?eid=15817&esortby=...,15817
1080,FL: Northeast,Northeast Florida Randonneurs / 909034,RUSAP,100,2023/12/30,Touch the St George POP,/cgi-bin/routesearch_PF.pl?rtid=3020,3020,/cgi-bin/resultsearch_PF.pl?eid=15831&esortby=...,15831
1081,CA: Davis,Davis Bike Club / 905014,ACPB,200,2023/12/31,Davis - Pope Valley - Davis,/cgi-bin/routesearch_PF.pl?rtid=1200,1200,/cgi-bin/resultsearch_PF.pl?eid=14493&esortby=...,14493
1082,CA: Davis,Davis Bike Club / 905014,RUSAP,102,2023/12/31,Bella Bru 102k,/cgi-bin/routesearch_PF.pl?rtid=2011,2011,/cgi-bin/resultsearch_PF.pl?eid=14492&esortby=...,14492


In [9]:
def get_event_details_results(eid):
    
    event_url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?eid={eid}&esortby=cert"
    #print(event_url)
    two_dats = pd.read_html(event_url)
    
    event_details = two_dats[0]
    event_details.columns = clean_names(list(event_details.columns))
    event_details['eid'] = eid

    # warn if we only get one table back
    if(len(two_dats)<2):
        #print(f'Warning: no finisher results for event {eid}!')
        event_results = pd.DataFrame()
    else:
        event_results = two_dats[1]
        event_results.columns = clean_names(list(event_results.columns))
        # drop the rows 'x nonmembers also finished'
        try:
            event_results = event_results.drop(
                event_results[event_results.cert.str.contains('also finished')].index
            )
        except:
            pass
            #print(f"eid {eid} - couldn't drop 'non member' result rows")
        event_results['eid'] = eid
    
    return([event_details,event_results])

In [10]:
events.groupby('type').size()

type
ACPB     653
ACPF      14
ACPR       2
RM         6
RUSAB    113
RUSAF     22
RUSAP    262
UAFB      10
dtype: int64

In [11]:
# https://rusa.org/pages/eventtypes
non_team_types = ['ACPB', 'RM', 'RUSAB', 'RUSAP']

In [12]:
# list of event ids for the non team event types, for which we'll get results/details
event_ids = events[events.type.isin(non_team_types)].eid.to_list()
print(len(event_ids))

1034


In [13]:
e_detail_list = []
e_results_list = []

In [22]:
remaining_event_ids = event_ids

In [25]:
# this can fail with too many event_ids (eg 1034 for 2023)
for e in tqdm(remaining_event_ids):
    this_d_e = get_event_details_results(e)
    e_detail_list.append(this_d_e[0])
    e_results_list.append(this_d_e[1])
    # sleep roughly every nth call (assuming event ids are consecutive ints)
    if(int(e) % 10 == 0):
        sleep(.5)

100%|████████████████████████████████████████████████████████████████████████████| 328/328 [01:47<00:00,  3.05it/s]


In [26]:
# should all be the same
print(len(e_detail_list), len(e_results_list), len(event_ids))

1034 1034 1034


In [24]:
# use this if loop fails before completion, then you can rerun the loop cell
#remaining_event_ids = event_ids[len(e_detail_list):]

'14385'

In [27]:
event_results = pd.concat(e_results_list)
event_details = pd.concat(e_detail_list)

In [None]:
# for when i forgot to filter team events before getting results; 
# the format is different and messes stuff up, plus i don't care about those results right now
# erl2 = list(compress(e_results_list, 
#                      events.type.isin(non_team_types)))

In [28]:
event_details.reset_index(inplace=True, drop=True)
event_results.reset_index(inplace=True, drop=True)

In [29]:
event_results.head()

Unnamed: 0,cert,rusa,name,club / acp code,time,medal,eid
0,865373,13488,"AKIYAMA, Bill",Pacific Coast Highway Randonneurs / 905051,12:30,,14213
1,865374,13560,"ALAVA, Victor",Randonneurs USA / 905095,09:28,,14213
2,865375,15640,"ANDREWS, Bradford Todd",Davis Bike Club / 905014,10:54,,14213
3,865376,14547,"ANDREWS, Justin Hale",Pacific Coast Highway Randonneurs / 905051,10:54,,14213
4,865377,7895,"ARITA, Jeffrey Glenn",Pacific Coast Highway Randonneurs / 905051,11:12,,14213


In [30]:
event_details.head()

Unnamed: 0,region,club,type,distance,date,finishers,dnf,eid
0,CA: Los Angeles,Pacific Coast Highway Randonneurs / 905051,ACPB,200,2023/01/01,26,2.0,14213
1,CA: Davis,Davis Bike Club / 905014,ACPB,200,2023/01/01,17,0.0,14474
2,CA: Davis,Davis Bike Club / 905014,RUSAP,108,2023/01/01,0,,14473
3,OR: Portland,Oregon Randonneurs / 937020,ACPB,200,2023/01/01,5,0.0,14436
4,OR: Portland,Oregon Randonneurs / 937020,RUSAP,109,2023/01/01,3,0.0,14692


In [36]:
if not os.path.exists('raw_data'):
    os.makedirs('raw_data')
    print('created output dir')
    

created output dir


In [37]:
event_details.to_csv(f'raw_data/event_details_{TARGET_YEAR}.csv', index=False)

In [38]:
event_results.to_csv(f'raw_data/event_results_{TARGET_YEAR}.csv', index=False)

In [39]:
events.to_csv(f'raw_data/events_{TARGET_YEAR}.csv', index=False)