In [1]:
import pandas as pd
from time import sleep
from tqdm import tqdm
#from itertools import compress

In [2]:
TARGET_YEAR = '2020'

In [3]:
def check_YYYY(YYYY):
    is_good = isinstance(YYYY, str) and len(YYYY) == 4 and YYYY.isdigit()
    if(not(is_good)):
       raise Exception("Bad args")
    return(is_good)

In [4]:
def clean_names(old_cols):
    new_cols = [c.lower().replace('#', '') for c in old_cols]
    return(new_cols)

In [5]:
def get_events(YYYY):
    check_YYYY(YYYY)
    url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date={YYYY}&type=&dist=&rtid=&esortby=cert&collapse=0"
    print(url)
    
    # get versions with and without links
    dat_links = pd.read_html(url, extract_links='all')[0]
    dat = pd.read_html(url)[0]
    
    # break apart tuples to get urls. we'll have columns called 0 and 1
        # TODO: validate it contains Route and select columns
    route_ids = pd.DataFrame(dat_links[('Route', None)].tolist(), 
                             index=dat_links.index)
    event_ids = pd.DataFrame(dat_links[('select', None)].tolist(), 
                             index=dat_links.index)

    # extract ids from link urls in column 1
        # TODO validate regex, in case format changes
    dat['rt_url'] = route_ids[1]
    dat['rtid'] = route_ids[1].str.extract('^.*rtid=(.*)')
    dat['event_url'] = event_ids[1]
    dat['eid'] = event_ids[1].str.extract('^.*eid=(.*)&')
    
    # clean col names
    new_cols = [c.lower() for c in list(dat.columns)]
    dat.columns = new_cols

    # drop first row and the 'select' column
    dat = dat.drop(0, axis=0)
    dat = dat.drop('select', axis=1)

    return(dat)

In [6]:
events = get_events(TARGET_YEAR)
events.head()

https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date=2020&type=&dist=&rtid=&esortby=cert&collapse=0


Unnamed: 0,region,club,type,distance,date,route,rt_url,rtid,event_url,eid
1,MD: Capital Region,DC Randonneurs / 946012,RUSAB,207,2020/01/01,Woodbine-Dillsburg,/cgi-bin/routesearch_PF.pl?rtid=625,625,/cgi-bin/resultsearch_PF.pl?eid=11579&esortby=...,11579
2,WA: Seattle,Seattle International Randonneurs / 947018,RUSAP,100,2020/01/01,Shortest Ride on the shortest day,/cgi-bin/routesearch_PF.pl?rtid=1679,1679,/cgi-bin/resultsearch_PF.pl?eid=11521&esortby=...,11521
3,OR: Portland,Oregon Randonneurs / 937020,ACPB,200,2020/01/01,Wine Country 200,/cgi-bin/routesearch_PF.pl?rtid=54,54,/cgi-bin/resultsearch_PF.pl?eid=11347&esortby=...,11347
4,TX: Dallas,Lone Star Randonneurs / 943026,ACPB,200,2020/01/01,Rip-Snortin' Rando Ride,/cgi-bin/routesearch_PF.pl?rtid=1495,1495,/cgi-bin/resultsearch_PF.pl?eid=10874&esortby=...,10874
5,CA: Davis,Davis Bike Club / 905014,RUSAP,103,2020/01/01,Moskowite Corner 100Km,/cgi-bin/routesearch_PF.pl?rtid=1322,1322,/cgi-bin/resultsearch_PF.pl?eid=11275&esortby=...,11275


In [7]:
def get_event_details_results(eid):
    
    event_url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?eid={eid}&esortby=cert"
    #print(event_url)
    two_dats = pd.read_html(event_url)
    
    event_details = two_dats[0]
    event_details.columns = clean_names(list(event_details.columns))
    event_details['eid'] = eid

    # warn if we only get one table back
    if(len(two_dats)<2):
        #print(f'Warning: no finisher results for event {eid}!')
        event_results = pd.DataFrame()
    else:
        event_results = two_dats[1]
        event_results.columns = clean_names(list(event_results.columns))
        # drop the rows 'x nonmembers also finished'
        try:
            event_results = event_results.drop(
                event_results[event_results.cert.str.contains('also finished')].index
            )
        except:
            pass
            #print(f"eid {eid} - couldn't drop 'non member' result rows")
        event_results['eid'] = eid
    
    return([event_details,event_results])

In [8]:
events.groupby('type').size()

type
ACPB     164
RUSAB    150
RUSAP    192
dtype: int64

In [9]:
# https://rusa.org/pages/eventtypes
non_team_types = ['ACPB', 'RM', 'RUSAB', 'RUSAP']

In [10]:
# list of event ids for the non team event types, for which we'll get results/details
event_ids = events[events.type.isin(non_team_types)].eid.to_list()
print(len(event_ids))

506


In [11]:
e_detail_list = []
e_results_list = []

In [12]:
for e in tqdm(event_ids):
    #print(e)
    this_d_e = get_event_details_results(e)
    e_detail_list.append(this_d_e[0])
    e_results_list.append(this_d_e[1])
    # sleep roughly every sixth call (assuming event ids are consecutive ints)
    if(int(e) % 6 == 0):
        #print('Sleeping')
        sleep(1)

100%|█████████████████████████████████████████| 506/506 [04:41<00:00,  1.79it/s]


In [13]:
# should all be the same
print(len(e_detail_list), len(e_results_list), len(event_ids))

506 506 506


In [14]:
event_results = pd.concat(e_results_list)

In [15]:
event_details = pd.concat(e_detail_list)

In [None]:
# for when i forget to filter team events before getting results; 
# the format is different and messes stuff up, plus i don't care about those results right now
# erl2 = list(compress(e_results_list, 
#                      events.type.isin(non_team_types)))

In [16]:
event_details.reset_index(inplace=True, drop=True)
event_results.reset_index(inplace=True, drop=True)

In [17]:
event_results.head()

Unnamed: 0,cert,rusa,name,club / acp code,time,medal,eid
0,RUSA-B15026,3319,"WINKERT, George",Northern Virginia Randonneurs / 946020,13:24,,11579
1,RUSA-B15027,10659,"SCHOENFELDER, Steven J",Pennsylvania Randonneurs / 938017,12:08,,11579
2,RUSA-B15028,7203,"NICHOLSON, Jack",Severna Park Peloton / 920025,12:08,,11579
3,RUSA-B15029,3446,"BECK, William A",DC Randonneurs / 946012,12:08,,11579
4,RUSA-B15030,10323,"BEATY, Robert M",DC Randonneurs / 946012,12:08,,11579


In [18]:
event_details.head()

Unnamed: 0,region,club,type,distance,date,finishers,dnf,eid
0,MD: Capital Region,DC Randonneurs / 946012,RUSAB,207,2020/01/01,5,1.0,11579
1,WA: Seattle,Seattle International Randonneurs / 947018,RUSAP,100,2020/01/01,14,2.0,11521
2,OR: Portland,Oregon Randonneurs / 937020,ACPB,200,2020/01/01,3,0.0,11347
3,TX: Dallas,Lone Star Randonneurs / 943026,ACPB,200,2020/01/01,10,0.0,10874
4,CA: Davis,Davis Bike Club / 905014,RUSAP,103,2020/01/01,1,0.0,11275


In [19]:
event_details.to_csv(f'data/event_details_{TARGET_YEAR}.csv', index=False)

In [20]:
event_results.to_csv(f'data/event_results_{TARGET_YEAR}.csv', index=False)

In [21]:
events.to_csv(f'data/events_{TARGET_YEAR}.csv', index=False)