In [1]:
import pandas as pd
from time import sleep
from tqdm import tqdm
#from itertools import compress

In [2]:
TARGET_YEAR = '2009'

In [3]:
def check_YYYY(YYYY):
    is_good = isinstance(YYYY, str) and len(YYYY) == 4 and YYYY.isdigit()
    if(not(is_good)):
       raise Exception("Bad args")
    return(is_good)

In [4]:
def clean_names(old_cols):
    new_cols = [c.lower().replace('#', '') for c in old_cols]
    return(new_cols)

In [5]:
def get_events(YYYY):
    check_YYYY(YYYY)
    url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date={YYYY}&type=&dist=&rtid=&esortby=cert&collapse=0"
    print(url)
    
    # get versions with and without links
    dat_links = pd.read_html(url, extract_links='all')[0]
    dat = pd.read_html(url)[0]
    
    # break apart tuples to get urls. we'll have columns called 0 and 1
        # TODO: validate it contains Route and select columns
    route_ids = pd.DataFrame(dat_links[('Route', None)].tolist(), 
                             index=dat_links.index)
    event_ids = pd.DataFrame(dat_links[('select', None)].tolist(), 
                             index=dat_links.index)

    # extract ids from link urls in column 1
        # TODO validate regex, in case format changes
    dat['rt_url'] = route_ids[1]
    dat['rtid'] = route_ids[1].str.extract('^.*rtid=(.*)')
    dat['event_url'] = event_ids[1]
    dat['eid'] = event_ids[1].str.extract('^.*eid=(.*)&')
    
    # clean col names
    new_cols = [c.lower() for c in list(dat.columns)]
    dat.columns = new_cols

    # drop first row and the 'select' column
    dat = dat.drop(0, axis=0)
    dat = dat.drop('select', axis=1)

    return(dat)

In [6]:
events = get_events(TARGET_YEAR)
events.head()

https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date=2009&type=&dist=&rtid=&esortby=cert&collapse=0


Unnamed: 0,region,club,type,distance,date,route,rt_url,rtid,event_url,eid
1,TX: Dallas,Lone Star Randonneurs / 943026,ACPB,200,2009/01/01,This Ain't no Club Ride 204,/cgi-bin/routesearch_PF.pl?rtid=398,398,/cgi-bin/resultsearch_PF.pl?eid=2213&esortby=cert,2213
2,GA: Atlanta,Audax Atlanta / 910004,RUSAB,200,2009/01/01,Augusta Georgia 200k,/cgi-bin/routesearch_PF.pl?rtid=533,533,/cgi-bin/resultsearch_PF.pl?eid=2089&esortby=cert,2089
3,NC: High Point,Bicycle For Life Club / 933057,ACPB,200,2009/01/03,route 693 (unnamed),/cgi-bin/routesearch_PF.pl?rtid=693,693,/cgi-bin/resultsearch_PF.pl?eid=2196&esortby=cert,2196
4,CA: San Diego,San Diego Randonneurs / 905140,ACPB,200,2009/01/03,Rainbow 200K,/cgi-bin/routesearch_PF.pl?rtid=253,253,/cgi-bin/resultsearch_PF.pl?eid=1990&esortby=cert,1990
5,TX: Houston,Houston Randonneurs / 943030,ACPB,200,2009/01/03,route 289 (unnamed),/cgi-bin/routesearch_PF.pl?rtid=289,289,/cgi-bin/resultsearch_PF.pl?eid=2005&esortby=cert,2005


In [7]:
def get_event_details_results(eid):
    
    event_url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?eid={eid}&esortby=cert"
    #print(event_url)
    two_dats = pd.read_html(event_url)
    
    event_details = two_dats[0]
    event_details.columns = clean_names(list(event_details.columns))
    event_details['eid'] = eid

    # warn if we only get one table back
    if(len(two_dats)<2):
        #print(f'Warning: no finisher results for event {eid}!')
        event_results = pd.DataFrame()
    else:
        event_results = two_dats[1]
        event_results.columns = clean_names(list(event_results.columns))
        # drop the rows 'x nonmembers also finished'
        try:
            event_results = event_results.drop(
                event_results[event_results.cert.str.contains('also finished')].index
            )
        except:
            pass
            #print(f"eid {eid} - couldn't drop 'non member' result rows")
        event_results['eid'] = eid
    
    return([event_details,event_results])

In [8]:
events.groupby('type').size()

type
ACPB     227
ACPF      12
RM         4
RUSAB    145
RUSAF      5
RUSAP     35
dtype: int64

In [9]:
# https://rusa.org/pages/eventtypes
non_team_types = ['ACPB', 'RM', 'RUSAB', 'RUSAP']

In [10]:
# list of event ids for the non team event types, for which we'll get results/details
event_ids = events[events.type.isin(non_team_types)].eid.to_list()
print(len(event_ids))

411


In [11]:
e_detail_list = []
e_results_list = []

In [12]:
for e in tqdm(event_ids):
    #print(e)
    this_d_e = get_event_details_results(e)
    e_detail_list.append(this_d_e[0])
    e_results_list.append(this_d_e[1])
    # sleep roughly every nth call (assuming event ids are consecutive ints)
    if(int(e) % 10 == 0):
        #print('Sleeping')
        sleep(.5)

100%|█████████████████████████████████████████| 411/411 [03:30<00:00,  1.95it/s]


In [13]:
# should all be the same
print(len(e_detail_list), len(e_results_list), len(event_ids))

411 411 411


In [14]:
event_results = pd.concat(e_results_list)

In [15]:
event_details = pd.concat(e_detail_list)

In [16]:
# for when i forgot to filter team events before getting results; 
# the format is different and messes stuff up, plus i don't care about those results right now
# erl2 = list(compress(e_results_list, 
#                      events.type.isin(non_team_types)))

In [17]:
event_details.reset_index(inplace=True, drop=True)
event_results.reset_index(inplace=True, drop=True)

In [18]:
event_results.head()

Unnamed: 0,cert,rusa,name,club / acp code,time,medal,eid
0,277877,4561,"APPLEWHAITE, John O",Hill Country Randonneurs / 943025,10:40,,2213
1,277878,2987,"ARMSTRONG, Shanna",Lone Star Randonneurs / 943026,08:10,,2213
2,277879,2362,"BARNELL, Brenda",Lone Star Randonneurs / 943026,08:31,,2213
3,277880,736,"BREAUD, Debbie",Lone Star Randonneurs / 943026,08:54,,2213
4,277881,390,"DRISCOLL, Dan",Lone Star Randonneurs / 943026,08:10,,2213


In [19]:
event_details.head()

Unnamed: 0,region,club,type,distance,date,finishers,dnf,eid
0,TX: Dallas,Lone Star Randonneurs / 943026,ACPB,200,2009/01/01,33,3,2213
1,GA: Atlanta,Audax Atlanta / 910004,RUSAB,200,2009/01/01,18,2,2089
2,NC: High Point,Bicycle For Life Club / 933057,ACPB,200,2009/01/03,22,1,2196
3,CA: San Diego,San Diego Randonneurs / 905140,ACPB,200,2009/01/03,42,5,1990
4,TX: Houston,Houston Randonneurs / 943030,ACPB,200,2009/01/03,23,2,2005


In [20]:
event_details.to_csv(f'raw_data/event_details_{TARGET_YEAR}.csv', index=False)

In [21]:
event_results.to_csv(f'raw_data/event_results_{TARGET_YEAR}.csv', index=False)

In [22]:
events.to_csv(f'raw_data/events_{TARGET_YEAR}.csv', index=False)