In [1]:
import pandas as pd
from time import sleep
from tqdm import tqdm
#from itertools import compress

In [2]:
TARGET_YEAR = '2019'

In [3]:
def check_YYYY(YYYY):
    is_good = isinstance(YYYY, str) and len(YYYY) == 4 and YYYY.isdigit()
    if(not(is_good)):
       raise Exception("Bad args")
    return(is_good)

In [4]:
def clean_names(old_cols):
    new_cols = [c.lower().replace('#', '') for c in old_cols]
    return(new_cols)

In [5]:
def get_events(YYYY):
    check_YYYY(YYYY)
    url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date={YYYY}&type=&dist=&rtid=&esortby=cert&collapse=0"
    print(url)
    
    # get versions with and without links
    dat_links = pd.read_html(url, extract_links='all')[0]
    dat = pd.read_html(url)[0]
    
    # break apart tuples to get urls. we'll have columns called 0 and 1
        # TODO: validate it contains Route and select columns
    route_ids = pd.DataFrame(dat_links[('Route', None)].tolist(), 
                             index=dat_links.index)
    event_ids = pd.DataFrame(dat_links[('select', None)].tolist(), 
                             index=dat_links.index)

    # extract ids from link urls in column 1
        # TODO validate regex, in case format changes
    dat['rt_url'] = route_ids[1]
    dat['rtid'] = route_ids[1].str.extract('^.*rtid=(.*)')
    dat['event_url'] = event_ids[1]
    dat['eid'] = event_ids[1].str.extract('^.*eid=(.*)&')
    
    # clean col names
    new_cols = [c.lower() for c in list(dat.columns)]
    dat.columns = new_cols

    # drop first row and the 'select' column
    dat = dat.drop(0, axis=0)
    dat = dat.drop('select', axis=1)

    return(dat)

In [6]:
events = get_events(TARGET_YEAR)
events.head()

https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date=2019&type=&dist=&rtid=&esortby=cert&collapse=0


Unnamed: 0,region,club,type,distance,date,route,rt_url,rtid,event_url,eid
1,TX: Dallas,Lone Star Randonneurs / 943026,RUSAP,100,2019/01/01,The Doctor's Daughter 100,/cgi-bin/routesearch_PF.pl?rtid=1491,1491,/cgi-bin/resultsearch_PF.pl?eid=10077&esortby=...,10077
2,CA: Los Angeles,Channel Islands Bike Club / 905082,ACPB,200,2019/01/01,Orange County Pendleton 200K,/cgi-bin/routesearch_PF.pl?rtid=995,995,/cgi-bin/resultsearch_PF.pl?eid=10423&esortby=...,10423
3,CA: Davis,Davis Bike Club / 905014,ACPB,200,2019/01/01,Davis-Calistoga-Davis 200Km,/cgi-bin/routesearch_PF.pl?rtid=1163,1163,/cgi-bin/resultsearch_PF.pl?eid=9450&esortby=cert,9450
4,CA: Davis,Davis Bike Club / 905014,RUSAP,103,2019/01/01,Moskowite Corner 100Km,/cgi-bin/routesearch_PF.pl?rtid=1322,1322,/cgi-bin/resultsearch_PF.pl?eid=9451&esortby=cert,9451
5,VA: Tidewater,Tidewater Randonneurs / 946002,RUSAP,100,2019/01/01,Suffolk 100K,/cgi-bin/routesearch_PF.pl?rtid=2034,2034,/cgi-bin/resultsearch_PF.pl?eid=10050&esortby=...,10050


In [7]:
def get_event_details_results(eid):
    
    event_url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?eid={eid}&esortby=cert"
    #print(event_url)
    two_dats = pd.read_html(event_url)
    
    event_details = two_dats[0]
    event_details.columns = clean_names(list(event_details.columns))
    event_details['eid'] = eid

    # warn if we only get one table back
    if(len(two_dats)<2):
        #print(f'Warning: no finisher results for event {eid}!')
        event_results = pd.DataFrame()
    else:
        event_results = two_dats[1]
        event_results.columns = clean_names(list(event_results.columns))
        # drop the rows 'x nonmembers also finished'
        try:
            event_results = event_results.drop(
                event_results[event_results.cert.str.contains('also finished')].index
            )
        except:
            pass
            #print(f"eid {eid} - couldn't drop 'non member' result rows")
        event_results['eid'] = eid
    
    return([event_details,event_results])

In [8]:
events.groupby('type').size()

type
ACPB     611
ACPF      15
ACPR       3
RM         3
RUSAB    137
RUSAF      9
RUSAP    173
dtype: int64

In [9]:
# https://rusa.org/pages/eventtypes
non_team_types = ['ACPB', 'RM', 'RUSAB', 'RUSAP']

In [10]:
# list of event ids for the non team event types, for which we'll get results/details
event_ids = events[events.type.isin(non_team_types)].eid.to_list()
print(len(event_ids))

924


In [11]:
e_detail_list = []
e_results_list = []

In [12]:
for e in tqdm(event_ids):
    #print(e)
    this_d_e = get_event_details_results(e)
    e_detail_list.append(this_d_e[0])
    e_results_list.append(this_d_e[1])
    if(int(e) % 6 == 0):
        #print('Sleeping')
        sleep(1)

100%|█████████████████████████████████████████| 924/924 [08:38<00:00,  1.78it/s]


In [13]:
# should all be the same
print(len(e_detail_list), len(e_results_list), len(event_ids))

924 924 924


In [14]:
event_results = pd.concat(e_results_list)

In [15]:
event_details = pd.concat(e_detail_list)

In [124]:
# erl2 = list(compress(e_results_list, 
#                      events.type.isin(non_team_types)))

In [16]:
event_details.reset_index(inplace=True, drop=True)
event_results.reset_index(inplace=True, drop=True)

In [17]:
event_results.head()

Unnamed: 0,cert,rusa,name,club / acp code,time,medal,eid
0,RUSA-P15298,413,"MYERS, Mike",Lone Star Randonneurs / 943026,05:40,,10077
1,RUSA-P15299,414,"MYERS, Nancy",Lone Star Randonneurs / 943026,05:40,,10077
2,RUSA-P15300,8958,"HARE, Karen",Lone Star Randonneurs / 943026,05:00,,10077
3,RUSA-P15301,11103,"HALL, David D",Lone Star Randonneurs / 943026,05:15,,10077
4,RUSA-P15302,4495,"TYER, Vickie",Lone Star Randonneurs / 943026,05:15,,10077


In [18]:
event_details.head()

Unnamed: 0,region,club,type,distance,date,finishers,dnf,eid
0,TX: Dallas,Lone Star Randonneurs / 943026,RUSAP,100,2019/01/01,5,0.0,10077
1,CA: Los Angeles,Channel Islands Bike Club / 905082,ACPB,200,2019/01/01,8,2.0,10423
2,CA: Davis,Davis Bike Club / 905014,ACPB,200,2019/01/01,24,1.0,9450
3,CA: Davis,Davis Bike Club / 905014,RUSAP,103,2019/01/01,6,0.0,9451
4,VA: Tidewater,Tidewater Randonneurs / 946002,RUSAP,100,2019/01/01,3,0.0,10050


In [19]:
event_details.to_csv(f'event_details_{TARGET_YEAR}.csv', index=False)

In [20]:
event_results.to_csv(f'event_results_{TARGET_YEAR}.csv', index=False)

In [153]:
#events.reset_index(inplace=True, drop=True)

In [21]:
events.to_csv(f'events_{TARGET_YEAR}.csv', index=False)