In [4]:
import pandas as pd
from time import sleep
from tqdm import tqdm

In [5]:
TARGET_YEAR = '2023'

In [6]:
def check_YYYY(YYYY):
    is_good = isinstance(YYYY, str) and len(YYYY) == 4 and YYYY.isdigit()
    if(not(is_good)):
       raise Exception("Bad args")
    return(is_good)

In [7]:
def clean_names(old_cols):
    new_cols = [c.lower().replace('#', '') for c in old_cols]
    return(new_cols)

In [8]:
def get_events(YYYY):
    check_YYYY(YYYY)
    url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date={YYYY}&type=&dist=&rtid=&esortby=cert&collapse=0"
    print(url)
    
    # get versions with and without links
    dat_links = pd.read_html(url, extract_links='all')[0]
    dat = pd.read_html(url)[0]
    
    # break apart tuples to get urls. we'll have columns called 0 and 1
        # TODO: validate it contains Route and select columns
    route_ids = pd.DataFrame(dat_links[('Route', None)].tolist(), 
                             index=dat_links.index)
    event_ids = pd.DataFrame(dat_links[('select', None)].tolist(), 
                             index=dat_links.index)

    # extract ids from link urls in column 1
        # TODO validate regex, in case format changes
    dat['rt_url'] = route_ids[1]
    dat['rtid'] = route_ids[1].str.extract('^.*rtid=(.*)')
    dat['event_url'] = event_ids[1]
    dat['eid'] = event_ids[1].str.extract('^.*eid=(.*)&')
    
    # clean col names
    new_cols = [c.lower() for c in list(dat.columns)]
    dat.columns = new_cols

    # drop first row and the 'select' column
    dat = dat.drop(0, axis=0)
    dat = dat.drop('select', axis=1)

    return(dat)

In [12]:
events = get_events(TARGET_YEAR)
events.head()

https://rusa.org/cgi-bin/resultsearch_PF.pl?regid=&date=2023&type=&dist=&rtid=&esortby=cert&collapse=0


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x93 in position 41968: invalid start byte

In [10]:
events.tail()

NameError: name 'events' is not defined

In [8]:
def get_event_details_results(eid):
    
    event_url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?eid={eid}&esortby=cert"
    #print(event_url)
    two_dats = pd.read_html(event_url)
    
    event_details = two_dats[0]
    event_details.columns = clean_names(list(event_details.columns))
    event_details['eid'] = eid

    # warn if we only get one table back
    if(len(two_dats)<2):
        #print(f'Warning: no finisher results for event {eid}!')
        event_results = pd.DataFrame()
    else:
        event_results = two_dats[1]
        event_results.columns = clean_names(list(event_results.columns))
        # drop the rows 'x nonmembers also finished'
        try:
            event_results = event_results.drop(
                event_results[event_results.cert.str.contains('also finished')].index
            )
        except:
            pass
            #print(f"eid {eid} - couldn't drop 'non member' result rows")
        event_results['eid'] = eid
    
    return([event_details,event_results])

In [9]:
events.groupby('type').size()

type
ACPB     553
ACPF      12
ACPR       2
RM         8
RUSAB    111
RUSAF     23
RUSAP    208
UAFB      11
dtype: int64

In [10]:
# https://rusa.org/pages/eventtypes
non_team_types = ['ACPB', 'RM', 'RUSAB', 'RUSAP']

In [11]:
# list of event ids for the non team event types, for which we'll get results/details
event_ids = events[events.type.isin(non_team_types)].eid.to_list()
print(len(event_ids))

880


In [12]:
e_detail_list = []
e_results_list = []

In [13]:
for e in tqdm(event_ids):
    #print(e)
    this_d_e = get_event_details_results(e)
    e_detail_list.append(this_d_e[0])
    e_results_list.append(this_d_e[1])
    # sleep roughly every nth call (assuming event ids are consecutive ints)
    if(int(e) % 10 == 0):
        #print('Sleeping')
        sleep(.5)

100%|██████████████████████████████████████████████████████████████████████████████████████| 880/880 [06:42<00:00,  2.18it/s]


In [14]:
# should all be the same
print(len(e_detail_list), len(e_results_list), len(event_ids))

880 880 880


In [15]:
event_results = pd.concat(e_results_list)

In [16]:
event_details = pd.concat(e_detail_list)

In [16]:
# for when i forgot to filter team events before getting results; 
# the format is different and messes stuff up, plus i don't care about those results right now
# erl2 = list(compress(e_results_list, 
#                      events.type.isin(non_team_types)))

In [17]:
event_details.reset_index(inplace=True, drop=True)
event_results.reset_index(inplace=True, drop=True)

In [18]:
event_results.head()

Unnamed: 0,cert,rusa,name,club / acp code,time,medal,eid
0,RUSA-P18340,390,"DRISCOLL, Dan",Lone Star Randonneurs / 943026,05:15,,12810
1,RUSA-P18341,11575,"ANDREWS, Delwin",Lone Star Randonneurs / 943026,05:42,,12810
2,RUSA-P18342,3205,"WRIGHT, Pamela",Lone Star Randonneurs / 943026,05:15,,12810
3,RUSA-P18343,736,"BREAUD, Debbie",Lone Star Randonneurs / 943026,05:10,,12810
4,RUSA-P18344,1212,"FLICKNER, Brad",Lone Star Randonneurs / 943026,05:07,,12810


In [19]:
event_details.head()

Unnamed: 0,region,club,type,distance,date,finishers,dnf,eid
0,TX: Dallas,Lone Star Randonneurs / 943026,RUSAP,107,2022/01/01,11,0.0,12810
1,OR: Portland,Oregon Randonneurs / 937020,RUSAP,109,2022/01/01,8,0.0,13567
2,CA: Davis,Davis Bike Club / 905014,RUSAP,102,2022/01/01,0,2.0,13156
3,GA: Atlanta,Audax Atlanta / 910004,ACPB,200,2022/01/01,9,0.0,12795
4,CA: Davis,Davis Bike Club / 905014,ACPB,200,2022/01/01,6,0.0,13155


In [20]:
event_details.to_csv(f'raw_data/event_details_{TARGET_YEAR}.csv', index=False)

In [21]:
event_results.to_csv(f'raw_data/event_results_{TARGET_YEAR}.csv', index=False)

In [22]:
events.to_csv(f'raw_data/events_{TARGET_YEAR}.csv', index=False)