In [None]:
'''
get_results <- function(rtid) {
  
  # SCRAPE ----
  # get events for a given route ('rtid')
  route_url <- function(rtid) {
    glue("https://rusa.org/cgi-bin/resultsearch_PF.pl?",
         "regid=&date=&type=&dist=&",
         "rtid={rtid}",
         "&esortby=cert&",
         "collapse=1", rtid=rtid)
  }
  route_results <- 
    route_url(rtid) %>%
    read_html
  dat <- route_results %>%
    html_nodes('table') %>%
    html_table %>%
    .[[1]]
  
  # CLEAN -----
  # remove the 'x nonmember(s) also finished this event' rows
  bad_rows <- grepl('also finished this event$', dat[[1]])
  dat <- dat[!bad_rows,]
  
  # find 'event divider rows' -- the remaining ones where every column is the same
  all_same <- function(x) all(x==x[1])
  event_rows <- apply(dat, 1, all_same)
  
  # add event date as own field:
  # 1) pull the dates
  extracted_dates <- 
    dat[event_rows, 1] %>% 
    stringr::str_extract('[0-9]{4}/[0-9]{2}/[0-9]{2}') %>%
    ymd
  class(extracted_dates)
  # 2) add empty date column
  dat[['event_date']] <- as.Date(NA)
  
  # 3) fill in values at each position where a new event starts
  dat[event_rows, 'event_date'] <- extracted_dates
  
  # 4) lastly, carry those dates foreward over missing values, filling every row
  dat[['event_date']] <- zoo::na.locf(dat[['event_date']])
  
  # now we can drop these 'event divider' rows
  dat <- dat[!event_rows,]
  
  # type conversion
  dat$Time <- hm(dat$Time)
  dat$time_hours <- as.numeric(dat$Time)/60/60
  
  # append rtid as attribute
  attr(dat, 'rtid') <- rtid
  
  return(dat)
}
'''


In [1]:
def rusa_result_url(rtid):
    url = f"https://rusa.org/cgi-bin/resultsearch_PF.pl?rtid={rtid}&collapse=1"
    return(url)

In [3]:
import pandas as pd
import numpy as np

In [4]:
rtid = 1158
# read data, keep just the first table (the only one)
dat = pd.read_html(rusa_result_url(rtid))
dat = dat[0]

In [5]:
dat.shape
# remove the 'x nonmember(s) also finished this event' rows
bad_rows = dat['Cert#'].str.contains('also finished this event')
dat = dat[~bad_rows]
dat.shape

(1118, 5)

In [6]:
# find 'event divider rows' -- the remaining ones where every column is the same
event_rows = dat.apply(lambda x: len(set(x)) == 1, axis=1)

In [7]:
# extract event dates
extracted_dates = dat[event_rows].iloc[:,1].str.extract(r'([0-9]{4}/[0-9]{2}/[0-9]{2})')

In [8]:
# new column
dat['event_date'] = np.NaN

# fill in values at each position where a new event starts
dat.loc[event_rows, 'event_date'] =  extracted_dates.iloc[:,0]

In [9]:
dat = dat.fillna(method="ffill")
dat

Unnamed: 0,Cert#,RUSA#,Name,Club,Time,event_date
0,CA: San Francisco RUSAB 200 km 2011/11/05 fini...,CA: San Francisco RUSAB 200 km 2011/11/05 fini...,CA: San Francisco RUSAB 200 km 2011/11/05 fini...,CA: San Francisco RUSAB 200 km 2011/11/05 fini...,CA: San Francisco RUSAB 200 km 2011/11/05 fini...,2011/11/05
1,RUSA-B07069,7033,"ALEXANDER, Cory E",Randonneurs USA / 950095,08:21,2011/11/05
2,RUSA-B07070,6056,"ALLEN, Heath",Maui Randonneurs / 911003,09:29,2011/11/05
3,RUSA-B07071,6566,"ALLEN, Scott Wayne",Santa Rosa Cycling Club / 905048,13:06,2011/11/05
4,RUSA-B07073,7184,"BARKER, Christopher D",San Francisco Randonneurs / 905030,11:09,2011/11/05
...,...,...,...,...,...,...
1122,826190,6373,"Wilson, Charles",San Francisco Randonneurs / 905030,07:36,2022/02/19
1123,826191,7268,"Wilson, James",San Francisco Randonneurs / 905030,11:16,2022/02/19
1124,826192,15118,"Wu, Jasmine",San Francisco Randonneurs / 905030,08:36,2022/02/19
1125,826193,14760,"Zabell, Howard",Cycle Folsom / 905169,09:43,2022/02/19


In [10]:
# now we can drop these 'event divider' rows
dat = dat[~event_rows]

In [11]:
dat

Unnamed: 0,Cert#,RUSA#,Name,Club,Time,event_date
1,RUSA-B07069,7033,"ALEXANDER, Cory E",Randonneurs USA / 950095,08:21,2011/11/05
2,RUSA-B07070,6056,"ALLEN, Heath",Maui Randonneurs / 911003,09:29,2011/11/05
3,RUSA-B07071,6566,"ALLEN, Scott Wayne",Santa Rosa Cycling Club / 905048,13:06,2011/11/05
4,RUSA-B07073,7184,"BARKER, Christopher D",San Francisco Randonneurs / 905030,11:09,2011/11/05
5,RUSA-B07074,4269,"BEATO, Keith",San Francisco Randonneurs / 905030,11:09,2011/11/05
...,...,...,...,...,...,...
1122,826190,6373,"Wilson, Charles",San Francisco Randonneurs / 905030,07:36,2022/02/19
1123,826191,7268,"Wilson, James",San Francisco Randonneurs / 905030,11:16,2022/02/19
1124,826192,15118,"Wu, Jasmine",San Francisco Randonneurs / 905030,08:36,2022/02/19
1125,826193,14760,"Zabell, Howard",Cycle Folsom / 905169,09:43,2022/02/19


In [15]:
# add route id as column
dat['route_id'] = rtid
dat

Unnamed: 0,Cert#,RUSA#,Name,Club,Time,event_date,route_id
1,RUSA-B07069,7033,"ALEXANDER, Cory E",Randonneurs USA / 950095,08:21,2011/11/05,1158
2,RUSA-B07070,6056,"ALLEN, Heath",Maui Randonneurs / 911003,09:29,2011/11/05,1158
3,RUSA-B07071,6566,"ALLEN, Scott Wayne",Santa Rosa Cycling Club / 905048,13:06,2011/11/05,1158
4,RUSA-B07073,7184,"BARKER, Christopher D",San Francisco Randonneurs / 905030,11:09,2011/11/05,1158
5,RUSA-B07074,4269,"BEATO, Keith",San Francisco Randonneurs / 905030,11:09,2011/11/05,1158
...,...,...,...,...,...,...,...
1122,826190,6373,"Wilson, Charles",San Francisco Randonneurs / 905030,07:36,2022/02/19,1158
1123,826191,7268,"Wilson, James",San Francisco Randonneurs / 905030,11:16,2022/02/19,1158
1124,826192,15118,"Wu, Jasmine",San Francisco Randonneurs / 905030,08:36,2022/02/19,1158
1125,826193,14760,"Zabell, Howard",Cycle Folsom / 905169,09:43,2022/02/19,1158
