# Enduro World Series (EWS) web scraping and analysis

First off, what is enduro? Basically, it's downhill mountain biking where you have to pedal your way to each stage. Racers are timed on the downhill portion, and then have to pedal their way to the next stage (instead of taking a chair lift, etc.). It looks like:

![https://images.app.goo.gl/64AV4ZtHASXin8Ru9](img/muddy_enduro.gif)

But also a long day in the saddle. For an example, here's the summary of a race on Strava of a pro enduro racer [Jesse Melamed](https://www.strava.com/activities/7260508291) who took 2nd place (by less than half a second to first!). On the clock, his time was 03:00.67 - wheras the total pedaling time was over three and a half hours!

![](img/example_ews.png)

Enduro racing at the world stage happens in the Enduro World Series, where the best of the best earn points by winning stages and races. At the end of the season a victor is crowned based on the number of points earned. We're going to take a look at the results in these races and look for trends that identify the types of performances that can crown a winner.

## Gather the data - web scraping
The following cells of this notebook download the results from the EWS for 2022. We only use the ! Like any good data science project, data wrangling takes 80% of the time...

First, we begin by downloading results and scraping the files from https://www.enduroworldseries.com/

In [229]:
import bs4
import requests
import typing_extensions
import re
import copy
import csv
import os
import traceback
import json

import pandas as pd

from PyPDF2 import PdfWriter, PdfReader

Some functions to make web scraping more pretty. We're using the `requests` package to make requests to the server.

In [None]:
#todo find out the file structure for races - it seems that each result is sorted by class in the form //race_results/class/class#
#todo determine the classes and class numbers present

import requests

base_url = "https://a23ea854a37f.arangodb.cloud:8529/_db/EWSDB/api_production//"

payload = ""
headers = {
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.9",
    "Authorization": "Basic QVBJX0VXUzpJRG9BUElUaGluZ3NGb3JQZW9wbGUuITI=",
    "Connection": "keep-alive",
    "DNT": "1",
    "Origin": "https://www.enduroworldseries.com",
    "Referer": "https://www.enduroworldseries.com/",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "cross-site",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "macOS",
    "sec-gpc": "1"
}

In [None]:
def url_to_json_dict(url, payload=payload, headers=headers, save=False, folder="", filename="", year="2022"):
	results = requests.request("GET", url, data=payload, headers=headers)
	
	if save:
		with open(year+folder+filename+".json", 'w+') as f:
			json.dump(results, f)

	return json.loads(results.text)

In [None]:
def url_to_json_string(url, payload=payload, headers=headers, save=False, folder="", filename="", year="2022"):
	results = requests.request("GET", url, data=payload, headers=headers)
	
	if save:
		with open(year+folder+filename+".json", 'w+') as f:
			json.dump(results, f)

	return results.text

In [None]:
url_races_2022 = "race_names/2022"
race_information = url_to_json_dict(base_url+url_races_2022)

In [None]:
race_names_2022 = [race['description'] for race in race_information]

race_url_strings_2022 = {race:race.replace(' ', '%20') for race in race_names_2022}

In [None]:
race_classes_2022 = {race:url_to_json_dict(base_url+"race_classes/2022/"+race_string) for race, race_string in race_url_strings_2022.items()}

In [None]:
# Individual rider query: race_results/rider/[rider class]/[rider #]

rider_result_test = url_to_json_dict(base_url+"race_results/rider/80467121/22930")

rider_result_test = rider_result_test[0]

results_format = ['time', 'stage_result', 'cumulative_result', 'cumulative_behind', 'overall_time']

In [None]:
print(rider_result_test[0]['stage'][:7].lower().replace(" ", "_"))

In [None]:
# create custom unpacking of data - convert data to columns
def unpack_stage_results(rider_results, rider_id, results_format=results_format, save=False):
	i = 1
	offset = 0
	header = []
	results = []
	while i < len(rider_results) + 1:
		stage_data = rider_results[i-1]['stage'] # trims results from format of 'Stage 1PRO' to 'Stage 1' in case of pro/queen stage
		if len(stage_data) > 7:
			stage_data = stage_data[:7]
		stage_info = stage_data.lower().replace(" ", "_") # modifies results from format of 'Stage 1' to 'stage_1'
		stage_info = stage_info + "_"
		for result in results_format:
			header.append(stage_info + result)
			results.append(rider_results[i-1][result])

		i +=1

	return ['rider_id']+header, [rider_id]+results

In [None]:
# race_class = "80467121"
# TODO adjust for riders not having results in all categories

#for race in race_classes_2022['EWS Burke']:

results_dict = dict()

for race_name in race_classes_2022.keys():

# race_name = 'EWS Burke'

	for race_class in race_classes_2022[race_name]:
	#for race_class in [{'name': 'EWS80 | MEN', '_key': '80470280'},{'name': 'MEN', '_key': '80467139'}]:

		# race information for a specific race class
		race_class_key = race_class['_key']
		race_class_desc = race_class['name']

		# download race results for a race class
		rider_class_results = url_to_json_dict(base_url+"race_results/class/"+race_class_key+"/1000/0")
		rider_class_df = pd.json_normalize(rider_class_results, 'results')

		# the ID for riders in each class (used to download specific results)
		rider_id_list = rider_class_df['rider_id']

		stage_class_results = []
		

		for rider_id in rider_id_list:
			individual_results = url_to_json_dict(f"{base_url}race_results/rider/{race_class_key}/{rider_id}")
			individual_results = individual_results[0]

			if len(stage_class_results) == 0:
				header, results = unpack_stage_results(individual_results, rider_id)
				stage_class_results = [header, results]

			else:
				_, results = unpack_stage_results(individual_results, rider_id)
				stage_class_results.append(results)

		stage_class_df = pd.DataFrame(stage_class_results[1:], columns=stage_class_results[0])

		full_rider_results = pd.merge(rider_class_df, stage_class_df, how='left', on='rider_id')

		# remove the '_key' column
		full_rider_results.drop('_key', inplace=True, axis=1)

		# add in the race class
		full_rider_results.insert(0, 'race_class', value=race_class_desc)

		# add in the race name at the beginning of the dataframe
		full_rider_results.insert(0, 'race_name', value=race_name)

		results_dict.update({race_name + "_" + race_class_desc : full_rider_results})

		full_rider_results.to_csv(race_name + "_2022_" + race_class_desc + ".csv", index=False)

In [None]:
EWS_2022_results = pd.concat([results for race_class, results in results_dict.items() if 'EWS ' in race_class])
EWS_2022_results.to_csv('EWS_2022_results_by_race.csv',index=False)

# EWSE_2022_results = pd.concat([results for race_class, results in results_dict.items() if 'EWS-E' in race_class])
# EWSE_2022_results.to_csv('EWS-E_2022_results_by_race.csv',index=False)

In [None]:
results_dict

In [None]:
len(stage_results[-1])

In [None]:
stage_df = pd.DataFrame(stage_results[1:], columns=stage_results[0])
stage_df.rider_id = stage_df.rider_id.astype('str')

In [None]:
stage_df

In [None]:
riders.rider_id = riders.rider_id.astype('str')

In [None]:
# merging data together

pd.merge(riders, stage_df, how='left', on='rider_id')

In [None]:
rider_results_test[0]

In [None]:
pd.json_normalize(rider_result_test[0])

In [None]:
def download_page(url):
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}

	req = requests.get(url, headers=headers)

	try:
		req.raise_for_status()
	except Exception as e:
		print(f'Downloading failed: {e}')
	
	return bs4.BeautifulSoup(req.text, "html.parser")

We then download the pages for each of the race years for standard EWS race results and EMTB race results

In [None]:
emtb_result = "https://www.enduroworldseries.com/races/6/" 
ews_result = "https://www.enduroworldseries.com/races/1/"

emtb_years = [2020, 2021, 2022]
ews_years = [2018, 2019, 2020, 2021, 2022]

list_of_emtb_soup = [download_page(emtb_result + str(year) + '/') for year in emtb_years]

list_of_ews_soup = [download_page(ews_result + str(year) + '/') for year in ews_years]

Now onto downlading the actual url's which contain the 

In [None]:
def get_result_links(results_page, result_url, year):
	links = results_page.find_all('a', href=True)
	links = [link.get('href') for link in links]

	race_list_URL = [result_url[:-9]+str(link_text) for link_text in links if 'results' in link_text]

	return [(url, url.split('/')[-4]) for url in race_list_URL if str(year) in url] # returns the tuple of the URL and the name of the event

In [None]:
links_to_emtb_results = [get_result_links(list_of_emtb_soup[i], emtb_result, emtb_years[i]) for i in range(len(emtb_years))]
links_to_ews_results = [get_result_links(list_of_ews_soup[i], ews_result, ews_years[i]) for i in range(len(ews_years))]

In [None]:
def find_pdf_links(soup):
	HTML_source = soup.find_all('a', href=True)
	source_links = [(link.get('href'), link) for link in HTML_source]

	return [(pdf, pdf_link.get_text()) for pdf, pdf_link in source_links if '.pdf' in pdf]

def download_pdf(url, folder_path, filename):
	req = requests.get(url)
	req.raise_for_status()

	with open(folder_path + filename, 'wb') as f:
		f.write(req.content)

In [None]:
def get_pdf_downloads(result_links, years, folder_path):
	for i in range(len(years)):

		year = years[i]

		for race_link, race_name in result_links[i]:

			race_page = download_page(race_link)
			result_pdf = find_pdf_links(race_page)

			for pdf_link, pdf_text in result_pdf:

				filename = str(year)+ '_' + race_name + '_' + pdf_text.replace(' ', '_') + '.pdf'
				download_pdf(pdf_link, folder_path, filename)

The next two cells download all the pdfs for the desired years based off of the links

In [None]:
get_pdf_downloads(links_to_emtb_results, emtb_years, 'emtb_results/')

In [None]:
get_pdf_downloads(links_to_ews_results, ews_years, 'ews_results/')

## Reading the PDF and placing into dataframes
Now that we have read in the various PDF, we need to pull the data out into a useable format. PDF's are tricky beasts, so we're going to rely on the `PyPDF2` package to take these data in. Unfortunately, these PDF are not set up as tables (otherwise this would be a trivial import using the `camelot` package), so we need to use a bunch of regular expressions to extract the desired data. Then, we place the data in a pandas dataframe

In [None]:
# Various regular expressions to extract data from text of results PDF
stage_numbers_regex = re.compile(r'Stage \d')					# recognizes stage numbers in headers of PDF
position_plate_name_regex = re.compile(r'\d+\s\d+\s[^a-z]+[a-z]+\s')		# finds rider position and name from individuals
dnf_dns_plate_name_regex = re.compile(r'(DNF|DNS|DSQ)\s\d+\s[^a-z]+[a-z]+\s')	# finds DNF/DNS rider information 
stage_position_regex = re.compile(r'\d:\d\d:\d\d\.\d\d \d+') 			# recognizes the a stage with its position
stage_time_regex = re.compile(r'(\d:\d\d:\d\d\.\d\d)') 				# recognizes each stage time (assumes all stages  <10 hours)
gap_regex = re.compile(r'\+\d:\d\d:\d\d\.\d\d') 				# determines gap from overall leader 
penalty_regex = re.compile(r'\d:\d\d:\d\d\.\d\d\s+\d:\d\d:\d\d\.\d\d') 		# penalty values occur before overall stage results
rider_id_regex = re.compile(r'\w{3}\.[\d\w\s]+\.[\d\w]+') 			# gets rider ID from results
lastname_regex = re.compile(r'\s[^a-z0-9]+')					# recognizes lastname - located between plate and firstname, no lowercase or numbers
firstname_regex = re.compile(r'([A-Z][a-z]+\s)+')				# recognizes firstname - first capital letter then lowercase TODO make sure this matches correctly 
position_plate_regex = re.compile(r'\d+\s\d+')					# recognizes the position and plate
dnf_dns_plate_regex = re.compile(r'(DNF|DNS|DSQ)\s\d+')				# recognizes DNF/DNS/DSQ along with plate
penalties_details_regex = re.compile(r'Penalties details')
penalty_line_regex = re.compile(r'DNF: did not finish   ·   DNS: did not start   ·   DSQ: disqualified')

#TODO need better regex for firstname/lastname - see G.T. CLYNE


This is ugly, but this is the function which reads the results PDF files and converts to csv. We convert to csv for easy storage and loading into Pandas DataFrame.

In [None]:
def ews_pdf_to_csv(pdf_location, csv_location='csv_output/'):

	# read in PDF and convert each page to list of strings
	reader = PdfReader(pdf_location)
	pages = [page.extract_text().split('\n') for page in reader.pages] # newline separates lines on all pages

	pdf_header = pages[0][:5]
	columns = pdf_header[0] + pdf_header[1] # first two lines are the column names for the file

	# race information
	num_stages = len(stage_numbers_regex.findall(columns)) # store the total number of stages based upon header
	race_date = pdf_header[3]
	race_location = pdf_header[2]
	race_type = 'standard'

	race_info = [race_date, race_location, race_type]

	header_race_info = ['date', 'race_location', 'race_type']
	header_rider_info = ['rider_category','rider_plate', 'rider_lastname', 'rider_firstname', 'rider_id', 'rider_final_position',
		'rider_penalties' , 'rider_final_time' , 'gap_from_first']
	header_rider_stage_results = ['stage_'+str(i)+'_time' for i in range(1,num_stages+1)] + ['stage_'+str(i)+'_pos' for i in range(1,num_stages+1)]
	# df_list = [['rider_num', 'rider_name', 'rider_id', 'rider_final_position' + 'rider_final_time'] + ['stage_'+str(i)+'time']]

	race_header_info = [header_race_info + header_rider_info + header_rider_stage_results]

	all_results = race_header_info


	for page in pages:

		is_results_page = pdf_header[0] == page[0] # checks if the first line of the page matches the header
		
		# try:
		# 	is_penalty_page = penalties_details_regex.search(page[4])
		# except IndexError:
		is_penalty_page = False

		for line in page:
			if penalty_line_regex.search(line):
				is_penalty_page = True
				break


		# if page == pages[-1]:
		# 	for j in range(7):
		# 		print(f'line {j} = {page[j]}')

		if is_penalty_page:
			# print(is_penalty_page)
			break
		
		if is_results_page:
			i = 5 # start after header
			rider_catagory = ''

			# iterate over all lines except final (which contains metadata)
			while i < len(page) - 1:

				ppnr = position_plate_name_regex.search(page[i])
				ddr = dnf_dns_plate_name_regex.search(page[i])

				if ppnr or ddr: # check if line contains rider information 
					result = copy.deepcopy(race_info)

					line1 = page[i]
					i += 1
					line2 = page[i]

					fix = stage_time_regex.sub(r' \1', line1+line2) # adds space before each stage time - used to fix issue with formatting of underlines
					fix = fix.replace('+ ', '+') # removes space before gap time

					if ppnr:
						info = ppnr.group()
						ppr = position_plate_regex.search(info)
										
					else:
						info = ddr.group()
						ppr = dnf_dns_plate_regex.search(info)
					
					position, plate = ppr.group().split(' ')
					
					lastname = lastname_regex.search(info).group()
					lastname = lastname[1:-2]
					firstname = firstname_regex.search(info).group()

					spr = stage_position_regex.findall(fix)
					spr = [s.split(' ') for s in spr]	

					rir = rider_id_regex.search(fix)
					pr = penalty_regex.search(line1)
					gr = gap_regex.search(fix)
					st = stage_time_regex.findall(line1)
					

					rider_num = None
					if rir:
						rider_num = rir.group()


					penalty_time = None
					if pr:
						penalty_time = pr.group().split(' ')[0]

					result += [rider_category, plate, lastname, firstname, rider_num, position, penalty_time]

					if ppnr:
						if gr:
							final_time = st[-2]
							gap 	   = st[-1]
						else:
							if not len(st): # for instances where no st regex is found
								print(f'string: {page[i]}\nline: {i}') # 
							final_time = st[-1]
							gap = '0:00:00.00'

						result += [final_time, gap]
						result += [stage_time for stage_time, stage_pos in spr]
						result += [stage_pos for stage_time, stage_pos in spr]

					else:
						result += [None, None] # no gap or final time for DNF/DNS/DSQ
						stage_diff = num_stages - len(spr) # calculate how many stages were not completed

						result += [stage_time for stage_time, stage_pos in spr] + [None for _ in range(stage_diff)]
						result += [stage_pos for stage_time, stage_pos in spr] + [None for _ in range(stage_diff)]

					all_results.append(result)
					i += 1				
					
				else:	# otherwise, this is category information for the following riders
					rider_category = page[i]
					i += 1
	
	pdf_filename = os.path.split(pdf_location)[1][:-4]

	with open(csv_location + pdf_filename + '.csv', 'w', newline='') as cw:
		writer = csv.writer(cw)
		for row in all_results:
			writer.writerow(row)


### Running the script and saving to .csv

Now that we've got something to convert these standardized pdf's to csv, we run them through the script and create new csv for each file

In [None]:
for file in os.listdir('ews_results'):
	if file.endswith('.pdf') and '_Results' in file:
		try:
			ews_pdf_to_csv('ews_results/'+file)
		except IndexError as e:
			print(f'IndexError: {e}\nFile {file}')
			print(traceback.format_exc())

### Loading into a DataFrame

In [None]:
from operator import index


csv_directory = ['csv_output/'+ file for file in os.listdir('csv_output')]
df_list = []

df = pd.read_csv(csv_directory[0], header=0)

for file in csv_directory:
	df_list.append(pd.read_csv(file,index_col=False))


# df = pd.concat(df_list,ignore_index=True)

In [None]:
csv_directory[0]

In [None]:
df_list[0].head()

In [None]:
ews_pdf_to_csv("raw_pdf/test3.pdf")

In [None]:
os.path.split("raw_pdf/test3.pdf")

In [None]:
df = pd.read_csv('csv_output/test3.csv')

In [None]:
pages[-1][0] == pdf_header[0]

In [None]:
len(all_results[0]) == len(all_results[1])

In [None]:
all_results[1]

In [None]:
page2 = page2text.split('\n')

In [None]:
line1 = '4 13 HILL Sam 0:03:28.58 22 0:05:44.14 28 0:04:55.69 190:06:44.10 1 0:37:40.33 +0:00:08.61'
line2 = '  Chain Reaction Cycles Mavic AUS.HILS.1985 0:07:19.73 7 0:04:17.81 20:05:10.28 1'

line1 = '147 122 DA SILVA Goncalo 0:04:11.38 125 0:09:38.38 157 0:07:50.37 154 0:11:31.81 153 0:01:00.00 1:10:34.72 +0:33:03.00'
line2 = ' POR.DA G.1987 0:19:28.41 147 0:08:13.09 148 0:08:41.28 148'

fix = stage_time_regex.sub(r' \1', line1+line2) # adds space before each stage time - used to fix issue with formatting of underlines
fix = fix.replace('+ ', '+') # removes space before gap time

In [None]:
fix

In [None]:
ppnr = position_plate_name_regex.search(fix)
spr = stage_position_regex.findall(fix)
rir = rider_id_regex.search(fix)

In [None]:
pr = penalty_regex.search(fix)

In [None]:
pr.group()

In [None]:
ppnr.group()

In [None]:
spr = [s.split(' ') for s in spr]

In [None]:
spr

In [None]:
rir.group()