# Enduro World Series (EWS) web scraping and analysis

This notebook downloads and analyzes the results from the EWS from its inception to current, and we carry out analysis on these results.

First, we begin by downloading results and scraping the files from https://www.enduroworldseries.com/

In [1]:
import bs4
import requests
import typing_extensions
import pandas
import re
from PyPDF2 import PdfWriter, PdfReader


Some functions to make web scraping more pretty. We're using the `requests` package to to download page information and returning in a cleaned up format using `bs4`.

In [2]:
def download_page(url):
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}

	req = requests.get(url, headers=headers)

	try:
		req.raise_for_status()
	except Exception as e:
		print(f'Downloading failed: {e}')
	
	return bs4.BeautifulSoup(req.text, "html.parser")

In [4]:
# test download of 2016 results

test_url = "https://www.enduroworldseries.com/races/1/2016/" 

ews_soup = download_page(test_url)

In [5]:
ews_soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Enduro World Series | EWS Race Series</title>
<meta content="" name="description"/>
<meta content="Enduro World Series Race Labs" name="author"/>
<meta content="Enduro World Series | EWS Race Series" property="og:title">
<meta content="List of all the events in the EWS calendar for the current year" property="og:description">
<meta content="https://admin.enduroworldseries.com/uploads/pagebanners/EWS_banner.png" property="og:image">
<!-- Mobile Meta -->
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="/images/favicon.png" rel="shortcut icon"/>
<link href="https://fonts.googleapis.com/css?family=Open+Sans:400italic,700italic,400,700,300&amp;subset=latin,latin-ext" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=PT+Serif" rel="stylesheet" type="text/css"/>
<link href="https://fonts.googleapis.com/css?family=Exo:100,200,300,400,500,500i,600,700,8

In [6]:
# for link in ews_soup.find_all('a'):
# 	url = link.get('href')
# 	# print(link.get('href'))
# 	if 'race' in url:
# 		print(url)
links = ews_soup.find_all('a', href=True)
links = [link.get('href') for link in links]

race_list_URL = [test_url[:-14]+str(link) for link in links if 'results' in link]

In [116]:
race_list_URL

['https://www.enduroworldseries.com/race/ews-round-1/ews-tweed-valley/202186/results/',
 'https://www.enduroworldseries.com/race/ews-round-2/ews-petzen-jamnica/202187/results/',
 'https://www.enduroworldseries.com/race/ews-round-3/ews-val-di-fassa-trentino/202188/results/',
 'https://www.enduroworldseries.com/race/ews-round-4/ews-whistler/202189/results/',
 'https://www.enduroworldseries.com/race/ews-round-5/ews-burke/202190/results/',
 'https://www.enduroworldseries.com/race/ews-round-6/ews-sugarloaf/202191/results/',
 'https://www.enduroworldseries.com/race/ews-round-7/ews-crans-montana/202192/results/',
 'https://www.enduroworldseries.com/race/ews-round-8/ews-loudenvielle/202193/results/',
 'https://www.enduroworldseries.com/race/ews-round-1/MontenbaikEnduroWorldSeries/201601/results/',
 'https://www.enduroworldseries.com/race/ews-round-2/CerroCatedralMontenbaikEnduroWorldSeriespresentedbyShimano/201602/results/',
 'https://www.enduroworldseries.com/race/ews-round-3/EmeraldEnduropre

16


In [7]:
test_race = download_page(race_list_URL[0])

In [8]:
test_pdf = test_race.find_all('a', href=True)
test_pdf = [(link.get('href'), link) for link in test_pdf]
test_pdf = [(pdf, pdf_link.get_text()) for pdf, pdf_link in test_pdf if '.pdf' in pdf]

In [9]:
test_pdf

[('https://admin.enduroworldseries.com/uploads/documents/races/1654354218.pdf',
  'Results Pro Stage'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654368581.pdf',
  'Results EWS100'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654368628.pdf',
  'Results EWS80'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654469829.pdf',
  'Results EWS'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654469868.pdf',
  'Rankings Series'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654469962.pdf',
  'Rankings Teams'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654535565.pdf',
  'Split S1'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654535596.pdf',
  'Split S3'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654535624.pdf',
  'Split S5'),
 ('https://admin.enduroworldseries.com/uploads/documents/races/1654535653.pdf',
  'Split S6')]

In [117]:
race_list_URL[0].split('/')

['https:',
 '',
 'www.enduroworldseries.com',
 'race',
 'ews-round-1',
 'ews-tweed-valley',
 '202186',
 'results',
 '']

In [37]:
req = requests.get('https://admin.enduroworldseries.com/uploads/documents/races/1584572245.pdf')
req.raise_for_status()

with open('raw_pdf/test3.pdf', 'wb') as f:
	f.write(req.content)

In [2]:
reader = PdfReader("raw_pdf/test2.pdf")
pages = [page.extract_text().split('\n') for page in reader.pages] # newline separates lines on all pages

In [68]:
# Various regular expressions to extract data from text of results PDF
stage_numbers_regex = re.compile(r'Stage \d')					# recognizes stage numbers in headers of PDF
position_plate_name_regex = re.compile(r'\d+\s\d+\s[^a-z]+[a-z]+\s')		# finds rider position and name from individuals
dnf_dns_plate_name_regex = re.compile(r'(DNF|DNS|DSQ)\s\d+\s[^a-z]+[a-z]+\s')	# finds DNF/DNS rider information 
stage_position_regex = re.compile(r'\d:\d\d:\d\d\.\d\d \d+') 			# recognizes the a stage with its position
stage_time_regex = re.compile(r'(\d:\d\d:\d\d\.\d\d)') 				# recognizes each stage time (assumes all stages  <10 hours)
gap_regex = re.compile(r'\+\d:\d\d:\d\d\.\d\d') 				# determines gap from overall leader 
penalty_regex = re.compile(r'\d:\d\d:\d\d\.\d\d\s+\d:\d\d:\d\d\.\d\d') 		# penalty values occur before overall stage results
rider_id_regex = re.compile(r'\w{3}\.[\d\w\s]+\.[\d\w]+') 			# gets rider ID from results
lastname_regex = re.compile(r'\s[^a-z0-9]+')					# recognizes lastname - located between plate and firstname, no lowercase or numbers
firstname_regex = re.compile(r'[A-Z][a-z]+')					# recognizes firstname - first capital letter then lowercase 
position_plate_regex = re.compile(r'\d+\s\d+')					# recognizes the position and plate
dnf_dns_plate_regex = re.compile(r'(DNF|DNS|DSQ)\s\d+')				# recognizes DNF/DNS/DSQ along with

#TODO need better regex for firstname/lastname - see G.T. CLYNE


In [59]:
pdf_header = pages[0][:5]
columns = pdf_header[0] + pdf_header[1] # first two lines are the column names for the file

# race information
num_stages = len(stage_numbers_regex.findall(columns)) # store the total number of stages based upon header
race_date = pdf_header[4]
race_location = pdf_header[3]
race_type = 'standard'

race_info = [race_date, race_location, race_type]

header_race_info = ['date', 'race_location', 'race_type']
header_rider_info = ['rider_category','rider_plate', 'rider_lastname', 'rider_firstname', 'rider_id', 'rider_final_position',
	      'rider_penalties' , 'rider_final_time' , 'gap_from_first']
header_rider_stage_results = ['stage_'+str(i)+'_time' for i in range(1,num_stages+1)] + ['stage_'+str(i)+'_pos' for i in range(1,num_stages+1)]
# df_list = [['rider_num', 'rider_name', 'rider_id', 'rider_final_position' + 'rider_final_time'] + ['stage_'+str(i)+'time']]

race_info = [header_race_info + header_rider_info + header_rider_stage_results]

In [53]:
race_info

[['date',
  'race_location',
  'race_type',
  'rider_category',
  'rider_plate',
  'rider_lastname',
  'rider_firstname',
  'rider_id',
  'rider_final_position',
  'rider_penalties',
  'rider_final_time',
  'gap_from_first',
  'stage_1_time',
  'stage_2_time',
  'stage_3_time',
  'stage_4_time',
  'stage_5_time',
  'stage_6_time',
  'stage_1_pos',
  'stage_2_pos',
  'stage_3_pos',
  'stage_4_pos',
  'stage_5_pos',
  'stage_6_pos']]

In [72]:
for page in pages:

	result = race_info
	is_results_page = pdf_header[0] == page[0] # checks if the first line of the page matches the header
	
	if is_results_page:
		i = 5 # start after header
		rider_catagory = ''

		# iterate over all lines except final (which contains metadata)
		while i < len(page) - 1:

			ppnr = position_plate_name_regex.search(page[i])
			ddr = dnf_dns_plate_name_regex.search(page[i])

			if ppnr or ddr: # check if line contains rider information 
				result = []

				line1 = page[i]
				i += 1
				line2 = page[i]

				fix = stage_time_regex.sub(r' \1', line1+line2) # adds space before each stage time - used to fix issue with formatting of underlines
				fix = fix.replace('+ ', '+') # removes space before gap time

				if ppnr:
					info = ppnr.group()
					ppr = position_plate_regex.search(info)
					
					
				else:
					info = ddr.group()
					ppr = dnf_dns_plate_regex.search(info)
				
				position, plate = ppr.group().split(' ')
				
				lastname = lastname_regex.search(info).group()
				lastname = lastname[1:-2]
				firstname = firstname_regex.search(info).group()

				spr = stage_position_regex.findall(fix)		
				rir = rider_id_regex.search(fix)
				pr = penalty_regex.search(fix)
				gr = gap_regex.search(fix)
				

				rider_num = None

				if rir:
					rider_num = rir.group()

				penalty_time = None
				if pr:
					penalty_time = pr.group().split(' ')[0]

				result = [rider_category, plate, lastname, firstname, rider_num, position, penalty]
				print(result)
				i += 1

				#TODO add in the final results
				
			else:	# otherwise, this is category information for the following riders
				rider_category = page[i]
				i += 1
			

MEN
['MEN', '1', 'HILL', 'Sam', 'AUS.1985.21775', '1', None]
['MEN', '6', 'MAES', 'Martin', 'BEL.1997.21636', '2', None]
['MEN', '11', 'WALLNER', 'Robin', 'SWE.1988.21594', '3', None]
['MEN', '63', 'WILDHABER', 'Gusti', 'SUI.1988.21848', '4', None]
['MEN', '12', 'MELAMED', 'Jesse', 'CAN.1992.21596', '5', None]
['MEN', '24', 'JOHANSEN', 'Zakarias', 'NOR.1992.21863', '6', None]
['MEN', '29', 'CARLSON', 'Josh', 'AUS.1986.21659', '7', None]
['MEN', '17', 'OSBORNE', 'Marco', 'USA.1992.21762', '8', None]
['MEN', '30', 'TORDO', 'Dimitri', 'FRA.1993.21764', '9', None]
['MEN', '50', 'NEER', 'Shawn', 'USA.1991.21712', '10', None]
['MEN', '5', 'OTON', 'Damien', 'FRA.1987.21728', '11', None]
['MEN', '7', 'NICOLAI', 'Florian', 'FRA.1993.21749', '12', None]
['MEN', '64', 'BURNS', 'Pedro', 'CHI.1997.21729', '13', None]
['MEN', '37', 'DENIAUD', 'Youn', 'FRA.1996.22994', '14', None]
['MEN', '79', 'ACUNA', 'Maurico', None, '15', None]
['MEN', '23', 'BARELLI', 'Yoann', 'FRA.1985.21889', '16', None]
MEN
[

In [122]:
pages[-1][0] == pdf_header[0]

False

In [10]:
reader.pages[0].extract_text()

'Pos Plate Name Nat Stage 1 Pos Stage 2 Pos Stage 3 Pos Stage 4 Pos Penalties Time Gap\n   Team EMBA Stage 5 Pos Stage 6 Pos Stage 7 PosCrankworx Rotorua Giant Toa Enduro\nRotorua, New Zealand\nMarch 26, 2017\nRESULTS OVERALL\nMEN \n1 39 MASTERS Wyn 0:03:16.65 2 0:05:37.35 5 0:04:54.78 17 0:06:48.68 6 0:37:31.72\n  GT Factory Racing NZL.MASW.1987 0:07:06.31 2 0:04:26.45 19 0:05:21.50 15\n2 62 WALKER Matt 0:03:22.02 8 0:05:31.33 2 0:04:39.74 4 0:06:53.18 12 0:37:35.40 +0:00:03.68\n NZL.WALM.1990 0:07:27.86 18 0:04:24.55 11 0:05:16.72 6\n3 63 MASTERS Eddie 0:03:32.63 330:05:29.15 10:04:27.36 1 0:06:53.40 13 0:37:37.95 +0:00:06.23\n NZL.MASE.1989 0:07:11.22 4 0:04:46.00 59 0:05:18.19 11\n4 13 HILL Sam 0:03:28.58 22 0:05:44.14 28 0:04:55.69 190:06:44.10 1 0:37:40.33 +0:00:08.61\n  Chain Reaction Cycles Mavic AUS.HILS.1985 0:07:19.73 7 0:04:17.81 20:05:10.28 1\n5 7 CALLAGHAN Greg 0:03:37.79 52 0:05:38.72 10 0:04:49.21 8 0:06:48.44 5 0:37:58.21 +0:00:26.49\n  Cube Action Team IRL.CALG.1991 0

In [11]:
page2 = page2text.split('\n')

In [94]:
# note the issues with the underlines 
pages[11].split('\n')

['Pos Plate Name Nat Stage 1 Pos Stage 2 Pos Stage 3 Pos Stage 4 Pos Penalties Time Gap',
 '   Team EMBA Stage 5 Pos Stage 6 Pos Stage 7 PosCrankworx Rotorua Giant Toa Enduro',
 'Rotorua, New Zealand',
 'March 26, 2017',
 'RESULTS OVERALL',
 'MEN | Master 40+',
 '17 724 TAGUE Alex 0:04:08.14 10 0:06:10.11 9 0:07:12.94 27 0:07:44.67 11 0:50:45.24 +0:11:11.53',
 ' NZL.TAGA.1976 0:11:52.82 21 0:06:53.99 24 0:06:42.57 14',
 '18 729 DAWSON Guy 0:04:43.30 21 0:07:14.50 24 0:06:32.52 20 0:09:08.07 22 0:51:33.69 +0:11:59.98',
 ' 0:10:44.76 15 0:06:12.16 17 0:06:58.38 18',
 '19 730 SCHAUT Duncan 0:05:04.41 25 0:07:12.12 23 0:06:01.95 16 0:08:48.45 21 0:51:37.63 +0:12:03.92',
 ' 0:10:54.67 17 0:06:29.37 20 0:07:06.66 19',
 '20 717 PIRES Armando 0:04:40.32 20 0:07:00.89 20 0:06:28.87 18 0:08:35.49 20 0:51:50.61 +0:12:16.90',
 ' BRA.PIRA.1974 0:11:32.84 20 0:06:22.62 18 0:07:09.58 21',
 '21 728 SPRAGUE Steve 0:04:48.68 23 0:07:02.20 21 0:06:47.25 23 0:09:09.64 23 0:54:27.44 +0:14:53.73',
 ' 0:12:5

In [63]:
line1 = '4 13 HILL Sam 0:03:28.58 22 0:05:44.14 28 0:04:55.69 190:06:44.10 1 0:37:40.33 +0:00:08.61'
line2 = '  Chain Reaction Cycles Mavic AUS.HILS.1985 0:07:19.73 7 0:04:17.81 20:05:10.28 1'

line1 = '147 122 DA SILVA Goncalo 0:04:11.38 125 0:09:38.38 157 0:07:50.37 154 0:11:31.81 153 0:01:00.00 1:10:34.72 +0:33:03.00'
line2 = ' POR.DA G.1987 0:19:28.41 147 0:08:13.09 148 0:08:41.28 148'

fix = stage_time_regex.sub(r' \1', line1+line2) # adds space before each stage time - used to fix issue with formatting of underlines
fix = fix.replace('+ ', '+') # removes space before gap time

In [26]:
fix

'147 122 DA SILVA Goncalo  0:04:11.38 125  0:09:38.38 157  0:07:50.37 154  0:11:31.81 153  0:01:00.00  1:10:34.72 +0:33:03.00 POR.DA G.1987  0:19:28.41 147  0:08:13.09 148  0:08:41.28 148'

In [64]:
ppnr = position_plate_name_regex.search(fix)
spr = stage_position_regex.findall(fix)
rir = rider_id_regex.search(fix)

In [28]:
pr = penalty_regex.search(fix)

In [29]:
pr.group()

'0:01:00.00  1:10:34.72'

In [65]:
ppnr.group()

'147 122 DA SILVA Goncalo '

In [93]:
spr

['0:04:11.38 125',
 '0:09:38.38 157',
 '0:07:50.37 154',
 '0:11:31.81 153',
 '0:19:28.41 147',
 '0:08:13.09 148',
 '0:08:41.28 148']

In [39]:
rir.group()

'POR.DA G.1987'