# Scraping trail running races 2021

This notebook scraps the trail running races' data from https://itra.run/races. Environment preparation is managed by the following scripts:
- WindowsEnvironment.ps1
- MacOSEnvironment.sh

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import re
import json
import pandas as pd
import whois
import sys
from tqdm import tqdm

# What environment am I using?
print(sys.executable)

Obrim un navegador i accedim a la plana que volem treballar.

In [None]:
# Using selenium, open firefox window with the ITRA website
driver = webdriver.Firefox()
driver.get("https://itra.run/races")

# Getting current URL source code 
get_title = driver.title 
  
# Printing the title of this URL 
print(get_title) 


Un cop oberta la plana canviem l'idioma a anglès i sel·leccionem les dates d'inici i de fi.

In [None]:
# Click dropdown menu for language selection
driver.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]").click()

# Select language EN
driver.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]/div/div[1]").click()

# Wait for page laod after click
time.sleep(5)

# elect init date
initYear=2021
initMonth=1
initDay=1

# Picker de la data d'inici
dpkStartdate = driver.find_element_by_css_selector("div.vdp-datepicker.dp1")
dpkStartdate.click()

# Picker per mostrar anys
spnMonthSelector = driver.find_element_by_css_selector(".dp1 span.day__month_btn.up")
spnMonthSelector.click()

# Picker per mostrar anys
spnYearSelector = driver.find_element_by_css_selector(".dp1 span.month__year_btn.up")
spnYearSelector.click()

# Picker per seleccionar any
divYears = driver.find_elements_by_css_selector(".dp1 span.cell.year")
divYears[initYear-2020].click()

# Picker per seleccionar mes
divMonths = driver.find_elements_by_css_selector(".dp1 span.cell.month")
divMonths[initMonth-1].click()

# Picker per seleccionar dia
divDays = driver.find_elements_by_css_selector(".dp1 span.cell.day")
# Get the days from the previous month in the current mont first week
divBlankDays = driver.find_elements_by_css_selector(".dp1 span.cell.day.blank")
divDays[initDay+len(divBlankDays)-1].click()  

# Select end date
endYear=2021
endMonth=12
endDay=31

dpkStartdate = driver.find_element_by_css_selector("div.vdp-datepicker.dp2")
dpkStartdate.click()

spnMonthSelector = driver.find_element_by_css_selector(".dp2 span.day__month_btn.up")
spnMonthSelector.click()

spnYearSelector = driver.find_element_by_css_selector(".dp2 span.month__year_btn.up")
spnYearSelector.click()

divYears = driver.find_elements_by_css_selector(".dp2 span.cell.year")
divYears[endYear-2020].click()

divMonths = driver.find_elements_by_css_selector(".dp2 span.cell.month")
divMonths[endMonth-1].click()

divDays = driver.find_elements_by_css_selector(".dp2 span.cell.day")
# Get the days from the previous month in the current mont first week
divBlankDays = driver.find_elements_by_css_selector(".dp2 span.cell.day.blank")
divDays[endDay+len(divBlankDays)-1].click()  

Carreguem totes les curses fins que no n'hi hagi més.

In [None]:
# Click on More Races to get the full list on the screen & Wait for Visibility of Races
 maxIterations = 10 #-1 for ALL
i = 0
try:
    btnSeeMore = driver.find_element_by_css_selector('button.btn-itra-black[type="button"]')    

except:
    btnSeeMore = None
    print("No button")

while btnSeeMore is not None and i < maxIterations:
    i = i +1
    btnSeeMore.click()
    try:
        btnSeeMore = driver.find_element_by_css_selector('button.btn-itra-black[type="button"]')    

    except:
        btnSeeMore = None
        print("No button")      
    finally:
        time.sleep(5)

print("No more results")

Ara que ja tenim totes les curses carregades obtenim les dades analitzant el codi HTML amb BeautifulSoup.

In [None]:
# Getting current URL source code 
get_source = driver.page_source
time.sleep(2)

In [None]:
# Scraping race names with BeautifulSoup
soup = BeautifulSoup(get_source, 'html')
#print(soup.h5)
#soup.find_all('h5')

In [None]:
# Find the race names
racesList = re.findall(r'(?<=<h5 data-v-f3c4ac1c="" class="itra-green">)(.*?)(?=</h5>)', get_source)
print(len(racesList))
print(racesList)

In [None]:
# Find the mouse over link pointing to the race site
links = [a['href'] for a in soup.find_all('a',"card ontop", href=True)]
print(len(links))
print(links)

In [None]:
# Scraping the data for distance, elevation gain and loss
myList = re.findall(r'(?<=<span class="icon-text-grey icon-bold">)(.*?)(?=</span>)', get_source)
#print(myList)
#len(myList)

# Find the race distance
distancesList = myList[0::3]
print(len(distancesList))

# Find the race elevation gain
gainList = myList[1::3]
print(len(gainList))

# Find the race elevation loss
lossList = myList[2::3]
print(len(lossList))

# Find the race date
datesList = re.findall(r'(?<=<span data-v-f3c4ac1c="" class="itra-grey" style="margin-top: 0.2rem; margin-left: 0.2rem; margin-right: 2rem; font-size: 80%;">)(.*?)(?=</span>)', get_source)
#print(datesList)
print(len(datesList))

# Loop
# Visit race page in Itra
# Scraping www, place, topology, number of participants

In [None]:
# Find the number of finishers of the race
#<span class="icon-finisher icon-bold">370</span>
finishersList = re.findall(r'(?<=<span class="icon-finisher icon-bold">)(.*?)(?=</span>)', get_source)
print(finishersList)
len(finishersList)

In [None]:
# Assign data to tuples: # get the list of tuples from two lists and merge them by using zip(). 
list_of_tuples = list(zip(racesList, links, distancesList, gainList, lossList, datesList)) 
# Converting lists of tuples into pandas Dataframe. 
df = pd.DataFrame(list_of_tuples, columns = ['Name', 'Link', 'Distance', 'Gain', 'Loss', 'Date'])
df



In [None]:
# CAL DESAR LES DADES EN UN FORMAT ADIENT: JSON? CSV directament ja que l'enunciat de la PRACTICA requereix CSV

# Convert results to json and save the file
with open('result.json', 'w') as fp:
    json.dump(sample, fp)

In [None]:





print(iList)
print(iiList)
print(iiList_of_tuples)

In [None]:
# Ara ataquem les dades de la pàgina específica de cada cursa


# List with race websites
websitesList = []

# List with race place and country
locationList = []

# List containing the table for each website (as a list)
tableList = []


#slicedLinks = links[:5]

for i in tqdm(links):
    print(i)
    # Using selenium, open firefox window with the ITRA website
    driver2 = webdriver.Firefox()
    #driver2.get("https://itra.run/race/13893")
    driver2.get(i)
    
    
    try:
        time.sleep(3)   

        # Click dropdown menu for language selection
        driver2.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]").click()

        # Select language EN
        driver2.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]/div/div[1]").click()

        time.sleep(15)
        
        # if link is broken, go back
    except TimeoutException:
        driver2.back()
        print("Time out exception.")
        # continue so we can return to beginning of loop
        continue

    # if you reach this point, the link is valid, and you can 'do stuff' on the page
    
    # Getting current URL source code 
    get_source2 = driver2.page_source
    time.sleep(2)   
    driver2.close()

    # Scraping race names with BeautifulSoup
    soup2 = BeautifulSoup(get_source2, 'html')

    # List containing the race website (when available) and "facebook", "twitter" and other info
    hrefList = [a['href'] for a in soup2.find_all('a', {'rel': "ugc"}, href=True)]

    # Remove the links that contain "facebook" or "twitter" or "@"
    hrefList[:] = [x for x in hrefList if "facebook" not in x]
    hrefList[:] = [x for x in hrefList if "twitter" not in x]
    hrefList[:] = [x for x in hrefList if "@" not in x]

    print(len(hrefList))
    print(hrefList) 
    websitesList.append(hrefList)
    
    # Srape the table with additional data: data labels (first) and content
    labelsList  = re.findall(r'(?<=<div class="colinforace1">)(.*?)(?=</div>)', get_source2)
    contentList = re.findall(r'(?<=<div class="colinforace2 mbb">)(.*?)(?=</div>)', get_source2)

    labels_content_list_of_tuples = list(zip(labelsList[:13], contentList[:13])) 
    print(labels_content_list_of_tuples)
    tableList.append(labels_content_list_of_tuples)
    
    # Scrape the location
    location = soup2.find('p').getText()
    print(location)
    locationList.append(location)
    
    # Check scraped data is saved in a manner that can be directly matched
    print(len(websitesList))
    print(len(locationList))
    print(len(tableList))
    
print(websitesList)