# Scraping trail running races 2021

This notebook scraps the trail running races' data from https://itra.run/races. Environment preparation is managed by the following scripts:
- WindowsEnvironment.ps1
- MacOSEnvironment.sh

In [None]:
import logging
# Logging format
format = "%(asctime)s: %(message)s"
#logging.basicConfig(filename='example.log', format=format, level=logging.INFO,datefmt="%H:%M:%S")
logging.basicConfig(format=format, level=logging.INFO,datefmt="%H:%M:%S")
logging.info("Log started")

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

import concurrent.futures
import time
import re
import json
import pandas as pd
import whois
import sys
import datetime
from tqdm import tqdm
from datetime import timedelta


# Logging format
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.DEBUG,
                    datefmt="%H:%M:%S")


Per tal de ser més eficients, crarem diferents instàncies d'exploradors que facin la mateixa feina però amb rangs de dates diferents. Primer definim una funció que permet partir un període de temps en subperiodes.

In [None]:
def splitTimePeriod(n, start, end):
    """
    splitTimePeriod gets an array of dates [n,2] with the period splatted in n subperiods

    :param n: number of subperiods
    :param start: Start date 
    :param end: End date 
    :return: returns a [n,2] matrix with the subperiods dates
    """     
    logging.info("splitTimePeriod %s / %s period in %s subperiods: starting", start, end, n)

    duration = end - start

    durationDays = duration.total_seconds() / 60 / 60 / 24
    logging.info("splitTimePeriod %s / %s period in %s subperiods: total duration in days: %s", start, end, n, durationDays)

    subperiodDuration = durationDays / n
    logging.info("splitTimePeriod %s / %s period in %s subperiods: subperiod duration in days: %s", start, end, n, subperiodDuration)

    result = []
    
    if subperiodDuration < 2:
        # Too small period return only one array
        result = [[start,end]]
    else:
        # Split the period
        init = start
        
        # Remove decimals
        periodDays = int(subperiodDuration)

        for x in range(n-1):
            result.append([init,init+timedelta(periodDays-1)])
            init = init+timedelta(periodDays)

        #Last subperiod
        result.append([init,end])
    
    return result    

Definim també una funció per a obtenir les dades que ens interessen amb la llibreria BeautifulSoup.

In [None]:
def getData(n, htmlSource):
    """
    getData gets the races from ITRA in the given period time

    :param htmlSource: Source HTML for the apge 
    :return: data
    """
    logging.info("Thread %s: Getting Data", n) 
    
    # Scraping race names with BeautifulSoup
    soup = BeautifulSoup(htmlSource, 'html')
    logging.info("Thread %s: html extracted", n) 
    
    # Find the race names
    racesList = re.findall(r'(?<=<h5 data-v-f3c4ac1c="" class="itra-green">)(.*?)(?=</h5>)', htmlSource)
    logging.info("Thread %s: Number of races %s",n,len(racesList))
    #print(racesList)

    # Find the mouse over link pointing to the race site
    links = [a['href'] for a in soup.find_all('a',"card ontop", href=True)]
    logging.info("Thread %s: Number of links %s",n,len(links))

    # Scraping the data for distance, elevation gain and loss
    myList = re.findall(r'(?<=<span class="icon-text-grey icon-bold">)(.*?)(?=</span>)', htmlSource)
    logging.info("Thread %s: Number of measures %s",n,len(myList))

    # Find the race distance
    distancesList = myList[0::3]
    
    # Find the race elevation gain
    gainList = myList[1::3]
    
    # Find the race elevation loss
    lossList = myList[2::3]
    
    # Find the race date
    datesList = re.findall(r'(?<=<span data-v-f3c4ac1c="" class="itra-grey" style="margin-top: 0.2rem; margin-left: 0.2rem; margin-right: 2rem; font-size: 80%;">)(.*?)(?=</span>)', htmlSource)
    #print(datesList)
    logging.info("Thread %s: Number of races %s",n,len(datesList))

    # Assign data to tuples: # get the list of tuples from two lists and merge them by using zip(). 
    list_of_tuples = list(zip(racesList, links, distancesList, gainList, lossList, datesList)) 
    # Converting lists of tuples into pandas Dataframe. 
    df = pd.DataFrame(list_of_tuples, columns = ['Name', 'Link', 'Distance', 'Gain', 'Loss', 'Date'])
    logging.info("Thread %s: Get Data terminated", n) 
    return df

Ara definim una nova funció que obtingui amb selenium el codi font de la plana web de curses. 

In [None]:
def getRaces(n, period):
    """
    getRaces gets the races from ITRA in the given period time

    :param n: Thread context execution
    :param period: Start and End date 
    :return: All races with high level data for the given period
    """ 
    logging.info("Thread %s for %s period: starting getRaces", n, period)
    
    time.sleep(2)
    
    # Using selenium, open firefox window with the ITRA website
    driver = webdriver.Firefox()
    driver.get("https://itra.run/races")

    # Getting current URL source code 
    get_title = driver.title 

    # Click dropdown menu for language selection
    driver.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]").click()

    # Select language EN
    driver.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]/div/div[1]").click()

    # Wait for page load after click
    time.sleep(5)

    # Select init date
    initYear=period[0].year
    initMonth=period[0].month
    initDay=period[0].day

    # Picker de la data d'inici
    dpkStartdate = driver.find_element_by_css_selector("div.vdp-datepicker.dp1")
    dpkStartdate.click()

    # Picker per mostrar anys
    spnMonthSelector = driver.find_element_by_css_selector(".dp1 span.day__month_btn.up")
    spnMonthSelector.click()

    # Picker per mostrar anys
    spnYearSelector = driver.find_element_by_css_selector(".dp1 span.month__year_btn.up")
    spnYearSelector.click()

    # Picker per seleccionar any
    divYears = driver.find_elements_by_css_selector(".dp1 span.cell.year")
    divYears[initYear-2020].click()

    # Picker per seleccionar mes
    divMonths = driver.find_elements_by_css_selector(".dp1 span.cell.month")
    divMonths[initMonth-1].click()

    # Picker per seleccionar dia
    divDays = driver.find_elements_by_css_selector(".dp1 span.cell.day")
    # Get the days from the previous month in the current mont first week
    divBlankDays = driver.find_elements_by_css_selector(".dp1 span.cell.day.blank")
    divDays[initDay+len(divBlankDays)-1].click()  

    # Select end date
    endYear=period[1].year
    endMonth=period[1].month
    endDay=period[1].day

    # Picker de la data d'inici
    dpkStartdate = driver.find_element_by_css_selector("div.vdp-datepicker.dp2")
    dpkStartdate.click()

    # Picker per mostrar anys
    spnMonthSelector = driver.find_element_by_css_selector(".dp2 span.day__month_btn.up")
    spnMonthSelector.click()

    # Picker per mostrar anys
    spnYearSelector = driver.find_element_by_css_selector(".dp2 span.month__year_btn.up")
    spnYearSelector.click()

    # Picker per seleccionar any
    divYears = driver.find_elements_by_css_selector(".dp2 span.cell.year")
    divYears[endYear-2020].click()

    # Picker per seleccionar mes
    divMonths = driver.find_elements_by_css_selector(".dp2 span.cell.month")
    divMonths[endMonth-1].click()

    # Picker per seleccionar dia
    divDays = driver.find_elements_by_css_selector(".dp2 span.cell.day")
    # Get the days from the previous month in the current mont first week
    divBlankDays = driver.find_elements_by_css_selector(".dp2 span.cell.day.blank")
    divDays[endDay+len(divBlankDays)-1].click()

    # Wait for page load
    time.sleep(5)  

    # Get the number of total races for the given subperiod
    logging.info("Thread %s: Retrieving number of races",n) 
    totalRacesText = re.findall(r'(?<=<h1 class="itra-green text-center">)(.*?)(?= races found</h1>)', driver.page_source)
    totalRaces = int(totalRacesText[0])
    logging.info("Thread %s: Number of races %s",n,totalRaces)    

    # Click on More Races to get the full list on the screen & Wait for Visibility of Races until we list all races
    maxIterations = int(totalRaces / 50)

    logging.info("Thread %s: We will use %s iterations for %s races",n,maxIterations,totalRaces)    

    if totalRaces % 50 > 0:
        maxIterations = maxIterations + 1

    i = 0
    try:
        logging.info("Thread %s for %s period: %s iteration", period, n, i)
        btnSeeMore = driver.find_element_by_css_selector('button.btn-itra-black[type="button"]')    

    except:
        btnSeeMore = None
        logging.info("Thread %s for %s period: No more races", period, n)

    while btnSeeMore is not None and i != maxIterations-1:
        i = i+1
        btnSeeMore.click()

        try:
            logging.info("Thread %s for %s period: %s iteration", period, n, i)
            btnSeeMore = driver.find_element_by_css_selector('button.btn-itra-black[type="button"]') 
            
        except:            
            btnSeeMore = None
            logging.info("Thread %s for %s period: No more races", period, n)
            break

        finally:
            time.sleep(5)
    
    # Obtnim les dades amb BaeatifulSoup 
    resultPeriod = getData(n, driver.page_source)
    logging.info("Thread %s for %s period: Data frame returned with %s records", period, n, len(resultPeriod))
    driver.quit()
    return resultPeriod

Definim una funció que obten les curses limitant el número de dies del periode a consultar. En cas que hi hagi més dies executa el procés de forma iterativa per evitar problemes de rendiment i estabilitat en execucions massa llargues

In [None]:
def getRacesSeq(n, period, maxDuration=30):
    """
    getRacesSeq gets the races from ITRA in the given period time. Opens a new browser session for each maxDuration days to avoid issues with the borwser when too much races needs to bre retrieved.

    :param n: Thread context execution
    :param period: Start and End date 
    :param maxDuration: Maximum days to query to the browser
    :return: All races with high level data for the given period
    """ 
    logging.info("Thread %s for %s period: starting getRacesSeq", n, period)

    # Get the number of subperiods
    duration = period[1] - period[0]
    durationDays = int(duration.total_seconds() / 60 / 60 / 24)
    
    logging.info("Thread %s for %s period: Period duration is %s days", n, period,durationDays)
    subperiodsSeq = splitTimePeriod(int(durationDays/maxDuration), period[0], period[1])
    logging.info("Thread %s for %s period: Periods %s", n, period,subperiodsSeq)
    
    i = 0
    result = []

    while i < len(subperiodsSeq):
        logging.info("Thread %s for %s period: Processing period", n, subperiodsSeq[i])
        resultAux = getRaces(n,subperiodsSeq[i])
        logging.info("Thread %s for %s period: Processed period with %s races", n, subperiodsSeq[i], len(resultAux))
        result.append(resultAux)
        i = i + 1
    result  = pd.concat(result)
    logging.info("Thread %s for %s period: terminated getRacesSeq with %s records", n, period, len(result))
    return result

Amb les funcions anteriors podem fer l'execució on inicialitzem les variables d'entrada.

In [None]:
# Period definition
startDate = datetime.datetime(2021, 1, 1, 0, 0, 0)
endDate = datetime.datetime(2021, 12, 31, 0, 0, 0)

# Number of threads
threads = 3

# Get periods
periods = splitTimePeriod(threads, startDate, endDate)

# Parallel execution
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
    resultPeriods = list(executor.map(getRacesSeq, range(threads),periods))

# Concatene all results
resultFullPeriod = pd.concat(resultPeriods)
logging.info("Races for period %s to %s is %s", startDate, endDate, len(resultFullPeriod))
print(resultFullPeriod)


In [60]:
# CAL DESAR LES DADES EN UN FORMAT ADIENT: JSON? CSV directament ja que l'enunciat de la PRACTICA requereix CSV

# Convert results to json and save the file
with open('result.json', 'w') as fp:
    json.dump(sample, fp)

NameError: name 'sample' is not defined

In [None]:





print(iList)
print(iiList)
print(iiList_of_tuples)

In [None]:
# Ara ataquem les dades de la pàgina específica de cada cursa


# List with race websites
websitesList = []

# List with race place and country
locationList = []

# List containing the table for each website (as a list)
tableList = []


#slicedLinks = links[:5]

for i in tqdm(links):
    print(i)
    # Using selenium, open firefox window with the ITRA website
    driver2 = webdriver.Firefox()
    #driver2.get("https://itra.run/race/13893")
    driver2.get(i)
    
    
    try:
        time.sleep(3)   

        # Click dropdown menu for language selection
        driver2.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]").click()

        # Select language EN
        driver2.find_element_by_xpath("/html/body/div[1]/div[1]/nav/div[4]/div/div[1]").click()

        time.sleep(15)
        
        # if link is broken, go back
    except TimeoutException:
        driver2.back()
        print("Time out exception.")
        # continue so we can return to beginning of loop
        continue

    # if you reach this point, the link is valid, and you can 'do stuff' on the page
    
    # Getting current URL source code 
    get_source2 = driver2.page_source
    time.sleep(2)   
    driver2.close()

    # Scraping race names with BeautifulSoup
    soup2 = BeautifulSoup(get_source2, 'html')

    # List containing the race website (when available) and "facebook", "twitter" and other info
    hrefList = [a['href'] for a in soup2.find_all('a', {'rel': "ugc"}, href=True)]

    # Remove the links that contain "facebook" or "twitter" or "@"
    hrefList[:] = [x for x in hrefList if "facebook" not in x]
    hrefList[:] = [x for x in hrefList if "twitter" not in x]
    hrefList[:] = [x for x in hrefList if "@" not in x]

    print(len(hrefList))
    print(hrefList) 
    websitesList.append(hrefList)
    
    # Srape the table with additional data: data labels (first) and content
    labelsList  = re.findall(r'(?<=<div class="colinforace1">)(.*?)(?=</div>)', get_source2)
    contentList = re.findall(r'(?<=<div class="colinforace2 mbb">)(.*?)(?=</div>)', get_source2)

    labels_content_list_of_tuples = list(zip(labelsList[:13], contentList[:13])) 
    print(labels_content_list_of_tuples)
    tableList.append(labels_content_list_of_tuples)
    
    # Scrape the location
    location = soup2.find('p').getText()
    print(location)
    locationList.append(location)
    
    # Check scraped data is saved in a manner that can be directly matched
    print(len(websitesList))
    print(len(locationList))
    print(len(tableList))
    
print(websitesList)