In [29]:
import requests
import html5lib
import pandas as pd
from bs4 import BeautifulSoup
import os
import json
import numpy as np
import datetime

In [2]:
# This is the base URL to get links to unstyled pages with the tables of data
base_url = "https://www.letour.fr"
# The DOM codes the results as such
tab_dict = {'ite':'Stage',
'itg':'General Classification'}
# Or for easy iterating.
short_list = ['ite','itg']

In [4]:
stages = np.arange(1,2) #We can change the range to 1,22 to get all 21 stages. Testing 1 for now.

In [8]:
# This cell scrapes the page for stage 1 and extracts a list of raw URLs coded with the classification code
# Scraping with requests and Beautiful Soup for the correct div.
links_list = []
start_url = f"{base_url}/en/rankings/stage-{item}" #URL to stage)
print(start_url)
for item in stages:
    
    page = requests.get(start_url)
    if page.status_code == 200:
        content = page.content
        soup = BeautifulSoup(content, "html5lib")

    #Pull out a specific block of code with two sets of coded URLs from the soup.
        try:
            all_links = soup.find_all(class_="tabs__link js-tabs-ranking")
            links_list.append(all_links)
        except ElementDoesNotExist as e:
            print(f"That does not appear to be a valid results URL. {e}")

https://www.letour.fr/en/rankings/stage-1


In [10]:
# Take a look at that ugliness. We can extract just the stage results (ite) or overall ranking (itg)
links_list

[[<a class="tabs__link js-tabs-ranking" data-ajax-stack='{"itg":"\/en\/ajax\/ranking\/1\/itg\/bd845dfbbdf0630f11da790f0cad3096\/none","ipg":"\/en\/ajax\/ranking\/1\/ipg\/2e50858f0d77639053cc601af0b6cbdb\/none","etg":"\/en\/ajax\/ranking\/1\/etg\/f1b2d0eb8693db6e4027ff95deca603d\/none","img":"\/en\/ajax\/ranking\/1\/img\/d265b181815ffb4a503d6e0ed9506005\/none","ijg":"\/en\/ajax\/ranking\/1\/ijg\/ca5d1eea68fc753c67ebe11a618c2580\/none","icg":"\/en\/ajax\/ranking\/1\/icg\/5a819d202911cc09d579b429a8e2c43e\/none"}' data-type="g" data-xtclick="ranking::tab::overall" href="it">General classification</a>,
  <a class="tabs__link js-tabs-ranking" data-ajax-stack='{"ite":"\/en\/ajax\/ranking\/1\/ite\/72d3750b88cc75c1e386e3a8f55a9d96\/none","ipe":"\/en\/ajax\/ranking\/1\/ipe\/146f21c83187983073965fa20531ed94\/none","ete":"\/en\/ajax\/ranking\/1\/ete\/eac6423eb988564ee16aa681402f83e2\/none","ime":"\/en\/ajax\/ranking\/1\/ime\/e2baaa6161138846207204bd4e4f9146\/none","ije":"\/en\/ajax\/ranking\/1\/ij

In [13]:
# Parsing out the list of json-ish links from the DOM into a dictionary of functional URLs

url_dict = {}

for item in all_links:
    myurl = item['data-ajax-stack']
    #clean up the code into a useable URL
    myurl = myurl.replace('\/', '/')
    myurls = json.loads(myurl)
    for key, value in myurls.items():
        url_dict[key] = f"{base_url}{value}"
# TODO loop through each stage and get the results. For now just look at a single stage, not any more or other results
for key, value in url_dict.items():
    try:
        if key == 'ite':
            myurl = value #This would change to adding the list of URLs to a list or dictionary later.
            print(value, key)
    except KeyError:
        pass

https://www.letour.fr/en/ajax/ranking/1/ite/72d3750b88cc75c1e386e3a8f55a9d96/none ite


In [24]:
# Go to the link parsed out of the DOM and it is a plain URL page with a single, nested table
# table_list = [] #Not used here, we will have to append the data to a list or do a direct load to SQL database from here
try:
    table = pd.read_html(myurl)
    df = table[0] #Change this when we start iterating
except KeyError:
    pass


In [26]:
df.head()

Unnamed: 0,Rank,Rider,Rider No.,Team,Times,Gap,B,P
0,1,FERNANDO GAVIRIA RENDON,103,QUICK - STEP FLOORS,04h 23' 32'',-,B : 10'',-
1,2,PETER SAGAN,111,BORA - HANSGROHE,04h 23' 32'',-,B : 6'',-
2,3,MARCEL KITTEL,144,TEAM KATUSHA ALPECIN,04h 23' 32'',-,B : 4'',-
3,4,ALEXANDER KRISTOFF,95,UAE TEAM EMIRATES,04h 23' 32'',-,-,-
4,5,CHRISTOPHE LAPORTE,201,"COFIDIS, SOLUTIONS CREDITS",04h 23' 32'',-,-,-


In [28]:
# Probably easiest to just reformat those times in Pandas
df['Result'] = df['Times'].str.replace('h ', ':').str.replace('\'\'', '').str.replace('\' ',':')
df.head()

Unnamed: 0,Rank,Rider,Rider No.,Team,Times,Gap,B,P,Result
0,1,FERNANDO GAVIRIA RENDON,103,QUICK - STEP FLOORS,04h 23' 32'',-,B : 10'',-,04:23:32
1,2,PETER SAGAN,111,BORA - HANSGROHE,04h 23' 32'',-,B : 6'',-,04:23:32
2,3,MARCEL KITTEL,144,TEAM KATUSHA ALPECIN,04h 23' 32'',-,B : 4'',-,04:23:32
3,4,ALEXANDER KRISTOFF,95,UAE TEAM EMIRATES,04h 23' 32'',-,-,-,04:23:32
4,5,CHRISTOPHE LAPORTE,201,"COFIDIS, SOLUTIONS CREDITS",04h 23' 32'',-,-,-,04:23:32


In [None]:
# We need to reformat the Result column, so we could use the datetime function to convert it to seconds for easy
# calculations later on. Here's an example I found on stackoverflow
# https://stackoverflow.com/questions/6402812/how-to-convert-an-hmmss-time-string-to-seconds-in-python
# t = '10:15:30'
# h,m,s = t.split(':')
# print(int(datetime.timedelta(hours=int(h),minutes=int(m),seconds=int(s)).total_seconds()))

In [None]:
# From the Pandas documentation, here's the options for Pandas to_json 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html
# df.to_json(path_or_buf=None, orient=None, date_format=None, double_precision=10, 
# force_ascii=True, date_unit='ms', default_handler=None, lines=False, compression='infer', index=True

In [None]:
# We could output the data to CSV or json here
#     tables = pd.read_html(url)
#     with open(f'output.csv','a+') as output:
#         for i in tables:
#             i.to_csv(output, header=False, sep = "\t", index_label=False)