In [23]:
import requests
import html5lib
import pandas as pd
from bs4 import BeautifulSoup
import os
import json
import numpy as np
import datetime
from config import user, pwd

In [24]:
# SQL Alchemy
from sqlalchemy import create_engine

# PyMySQL 
import pymysql
pymysql.install_as_MySQLdb()

In [6]:
# This is the base URL to get links to unstyled pages with the tables of data
base_url = "https://www.letour.fr"
# The DOM codes the results as such
# 'ite':'Stage'
# 'itg':'General Classification'


In [32]:
# # stages = np.arange(1,2) #We can change the range to 1,22 to get all 21 stages. Testing 1 for now.
# stages = [6]

In [47]:
# This cell scrapes the page for stage 1 and extracts a list of raw URLs coded with the classification code
# Scraping with requests and Beautiful Soup for the correct div.

def scrape_pages(stage_id):
    stage_id = stage_id
    links_list = []
 
    start_url = f"{base_url}/en/rankings/stage-{stage_id}" #URL to stage)
    print(f"Getting result links for {start_url}")
    
    page = requests.get(start_url)

    if page.status_code == 200:
        content = page.content
        soup = BeautifulSoup(content, "html5lib")

    #Pull out a specific block of code with two sets of coded URLs from the soup.
        try:
            all_links = soup.find_all(class_="tabs__link js-tabs-ranking")
            links_list.append(all_links)
        except ElementDoesNotExist as e:
            print(f"That does not appear to be a valid results URL. {e}")

    # Parsing out the list of json-ish links from the DOM into a dictionary of functional URLs

    url_dict = {}

    for item in all_links:
        myurl = item['data-ajax-stack']
        #clean up the code into a useable URL
        myurl = myurl.replace('\/', '/')
        myurls = json.loads(myurl)
        for key, value in myurls.items():
            url_dict[key] = f"{base_url}{value}"

    # TODO loop through each stage and get the results. For now just look at a single stage, not any more or other results

    for key, value in url_dict.items():
        
        try:
            if key == 'ite':
                print(f"Getting results from {value}")
                get_results(value, 1, stage_id)
            elif key == 'itg':
                print(f"Getting results from {value}")
                get_results(value, 2, stage_id)
        except ValueError:
            print(f"Error getting results from {value}")
            pass

In [48]:
# Go to the link parsed out of the DOM and it is a plain URL page with a single, nested table
# table_list = [] #Not used here, we will have to append the data to a list or do a direct load to SQL database from here

def get_results(myurl, race_result_type_id, stage_id):

    try:
        table = pd.read_html(myurl)
        df = table[0]
        print(f"Table scraped for stage {stage_id}")
    except KeyError:
        print(f"Error with stage {stage_id}")
        pass
# put foreign keys into dataframe before insert into mySql

    df["stage_id"] = stage_id
    df["race_result_type_id"] = race_result_type_id
    # Reformat the times
    df['Result'] = df['Times'].str.replace('h ', ':').str.replace('\'\'', '').str.replace('\' ',':')
# Calculate bonus/penalty in seconds
    for index, row in df.iterrows():
        if 'B' in row['B']:
            bonus = row['B']
            bonus= bonus.split(' : ')[1].replace("''",'')
            df.loc[index, 'rider_bonus'] = bonus
        elif 'P' in row['P']:
            bonus = row['P']
            seconds = bonus.replace('P : ','').replace("'",'').split(' ')[1]
            minutes = int(bonus.replace('P : ','').replace("'",'').split(' ')[0])*60
            bonus = minutes + int(seconds)
            df.loc[index, 'rider_bonus'] = int(bonus * -1)

# Calculate time in seconds
    for index, row in df.iterrows():
        if ':' in row['Result']:
            t = row['Result']
            h,m,s = t.split(':')
            df.loc[index, 'rider_time'] = int(datetime.timedelta(hours=int(h),minutes=int(m),seconds=int(s)).total_seconds())

# Remove extra columns
    df = df.drop(['Rider','Team','Gap', 'B', 'P','Times','Result'], axis=1)
# Rename columns to match
    df = df.rename({'Rank': 'ranking', 'Rider No.': 'rider_id'}, axis='columns')
    try:
        engine = create_engine(f"mysql://{user}:{pwd}@localhost/letour_db")
        df.to_sql(name='race_results',con=engine,if_exists='append', index=False)
    except InternalError:
        print("Could not find database.")
    

In [49]:
stages = np.arange(1,22)
# The result of this call will be pushed to the database
for i in stages:
    scrape_pages(i)

Getting result links for https://www.letour.fr/en/rankings/stage-1
Getting results from https://www.letour.fr/en/ajax/ranking/1/itg/bd845dfbbdf0630f11da790f0cad3096/none
Table scraped for stage 1
Getting results from https://www.letour.fr/en/ajax/ranking/1/ite/72d3750b88cc75c1e386e3a8f55a9d96/none
Table scraped for stage 1
Getting result links for https://www.letour.fr/en/rankings/stage-2
Getting results from https://www.letour.fr/en/ajax/ranking/2/itg/fca07fb6b35baa3d6dfe2d4500ea91ed/none
Table scraped for stage 2
Getting results from https://www.letour.fr/en/ajax/ranking/2/ite/6f21d73eddddbb74e35c0113a43935ca/none
Table scraped for stage 2
Getting result links for https://www.letour.fr/en/rankings/stage-3
Getting results from https://www.letour.fr/en/ajax/ranking/3/itg/8cc975d0be03e5d50e78530e0a4b6f7d/none
Table scraped for stage 3
Getting results from https://www.letour.fr/en/ajax/ranking/3/ite/77864547454b8433e7ab54f08c7c64d0/none
Error getting results from https://www.letour.fr/en

In [32]:
select = "select r.stage_id, r.rider_speed, r.ranking, rs.rider_name, s.stage_type, s.stage_distance from race_results r, race_starters rs, race_stages s WHERE r.stage_id=s.stage_id AND r.rider_id=rs.rider_id and r.race_result_type_id=1 and rider_speed IS NOT NULL ORDER BY r.stage_id ASC, r.ranking ASC;"
engine = create_engine(f"mysql://{user}:{pwd}@localhost/letour_db")
data = engine.execute(select)

df = pd.read_sql_query(select, engine)
df.head()


Unnamed: 0,stage_id,rider_speed,ranking,rider_name,stage_type,stage_distance
0,1,45.7627,1,Fernando Gaviria,Flat stage,201
1,1,45.7627,2,Peter Sagan,Flat stage,201
2,1,45.7627,3,Marcel Kittel,Flat stage,201
3,1,45.7627,4,Alexander Kristoff,Flat stage,201
4,1,45.7627,5,Christophe Laporte,Flat stage,201


In [35]:
data_output = {}
for record in data:
    data_output["stage_id"] = df["stage_id"].tolist()
    data_output["rider_speed"] = df["rider_speed"].tolist()
    data_output["rider_rank"] = df["ranking"].tolist()
    data_output["stage_type"] = df["stage_type"].tolist()
    data_output["stage_length"] = df["stage_distance"].tolist()
    

    


In [36]:
data_output

{'stage_id': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2

In [None]:
df.stage_id.unique().tolist()

In [10]:
def add_stage_3(filename):
    
    stage_3 = pd.read_csv(filename)

    stage_3['race_result_type_id'] = 1
    stage_3['stage_id'] = 3

    for index, row in stage_3.iterrows():
            if ':' in row['rider_time']:
                t = row['rider_time']
                t = t.split('.')[0]
                m,s = t.split(':')
                stage_3.loc[index, 'rider_time'] = int(datetime.timedelta(minutes=int(m),seconds=int(s)).total_seconds())

    engine = create_engine(f"mysql://{user}:{pwd}@localhost/letour_d")
    stage_3.to_sql(name='race_results',con=engine,if_exists='append', index=False)


In [11]:
add_stage_3("../sql/stage-3.csv")

FileNotFoundError: File b'../sql/stage-3.csv' does not exist