In [1]:
import pprint as pp
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import simplejson as json
import time
import random
import os
import glob
import re
import unidecode
import datetime as dt
from collections import Counter
import sys
import pickle
import codecs
import urllib

In [141]:
#Generating the dataframe, even before we have the data, so we see what kind of data will we need
df = pd.DataFrame(columns=['rider_name','nation','season','team','race','racetype', 'racedays'])

In [3]:
#These are the races that I will use to collect the data
race_url_list = ["tour-de-france"]

In [4]:
#Here I define how to get the startlist of a race
user_agent = {'User-agent': 'Mozilla/5.0'}

def get_rider_urls(race_url,start_year,years_back):
    base_url = 'https://www.procyclingstats.com/race/' + str(race_url) + '/'
    print(str(race_url).upper(),'starting...')
    
    for year in range(int(start_year),int(start_year-(years_back+1)),-1):
        url = base_url + str(year) + '/startlist'
        response = requests.get(url, headers=user_agent)

        if response.status_code == 200:
            page = response.text
            soup = BeautifulSoup(page, 'lxml')
            startlist = soup.find_all('a', class_='blue')
            urls = [x['href'] for x in startlist]
            urls = ['https://www.procyclingstats.com/' + u for u in urls]
            file = str(race_url) + '_' + str(year) + '.json'
            with open(file, 'w') as f:
                json.dump(urls, f)
            timer = 2 + 2 * random.random()
            print(year,'done, sleeping for',np.round(timer,2),'sec')
            time.sleep(timer)
        else:
            print('unsuccessful request!')
            print('status code:',response.status_code)
            break
    print('finished!\n')

In [5]:
#I collect all the startlist 
#from 2019 to 5 years back until 2014
for race_url in race_url_list:
    get_rider_urls(race_url,2019,5)

TOUR-DE-FRANCE starting...
2019 done, sleeping for 3.73 sec
2018 done, sleeping for 3.16 sec
2017 done, sleeping for 3.08 sec
2016 done, sleeping for 2.75 sec
2015 done, sleeping for 3.68 sec
2014 done, sleeping for 2.43 sec
finished!

GIRO-D-ITALIA starting...
2019 done, sleeping for 3.22 sec
2018 done, sleeping for 2.32 sec
2017 done, sleeping for 3.57 sec
2016 done, sleeping for 2.15 sec
2015 done, sleeping for 2.38 sec
2014 done, sleeping for 2.84 sec
finished!

VUELTA-A-ESPANA starting...
2019 done, sleeping for 2.92 sec
2018 done, sleeping for 2.39 sec
2017 done, sleeping for 3.39 sec
2016 done, sleeping for 3.76 sec
2015 done, sleeping for 2.16 sec
2014 done, sleeping for 3.04 sec
finished!

STRADE-BIANCHI starting...
2019 done, sleeping for 2.99 sec
2018 done, sleeping for 3.86 sec
2017 done, sleeping for 2.17 sec
2016 done, sleeping for 3.25 sec
2015 done, sleeping for 3.79 sec
2014 done, sleeping for 2.02 sec
finished!

PARIS-NICE starting...
2019 done, sleeping for 3.8 sec
2

KeyboardInterrupt: 

In [6]:
#I check how many unique riders have participated on those races
unique_riders = []
filenames = []

for race_url in race_url_list:
    for i in range (2014,2020):
        filename = race_url +'_' +  str(i) + '.json'
        filenames.append(filename)

for filename in filenames:
    with open(filename) as f:
        riders = json.load(f)
        unique_riders.extend(riders)

print('number of unique rider urls:',len(set(unique_riders)))
unique_riders = list(set(unique_riders))

number of unique rider urls: 1444


In [7]:
#I download each rider's each season's html from the 1444 sample pool 
#This takes about 1 full day with 100% capacity with 8GB RAM

## user_agent = {'User-agent': 'Mozilla/5.0'}
print(dt.datetime.now(),'Downloading web pages...')
start = 846
end = 1444
all_filenames = []

for rdx, first_url in enumerate(unique_riders[start:end]):
    # save first season and get list of remaining ones
    response = requests.get(first_url, headers=user_agent)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    seasons = soup.find_all('a', class_='seasonResults')
    u = 0
    for i in range (len(seasons)):
        seasons[i] = str(seasons[i])[38:42]
    seasons = list(dict.fromkeys(seasons))
    save_name = first_url.split('/')[4] + '_' + seasons[u] + '.html'
    u+=1
    all_filenames.append(save_name)
    rdx_str = 'R[' + str(rdx+1) + ']'
    sdx_str = 'S[1]'
    
    with open(os.path.join('riders/',save_name), 'w', encoding = 'utf-8') as file:
        file.write(str(soup))
    print(dt.datetime.now(),rdx_str,sdx_str,'Saved:',save_name,end='\r')
    
    random_wait = 2 + 2 * random.random()
    time.sleep(random_wait)
    
    
    # save the remaining seasons
    if len(seasons) > 1:
        remaining_seasons = seasons[1:len(seasons)]
        for sdx, year in enumerate(remaining_seasons):
            next_url = first_url + '/' + year
            response = requests.get(next_url, headers=user_agent)
            page = response.text
            soup = BeautifulSoup(page, 'lxml')
            save_name = first_url.split('/')[4] + '_' + seasons[u]+ '.html'
            u+=1
            sdx_str = 'S[' + str(sdx+2) + ']'
            
            with open(os.path.join('riders/',save_name), 'w', encoding = 'utf-8') as file:
                file.write(str(soup))
            print(dt.datetime.now(),rdx_str,sdx_str,'Saved:',save_name,end='\r')
            all_filenames.append(save_name)
            
            random_wait = 2 + 2 * random.random()
            time.sleep(random_wait)
    else:
        continue2

print(dt.datetime.now(),'Done')

2020-12-10 15:33:03.832669 Downloading web pages...
2020-12-10 15:33:08.777393 R[1] S[2] Saved: kevin-rivera_2019.html

KeyboardInterrupt: 

In [142]:
#In this code I iterate over all the html files
#Each html file contains data for 1 year of racing for a rider


for filename in os.listdir(os.getcwd()+'/riders'):
    soup = BeautifulSoup(open(os.getcwd()+'/riders/'+filename, encoding="utf8"), 'html.parser')
    
    #Here I generate a lowercased_name that will be stored in the dataframe
    rider_name = str(soup.title)
    rider_name = rider_name[7:-8]
    rider_name = rider_name.lower()
    rider_name = rider_name.replace(" ", "_")
    
    nation = soup.find_all('span', class_='flag')
    nation = str(nation[0])[22:24]
    
    #This collects the data of which year of racing the html file collects f.e. 2004
    season = filename[-9:-5]
    
    print(season)
    print(rider_name)
    
    #Here I collect al the links which contain any link
    #All races have a link at Procyclingstats
    links = soup.find_all('a', href=True)
    links
    urls = [x['href'] for x in links]
    
    results = []
    races = []
    racetype = []
    racedays = []
    team = 0
    
    for i in range (len(urls)):
        if 'team' in urls[i] and str(season) in urls[i]:
            team = urls[i][5:]
            season = int(season)
    
    #Every race has a "Filter" tag after the race's link in the html
    for i in range(len(urls)):
        if urls[i][-6:] == "Filter":
            results.append(urls[i-1]) #So I chose the (i-1)th element from the links

    #Now we have all the races, let's put them into the dataframe
    for i in range(len(results)):
        #One race type is gc
        if results[i][-2:] == 'gc':
            races.append(results[i][5:-8])
            racetype.append('gc')
            if i+1 < len(results):
                racedays.append(results[i+1][-1:])
            else:
                racedays.append(0)
        #The other race type is 1 day race, which has an ending link of "result" instead gc 
        #Logically, last 2 words will be --> 'lt' 
        if results[i][-2:] == 'lt':
            races.append(results[i][5:-12])
            racetype.append('lt')
            racedays.append(1)
    #I put the new row into the dataframe
    #elements: name, season, race, racetype, racedays
    for i in range (len(races)):
        df.loc[len(df.index)] = [rider_name, nation, season, team , races[i], racetype[i], racedays[i]]

2009
aaron_gate
2010
aaron_gate
2011
aaron_gate
2012
aaron_gate
2013
aaron_gate
2014
aaron_gate
2015
aaron_gate
2016
aaron_gate
2017
aaron_gate
2018
aaron_gate
2019
aaron_gate
2020
aaron_gate
2014
aaron_verwilst
2015
aaron_verwilst
2016
aaron_verwilst
2017
aaron_verwilst
2018
aaron_verwilst
2019
aaron_verwilst
2020
aaron_verwilst
2011
adam_de_vos
2012
adam_de_vos
2013
adam_de_vos
2014
adam_de_vos
2015
adam_de_vos
2016
adam_de_vos
2017
adam_de_vos
2018
adam_de_vos
2019
adam_de_vos
2020
adam_de_vos
2009
adam_yates
2010
adam_yates
2011
adam_yates
2012
adam_yates
2013
adam_yates
2014
adam_yates
2015
adam_yates
2016
adam_yates
2017
adam_yates
2018
adam_yates
2019
adam_yates
2020
adam_yates
2005
adriano_malori
2006
adriano_malori
2007
adriano_malori
2008
adriano_malori
2009
adriano_malori
2010
adriano_malori
2011
adriano_malori
2012
adriano_malori
2013
adriano_malori
2014
adriano_malori
2015
adriano_malori
2016
adriano_malori
2017
adriano_malori
2013
adrien_garel
2014
adrien_garel
2015
adrie

KeyboardInterrupt: 

In [143]:
# Get names of indexes for which there is no team data
indexNames = df[ df['team'] == 0 ].index
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)

df.tail(30)

Unnamed: 0,rider_name,nation,season,team,race,racetype,racedays
2244,alejandro_valverde,es,2019,movistar-team-2019,uae-tour,gc,7
2245,alejandro_valverde,es,2019,movistar-team-2019,vuelta-ciclista-a-la-region-de-murcia,gc,2
2246,alejandro_valverde,es,2019,movistar-team-2019,vuelta-a-la-comunidad-valenciana,gc,5
2247,alejandro_valverde,es,2019,movistar-team-2019,deia-trophy,lt,1
2248,alejandro_valverde,es,2019,movistar-team-2019,trofeo-andratx-mirador-d-es-colomer,lt,1
2249,alejandro_valverde,es,2019,movistar-team-2019,trofeo-cala-millor,lt,1
2250,alejandro_valverde,es,2020,movistar-team-2020,vuelta-a-espana,gc,8
2251,alejandro_valverde,es,2020,movistar-team-2020,world-championship,lt,1
2252,alejandro_valverde,es,2020,movistar-team-2020,tour-de-france,gc,1
2253,alejandro_valverde,es,2020,movistar-team-2020,dauphine,gc,5


In [144]:
#Let's save the data
df.to_csv('networktest.csv') 

In [135]:
#Let's make a new network from the given data
df = pd.read_csv("networktest.csv")
df = df.drop(['Unnamed: 0'], 1)
# Get names of indexes for which race wasn't in 2017
season_check = df[ df['season'] != 2017 ].index
# Delete these row indexes from dataFrame
df.drop(season_check , inplace=True)

df

Unnamed: 0,rider_name,nation,season,team,race,racetype,racedays
0,aaron_gate,nz,2017,aqua-blue-sport-2017,circuit-franco-belge,lt,1
1,aaron_gate,nz,2017,aqua-blue-sport-2017,omloop-eurometropool,lt,1
2,aaron_gate,nz,2017,aqua-blue-sport-2017,vuelta-a-espana,gc,1
3,aaron_gate,nz,2017,aqua-blue-sport-2017,vuelta-a-burgos,gc,5
4,aaron_gate,nz,2017,aqua-blue-sport-2017,tour-de-wallonie,gc,5
5,aaron_gate,nz,2017,aqua-blue-sport-2017,gp-cerami,lt,1
6,aaron_gate,nz,2017,aqua-blue-sport-2017,tour-de-suisse,gc,9
7,aaron_gate,nz,2017,aqua-blue-sport-2017,tour-of-belgium,gc,5
8,aaron_gate,nz,2017,aqua-blue-sport-2017,4-jours-de-dunkerque,gc,6
9,aaron_gate,nz,2017,aqua-blue-sport-2017,Eschborn-Frankfurt,lt,1


In [136]:
#If there is a hickup in the data, I do manually clear it
df.loc[df['racedays'] =='a', 'racedays'] = 1
df.loc[df['racedays'] =='e', 'racedays'] = 1
df.loc[df['racedays'] =='d', 'racedays'] = 1
df.loc[df['racedays'] =='e', 'racedays'] = 1
df.loc[df['racedays'] =='s', 'racedays'] = 1
df.loc[df['racedays'] =='b', 'racedays'] = 1
df.loc[df['racedays'] =='h', 'racedays'] = 1
df.loc[df['racedays'] =='c', 'racedays'] = 1
df.loc[df['racedays'] =='t', 'racedays'] = 1
df.loc[df['racedays'] =='B', 'racedays'] = 1
df.loc[df['racedays'] =='m', 'racedays'] = 1
df.loc[df['racedays'] =='A', 'racedays'] = 1
df['racedays'] = df['racedays'].apply(pd.to_numeric)

In [137]:
#I create a network for weighted data
#If a race consist of 6 racedays, the rider's name and the data will pop up 6 times
weight_df = pd.DataFrame(np.repeat(df.values,df.racedays,axis=0))
weight_df.columns = df.columns

In [138]:
weight_df.columns

Index(['rider_name', 'nation', 'season', 'team', 'race', 'racetype',
       'racedays'],
      dtype='object')

In [139]:
#Let's see: did it work? 
weight_df = weight_df.drop('racetype', axis=1)

weight_df

Unnamed: 0,rider_name,nation,season,team,race,racedays
0,aaron_gate,nz,2017,aqua-blue-sport-2017,circuit-franco-belge,1
1,aaron_gate,nz,2017,aqua-blue-sport-2017,omloop-eurometropool,1
2,aaron_gate,nz,2017,aqua-blue-sport-2017,vuelta-a-espana,1
3,aaron_gate,nz,2017,aqua-blue-sport-2017,vuelta-a-burgos,5
4,aaron_gate,nz,2017,aqua-blue-sport-2017,vuelta-a-burgos,5
...,...,...,...,...,...,...
90,adam_de_vos,ca,2017,rally-cycling-2017,volta-ao-algarve,5
91,adam_de_vos,ca,2017,rally-cycling-2017,volta-ao-algarve,5
92,adam_de_vos,ca,2017,rally-cycling-2017,volta-ao-algarve,5
93,adam_de_vos,ca,2017,rally-cycling-2017,volta-ao-algarve,5


In [140]:
#saving the weighted data
weight_df.to_csv('weight_networktest.csv') 