In [None]:
import requests as rq
from bs4 import BeautifulSoup as bfs
from IPython.display import HTML
import pandas as pd
import numpy as np
import time
import random
import os.path

# Get all acode (runner unique code) in a run event

In [None]:
HTML('<iframe src=https://services.datasport.com/2015/lauf/transviamala/alfaw.htm width=1000 height=350></iframe>')

In [None]:
def get_all_acode(url):
    alpha_page = rq.get(url)
    alpha_page_soup = bfs(alpha_page.text, 'html5lib')

    runners = alpha_page_soup.body.findAll('span', attrs={'class': 'myds'})
    
    return {runner['acode']: str.strip(runner.text) for runner in runners}

def get_all_acode_from_run_event(url, file=False):
    
    if file and os.path.isfile(file):
        print('Read acode from file: ' + file)
        return pd.read_csv(file, index_col='acode').to_dict()['name']
    
    data = []
    page = rq.get(url)
    soup = bfs(page.text, 'html5lib')
    table_links = soup.select('font > a[href*=ALF]')
    all_acode = {}
    if url[-1] != '/':
        url += '/'
    for idx, link in enumerate(table_links):
        full_link = url + link['href']
        
        print(str(idx+1) + '/' + str(len(table_links)) + ' - Processing ' + full_link)

        all_acode = {**all_acode, **get_all_acode(full_link)}

        time.sleep(random.uniform(0.5, 3))

    if file:
        print('Write acode in file: ' + file)
        pd.Series(all_acode).to_csv(file, header=['name'], index_label='acode')
        
    return all_acode

In [None]:
acode_list = get_all_acode_from_run_event('https://services.datasport.com/2009/diverse/trophy/')

In [None]:
list(acode_list.items())[0:10]

# Get run events

In [None]:
HTML('<iframe src=https://www.datasport.com/en/Calendar/ width=1000 height=350></iframe>')

## Get all params

In [None]:
def get_all_params_calendar():
    calendar_page = rq.get('https://www.datasport.com/en/Calendar/')
    calendar_soup = bfs(calendar_page.text, 'html5lib')

    selector_table = calendar_soup.find('table', attrs={'id': 'ds-calendar-header'})

    available_params = {}
    for selector in selector_table.findAll('select'):
        available_params[selector['name']] = {option.text: option['value'] for option in selector.findAll('option')}

    return available_params

In [None]:
calendar_available_params = get_all_params_calendar()
calendar_available_params

## Get all run events url in a calendar

In [None]:
def get_run_events_url(year=2009, month=1, country='CCH', sport='Running'):

    calendar_params = {
        'dr': '',
        'lastQuery': 'D147BC896417D2D2B96FA1AADD893731',
        'eventsearch': '',
        'eventservice': 'all',
        'start': 1,
        'etyp': sport,
        'eventlocation': country,
        'eventmonth': month,
        'eventyear': year,
    }

    calendar_page = rq.post('https://www.datasport.com/en/Calendar/', data = calendar_params)
    calendar_soup = bfs(calendar_page.text, 'html5lib')

    table = calendar_soup.find('table', attrs={'id': 'ds-calendar-body'})

    all_event_url = {}
    for row in table.findAll('tr'):
        columns = row.findAll('td')
        if len(columns) >= 4:
            url = columns[4].find('a')
            if url:
                all_event_url[url['href']] = {
                    'year': year,
                    'month': month,
                    'country': country,
                    'sport': sport,
                    'full_date': str.strip(columns[0].text),
                    'name': str.strip(columns[1].text),
                }

    return all_event_url

In [None]:
get_run_events_url()

## Get all run event urls

In [None]:
def get_all_run_events():
    run_events = {}
    for year_text, year_value in calendar_available_params['eventyear'].items():
        if year_value != 'all':
            for month_text, month_value in calendar_available_params['eventmonth'].items():
                if month_value != 'all':
                    print('Processing: Year ' + year_value + ' / Month ' + month_value)
                    run_events = {**run_events,  **get_run_events_url(year_value, month_value)}
                    time.sleep(random.uniform(0.5, 2))
    return run_events

In [None]:
run_events = get_all_run_events()

In [None]:
run_events_df = pd.DataFrame(run_events).T
run_events_df.index.name = 'url'

In [None]:
run_events_df.head()

In [None]:
run_events_df.to_csv('Data/run_events.csv')

In [None]:
!head Data/run_events.csv

## Get all acode in all runs from 2009

In [None]:
run_events_df = pd.read_csv('Data/run_events.csv')
run_events_df.head()

In [None]:
run_events_df_2009_2015 = run_events_df[[year >= 2009 and year < 2016 for year in run_events_df['year']]]
run_events_df_2009_2015.size

In [None]:
run_events_df_2009_2015.to_csv('Data/run_events_2009_2015.csv', index_label='acode_index')
!head Data/run_events_2009_2015.csv

In [None]:
def get_all_acode_from_list_run_event(run_events):
    acodes = {}
    for run_event in run_events.itertuples():
        print('Processing: Run "' + run_event.name + '" / Date ' + run_event.full_date + ' / Url ' + run_event.url)
        new_acode = get_all_acode_from_run_event(run_event.url, 'Data/acode_2009_2015/' + str(run_event.Index) + '.csv')
        acodes = {**acodes, **new_acode}
    return acodes

In [None]:
all_acode_2009_2015 = get_all_acode_from_list_run_event(run_events_df_2009_2015)