# DataSport Scraping

In [None]:
import requests as rq
from bs4 import BeautifulSoup as bfs
from IPython.display import HTML
import pandas as pd
import numpy as np
import time
import random
from astropy.io import ascii
import math
import os
from urllib.parse import urlparse

## Scraping a table

In [None]:
HTML('<iframe src=https://services.datasport.com/2016/lauf/lamara/alfaa.htm width=1000 height=350></iframe>')

In [None]:
def get_content(bfs_page):
    fonts = bfs_page.select('pre > font')
    text = ''
    for font in fonts:
        text += font.text
    return text.split('\n')

def get_probability_of_split(table, column):
    nb_blank = 0

    for line in table:
        if len(line) > column and line[column] == ' ':
            nb_blank += 1

    prob_column = nb_blank / len(table)

    return prob_column

def split_separator(probabilities, separator):
    if len(probabilities) <= 0:
        raise ValueError('No probability given')
    max_prob = probabilities[0]
    for probability in probabilities:
        if probability['probability'] > max_prob['probability']:
            max_prob = probability
    index = max_prob['index']
    separator = separator[:index] + ' ' + separator[index + 1:]
    return separator

def read_table(content):
    header = content[0]
    separator = content[1]
    table = content[2:]
    
    blank = False
    probabilities = []
    
    for column in range(0, len(header)):
        if header[column] == ' ':
            blank = True
            new_prob = {'index': 0, 'probability': 0}
            new_prob['index'] = column
            new_prob['probability'] = get_probability_of_split(table, column)
            probabilities.append(new_prob)
        elif blank and header[column] != ' ':
            blank = False
            separator = split_separator(probabilities, separator)
            probabilities = []

    final_content = [header, separator] + table
    
    fill_values = [('-----', ''), ('---', ''), ('--', ''), ('-', ''), ('', '')]
    
    exclude_names = ['¦']

    df = ascii.read(final_content, format='fixed_width_two_line', exclude_names=exclude_names, fill_values=fill_values).to_pandas()

    return df

def add_acodes(df, soup, check_names=False):
    runners = soup.findAll('span', attrs={'class': 'myds'})
    
    # Slow but we are sure that the acode match the runner
    if check_names:
        df['acode'] = ''
        for runner in runners:
            df.loc[df['nom'] == runner.text.strip(), 'acode'] = runner['acode']

    # Very fast but strong assumption on the order of the dataframe and the acodes find
    else:
        acodes = [runner['acode'] for runner in runners]
        # Need to add an extra acode for the last line
        df['acode'] = acodes + ['']
    return df

def read_page(url, acodes=None):
    page = rq.get(url)
    soup = bfs(page.text, 'html5lib')
    content = get_content(soup)
    df = read_table(content)
    if acodes and acodes == 'no-check':
        df = add_acodes(df, soup, False)
    elif acodes and acodes == 'check':
        df = add_acodes(df, soup, True)
    return df

In [None]:
read_page('https://services.datasport.com/2016/lauf/lamara/alfaa.htm', 'no-check').head()

## Scraping a run

In [None]:
HTML('<iframe src=https://services.datasport.com/2016/lauf/lamara/ width=1000 height=350></iframe>')

In [None]:
def get_all_data_from_page(url, directory=False):
    data = []
    page = rq.get(url)
    soup = bfs(page.text, 'html5lib')

    table_links = soup.select('font > a[href*=ALF]')

    for idx, link in enumerate(table_links):
        full_link = url + link['href']
        
        print(str(idx+1) + '/' + str(len(table_links)) + ' - Processing ' + full_link)
        df = read_page(full_link, 'no-check')        
        data.append(df)
        
        if directory and os.access(directory, os.W_OK):
            url_parsed = urlparse(full_link)
            file = url_parsed.netloc + '_'.join(url_parsed.path.split('/')) + '.csv'
            if directory[-1] == '/':
                file = directory + file
            else:
                file = directory + '/' + file
            df.to_csv(file)
            print('Write file: ' + file)
        
        #time.sleep(random.uniform(0, 0.5))
    return data

In [None]:
data = get_all_data_from_page('https://services.datasport.com/2016/lauf/lamara/', './Data/Lausanne_Marathon_2016')

In [None]:
nb_runners = 0
for df in data:
    nb_runners += len(df)

nb_runners

### Not used - Get data from all run event

In [None]:
run_events_df = pd.read_csv('Data/run_events.csv')
run_events_df.head()

In [None]:
def get_data_from_run_events(run_events, path='Data/ScrappingAcodeByRuns/'):

    nbRaceProcessing = 0;
    for run_event in run_events.itertuples():
        print('Processing: Run "' + run_event.name + '" / Date ' + run_event.full_date + ' / Url ' + run_event.url)
        print('Processing the url: ' + run_event.url)
        data_run_events = get_all_data_from_page( run_event.url + '/')
        
        # Merge the data.
        result_race_event = pd.concat(data_run_events)
        print(len(result_race_event))
            
        # write on csv.
        result_race_event.to_csv(path + run_event.name + '.csv')
        
        # test 10 first url.
        nbRaceProcessing += 1
        if nbRaceProcessing > 10:
            break


In [None]:
#get_data_from_run_events(run_events_df)

> URL qui pose probleme, à regarder de plus pres.

In [None]:
#data_run_event = get_all_data_from_page('http://services.datasport.com/2009/diverse/trophy/')

In [None]:
#data = get_all_data_from_page('http://services.datasport.com/1999/lauf/Greifenseelauf/')