# DataSport Scraping

In [None]:
import requests as rq
from bs4 import BeautifulSoup as bfs
from IPython.display import HTML
import pandas as pd
import numpy as np
import time
from random import randint

## Scraping a table

In [None]:
HTML('<iframe src=https://services.datasport.com/2015/lauf/transviamala/alfaw.htm width=1000 height=350></iframe>')

In [None]:
def get_header(bfs_page):
    fonts = bfs_page.select('pre > font')
    return min(fonts, key=len)

def get_raw_table(bfs_page):
    fonts = bfs_page.select('pre > font')
    return max(fonts, key=len)

In [None]:
def clean_raw_table(raw, min_len=10, max_len=-1):
    return [line for line in raw if len(line) > min_len and (max_len < 0 or len(line) < max_len)]

In [None]:
def split_raw_table(raw, header=''):

    lines = clean_raw_table(raw.split('\n'))

    nb_char_max = len(max(lines, key=len))

    values = [[] for _ in range(0, len(lines))]

    column_char_idx = 0
    column_names = []
    column_idx = 0

    for char_idx in range(0, nb_char_max):

        # If all lines have a blank at the same character index,
        # this is a separator and we need to split in two columns
        nb_blank = 0
        
        # In order to don't add a full blank column,
        # we need to check if a value is present a least in one line
        has_value = False

        for line_idx, line in enumerate(lines):

            if len(line) > char_idx:
                
                if line[char_idx] == ' ':
                    nb_blank += 1

                # Init new column in line
                if len(values[line_idx]) == column_idx:
                    values[line_idx].append('')

                # Add character if not blank or if there is already a value in the coumn
                # (don't remove blank in a midle of a column)
                if line[char_idx] != ' ' or len(values[line_idx][column_idx]) > 0:
                    values[line_idx][column_idx] += line[char_idx]
                    has_value = True

            else:
                nb_blank += 1

        if nb_blank == len(lines) and has_value:
            column_idx += 1            
            column_name = header[column_char_idx:char_idx].strip()
            column_names.append(column_name)
            column_char_idx = char_idx

    return values, column_names

In [None]:
def create_clean_dataframe(raw, header):
    
    data, columns = split_raw_table(raw, header)

    clean_data = []
    
    for row in data:
        clean_row = []
        
        # Remove the last column
        for value in row[0:-1]:
            
            clean_value = value.strip()
            if len(clean_value) > 0:
                
                # Remove the point present in last character of the value (like in rank)
                if clean_value[len(clean_value)-1] == '.':
                    clean_value = clean_value[0:-1]

                # Remove parenthesis
                if clean_value[0] == '(' and clean_value[len(clean_value)-1] == ')':
                    clean_value = clean_value[1:-1]

                # Set empty value when there is no real value
                if clean_value[0] == '-' and clean_value[len(clean_value)-1] == '-':
                    clean_value = ''

            clean_row.append(clean_value)
        clean_data.append(clean_row)

    # Create the dataframe (we remove the last column like rows above)
    df = pd.DataFrame(data=clean_data, columns=columns[0:-1])
    
    """
        TODO:
            - Set types
            - Translate column names ?
            - Fix: sometime the algo make unnecessary split, see : https://services.datasport.com/2015/lauf/transviamala/alfaw.htm
    """
    return df

In [None]:
page = rq.get('https://services.datasport.com/2015/lauf/transviamala/alfac.htm')
soup = bfs(page.text, 'html5lib')

header = get_header(soup)
raw_table = get_raw_table(soup)

create_clean_dataframe(raw_table.text, header.text)

In [None]:
page = rq.get('https://services.datasport.com/2015/lauf/lamara/alfan.htm')
soup = bfs(page.text, 'html5lib')

header = get_header(soup)
raw_table = get_raw_table(soup)

create_clean_dataframe(raw_table.text, header.text)

## Scraping a run

In [None]:
HTML('<iframe src=https://services.datasport.com/2015/lauf/transviamala width=1000 height=350></iframe>')

In [None]:
def get_all_data_from_page(url):
    data = []
    page = rq.get(url)
    soup = bfs(page.text, 'html5lib')
    table_links = soup.select('font > a[href*=ALF]')
    for idx, link in enumerate(table_links):
        full_link = url + link['href']
        
        print(str(idx+1) + '/' + str(len(table_links)) + ' - Processing ' + full_link)
        
        alpha_page = rq.get(full_link)
        alpha_bfs = bfs(alpha_page.text, 'html5lib')
        df = create_clean_dataframe(get_raw_table(alpha_bfs).text, get_header(alpha_bfs).text)
        data.append(df)
        
        time.sleep(randint(1,3))
    return data

In [None]:
data = get_all_data_from_page('https://services.datasport.com/2015/lauf/transviamala/')

In [None]:
data[2].head()

In [None]:
for df in data:
    print(df.columns)