# DataSport Scraping

In [None]:
import requests as rq
from bs4 import BeautifulSoup as bfs
from IPython.display import HTML
import pandas as pd
import numpy as np
import collections

## Scraping a table

In [None]:
HTML('<iframe src=https://services.datasport.com/2015/lauf/transviamala/alfaw.htm width=1000 height=350></iframe>')

In [None]:
page = rq.get('https://services.datasport.com/2015/lauf/transviamala/alfaw.htm')
soup = bfs(page.text, 'html5lib')

fonts = soup.select('pre > font')

In [None]:
def clean_raw_table(raw, min_len=10, max_len=-1):
    return [line for line in raw if len(line) > min_len and (max_len < 0 or len(line) < max_len)]

In [None]:
def split_raw_table(raw, header=''):

    lines = clean_raw_table(raw.split('\n'))

    nb_char_max = len(max(lines, key=len))

    values = [[] for _ in range(0, len(lines))]

    column_char_idx = 0
    column_names = []
    column_idx = 0

    for char_idx in range(0, nb_char_max):

        # If all lines have a blank at the same character index,
        # this is a separator and we need to split in two columns
        nb_blank = 0
        
        # In order to don't add a full blank column,
        # we need to check if a value is present a least in one line
        has_value = False

        for line_idx, line in enumerate(lines):

            if len(line) > char_idx:
                
                if line[char_idx] == ' ':
                    nb_blank += 1

                if len(values[line_idx]) == column_idx:
                    values[line_idx].append('')

                # Remove blank before values
                if line[char_idx] != ' ' or len(values[line_idx][column_idx]) > 0:
                    values[line_idx][column_idx] += line[char_idx]
                    has_value = True

        if nb_blank == len(lines) and has_value:
            column_idx += 1
            if len(header) > char_idx:
                column_names.append(header[column_char_idx:char_idx])
                column_char_idx = char_idx

    return values, column_names

In [None]:
def create_clean_dataframe(raw, header):
    
    data, columns = split_raw_table(raw, header)

    clean_data = []
    
    for row in data:
        clean_row = []
        
        # Remove the last column
        for value in row[0:-1]:
            
            clean_value = value.strip()
            if len(clean_value) > 0:
                
                # Remove the point present in last character of the value (like in rank)
                if clean_value[len(clean_value)-1] == '.':
                    clean_value = clean_value[0:-1]

                # Remove parenthesis
                if clean_value[0] == '(' and clean_value[len(clean_value)-1] == ')':
                    clean_value = clean_value[1:-1]

                # Set empty value when there is no real value
                if clean_value[0] == '-' and clean_value[len(clean_value)-1] == '-':
                    clean_value = ''

            clean_row.append(clean_value)
        clean_data.append(clean_row)

    # Create the dataframe (we remove the last column like rows above)
    df = pd.DataFrame(data=clean_data, columns=columns[0:-1])
    
    """
        TODO:
            - Set types
            - Translate column names ?
    """
    return df

In [None]:
create_clean_dataframe(fonts[2].text, fonts[0].text)