# DataSport Scraping

In [16]:
import requests as rq
from bs4 import BeautifulSoup as bfs
from IPython.display import HTML
import pandas as pd
import numpy as np
import time
from random import randint

## Scraping a table

In [17]:
HTML('<iframe src=https://services.datasport.com/2015/lauf/transviamala/alfaw.htm width=1000 height=350></iframe>')

In [18]:
def get_header(bfs_page):
    fonts = bfs_page.select('pre > font')
    return min(fonts, key=len)

def get_raw_table(bfs_page):
    fonts = bfs_page.select('pre > font')
    return max(fonts, key=len)

In [19]:
def clean_raw_table(raw, min_len=10, max_len=-1):
    return [line for line in raw if len(line) > min_len and (max_len < 0 or len(line) < max_len)]

In [20]:
def split_raw_table(raw, header=''):

    lines = clean_raw_table(raw.split('\n'))

    nb_char_max = len(max(lines, key=len))

    values = [[] for _ in range(0, len(lines))]

    column_char_idx = 0
    column_names = []
    column_idx = 0

    for char_idx in range(0, nb_char_max):

        # If all lines have a blank at the same character index,
        # this is a separator and we need to split in two columns
        nb_blank = 0
        
        # In order to don't add a full blank column,
        # we need to check if a value is present a least in one line
        has_value = False

        for line_idx, line in enumerate(lines):

            if len(line) > char_idx:
                
                if line[char_idx] == ' ':
                    nb_blank += 1

                if len(values[line_idx]) == column_idx:
                    values[line_idx].append('')

                # Remove blank before values
                if line[char_idx] != ' ' or len(values[line_idx][column_idx]) > 0:
                    values[line_idx][column_idx] += line[char_idx]
                    has_value = True

        if nb_blank == len(lines) and has_value:
            column_idx += 1            
            column_name = header[column_char_idx:char_idx].strip()
            column_names.append(column_name)
            column_char_idx = char_idx

    return values, column_names

In [21]:
def create_clean_dataframe(raw, header):
    
    data, columns = split_raw_table(raw, header)

    clean_data = []
    
    for row in data:
        clean_row = []
        
        # Remove the last column
        for value in row[0:-1]:
            
            clean_value = value.strip()
            if len(clean_value) > 0:
                
                # Remove the point present in last character of the value (like in rank)
                if clean_value[len(clean_value)-1] == '.':
                    clean_value = clean_value[0:-1]

                # Remove parenthesis
                if clean_value[0] == '(' and clean_value[len(clean_value)-1] == ')':
                    clean_value = clean_value[1:-1]

                # Set empty value when there is no real value
                if clean_value[0] == '-' and clean_value[len(clean_value)-1] == '-':
                    clean_value = ''

            clean_row.append(clean_value)
        clean_data.append(clean_row)

    # Create the dataframe (we remove the last column like rows above)
    df = pd.DataFrame(data=clean_data, columns=columns[0:-1])
    
    """
        TODO:
            - Set types
            - Translate column names ?
            - Fix: sometime the algo make unnecessary split, see : https://services.datasport.com/2015/lauf/transviamala/alfaw.htm
    """
    return df

In [22]:
page = rq.get('https://services.datasport.com/2015/lauf/transviamala/alfac.htm')
soup = bfs(page.text, 'html5lib')

header = get_header(soup)
raw_table = get_raw_table(soup)

create_clean_dataframe(raw_table.text, header.text)

Unnamed: 0,Kate,gorie Rang,Name,Jg,Land Ort,Unnamed: 6,Team,Zeit,Rückstand,Stnr,Unnamed: 11,Unnamed: 12
0,W-TD,,Caduff Irena,1966,Morissen,,US Lumnezia,"2:26.51,3",,3015,Diplom,Foto
1,W-TH,,Caduff Ueli,1963,Morissen,,US Lumnezia,"2:17.26,7",,3016,Diplom,Foto
2,C-H2,31.0,Caflisch Michael,1967,Chur,,,"1:37.00,2","34.40,4",2010,Diplom,Foto
3,J-F2,1.0,Caluori Ginia,2002,Bonaduz,,SC Beverin,"19.27,6",,5008,Diplom,Foto
4,W-TD,,Caluori Lucia,1970,Bonaduz,,,"2:51.42,3",,3017,Diplom,Foto
5,J-K3,6.0,Caluori Matteo,2004,Bonaduz,,,"21.28,3","3.43,4",5009,Diplom,Foto
6,T-H1,134.0,Camenisch Matias,1981,Kloten,,Egetswiler Lauffründe,"2:12.54,5","52.13,8",1073,Diplom,Foto
7,W-CF,,Caminada-Paier Maria,1956,Rhäzüns,,,"2:07.52,8",,4005,Diplom,Foto
8,N-C,3.0,Candrian Alamo,1966,Bubendorf,,,"5:03.28,0","17.15,5",106,Diplom,Foto
9,N-F2,3.0,Cantieni Christine,1966,Donat,,transviamala.ch,"4:39.24,1","30.17,6",258,Diplom,Foto


## Scraping a run

In [23]:
HTML('<iframe src=https://services.datasport.com/2015/lauf/transviamala width=1000 height=350></iframe>')

In [45]:
def get_all_data_from_page(url):
    data = []
    page = rq.get(url)
    soup = bfs(page.text, 'html5lib')
    table_links = soup.select('font > a[href*=ALF]')
    for idx, link in enumerate(table_links):
        full_link = url + link['href']
        
        print(str(idx+1) + '/' + str(len(table_links)) + ' - Processing ' + full_link)
        
        alpha_page = rq.get(full_link)
        alpha_bfs = bfs(alpha_page.text, 'html5lib')
        df = create_clean_dataframe(get_raw_table(alpha_bfs).text, get_header(alpha_bfs).text)
        data.append(df)
        
        time.sleep(randint(1,3))
    return data

In [None]:
data = get_all_data_from_page('https://services.datasport.com/2015/lauf/transviamala/')

1/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAA.HTM
2/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAB.HTM
3/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAC.HTM
4/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAD.HTM
5/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAE.HTM
6/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAF.HTM
7/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAG.HTM
8/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAH.HTM
9/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAI.HTM
10/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAJ.HTM
11/25 - Processing https://services.datasport.com/2015/lauf/transviamala/ALFAK.HTM


In [28]:
data[2].head()

IndexError: list index out of range

In [29]:
for df in data:
    print(df.columns)