In [None]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import math
import os

# Clean every dataframes and merge

## Marathon Lausanne 2016

In [None]:
DATA_DIR = './Data/Lausanne_Marathon_2016'
!ls -hl './Data/Lausanne_Marathon_2016'

In [None]:
def clean_dataframe(df):

    selected_columns = {
        'category': ['catégorie', 'Kategorie'],
        'name': ['nom', 'nom/lieu', 'Name/Ort'],
        'rank': ['Rang', 'rang'],
        'team': ['équipe', 'Team/Ortschaft', 'équipe/lieu'],
        'acode': ['acode'],
        'time': ['Zeit', 'temps'],
        'birthday': ['an', 'Jg'],
        'number': ['Stnr', 'doss'],
        #'excluded': ['lieu', 'pénalité', 'retard', 'overall', 'moyenne', 'Rückstand', 'pays/lieu', 'temps-net']
    }

    dataframe = df.copy()
    
    # Rename columns
    rename_table = {}
    for final_column_name, equivalent_columns in selected_columns.items():
        for column in equivalent_columns:
            if column in dataframe.columns:
                rename_table[column] = final_column_name
                break

    dataframe.rename(columns=rename_table, inplace=True)
        
    # Drop last row if lot of NaN and select colmuns
    if df.tail(1).isnull().sum(axis=1).values[0] > (len(df.columns) / 2):
        dataframe = dataframe.drop(dataframe.index[len(dataframe)-1])[list(rename_table.values())].copy()
    else:
        dataframe = dataframe[list(rename_table.values())].copy()

    for row in dataframe.itertuples():

        # Remove () in number column
        if 'number' in dataframe.columns and str(row.number)[0] == '(' and str(row.number)[len(row.number) - 1] == ')':
            dataframe.set_value(row.Index, 'number', row.number[1:-1])

        # Parse time
        if 'time' in dataframe.columns:
            try:
                dataframe.set_value(row.Index, 'time', pd.to_datetime(row.time, format='%H:%M.%S,%f'))
            except ValueError:
                dataframe.set_value(row.Index, 'time', pd.to_datetime(row.time, format='%M.%S,%f', errors='coerce'))
        if 'birthday' in dataframe.columns:
            date = row.birthday
            try:
                # If there are only two digit (like 85.0 (4 characters)), we add 1900:
                if len(str(math.modf(float(date))[1])) == 4:
                    date = 1900.0 + float(date)
            except ValueError:
                dataframe.set_value(row.Index, 'birthday', np.nan)
                continue
            try:
                dataframe.set_value(row.Index, 'birthday', pd.to_datetime(date, format='%Y.0'))
            except ValueError:
                dataframe.set_value(row.Index, 'birthday', pd.to_datetime(date, format='%Y', errors='coerce'))
            

    return dataframe

### Runners: A

In [None]:
!head ./Data/Lausanne_Marathon_2016/services.datasport.com_2016_lauf_lamara_ALFAA.HTM.csv

In [None]:
df_2016_lauf_lamara_ALFAA = pd.read_csv(DATA_DIR + '/services.datasport.com_2016_lauf_lamara_ALFAA.HTM.csv', index_col=0)
df_2016_lauf_lamara_ALFAA.head()

In [None]:
clean_dataframe(df_2016_lauf_lamara_ALFAA).head()

### All runners

In [None]:
dataframes = []
for filename in os.listdir(DATA_DIR):
    print('Read and clean: ' + filename)
    uncleaned_df = pd.read_csv(DATA_DIR + '/' + filename, index_col=0)
    cleaned_df = clean_dataframe(uncleaned_df)
    dataframes.append(cleaned_df)

In [None]:
merged_df = pd.concat(dataframes)
merged_df.head()

In [None]:
merged_df.to_csv('./Data/Lausanne_Marathon_2016.csv')
merged_df.to_pickle('./Data/Lausanne_Marathon_2016.pickle')

In [None]:
!head './Data/Lausanne_Marathon_2016.csv'

## All Marathon Lausanne

In [None]:
lauf_directories = [
    'Lausanne_Marathon_2016',
    'Lausanne_Marathon_2015',
    'Lausanne_Marathon_2014',
    'Lausanne_Marathon_2013',
    'Lausanne_Marathon_2012',
    'Lausanne_Marathon_2011',
    'Lausanne_Marathon_2010',
    'Lausanne_Marathon_2009',
    'Lausanne_Marathon_2005',
    'Lausanne_Marathon_2004',
    'Lausanne_Marathon_2003',
    'Lausanne_Marathon_2002',
    'Lausanne_Marathon_2001',
    'Lausanne_Marathon_2000',
    'Lausanne_Marathon_2008',
    'Lausanne_Marathon_2007',
    'Lausanne_Marathon_2006',
    'Lausanne_Marathon_1999',
]

for directory in lauf_directories:
    directory_path = './Data/' + directory
    print('Start ' + directory_path)
    dataframes = []
    for filename in os.listdir(directory_path):
        print('    Read and clean: ' + filename)
        uncleaned_df = pd.read_csv(directory_path + '/' + filename, index_col=0)
        dataframes.append(clean_dataframe(uncleaned_df))
    merged_df = pd.concat(dataframes)
    merged_df.to_csv(directory_path + '.csv')
    merged_df.to_pickle(directory_path + '.pickle')
    print('End ' + directory_path + '\n')