In [None]:
# Data analysis 
import pandas as pd
import numpy as np

# Utils
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import FloatProgress
import collections
from IPython.display import display
from datetime import date
from dateutil.relativedelta import relativedelta

# wrangling data


In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/Lausanne_Marathon_2016.pickle'
SEX_COLUMN_NAME = 'Sex'

In [None]:
lausanne_race = pd.read_pickle(PATH_TO_DATA)

In [None]:
lausanne_race.head()

In [None]:
def assign_sex (data_race, name_column):
    '''
    assign sex to runner selected by categorie.
    variables
        data_race : data describing the race.
    '''
    

    sex_runner_list = []
    map_letter_sex =   {
                        'Men' : ['H', 'G'],
                        'Women': ['F', 'D'],
                        'Mixte': ['M']
                    }
    
    # related to progress bar.
    progress_bar = FloatProgress(min=0, max=len(data_race))
    display(progress_bar)
    
    # Loop on each row of the dataframe.
    for runner in data_race.itertuples() :
        sex_runner = None
        
        # Loop on each entry of the map
        for sex,letters in map_letter_sex.items(): 
            is_already_selected = False
            
            # Loop on each letter for the sex.
            for letter in letters :
                
                # We check if the categorie contains capital letter.
                if runner.catégorie.find(letter) != -1:

                    # We check There are not any double letter in the categorie name.
                    if is_already_selected == False:
                        sex_runner = sex
                        is_already_selected = True

                    else:
                     sex_runner_list.append(None) 
                     break
                    
        progress_bar.value += 1            
        sex_runner_list.append(sex_runner)
        
    data_race_copy = data_race.copy()
    data_race_copy[name_column] = sex_runner_list
    return data_race_copy


def compute_difference_Junior_adult (data_race):
    
    category_list = []
    junior_ref =  'Jun'
    
    # related to progress bar.
    progress_bar = FloatProgress(min=0, max=len(data_race))
    display(progress_bar)
    
    # Loop on each row of the dataframe.
    for runner in data_race.itertuples() :
         
        # We check if the categorie contains Junior.
        if runner.catégorie.find(junior_ref) != -1:
            category_list.append('Junior')
        else:
            category_list.append('Adult')
                
        progress_bar.value += 1            
        
    data_race_copy = data_race.copy()
    data_race_copy['type'] = category_list
    return data_race_copy


def compute_age (data_race):
    
    age_list = [] 
    # related to progress bar.
    progress_bar = FloatProgress(min=0, max=len(data_race))
    display(progress_bar)
    today = date.today()
    # Loop on each row of the dataframe.
    for runner in data_race.itertuples() :
        age_list.append(calculate_age(runner.an,today))     
        progress_bar.value += 1            
        
    data_race_copy = data_race.copy()
    data_race_copy['Age'] = age_list
    return data_race_copy

def calculate_age(born,today):
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [None]:
lausanne_race_sex_runner = assign_sex(lausanne_race,SEX_COLUMN_NAME)

In [None]:
lausanne_race_sex_runner.head(100)

In [None]:
lausanne_race_sex_runner[SEX_COLUMN_NAME].unique()

In [None]:
lausanne_race_sex_runner [(lausanne_race_sex_runner[SEX_COLUMN_NAME] != 'Men') & 
                          (lausanne_race_sex_runner[SEX_COLUMN_NAME] != 'Women')]['catégorie'].unique() 

From http://fr.lausanne-marathon.com/info-course/horaires/, we can easily remove peoples belong to the Pink_ch,10W-Walk and  10W-NW category, not interesting for our study because it's a walk and not running.

In [None]:
lausanne_race_sex_runner = lausanne_race_sex_runner [(lausanne_race_sex_runner[SEX_COLUMN_NAME] == 'Men') | 
                                                     (lausanne_race_sex_runner[SEX_COLUMN_NAME] == 'Women')]

In [None]:
len(lausanne_race_sex_runner)

We have a total of 11248 runner for the lausanne marathon of 2016.

We need to compute the exact time for each runner.

In [None]:
#lausanne_race_sex_runner['temps'] = lausanne_race_sex_runner['temps'].apply(lambda x: x.replace("1900-01-01 ", ""))
#lausanne_race_sex_runner['retard'] = lausanne_race_sex_runner['retard'].apply(lambda x: x.replace("1900-01-01 ", ""))

In [None]:
lausanne_race_sex_runner.head()

In [None]:
lausanne_race_sex_runner['rang'] = lausanne_race_sex_runner['rang'].apply(lambda x : int(float(x)))
lausanne_race_sex_runner.head()

# Statistical study

## Overall Study

In [None]:
lausanne_race_sex_runner[SEX_COLUMN_NAME].value_counts().plot.bar(title='Distribution men/women for the lausanne Marthon 2016')

The lausanne marathon race had a significantly higher percentage of males than females.

In [None]:
compute_difference_Junior_adult(lausanne_race_sex_runner)['type'].value_counts().plot.bar(title='Distribution adult/junior for the lausanne Marthon 2016')

In [None]:
pd.Series(
            { 
              'Individual' : lausanne_race_sex_runner['équipe'].isnull().sum(),
              'Team' : lausanne_race_sex_runner['équipe'].notnull().sum()
            }
).plot.bar(title='Distribution Team/individual for the lausanne Marthon 2016')

## Demographic Study

--> study age runner
--> study place where they come.

In [None]:
lausanne_race_age = compute_age(lausanne_race_sex_runner)

Display the age distirbution of runners in the marathon 2016 of Lausanne.

In [None]:
lausanne_race_age['Age'].apply(lambda x : int(float(x)))
lausanne_race_age['Age'].hist()

## Category Study

In [None]:
lausanne_race_sex_runner['catégorie'].unique()

--> make study for each run event --> 10K,21K,42K 
--> study with all distance.