In [None]:
# Data analysis 
import pandas as pd
import numpy as np
import re

# Utils
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import FloatProgress
import collections
from IPython.display import display
from datetime import date
from dateutil.relativedelta import relativedelta

In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/Lausanne_Marathon_2016.pickle'
FEMALE_CATEGORY_REGEX = '([WD])|(Fille)|(JunF)'
MALE_CATEGORY_REGEX = '([H])|(Gar)|(JunG)'

In [None]:
def get_sex_of_runner(runner):
    '''
    Returns the sex of runner based on the category in which runner has done the marathon.
    
    Parameters
        - runner: row representing the runner
    
    Return
        - string ('female'/'male') or None if sex was not retrieved
    '''
    
    # We ignore this specific category as it is not useful and as it can be misleading
    if (runner['category'] == '10W-NW' or runner['category'] == '10W-Walk'):
        return None
    
    if (re.search(FEMALE_CATEGORY_REGEX, runner['category']) != None):
        return 'female'
    elif (re.search(MALE_CATEGORY_REGEX, runner['category']) != None):
        return 'male'
    else:
        return None


def get_type_of_runner(runner):
    '''
    Returns the type of the runner.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - string ('junior'/'adult')
    '''
    
    if (runner['category'].find('Jun') != -1):
        return 'junior'
    else:
        return 'adult'
    

def compute_age_of_runner(runner):
    '''
    Returns the age of runner, based on this year of birth.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - age of runner (int)
    '''
    
    today = date.today()
    birth_year = runner['birthday']
    return today.year - birth_year.year - ((today.month, today.day) < (birth_year.month, birth_year.day))

# Data wrangling

In [None]:
lausanne_marathon_2016 = pd.read_pickle(PATH_TO_DATA)
lausanne_marathon_2016.head()

In [None]:
lausanne_marathon_2016['sex'] = lausanne_marathon_2016.apply(get_sex_of_runner, axis=1)

In [None]:
lausanne_marathon_2016['sex'].value_counts(dropna=False)

Note: Here, we remove people that are part of *Pink_Ch*, *10W-Walk* or *10W-NW* categories as these categories do not belong to Lausanne Marathon, and as their name can be misleading with sex attribution. For more information about these categories, check <a href="http://fr.lausanne-marathon.com/info-course/horaires/">Lausanne Marathon's official website</a>.

In [None]:
lausanne_marathon_2016_cleaned = lausanne_marathon_2016[lausanne_marathon_2016['sex'].notnull()].copy()

In [None]:
len(lausanne_marathon_2016_cleaned)

We have a total of 12060 runners on whom we can run our analysis for Lausanne Marathon 2016.

We also transform the *rang* column as an integer.

In [None]:
lausanne_marathon_2016_cleaned['type'] = lausanne_marathon_2016_cleaned.apply(get_type_of_runner, axis=1)

In [None]:
lausanne_marathon_2016_cleaned['rank'] = lausanne_marathon_2016_cleaned['rank'].apply(lambda x : int(float(x)))
lausanne_marathon_2016_cleaned.head()

# Statistical study

## Overall study

In [None]:
lausanne_marathon_2016_cleaned['sex'].value_counts().plot.bar(title='Distribution men/women for the lausanne Marthon 2016')

The Lausanne marathon race had a significantly higher percentage of male runners than female ones.

In [None]:
lausanne_marathon_2016_cleaned['type'].value_counts().plot.bar(title='Distribution adult/junior for the lausanne Marthon 2016')

Not surprinsingly, very minor part of runners were young during the Lausanne Marathon of 2016.

In [None]:
pd.Series(
            { 
              'Individual runners' : lausanne_marathon_2016_cleaned['team'].isnull().sum(),
              'Runners in teams' : lausanne_marathon_2016_cleaned['team'].notnull().sum()
            }
).plot.bar(title='Distribution Team/individual for the lausanne Marthon 2016')

An interesting fact is that we have a non-negligible part of runners who were in team.

## Demographic study

In this part, we plan to focus on age of runners, but also the place from which they come.

In [None]:
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned.apply(compute_age_of_runner, axis=1)
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned['age'].apply(lambda x : int(float(x)))

Display the age distirbution of runners in the marathon 2016 of Lausanne.

In [None]:
lausanne_marathon_2016_cleaned['age'].hist()

## Category study

In [None]:
lausanne_marathon_2016_cleaned['category'].unique()

Make study for each run event.
<br>
10K, 21K, 42K
<br>
Study with all distance.