In [None]:
# Data analysis 
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import FloatProgress
import collections
from IPython.display import display
from datetime import date
from dateutil.relativedelta import relativedelta
from matplotlib.pyplot import show

In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/Lausanne_Marathon_2016.pickle'
FEMALE_CATEGORY_REGEX = '([WD])|(JunF)'
MALE_CATEGORY_REGEX = '([H])|(JunG)'
MARATHON_DISTANCE_REGEX = '42-'
SEMI_MARATHON_DISTANCE_REGEX = '21-'
QUARTER_MARATHON_DISTANCE_REGEX = '10-'

In [None]:
def get_sex_of_runner(runner):
    '''
    Returns the sex of runner based on the category in which runner has done the marathon.
    
    Parameters
        - runner: row representing the runner
    
    Return
        - string ('female'/'male') or None if sex was not retrieved
    '''
    
    # We ignore this specific category as it is not useful and as it can be misleading
    if (runner['category'] == '10W-NW' or runner['category'] == '10W-Walk'):
        return None
    
    if (re.search(FEMALE_CATEGORY_REGEX, runner['category']) != None):
        return 'female'
    elif (re.search(MALE_CATEGORY_REGEX, runner['category']) != None):
        return 'male'
    else:
        return None


def get_type_of_runner(runner):
    '''
    Returns the type of the runner.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - string ('junior'/'adult')
    '''
    
    if (runner['category'].find('Jun') != -1):
        return 'junior'
    else:
        return 'adult'
    
def compute_distance_from_category (runner):
    '''
    Returns the category distance of the runner, based on category
    
    Parameters
        - runner: row representing the runner
        
    Return
        - distance of runner (int)
    '''
    if (re.search(MARATHON_DISTANCE_REGEX, runner['category']) != None):
        return 42
    elif (re.search(SEMI_MARATHON_DISTANCE_REGEX, runner['category']) != None):
        return 21
    elif (re.search(QUARTER_MARATHON_DISTANCE_REGEX, runner['category']) != None):
        return 10
    
    return None

def compute_age_of_runner(runner):
    '''
    Returns the age of runner, based on this year of birth.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - age of runner (int)
    '''
    
    today = date.today()
    birth_year = runner['birthday']
    return today.year - birth_year.year - ((today.month, today.day) < (birth_year.month, birth_year.day))

def compute_run_in_Team(runner):
    '''
    Returns the age of runner, based on this year of birth.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - age of runner (int)
    '''
    
    if pd.isnull(runner['team']):
        return 'Individual runners'
    else:
        return 'Runners in teams'

def format_time(runner):
    time = runner['time']
    formatted_time = time.time()
    if time:
        return datetime.timedelta(hours=formatted_time.hour, minutes=formatted_time.minute, seconds=formatted_time.second).total_seconds()
    

def convert_seconds_to_time(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "%d:%02d:%02d" % (h, m, s)

# Data wrangling

In [None]:
lausanne_marathon_2016 = pd.read_pickle(PATH_TO_DATA)
lausanne_marathon_2016.head()

In [None]:
lausanne_marathon_2016['sex'] = lausanne_marathon_2016.apply(get_sex_of_runner, axis=1)

In [None]:
lausanne_marathon_2016['sex'].value_counts(dropna=False)

Note: Here, we remove people that are part of *Pink_Ch*, *10W-Walk* or *10W-NW* categories as these categories do not belong to Lausanne Marathon, and as their name can be misleading with sex attribution. For more information about these categories, check <a href="http://fr.lausanne-marathon.com/info-course/horaires/">Lausanne Marathon's official website</a>.

In [None]:
lausanne_marathon_2016_cleaned = lausanne_marathon_2016[lausanne_marathon_2016['sex'].notnull()].copy()

In [None]:
len(lausanne_marathon_2016_cleaned)

We have a total of 12060 runners on whom we can run our analysis for Lausanne Marathon 2016.

We also transform the *rang* column as an integer.

In [None]:
lausanne_marathon_2016_cleaned['type'] = lausanne_marathon_2016_cleaned.apply(get_type_of_runner, axis=1)

In [None]:
lausanne_marathon_2016_cleaned['rank'] = lausanne_marathon_2016_cleaned['rank'].apply(lambda x : int(float(x)))
lausanne_marathon_2016_cleaned.head()

In [None]:
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned.apply(compute_age_of_runner, axis=1)
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned['age'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned['time'] = lausanne_marathon_2016_cleaned.apply(format_time, axis=1)

# Statistical study

## Overall study

In [None]:
plot = sns.countplot(data=lausanne_marathon_2016_cleaned, x='sex')
total = len(lausanne_marathon_2016_cleaned)
for p in plot.patches:
        plot.annotate('%{:.1f}'.format(p.get_height()*100/total), (p.get_x()+0.1, p.get_height()+50))

The Lausanne marathon race had a significantly higher percentage of male runners than female ones.

In [None]:
plot = sns.countplot(data=lausanne_marathon_2016_cleaned, x='type')
total = len(lausanne_marathon_2016_cleaned)
for p in plot.patches:
        plot.annotate('%{:.1f}'.format(p.get_height()*100/total), (p.get_x()+0.1, p.get_height()+50))

Not surprinsingly, very minor part of runners were young during the Lausanne Marathon of 2016.

An interesting fact is that we have a non-negligible part of runners who were in team.

## Demographic study

In this part, we plan to focus on age of runners, but also the place from which they come.

We first display the age distribution of runners in the marathon 2016 of Lausanne.

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,5)
ax.hist(lausanne_marathon_2016_cleaned['age'], bins=30)

# Compute the mean of age selected by gender.
mean_age_M = np.mean(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'male'])
mean_age_W = np.mean(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'female'])
mean_age_all = np.mean(lausanne_marathon_2016_cleaned['age'])

# display the median and titles.
ax.axvline(mean_age_all, 0, 1750, color='r', linestyle='--')
ax.set_title('Age Distribution of Runners')
ax.set_xlabel('Age')
ax.set_ylabel('Number of Runners')

#Calculate age distribution statistics by gender:
age_stats = 'Mean Age: ' + str(round(mean_age_all, 2)) + '\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age']), 2)) 
age_statsf = 'Mean Age (Female): ' + str(round(mean_age_M, 2)) + '\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'female']), 2))                                                                       
age_statsm = 'Mean Age (Male): ' + str(round(mean_age_W, 2)) + '\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'male']), 2))
age_stats = age_stats + '\n' + age_statsf + '\n' + age_statsm

# add legend text.
ax.text(.95, .95, age_stats, fontsize=11, transform=ax.transAxes, va='top', ha='right', bbox=props, multialignment='left')

## Performance according to age of participants

In [None]:
ax = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='age', y='time', kind='box', size=10, aspect=1.5)
formatted_labels = [convert_seconds_to_time(seconds) for seconds in lausanne_marathon_2016_cleaned['time']]
ax.set_yticklabels(formatted_labels)
ax.fig.suptitle('Distibution time by age')

## Category study

In [None]:
lausanne_marathon_2016_cleaned['distance (Km)'] = lausanne_marathon_2016_cleaned.apply(compute_distance_from_category, axis=1)

Now that we have the distance of each runners, we need to compute the speed by kilometer for each runners.

In [None]:
lausanne_marathon_2016_cleaned['Speed by kilometer (m/s)'] = lausanne_marathon_2016_cleaned['distance (Km)']*1000/lausanne_marathon_2016_cleaned['time']

In [None]:
ax = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='distance (Km)', y='Speed by kilometer (m/s)', kind='box', size=5, aspect=1.5)
ax.fig.suptitle('Distibution speed by kilometer')

As we can expect the fastest runners have made the 10 K, as the distance is shorter the speed by kilometer can be greater.
But unexpectedly the average speed seems very close between runners despite the difference of the distance, in order to understand this phenomenon we need to go further in the study and study the past of runners.

In [None]:
g = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='sex', y='Speed by kilometer (m/s)', col = 'distance (Km)',size=5, kind='box')

In [None]:
lausanne_marathon_2016_cleaned['type_team'] = lausanne_marathon_2016_cleaned.apply(compute_run_in_Team, axis=1)

In [None]:
plot = sns.countplot(x='type_team', hue='distance (Km)', data=lausanne_marathon_2016_cleaned)
plt.title('Distibution indivudal/team runners for the lausanne race 2016')
plot.set_xlabel('')
plot.set_ylabel('Number of Runners')

# displaying the percentage per each race.
total_10 = len(lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (Km)'] == 10])
total_21 = len(lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (Km)'] == 21])
total_42 = len(lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (Km)'] == 42])
totals = [total_10,total_21,total_42]
race_type = 0
index = 0

# loop for displaying percentages
for p in plot.patches:
        if race_type == 2:
            race_type = 0
            index = index + 1
        plot.annotate('%{:.1f}'.format(p.get_height()*100/totals[index]), (p.get_x()+0.1, p.get_height()+50))
        race_type = race_type + 1

The repartition individual/team seems quite equivalent between the different races offered by the event.
The teams are more present in the 10K. The distance seems more appropriate for the teams, it's short enough to be a good challenge between beginner friends.

Let's see if being in team a real impact on the performance

In [None]:
g = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='type_team', y='Speed by kilometer (m/s)', col = 'distance (Km)',size=5, kind='box')

Actually the teams are better on average compared to individual runners, but the best performance comes from individual runners.
From the graph above, we can notice for "average" people be in team seem to be a motivational factor, compared to excellent runners who prefer doing the race alone.