In [None]:
# Data analysis 
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from matplotlib.pyplot import show


# our code (mark it at autoreload at every cell execution - useful in developement mode)
%load_ext autoreload
%autoreload 1
%aimport utils

In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/Lausanne_Marathon_2016.pickle'

# Information about Canton of Vaud can be found on official government website
# https://www.bfs.admin.ch/bfs/fr/home/statistiques/population.assetdetail.1500543.html.
TOTAL_RESIDENT_VAUD = 778365
TOTAL_RESIDENT_MALE = 381864
TOTAL_RESIDENT_FEMALE = 396501

# Data wrangling

In this first part, we retrieve data of Lausanne Marathon 2016 and we manipulate the different columns in order to exploit them during the analysis part.

First, we use the pickle format to allow faster import of the data, and correct formatting of some columns as date columns.

Note: Same data are also available in CSV (readable format).

In [None]:
lausanne_marathon_2016 = pd.read_pickle(PATH_TO_DATA)
lausanne_marathon_2016.head()

Some participants are part of categories which must be excluded as these categories are not representative or can be misleading for the analysis part.

In [None]:
lausanne_marathon_2016_cleaned = lausanne_marathon_2016[lausanne_marathon_2016.apply(utils.filter_participants, axis=1)].copy()

Here, we know the categories of the runners and these categories contains sex information about the participants (for a given type of running, we have two subcategories for male and female runners, as usual in such sports). Thus, we use this information to create a new column giving the sex of each participants.

In [None]:
lausanne_marathon_2016_cleaned['sex'] = lausanne_marathon_2016_cleaned.apply(utils.get_sex_of_runner, axis=1)

In [None]:
lausanne_marathon_2016_cleaned['sex'].value_counts(dropna=False)

In [None]:
len(lausanne_marathon_2016_cleaned)

At the end of this stage, we have a total of 11248 runners on whom we can run our analysis for Lausanne Marathon 2016.

We also extract from the category the fact that a runner is either junior or adult.

In [None]:
lausanne_marathon_2016_cleaned['type'] = lausanne_marathon_2016_cleaned.apply(utils.get_type_of_runner, axis=1)

We also transform the *number* and *rank* columns, as they contain integer.

In [None]:
lausanne_marathon_2016_cleaned['number'] = lausanne_marathon_2016_cleaned['number'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned['rank'] = lausanne_marathon_2016_cleaned['rank'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned.head()

We also compute the age of runner based on the birthyear, to easily manipulate such information later.

In [None]:
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned.apply(utils.compute_age_of_runner, axis=1)
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned['age'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned['age category'] = pd.cut(lausanne_marathon_2016_cleaned['age'], [15, 26, 31, 36, 41, 46, 51, 56, 61, 66, 100], labels=['15-25 years', '26-30 years', '31-35 years', '36-40 years', '41-45 years', '46-50 years', '51-55 years', '56-60 years', '61-65 years', '65+ years'], right=False)

If the time is in readable-format, it is difficult to use it this way for analysis as comparison can be misleading. For this reason, we decide to store time in raw format (i.e. number of seconds, without any format).

In [None]:
lausanne_marathon_2016_cleaned['time'] = lausanne_marathon_2016_cleaned.apply(utils.format_time, axis=1)

Finally, we store the type of running the participants did, in order to make three distinct sets of data (10 km, semi-marathon and marathon).

In [None]:
lausanne_marathon_2016_cleaned['distance (km)'] = lausanne_marathon_2016_cleaned.apply(utils.compute_distance_from_category, axis=1)

In [None]:
lausanne_marathon_2016_10km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 10]
lausanne_marathon_2016_21km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 21]
lausanne_marathon_2016_42km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 42]

# Statistical study

## Overall study

In [None]:
# Building of DataFrame for ploting
total_runners = len(lausanne_marathon_2016_cleaned)
total_runners_male = len(lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['sex'] == 'male'])
total_runners_female = len(lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['sex'] == 'female'])
vaud_information_population = pd.Series({ 'male': TOTAL_RESIDENT_MALE/TOTAL_RESIDENT_VAUD * 100, 'female': TOTAL_RESIDENT_FEMALE/TOTAL_RESIDENT_VAUD * 100 }) 
marathon_information_runner = pd.Series({ 'male': total_runners_male/total_runners * 100, 'female': total_runners_female/total_runners * 100 }) 
information_population = pd.DataFrame({ 'Canton of Vaud': vaud_information_population, 'Lausanne Marathon': marathon_information_runner })
information_population.sort_index(axis=0, level=None, ascending=False, inplace=True)

# Displaying data
plot = information_population.plot.bar(figsize=(10,6), rot=0)
plot.set_title('Gender distribution Lausanne Marathon vs Canton of Vaud')   
plot.set_ylabel('Percentage (%)')
plot.set_xlabel('Gender')

# Add of annotations
annotations = [
                str(TOTAL_RESIDENT_MALE) + '\nresidents',
                str(TOTAL_RESIDENT_FEMALE)+ '\nresidents',
                str(total_runners_male) + ' runners',
                str(total_runners_female) + ' runners'
             ]
index = 0
for p in plot.patches:
    x_position = p.get_x()
    if index > 1 :
        x_position = x_position + 0.012
    plot.annotate(annotations[index], (x_position * 1.005, p.get_height() * 1.005))
    index = index + 1

plot.legend(loc='upper left')

# Add of box information on total runners/residents
# Create string for displaying important information
total_runners_str = 'Total residents: ' + str(TOTAL_RESIDENT_VAUD)
total_residents_str = 'Total runners: ' + str(total_runners)
stats = total_runners_str + ' \n' + total_residents_str 

# Add of information in the graph
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
plot.text(.95, .95, stats, fontsize=12, transform=plot.transAxes, va='top', ha='right', bbox=props, multialignment='left')

As we can see, Lausanne Marathon race had a significantly higher percentage of male runners than female ones comparing to gender distribution of Canton of Vaud, which is equal regarding male and female.

In [None]:
# Building of DataFrame for ploting
total_runners_10 = len(lausanne_marathon_2016_10km)
total_runners_21 = len(lausanne_marathon_2016_21km)
total_runners_42 = len(lausanne_marathon_2016_42km)

race_information_10 = pd.Series({ 
                                    'male': len(lausanne_marathon_2016_10km[lausanne_marathon_2016_10km['sex'] == 'male']) ,
                                    'female': len(lausanne_marathon_2016_10km[lausanne_marathon_2016_10km['sex'] == 'female'])
                                }) 

race_information_21 = pd.Series({ 
                                    'male': len(lausanne_marathon_2016_21km[lausanne_marathon_2016_21km['sex'] == 'male']) ,
                                    'female': len(lausanne_marathon_2016_21km[lausanne_marathon_2016_21km['sex'] == 'female'])
                                })

race_information_42 = pd.Series({ 
                                    'male': len(lausanne_marathon_2016_42km[lausanne_marathon_2016_42km['sex'] == 'male']) ,
                                    'female': len(lausanne_marathon_2016_42km[lausanne_marathon_2016_42km['sex'] == 'female'])
                                })


information_gender_race = pd.DataFrame({
                                       'Marathon': race_information_42,
                                       'Semi-marathon': race_information_21,
                                       '10 km': race_information_10
                                      })

information_gender_race = information_gender_race[['Marathon', 'Semi-marathon', '10 km']]

# Displaying of data
plot = information_gender_race.plot.bar(figsize=(10, 6), rot=0)
plot.set_title('Gender distribution by distance')
plot.set_ylabel('Number of runners')
plot.set_xlabel('Gender')

# Displaying of the percentage for each race
totals = [total_runners_42, total_runners_21, total_runners_10]
race_distance = 0
index = 0

# Loop (displaying percentages)
for p in plot.patches:
    
        if race_distance == 2:
            race_distance = 0
            index = index + 1
            
        plot.annotate('{:.1f}%'.format(p.get_height()*100/totals[index]), (p.get_x(), p.get_height() + 10))
        race_distance = race_distance + 1
    
plot.legend(loc='upper left')

# Add of box information on total runners/residents
# Creation of string for displaying important informations
total_marathon = 'Marathon: ' + str(total_runners_42) + ' runners'
total_semi_marathon = 'Semi-marathon: ' + str(total_runners_21) + ' runners'
total_10 = '10 km: ' + str(total_runners_10) + ' runners'
stats = total_marathon + ' \n' + total_semi_marathon + ' \n' + total_10

# Add of information in the graph
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
plot.text(.95, .95, stats, fontsize=12, transform=plot.transAxes, va='top', ha='left', bbox=props, multialignment='left')

In [None]:
plot = sns.countplot(data=lausanne_marathon_2016_cleaned, x='type')
plot.figure.set_size_inches(10,6)
total = len(lausanne_marathon_2016_cleaned)

for p in plot.patches:
        plot.annotate('{:.1f}%'.format(p.get_height()*100/total), (p.get_x()+0.35, p.get_height()+50))

# add legends
plot.set_xlabel('')
plot.set_ylabel('Number of runners')
plot.set_title('Distribution of runners (types)')

Not surprinsingly, very minor part of runners were young runners in Lausanne Marathon of 2016.

An interesting fact is that we have a non-negligible part of runners who were in team.

## Demographic study

In this part, we plan to focus on age of runners, but also the place from which they come.

We first display the age distribution of runners in the marathon 2016 of Lausanne.

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,6)
ax.hist(lausanne_marathon_2016_cleaned['age'], bins=30)

# Computing of the mean of age selected by gender
mean_age_M = np.mean(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'male'])
mean_age_W = np.mean(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'female'])
mean_age_all = np.mean(lausanne_marathon_2016_cleaned['age'])

# Display of the median and titles
ax.axvline(mean_age_all, 0, 1750, color='r', linestyle='--')
ax.set_title('Age Distribution of Runners')
ax.set_xlabel('Age')
ax.set_ylabel('Number of runners')

# Calculation of age distribution statistics by gender
age_stats = 'Mean Age: ' + str(round(mean_age_all, 2)) + ' years\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age']), 2)) 
age_statsf = 'Mean Age (Female): ' + str(round(mean_age_M, 2)) + ' years\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'female']), 2))                                                                       
age_statsm = 'Mean Age (Male): ' + str(round(mean_age_W, 2)) + ' years\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'male']), 2))
age_stats = age_stats + '\n' + age_statsf + '\n' + age_statsm

# Add of legend text
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax.text(.95, .95, age_stats, fontsize=11, transform=ax.transAxes, va='top', ha='right', bbox=props, multialignment='left')

In [None]:
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(10, 10)
fig.suptitle('Age Distribution of Runners selected by distance', fontsize=12)
utils.plot_distribution_age_distance(fig, lausanne_marathon_2016_10km, '10KM', 311)
utils.plot_distribution_age_distance(fig, lausanne_marathon_2016_21km, '21KM', 312)
utils.plot_distribution_age_distance(fig, lausanne_marathon_2016_42km, '42KM', 313)

## Performance according to age of participants

### Overall analysis

In [None]:
utils.plot_time_distribution_by_age(lausanne_marathon_2016_cleaned, 'age category')

### Performance for 10 km running

In [None]:
utils.plot_performance_according_to_running_type(lausanne_marathon_2016_10km, 10, 'age', size=10, aspect=1.5)

In [None]:
utils.plot_performance_according_to_running_type(lausanne_marathon_2016_10km, 10, 'age category')

### Performance for demi-marathon

In [None]:
utils.plot_performance_according_to_running_type(lausanne_marathon_2016_21km, 21, 'age', size=10, aspect=1.5)

In [None]:
utils.plot_performance_according_to_running_type(lausanne_marathon_2016_21km, 21, 'age category')

### Performance for marathon

In [None]:
utils.plot_performance_according_to_running_type(lausanne_marathon_2016_42km, 42, 'age', size=10, aspect=1.5)

In [None]:
utils.plot_performance_according_to_running_type(lausanne_marathon_2016_42km, 42, 'age category')

### Statistical tests

In [None]:
results_10km = utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_10km, 'age category', 'time')
print('F value: ' + str(results_10km['f_value']) + '\nP value: ' + str(results_10km['p_value']) + '\n')
results_10km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_10km['tukey_hsd'].set_index(['group1', 'group2'])

In [None]:
results_21km = utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_21km, 'age category', 'time')
print('F value: ' + str(results_21km['f_value']) + '\nP value: ' + str(results_21km['p_value']) + '\n')
results_21km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_21km['tukey_hsd'].set_index(['group1', 'group2'])

In [None]:
results_42km = utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_42km, 'age category', 'time')
print('F value: ' + str(results_42km['f_value']) + '\nP value: ' + str(results_42km['p_value']) + '\n')
results_42km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_42km['tukey_hsd'].set_index(['group1', 'group2'])

## Effective performance according to BIB number

In [None]:
ax = lausanne_marathon_2016_cleaned.plot(kind='scatter', x='number', y='time', xlim=(-1000, 18000));
formatted_labels = [utils.convert_seconds_to_time(seconds) for seconds in lausanne_marathon_2016_cleaned['time']]
ax.set_yticklabels(formatted_labels)
plt.title('Running time according to BIB number of participants')
plt.show()

We can recognize three major patterns following the BIB number of participants. Indeed, the higher the BIB number is, the lower the performance time is, globally.

Notice that BIB numbers are given by the organizators of Lausanne Marathon, and as we can read on <a href="http://fr.lausanne-marathon.com/inscription/inscriptions/inscription-online/">the official website</a>, each participant is asked to indicate his "estimated time of running for the attribution of start blocks", in oder to categorize the runners accordingly.

## Category study

Now that we have the distance of each runners, we need to compute the speed by kilometer for each runners.

In [None]:
lausanne_marathon_2016_cleaned['Speed (m/s)'] = lausanne_marathon_2016_cleaned['distance (km)']*1000/lausanne_marathon_2016_cleaned['time']
lausanne_marathon_2016_cleaned['type_team'] = lausanne_marathon_2016_cleaned.apply(utils.compute_run_in_team, axis=1)

In [None]:
# Creation of sets according of running type
lausanne_marathon_2016_cleaned_10 = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 10]
lausanne_marathon_2016_cleaned_21 = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 21]
lausanne_marathon_2016_cleaned_42 = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 42]

In [None]:
# Creation of figure
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(10, 10)
fig.suptitle('Speed distribution selected by distance', fontsize=14)
utils.plot_speed_distribution_by_running(fig, lausanne_marathon_2016_cleaned_10, '10 km', 1, np.arange(0,900, 100))
utils.plot_speed_distribution_by_running(fig, lausanne_marathon_2016_cleaned_21, '21 km', 2, np.arange(0,700, 100))
utils.plot_speed_distribution_by_running(fig, lausanne_marathon_2016_cleaned_42, '42 km', 3, np.arange(0,250, 50))

As we can expect the fastest runners have made the 10 km, as the distance is shorter, the speed by kilometer can be greater.

But unexpectedly the average speed seems very close between runners despite the difference of the distance, in order to understand this phenomenon we need to go further in the study and study the past of runners.

In [None]:
lausanne_marathon_2016_cleaned.head()

In [None]:
plot = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='sex', y='Speed (m/s)', col = 'distance (km)', size=5, kind='box')
plot.set_xlabels('Gender')
plot.fig.set_size_inches(10, 6)

In [None]:
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(10, 6)

ax = fig.add_subplot(111)
ax = sns.countplot(x='type_team', hue='distance (km)', data=lausanne_marathon_2016_cleaned)
ax.set_xlabel('')
ax.set_ylabel('Number of Runners')
ax.set_title('Team/indivual runners composition')

total_10 = len(lausanne_marathon_2016_cleaned_10)
total_21 = len(lausanne_marathon_2016_cleaned_21)
total_42 = len(lausanne_marathon_2016_cleaned_42)
totals = [total_10, total_21, total_42]
race_type = 0
index = 0

# Loop (displaying of the percentages)
for p in ax.patches:
        if race_type == 2:
            race_type = 0
            index = index + 1
        ax.annotate('{:.1f}%'.format(p.get_height()*100/totals[index]), (p.get_x()+0.05, p.get_height()+50))
        race_type = race_type + 1
        

The repartition individual/team seems quite equivalent between the different races offered by the event.
The teams are more present in the 10 km. The distance seems more appropriate for the teams, it's short enough to be a good challenge between beginner friends.

Let's see if being in team has a real impact on the performance.

In [None]:
plot = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='type_team', y='Speed (m/s)', col = 'distance (km)', kind='box')
plot.set_xlabels('')
plot.fig.suptitle('Speed distribution by types of runners', fontsize=14)
plot.fig.set_size_inches(10,6)

In [None]:
utils.display_information_speed(lausanne_marathon_2016_cleaned)

Actually the teams are better on average compared to individual runners, but the best performance comes from individual runners.
From the graph above, we can notice that for "average" people, to be part of a team seems to be a motivational factor, compared to excellent runners who prefer doing the race alone.

In [None]:
team_selected = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['type_team'] == 'Runners in teams']
lausanne_marathon_2016_cleaned['time difference team'] = lausanne_marathon_2016_cleaned.apply(utils.compute_time_to_best_in_team,args=(team_selected,), axis=1)

In [None]:
lausanne_marathon_team = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['type_team'] == 'Runners in teams'].copy()
utils.plot_time_difference_distribution(lausanne_marathon_team)

In [None]:
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(15, 15)
fig.suptitle('Team representation', fontsize=14)
annotation_plot1 = [311,'Runner Pair',(0,2000), (0,2500)]
annotation_plot2 = [313,'individual runners',(2,4000), (2,6000)]
utils.plot_scatter_difference_time_number(fig, lausanne_marathon_2016_cleaned, 10, 1000, 311, annotation_plot1)
utils.plot_scatter_difference_time_number(fig, lausanne_marathon_2016_cleaned, 21, 1000, 312)
utils.plot_scatter_difference_time_number(fig, lausanne_marathon_2016_cleaned, 42, 1000, 313, annotation_plot2)