In [None]:
import pandas as pd
import numpy as np
import re
import collections
import ipywidgets as widgets
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pyplot import show
from IPython.display import display, clear_output
import plotly
plotly.offline.init_notebook_mode()
import sys
sys.path.append('..')
%load_ext autoreload
%autoreload 1
%aimport study_utils
%aimport lausanne_2016_plotly_utils

In [None]:
# Constants
PATH_TO_DATA = '../../Scraping/DataSport/Data/Lausanne_Marathon_2016.pickle'
ALL_PERFORMANCE_CRITERIA = ['Time', 'Speed (m/s)']
ALL_AGE_CATEGORIES = ['10-25 years', '26-30 years', '31-35 years', '36-40 years', '41-45 years', '46-50 years', '51-55 years', '56-60 years', '61-65 years', '65+ years']
ALL_RUNNINGS = ['10 km', 'Semi-marathon', 'Marathon']
ALL_COLUMN_CATEGORIES_AGE = ['Age', 'Age category']
AGE_LABEL = widgets.Label(value='Age category')
RUNNING_LABEL = widgets.Label(value='Running')
COLUMN_CATEGORIES_AGE_LABEL = widgets.Label(value='Age/Age category')
PERFORMANCE_LABEL = widgets.Label(value='Performance')

**NOTE:**
This Notebook contains the same results as *study_lausanne_2016.ipynb* but uses Plotly to display plots.

# Data wrangling

In this first part, we retrieve data of Lausanne Marathon 2016 and we manipulate the different columns in order to exploit them during the analysis part.

First, we use the pickle format to allow faster import of the data, and correct formatting of some columns as date columns.

Note: Same data are also available in CSV (readable format).

In [None]:
lausanne_marathon_2016 = pd.read_pickle(PATH_TO_DATA)
lausanne_marathon_2016.head()

Some participants are part of categories which must be excluded as these categories are not representative or can be misleading for the analysis part.

In [None]:
lausanne_marathon_2016_cleaned = lausanne_marathon_2016[lausanne_marathon_2016.apply(study_utils.filter_participants, axis=1)].copy()

Here, we know the categories of the runners and these categories contains sex information about the participants (for a given type of running, we have two subcategories for male and female runners, as usual in such sports). Thus, we use this information to create a new column giving the sex of each participants.

In [None]:
lausanne_marathon_2016_cleaned['sex'] = lausanne_marathon_2016_cleaned.apply(study_utils.get_sex_of_runner, axis=1)

In [None]:
lausanne_marathon_2016_cleaned['sex'].value_counts(dropna=False)

In [None]:
len(lausanne_marathon_2016_cleaned)

At the end of this stage, we have a total of 11248 runners on whom we can run our analysis for Lausanne Marathon 2016.

We also extract from the category the fact that a runner is either junior or adult.

In [None]:
lausanne_marathon_2016_cleaned['type'] = lausanne_marathon_2016_cleaned.apply(study_utils.get_type_of_runner, axis=1)

We also transform the *number* and *rank* columns, as they contain integer.

In [None]:
lausanne_marathon_2016_cleaned['number'] = lausanne_marathon_2016_cleaned['number'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned['rank'] = lausanne_marathon_2016_cleaned['rank'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned.head()

We also compute the age of runner based on the birthyear, to easily manipulate such information later.

In [None]:
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned.apply(study_utils.compute_age_of_runner, axis=1)
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned['age'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned['age category'] = pd.cut(lausanne_marathon_2016_cleaned['age'], [10, 26, 31, 36, 41, 46, 51, 56, 61, 66, 100], labels=ALL_AGE_CATEGORIES, right=False)

If the time is in readable-format, it is difficult to use it this way for analysis as comparison can be misleading. For this reason, we decide to store time in raw format (i.e. number of seconds, without any format).

In [None]:
lausanne_marathon_2016_cleaned['time'] = lausanne_marathon_2016_cleaned.apply(study_utils.format_time, axis=1)

Moreover, we store the type of running the participants did.

In [None]:
lausanne_marathon_2016_cleaned['distance (km)'] = lausanne_marathon_2016_cleaned.apply(study_utils.compute_distance_from_category, axis=1)

We also need to compute the speed by kilometer for each runners.

In [None]:
lausanne_marathon_2016_cleaned['speed (m/s)'] = lausanne_marathon_2016_cleaned['distance (km)']*1000/lausanne_marathon_2016_cleaned['time']
lausanne_marathon_2016_cleaned['profile'] = lausanne_marathon_2016_cleaned.apply(study_utils.compute_run_in_team, axis=1)

Finally, we make three distinct sets of data (10 km, semi-marathon and marathon).

In [None]:
lausanne_marathon_2016_10km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 10]
lausanne_marathon_2016_21km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 21]
lausanne_marathon_2016_42km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 42]

Note that for some graphs, it will be necessary to use time as datetime, so we create a DataFrame accordingly.

In [None]:
lausanne_marathon_2016_cleaned_with_datetime = lausanne_marathon_2016_cleaned.copy()
lausanne_marathon_2016_cleaned_with_datetime['time'] = pd.to_datetime([study_utils.convert_seconds_to_time(t) for t in lausanne_marathon_2016_cleaned['time']], format='%H:%M:%S')

In [None]:
lausanne_marathon_2016_cleaned.to_pickle('./Data/Lausanne_Marathon_2016_cleaned.pickle')

# Complete study

## Overall study

In [None]:
gender_distribution = lausanne_2016_plotly_utils.plot_gender_distributions(lausanne_marathon_2016_cleaned)

As we can see, Lausanne Marathon race had a significantly higher percentage of male runners than female ones comparing to gender distribution of Canton of Vaud, which is equal regarding male and female.

In [None]:
gender_distribution_runnings = lausanne_2016_plotly_utils.plot_gender_distribution_according_to_running_type(lausanne_marathon_2016_cleaned)

We also see an interesting thing is that for 10 km running, distribution between male and female runners is similar, while for semi-marathon and marathon, we observe a clear difference between the repartition! According to the new direction of the Lausanne Marathon, there is a reason behind this observation.

> "Women don't even dare start the race unless they are totaly sure about mastering the distance, unlike men who sometimes finish completely exhausted.”
>
> Source: <a href="https://www.rts.ch/sport/athletisme/1258113-le-marathon-de-lausanne-change-dorientation.html">RTSSport (FR)</a>

In [None]:
distribution_by_types = lausanne_2016_plotly_utils.plot_distribution_between_types_of_participants(lausanne_marathon_2016_cleaned)

Not surprinsingly, very minor part of runners were young runners in Lausanne Marathon of 2016. Indeed, these runnings ask a lot of effort and are very difficult for juniors runners.

## Demographic study

In this part, we plan to focus on age of runners, but also the place from which they come.

We first display the age distribution of runners in the marathon 2016 of Lausanne.

In [None]:
age_distribution = lausanne_2016_plotly_utils.plot_age_distribution(lausanne_marathon_2016_cleaned)

We can notice that, globally, the average age of runners is 39 years. We also observe the conclusion we previously made (i.e. male runners are younger than female ones as women wait before trying to participate to a semi-marathon or a marathon).

In [None]:
distribution_age_category_distance = lausanne_2016_plotly_utils.plot_distribution_age_distance(data=lausanne_marathon_2016_cleaned)

Here, the important fact is that average age of runners tends to increase when number of kilometers of a runnings is increasing. This can be explained by the fact that it takes time before mastering such races like semi-marathons and marathons.

### Performance according to age of participants

#### Overall analysis

In [None]:
# Creation of figures
time_distribution_by_age = lausanne_2016_plotly_utils.plot_time_distribution_by_age(data=lausanne_marathon_2016_cleaned_with_datetime, age_column_name='age category')

In [None]:
def plot_time_distribution():
    clear_output()
    data = time_distribution_by_age[selected_age_category_time_distribution]
    plotly.offline.iplot(data)

def update_age_category_time_distribution(change):
    if change['new'] not in ALL_AGE_CATEGORIES:
        return
    global selected_age_category_time_distribution
    selected_age_category_time_distribution = change['new']
    plot_time_distribution()

# Default selection
selected_age_category_time_distribution = '10-25 years'

# Widget for age categories
age_categories_time_distribution_widget = widgets.Dropdown(options=ALL_AGE_CATEGORIES, value=selected_age_category_time_distribution)
age_categories_time_distribution_widget.observe(update_age_category_time_distribution)

In [None]:
display(AGE_LABEL)
display(age_categories_time_distribution_widget)
plot_time_distribution()

#### Performance by runnings

First we create the figures.

In [None]:
# Creation of figures
performance_by_age_and_age_category = lausanne_2016_plotly_utils.generate_performance_by_age_and_age_category(lausanne_marathon_2016_cleaned_with_datetime)

Then we create widget with its associated handler.

In [None]:
def plot_performance_distribution():
    clear_output()
    data = performance_by_age_and_age_category[selected_running_performance_distribution]
    plotly.offline.iplot(data)
    
def update_running_performance_distribution(change):
    if change['new'] not in ALL_RUNNINGS:
        return
    global selected_running_performance_distribution
    selected_running_performance_distribution = change['new']
    plot_performance_distribution()

# Default selection
selected_running_performance_distribution = 'Marathon'

# Widget for age categories
running_performance_distribution_widget = widgets.Dropdown(options=ALL_RUNNINGS, value=selected_running_performance_distribution)
running_performance_distribution_widget.observe(update_running_performance_distribution)

Finally, we display the graph.

In [None]:
display(RUNNING_LABEL)
display(running_performance_distribution_widget)
plot_performance_distribution()

#### Analysis of results

Globally, we observe more real differences at the extremes. But are these differences significant? We decide to compute ANOVA and Tukey HSD to verify it.

In [None]:
results_10km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_10km, 'age category', 'time')
print('F value: ' + str(results_10km['f_value']) + '\nP value: ' + str(results_10km['p_value']) + '\n')
results_10km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_10km['tukey_hsd'].set_index(['group1', 'group2'])

In [None]:
results_21km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_21km, 'age category', 'time')
print('F value: ' + str(results_21km['f_value']) + '\nP value: ' + str(results_21km['p_value']) + '\n')
results_21km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_21km['tukey_hsd'].set_index(['group1', 'group2'])

In [None]:
results_42km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_42km, 'age category', 'time')
print('F value: ' + str(results_42km['f_value']) + '\nP value: ' + str(results_42km['p_value']) + '\n')
results_42km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_42km['tukey_hsd'].set_index(['group1', 'group2'])

In all the cases, p value is extremly low. This means that we can reject null hypothesis H0 and conclude safely that there are significant differences between age categories!

## Effective performance according to BIB number

First we create figures.

In [None]:
time_distribution_by_bib_numbers = lausanne_2016_plotly_utils.generate_time_distribution_by_bib_numbers(data=lausanne_marathon_2016_cleaned_with_datetime, performance_criteria=ALL_PERFORMANCE_CRITERIA)

Then we create widget accordingly.

In [None]:
def plot_bib_distribution():
    clear_output()
    data = time_distribution_by_bib_numbers[selected_performance_criterion_bib_distribution]
    plotly.offline.iplot(data)
    
def update_performance_criterion_bib_distribution(change):
    if change['new'] not in ALL_PERFORMANCE_CRITERIA:
        return
    global selected_performance_criterion_bib_distribution
    selected_performance_criterion_bib_distribution = change['new']
    plot_bib_distribution()

# Default selection
selected_performance_criterion_bib_distribution = 'Time'

# Widget for age categories
bib_distribution_widget = widgets.Dropdown(options=ALL_PERFORMANCE_CRITERIA, value=selected_performance_criterion_bib_distribution)
bib_distribution_widget.observe(update_performance_criterion_bib_distribution)

Finally, we display graph.

In [None]:
display(PERFORMANCE_LABEL)
display(bib_distribution_widget)
plot_bib_distribution()

We can recognize three major patterns following the BIB number of participants. Indeed, for each subdivisions, the higher the BIB number is, the lower the performance time is, globally.

Notice that BIB numbers are given by the organizators of Lausanne Marathon, and as we can read on <a href="http://fr.lausanne-marathon.com/inscription/inscriptions/inscription-online/">the official website</a>, each participant is asked to indicate his "estimated time of running for the attribution of start blocks", in oder to categorize the runners accordingly.

## Study by sex

In [None]:
speed_distribution_by_running = lausanne_2016_plotly_utils.plot_speed_distribution_by_running(lausanne_marathon_2016_cleaned_with_datetime)

As we can expect the fastest runners have made the 10 km, as the distance is shorter, the speed by kilometer can be greater.

But unexpectedly the average speed seems very close between runners despite the difference of the distance, in order to understand this phenomenon we need to go further in the study and study the past of runners.

### Statistical analysis

** Overview **

In [None]:
performance_comparison = lausanne_2016_plotly_utils.generate_performance_comparison(data=lausanne_marathon_2016_cleaned_with_datetime)

Following the boxplots, differences seem to be significant.

** 10 km running **

In [None]:
results_10km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_10km, 'sex', 'speed (m/s)')
print('F value: ' + str(results_10km['f_value']) + '\nP value: ' + str(results_10km['p_value']) + '\n')
results_10km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_10km['tukey_hsd'].set_index(['group1', 'group2'])

** Semi-marathon **

In [None]:
results_21km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_21km, 'sex', 'speed (m/s)')
print('F value: ' + str(results_21km['f_value']) + '\nP value: ' + str(results_21km['p_value']) + '\n')
results_21km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_21km['tukey_hsd'].set_index(['group1', 'group2'])

** Marathon **

In [None]:
results_42km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_42km, 'sex', 'speed (m/s)')
print('F value: ' + str(results_42km['f_value']) + '\nP value: ' + str(results_42km['p_value']) + '\n')
results_42km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_42km['tukey_hsd'].set_index(['group1', 'group2'])

As we can see, no matter the running we consider, the difference between male and female is significant as we can reject H0 hypothesis. Notice that p-value is higher for the case of marathon, as the distribution seem to be more similar (see previous plots).

## Focus on teams and single runners

First, let's display the distribution of runners with and without teams.

In [None]:
runners_composition = lausanne_2016_plotly_utils.plot_runners_teams_individual_distribution_according_to_running_type(df=lausanne_marathon_2016_cleaned_with_datetime)

The repartition individual/team seems quite equivalent between the different races offered by the event.
The teams are more present in the 10 km. The distance seems more appropriate for the teams, it's short enough to be a good challenge between beginner friends.

### Statistical analysis

Let's see if being in team has a real impact on the performance.

In [None]:
performance_comparison_by_profile = lausanne_2016_plotly_utils.generate_performance_comparison_by_profile(data=lausanne_marathon_2016_cleaned_with_datetime)

In [None]:
lausanne_2016_plotly_utils.display_information_speed(lausanne_marathon_2016_cleaned)

Actually the teams are better on average compared to individual runners, but the best performance comes from individual runners.
From the graph above, we can notice that for "average" people, to be part of a team seems to be a motivational factor, compared to excellent runners who prefer doing the race alone.

Running ANOVA permits us to draw the same conclusion, as showed in the following results. Notice that for the marathon, p value is higher than for the other races as medians tend to be similar between the two categories (runners with and without team), even if it is not sufficiently high to maintain H0 hypothesis (i.e. similar distributions).

In [None]:
results_teams_10km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_10km, 'profile', 'speed (m/s)')
print('F value: ' + str(results_teams_10km['f_value']) + '\nP value: ' + str(results_teams_10km['p_value']) + '\n')
results_teams_10km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_teams_10km['tukey_hsd'].set_index(['group1', 'group2'])

In [None]:
results_teams_21km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_21km, 'profile', 'speed (m/s)')
print('F value: ' + str(results_teams_21km['f_value']) + '\nP value: ' + str(results_teams_21km['p_value']) + '\n')
results_teams_21km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_teams_21km['tukey_hsd'].set_index(['group1', 'group2'])

In [None]:
results_teams_42km = study_utils.compute_anova_and_tukey_hsd(lausanne_marathon_2016_42km, 'profile', 'speed (m/s)')
print('F value: ' + str(results_teams_42km['f_value']) + '\nP value: ' + str(results_teams_42km['p_value']) + '\n')
results_teams_42km['tukey_hsd'].rename(columns=lambda x: x.strip(), inplace=True)
results_teams_42km['tukey_hsd'].set_index(['group1', 'group2'])

In [None]:
team_selected = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['profile'] == 'Team-mate']
lausanne_marathon_2016_cleaned['time difference team'] = lausanne_marathon_2016_cleaned.apply(study_utils.compute_time_to_best_in_team, args=(team_selected,), axis=1)

In [None]:
lausanne_marathon_2016_runners_in_team = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['profile'] == 'Team-mate'].copy()
time_difference_distribution = lausanne_2016_plotly_utils.plot_time_difference_distribution(df=lausanne_marathon_2016_runners_in_team)

We observe that when runners are in team, they globally share similar performance (time's difference is between 0 and 5 minutes for the majority of runners). This indicates that similar people compose each team and they ran together, as we can expect it.

In [None]:
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(15, 15)
fig.suptitle('Team representation', fontsize=14)
annotation_plot1 = [311,'Runner Pair',(0, 2000), (0, 2500)]
annotation_plot2 = [313,'individual runner',(2, 4000), (2, 6000)]
lausanne_2016_plotly_utils.plot_scatter_difference_time_number(fig, lausanne_marathon_2016_cleaned, 10, 311, annotation_plot1)
lausanne_2016_plotly_utils.plot_scatter_difference_time_number(fig, lausanne_marathon_2016_cleaned, 21, 312)
lausanne_2016_plotly_utils.plot_scatter_difference_time_number(fig, lausanne_marathon_2016_cleaned, 42, 313, annotation_plot2)

As we can noticed from the previous study most of pair/group of runner have finished the race together. In order to go futher we need to study the situation of those who finihsed later. (study above take runners wither a minimum late of 1000 seconds after the best runners in the team).

We have considered a pair/group runner if the distance time if weaker than 30sec. It is very  unlikely that any two runners would have the exact same start and end time (within 30 sec) unless they ran together, so these visual clusters should be very accurate in showing running pairs/groups.Based on the graph there are distinct differences in "belong to a team" in the difference race, Based on the graph the 10km seems to be the more solidary one, where most people in a team ran in pair/group (326 runners compared to 60 runners who have finished alone). The 21km had a half of runner who have ran in pair/group whereas the 42km seems the more individiual one where 51 runners finished alone compared to 17 runners in teams/groups. 42 km is a long distance, the challenge is often individual and often seen as personnal reward of long hour of training, this could explain the difference in type of runners belong to a team.

# Save of (dictionaries containing) Plotly figures to JSON

We define dictionary containing all data to export.

In [None]:
data_to_export = {'gender_distribution': gender_distribution,
                  'gender_distribution_runnings': gender_distribution_runnings,
                  'distribution_by_types': distribution_by_types,
                  'age_distribution': age_distribution,
                  'distribution_age_category_distance': distribution_age_category_distance,
                  'time_distribution_by_age': time_distribution_by_age,
                  'performance_by_age_and_age_category': performance_by_age_and_age_category,
                  'time_distribution_by_bib_numbers': time_distribution_by_bib_numbers,
                  'speed_distribution_by_running': speed_distribution_by_running,
                  'performance_comparison': performance_comparison,
                  'runners_composition': runners_composition,
                  'performance_comparison_by_profile': performance_comparison_by_profile,
                  'time_difference_distribution': time_difference_distribution
                 }

Finally, we export all data.

**IMPORTANT: Generating JSON of all data takes some time and requires high resources.<br>By default, code is commented and existing files are not overriden.**

In [None]:
for name, data in data_to_export.items():
    # Note: We set indent to None in order to minify JSON
    study_utils.convert_to_JSON(object=data, file_name=name, path='./Data', encoder=plotly.utils.PlotlyJSONEncoder, indent=None)