In [None]:
# Data analysis 
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from ipywidgets import FloatProgress
import collections
from IPython.display import display
from datetime import date
from dateutil.relativedelta import relativedelta
from matplotlib.pyplot import show

In [None]:
PATH_TO_DATA = '../../Scraping/DataSport/Data/Lausanne_Marathon_2016.pickle'
FEMALE_CATEGORY_REGEX = '([WD])|(JunF)'
MALE_CATEGORY_REGEX = '([H])|(JunG)'
MARATHON_DISTANCE_REGEX = '42-'
SEMI_MARATHON_DISTANCE_REGEX = '21-'
QUARTER_MARATHON_DISTANCE_REGEX = '10-'

In [None]:
def get_sex_of_runner(runner):
    '''
    Returns the sex of runner based on the category in which runner has done the marathon.
    
    Parameters
        - runner: row representing the runner
    
    Return
        - string ('female'/'male') or None if sex was not retrieved
    '''
    
    # We ignore this specific category as it is not useful and as it can be misleading
    if (runner['category'] == '10W-NW' or runner['category'] == '10W-Walk'):
        return None
    
    if (re.search(FEMALE_CATEGORY_REGEX, runner['category']) != None):
        return 'female'
    elif (re.search(MALE_CATEGORY_REGEX, runner['category']) != None):
        return 'male'
    else:
        return None


def get_type_of_runner(runner):
    '''
    Returns the type of the runner.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - string ('junior'/'adult')
    '''
    
    if (runner['category'].find('Jun') != -1):
        return 'junior'
    else:
        return 'adult'
    
    
def compute_distance_from_category (runner):
    '''
    Returns the category distance of the runner, based on category
    
    Parameters
        - runner: row representing the runner
        
    Return
        - distance of runner (int)
    '''
    
    if (re.search(MARATHON_DISTANCE_REGEX, runner['category']) != None):
        return 42
    elif (re.search(SEMI_MARATHON_DISTANCE_REGEX, runner['category']) != None):
        return 21
    elif (re.search(QUARTER_MARATHON_DISTANCE_REGEX, runner['category']) != None):
        return 10
    
    return None


def compute_age_of_runner(runner):
    '''
    Returns the age of runner, based on this year of birth.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - age of runner (int)
    '''
    
    today = date.today()
    birth_year = runner['birthday']
    return today.year - birth_year.year - ((today.month, today.day) < (birth_year.month, birth_year.day))


def compute_run_in_team(runner):
    '''
    Returns the age of runner, based on this year of birth.
    
    Parameters
        - runner: row representing the runner
        
    Return
        - age of runner (int)
    '''
    
    if pd.isnull(runner['team']):
        return 'Individual runners'
    else:
        return 'Runners in teams'

    
def format_time(runner):
    '''
    Returns the number of seconds of running time of current runner
    
    Parameters
        - runner: row representing the runner
        
    Return
        - total running time in seconds (int)
    '''
    
    time = runner['time']
    formatted_time = time.time()
    if time:
        return datetime.timedelta(hours=formatted_time.hour, minutes=formatted_time.minute, seconds=formatted_time.second).total_seconds()
    

def convert_seconds_to_time(seconds):
    '''
    Returns formatted time according to a given number of seconds
    
    Parameters
        - seconds: number of seconds of a given time
        
    Return
        - formatted time (HH:mm:ss format, string)
    '''
    
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return "%d:%02d:%02d" % (h, m, s)


def plot_performance_according_to_running_type(data, nb_km):
    '''
    Plots the performance according to age of participants for a given running
    
    Parameters
        - data: DataFrame containing records for a given running
        - nb_km: km of the running
    '''
    
    g = sns.factorplot(data=data, x='age', y='time', kind='box', size=10, aspect=1.5)
    for ax in g.axes.flat:
        labels = []
        for label in ax.get_yticklabels():
            formatted_label = convert_seconds_to_time(int(float(label._y)))
            labels.append(formatted_label)
        ax.set_yticklabels(labels)
    plt.title('Distribution of time of ' + str(nb_km) + 'km running following age of participants')
    plt.show()

# Data wrangling

In [None]:
lausanne_marathon_2016 = pd.read_pickle(PATH_TO_DATA)
lausanne_marathon_2016.head()

In [None]:
lausanne_marathon_2016['sex'] = lausanne_marathon_2016.apply(get_sex_of_runner, axis=1)

In [None]:
lausanne_marathon_2016['sex'].value_counts(dropna=False)

Note: Here, we remove people that are part of *Pink_Ch*, *10W-Walk* or *10W-NW* categories as these categories do not belong to Lausanne Marathon, and as their name can be misleading with sex attribution. For more information about these categories, check <a href="http://fr.lausanne-marathon.com/info-course/horaires/">Lausanne Marathon's official website</a>.

In [None]:
lausanne_marathon_2016_cleaned = lausanne_marathon_2016[lausanne_marathon_2016['sex'].notnull()].copy()

In [None]:
len(lausanne_marathon_2016_cleaned)

We have a total of 12060 runners on whom we can run our analysis for Lausanne Marathon 2016.

We also transform the *rang* column as an integer.

In [None]:
lausanne_marathon_2016_cleaned['type'] = lausanne_marathon_2016_cleaned.apply(get_type_of_runner, axis=1)

In [None]:
lausanne_marathon_2016_cleaned['rank'] = lausanne_marathon_2016_cleaned['rank'].apply(lambda x : int(float(x)))
lausanne_marathon_2016_cleaned.head()

In [None]:
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned.apply(compute_age_of_runner, axis=1)
lausanne_marathon_2016_cleaned['age'] = lausanne_marathon_2016_cleaned['age'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned['number'] = lausanne_marathon_2016_cleaned['number'].apply(lambda x : int(float(x)))

In [None]:
lausanne_marathon_2016_cleaned['time'] = lausanne_marathon_2016_cleaned.apply(format_time, axis=1)

In [None]:
lausanne_marathon_2016_cleaned['distance (km)'] = lausanne_marathon_2016_cleaned.apply(compute_distance_from_category, axis=1)

# Statistical study

## Overall study

In [None]:
plot = sns.countplot(data=lausanne_marathon_2016_cleaned, x='sex')
total = len(lausanne_marathon_2016_cleaned)
for p in plot.patches:
        plot.annotate('%{:.1f}'.format(p.get_height()*100/total), (p.get_x()+0.1, p.get_height()+50))

The Lausanne marathon race had a significantly higher percentage of male runners than female ones.

In [None]:
plot = sns.countplot(data=lausanne_marathon_2016_cleaned, x='type')
total = len(lausanne_marathon_2016_cleaned)
for p in plot.patches:
        plot.annotate('%{:.1f}'.format(p.get_height()*100/total), (p.get_x()+0.1, p.get_height()+50))

Not surprinsingly, very minor part of runners were young during the Lausanne Marathon of 2016.

An interesting fact is that we have a non-negligible part of runners who were in team.

## Demographic study

In this part, we plan to focus on age of runners, but also the place from which they come.

We first display the age distribution of runners in the marathon 2016 of Lausanne.

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,5)
ax.hist(lausanne_marathon_2016_cleaned['age'], bins=30)

# Compute the mean of age selected by gender.
mean_age_M = np.mean(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'male'])
mean_age_W = np.mean(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'female'])
mean_age_all = np.mean(lausanne_marathon_2016_cleaned['age'])

# display the median and titles.
ax.axvline(mean_age_all, 0, 1750, color='r', linestyle='--')
ax.set_title('Age Distribution of Runners')
ax.set_xlabel('Age')
ax.set_ylabel('Number of Runners')

#Calculate age distribution statistics by gender:
age_stats = 'Mean Age: ' + str(round(mean_age_all, 2)) + '\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age']), 2)) 
age_statsf = 'Mean Age (Female): ' + str(round(mean_age_M, 2)) + '\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'female']), 2))                                                                       
age_statsm = 'Mean Age (Male): ' + str(round(mean_age_W, 2)) + '\n' + 'SD: ' + str(round(np.std(lausanne_marathon_2016_cleaned['age'][lausanne_marathon_2016_cleaned['sex'] == 'male']), 2))
age_stats = age_stats + '\n' + age_statsf + '\n' + age_statsm

# add legend text.
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax.text(.95, .95, age_stats, fontsize=11, transform=ax.transAxes, va='top', ha='right', bbox=props, multialignment='left')

## Performance according to age of participants

In [None]:
lausanne_marathon_2016_10km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 10]
lausanne_marathon_2016_21km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 21]
lausanne_marathon_2016_42km = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 42]

### Performance for 10 km running

In [None]:
plot_performance_according_to_running_type(lausanne_marathon_2016_10km, 10)

### Performance for demi-marathon

In [None]:
plot_performance_according_to_running_type(lausanne_marathon_2016_21km, 21)

### Performance for marathon

In [None]:
plot_performance_according_to_running_type(lausanne_marathon_2016_42km, 42)

## Effective performance according to BIB number

In [None]:
ax = lausanne_marathon_2016_cleaned.plot(kind='scatter', x='number', y='time', xlim=(-1000, 18000));
formatted_labels = [convert_seconds_to_time(seconds) for seconds in lausanne_marathon_2016_cleaned['time']]
ax.set_yticklabels(formatted_labels)
plt.title('Running time according to BIB number of participants')
plt.show()

We can recognize three major patterns following the BIB number of participants. Indeed, the higher the BIB number is, the lower the performance time is, globally.

Notice that BIB numbers are given by the organizators of Lausanne Marathon, and as we can read on <a href="http://fr.lausanne-marathon.com/inscription/inscriptions/inscription-online/">the official website</a>, each participant is asked to indicate his "estimated time of running for the attribution of start blocks", in oder to categorize the runners accordingly.

## Category study

Now that we have the distance of each runners, we need to compute the speed by kilometer for each runners.

In [None]:
lausanne_marathon_2016_cleaned['Speed (m/s)'] = lausanne_marathon_2016_cleaned['distance (km)']*1000/lausanne_marathon_2016_cleaned['time']
lausanne_marathon_2016_cleaned['type_team'] = lausanne_marathon_2016_cleaned.apply(compute_run_in_team, axis=1)

In [None]:
# make selection on distance.
lausanne_marathon_2016_cleaned_10 = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 10]
lausanne_marathon_2016_cleaned_21 = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 21]
lausanne_marathon_2016_cleaned_42 = lausanne_marathon_2016_cleaned[lausanne_marathon_2016_cleaned['distance (km)'] == 42]

In [None]:
# Create figure.
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(10,10)

# 10 K --------------------------------------------------------------------------------------------
ax1 = fig.add_subplot(311)
race_speed_10 = lausanne_marathon_2016_cleaned_10['Speed (m/s)'].tolist()
ax1.hist(race_speed_10, bins=25)
ax1.set_ylabel('Number of Runners')
ax1.set_title('Distance = 10K')
ax1.set_xlabel('Speed (m/s)')
ax1.xaxis.set_label_coords(1.15,-0.025)

# Set axis
x = np.arange(0,6.5,0.5)
y = np.arange(0,900,100)
plt.xticks(x)
plt.yticks(y)

#Compute important informations
avg_time_10 = round(np.mean(race_speed_10), 4)
med_time_10 = round(np.median(race_speed_10), 4)
max_speed_10 = round(np.max(race_speed_10), 2)
min_speed_10 = round(np.min(race_speed_10), 2)
total_10 = len(race_speed_10)
ax1.axvline(med_time_10, 0, 1750, color='r', linestyle='--')

#Create string with statistics:
med_10 = 'Median: ' + str(avg_time_10) + ' m/s'
mean_10 = 'Mean: ' + str(med_time_10)  + ' m/s'
max_10 = 'Max: ' + str(max_speed_10)   + ' m/s'
min_10 = 'Min: ' + str(min_speed_10)   + ' m/s'
total_10 = 'Total: ' + str(total_10)   + ' runners'
std_10 = 'SD: ' + str(round(np.std(race_speed_10), 2)) + 's'
stats10 = total_10 + ' \n' + med_10 + ' \n' + mean_10 + ' \n' + max_10 + ' \n' + min_10 + ' \n' + std_10

# Add information to graph
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax1.text(.95, .95, stats10, fontsize=12, transform=ax1.transAxes, va='top', ha='right', bbox=props, multialignment='left')

# 21 K --------------------------------------------------------------------------------------------
ax2 = fig.add_subplot(312)
race_speed_21 = lausanne_marathon_2016_cleaned_21['Speed (m/s)'].tolist()
ax2.hist(race_speed_21, bins=25)
ax2.set_ylabel('Number of Runners')
ax2.xaxis.set_label_coords(1.15,-0.025)
ax2.set_xlabel('Speed (m/s)')
ax2.set_title('Distance = 21K')

# Set axis
x = np.arange(0,6.5,0.5)
y = np.arange(0,700,100)
plt.xticks(x)
plt.yticks(y)

#Compute important informations
avg_time_21 = round(np.mean(race_speed_21), 4)
med_time_21 = round(np.median(race_speed_21) ,4)
max_speed_21 = round(np.max(race_speed_21), 2)
min_speed_21 = round(np.min(race_speed_21), 2)
total_21 = len(race_speed_21)
ax2.axvline(med_time_21, 0, 1750, color='r', linestyle='--')

#Create string with statistics:
med_21 = 'Median: ' + str(avg_time_21) + ' m/s'
mean_21 = 'Mean: ' + str(med_time_21)  + ' m/s'
max_21 = 'Max: ' + str(max_speed_21)   + ' m/s'
min_21 = 'Min: ' + str(min_speed_21)   + ' m/s'
total_21 = 'Total: ' + str(total_21)   + ' runners'
std_21 = 'SD: ' + str(round(np.std(race_speed_21), 2)) + 's'
stats21 = total_21 + ' \n' + med_21 + ' \n' + mean_21 + ' \n' +  max_21 + ' \n' + min_21 + ' \n' + std_21

# Add information to graph
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax2.text(.95, .95, stats21, fontsize=12, transform=ax2.transAxes, va='top', ha='right', bbox=props, multialignment='left')


# 42 K --------------------------------------------------------------------------------------------
ax3 = fig.add_subplot(313)
race_speed_42 = lausanne_marathon_2016_cleaned_42['Speed (m/s)'].tolist()
ax3.hist(race_speed_42, bins=25)
ax3.set_ylabel('Number of Runners')
ax3.set_xlabel('Speed (m/s)')
ax3.xaxis.set_label_coords(1.15,-0.025)
ax3.set_title('Distance = 42K')

# set axis
x = np.arange(0,6.5,0.5)
y = np.arange(0,250,50)
plt.xticks(x)
plt.yticks(y)

# Compute important informations
avg_time_42 = round(np.mean(race_speed_42), 4)
med_time_42 = round(np.median(race_speed_42), 4)
total_42 = len(race_speed_42)
max_speed_42 = round(np.max(race_speed_42), 2)
min_speed_42 =  round(np.min(race_speed_42) ,2)
ax3.axvline(med_time_42, 0, 1750, color='r', linestyle='--')

# Create string for displaying important informations
med_42 = 'Median: ' + str(avg_time_42) + ' m/s'
mean_42 = 'Mean: ' + str(med_time_42)  + ' m/s'
total_42 = 'Total: ' + str(total_42)   + ' runners'
max_42 = 'Max: ' + str(max_speed_42)   + ' m/s'
min_42 = 'Min: ' + str(min_speed_42)   + ' m/s'
std_42 = 'SD: ' + str(round(np.std(race_speed_42), 2)) + 's'
stats42 = total_42 + ' \n' + med_42 + ' \n' + mean_42 + ' \n' + max_42 + ' \n' + min_42 + ' \n' + std_42

# Add information to graph
props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
ax3.text(.95, .95, stats42, fontsize=12, transform=ax3.transAxes, va='top', ha='right', bbox=props, multialignment='left')

As we can expect the fastest runners have made the 10 K, as the distance is shorter the speed by kilometer can be greater.
But unexpectedly the average speed seems very close between runners despite the difference of the distance, in order to understand this phenomenon we need to go further in the study and study the past of runners.

In [None]:
plot = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='sex', y='Speed (m/s)', col = 'distance (km)',size=5, kind='box')
plot.set_xlabels('Gender')
plot.fig.set_size_inches(10,6)

In [None]:
fig = plt.figure()
fig.tight_layout
fig.set_size_inches(6,6)

# 10 K --------------------------------------------------------------------------------------------
ax1 = fig.add_subplot(111)
ax1 = sns.countplot(x='type_team', hue='distance (km)', data=lausanne_marathon_2016_cleaned)
#ax1.title('Distibution indivudal/team runners for the lausanne race 2016')
ax1.set_xlabel('')
ax1.set_ylabel('Number of Runners')

# displaying the percentage per each race.
total_10 = len(lausanne_marathon_2016_cleaned_10)
total_21 = len(lausanne_marathon_2016_cleaned_21)
total_42 = len(lausanne_marathon_2016_cleaned_42)
totals = [total_10,total_21,total_42]
race_type = 0
index = 0

# loop for displaying percentages
for p in ax1.patches:
        if race_type == 2:
            race_type = 0
            index = index + 1
        ax1.annotate('%{:.1f}'.format(p.get_height()*100/totals[index]), (p.get_x()+0.1, p.get_height()+50))
        race_type = race_type + 1
        

The repartition individual/team seems quite equivalent between the different races offered by the event.
The teams are more present in the 10K. The distance seems more appropriate for the teams, it's short enough to be a good challenge between beginner friends.

Let's see if being in team a real impact on the performance

In [None]:
plot = sns.factorplot(data=lausanne_marathon_2016_cleaned, x='type_team', y='Speed (m/s)', col = 'distance (km)', kind='box')
plot.set_xlabels('')
plot.fig.set_size_inches(10,6)

Actually the teams are better on average compared to individual runners, but the best performance comes from individual runners.
From the graph above, we can notice for "average" people be in team seem to be a motivational factor, compared to excellent runners who prefer doing the race alone.