In [None]:
# Data analysis 
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import collections
from matplotlib.pyplot import show
import sys
sys.path.append('..')

# our code (mark it at autoreload at every cell execution - useful in developement mode)
%load_ext autoreload
%autoreload 1
%aimport study_utils
%aimport lausanne_1999_2016_utils

In [None]:
def get_data():
    '''
    get data from pickle on information on runners.
        return: dataframe with all data.
    '''
    path_to_data = '../../Scraping/DataSport/Data/Lausanne_Marathon_'
    extension = '.pickle'
    dataframe_lausanne = []
    
    # getting data from 1999 to 2016
    for year in range (1999,2017):
        path = path_to_data + str(year) + extension
        lausanne_marathon_temp = pd.read_pickle(path)
        lausanne_marathon_temp['year'] =  year
        dataframe_lausanne.append(lausanne_marathon_temp)
    
    
    return pd.concat(dataframe_lausanne)

def apply_computation(lausanne_marathon):
    
    
    # remove the SettingWithCopyWarning
    pd.options.mode.chained_assignment = None 
    
    # compute gender of the runner.
    lausanne_marathon['sex'] = lausanne_marathon.apply(study_utils.get_sex_of_runner, axis=1)
    lausanne_marathon = lausanne_marathon[lausanne_marathon['sex'].notnull()]
    
    # Adult / Junior
    lausanne_marathon['type'] = lausanne_marathon.apply(study_utils.get_type_of_runner, axis=1)
    
    # clean the rank attribute 
    lausanne_marathon.drop('rank', axis=1, inplace=True)
    
    # compute age from birthdate.
    lausanne_marathon = lausanne_marathon[lausanne_marathon['birthday'].notnull()] # 192 values removed.
    lausanne_marathon['age'] = lausanne_marathon.apply(study_utils.compute_age_of_runner, axis=1)
    lausanne_marathon['age'] = lausanne_marathon['age'].apply(lambda x : int(float(x)))
    
    # Compute time
    lausanne_marathon = lausanne_marathon[lausanne_marathon['time'].notnull()] # 706 values removed.
    lausanne_marathon['time'] = lausanne_marathon.apply(study_utils.format_time, axis=1)
    
    # Compute distance
    lausanne_marathon['distance (km)'] = lausanne_marathon.apply(study_utils.compute_distance_from_category, axis=1)
    
    # compute speed
    lausanne_marathon['speed (m/s)'] = lausanne_marathon['distance (km)']*1000/lausanne_marathon['time']
    
    # compute type of runner.
    lausanne_marathon['type_team'] = lausanne_marathon.apply(study_utils.compute_run_in_team, axis=1)
    
    lausanne_marathon = compute_overall_rank_all(lausanne_marathon)
    lausanne_marathon['overall_rank'] = lausanne_marathon['overall_rank'].apply(lambda x : int(float(x)))
    
    pd.options.mode.chained_assignment = 'warn'
    
    return lausanne_marathon
    
def compute_overall_rank_all(lausanne_marathon):
    '''
    compute the overall rank by distance
    
    Parameters
        - data: DataFrame containing records for a given running
    '''
    
    years = lausanne_marathon['year'].unique()
    marathons = []
    
    for year in years:
        print('computing rank for: ' + str(year))
        lausanne_marathon_per_year = lausanne_marathon[lausanne_marathon['year'] == year]
        marathons.append(lausanne_1999_2016_utils.compute_overall_rank(lausanne_marathon_per_year))
        
    # return the all dataframe.
    return pd.concat(marathons)

In [None]:
lausanne_marathon = get_data()

In [None]:
lausanne_marathon.head()

In [None]:
lausanne_marathon = apply_computation(lausanne_marathon)

In [None]:
lausanne_marathon.head()