In [52]:
%%HTML
<style>
    body {
        --vscode-font-family: "Inter";
        font-size: 15px;``
    }
</style>

# **Dependencies**
* Pandas
* Plotly
* nbformat

# **Distance Running Predictions**

The objective of this project is to create a model that predicts race time given the distance and your history of running performance

In [53]:
import plotly.graph_objects as go
import pandas as pd

onedark_theme = {
    'red': '#e06c75',
    'bg': '#23272e',
    'txt': '#8b8d90',
    'txt_dark': '#1e2227'
    }
theme = onedark_theme

def choose_label(_name,dotname,axis):

    if _name is not None and dotname is not None:
        return _name
    elif _name is not None and dotname is None:
        return _name
    elif _name is None and dotname is not None:
        return dotname
    else:
        return axis

def my_scatter(x,y,z,
               c,
               cmap='viridis',
               aspectratio=dict(x=1,y=1,z=1),
               height=None,
               width=None,
               markersize=5
               ):
    '''
    Helper function that plots a scatter using plotly

    c: list or np.ndarray or pd.Series - the numerical value associated with the colour map
    cmap: string - colour map
    aspectratio: dict - aspect ratio 

    '''

    x = pd.Series(x)
    y = pd.Series(y)
    z = pd.Series(z)


    trace_data = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        marker=dict(
            size=markersize,
            color=c,
            colorscale=cmap,
            opacity=0.8
        )
    )

    fig = go.Figure(data = trace_data)

    fig.update_layout(
        autosize=False if not (height is None and width is None) else True,
        width=width,
        height=height,
        margin = dict(l=0,r=0,b=0,t=0),
        paper_bgcolor=theme['bg'],
        font_color = theme['txt'],
        scene = dict(
            xaxis = dict(     
                title = 'x' if x.name is None else x.name,
                backgroundcolor=theme['bg'],
                gridcolor=theme['txt'],
                showbackground=True,
                zerolinecolor=theme['txt'],
                titlefont=dict(family='Inter')
                # range=
            ),
            yaxis = dict(
                title = 'y' if y.name is None else y.name,
                backgroundcolor=theme['bg'],
                gridcolor=theme['txt'],
                showbackground=True,
                zerolinecolor=theme['txt'],
                titlefont=dict(family='Inter')
                # range=
            ),
            zaxis = dict(
                title = 'z' if z.name is None else z.name,
                backgroundcolor=theme['bg'],
                gridcolor=theme['txt'],
                showbackground=True,
                zerolinecolor=theme['txt'],
                titlefont=dict(family='Inter')
                # range=
            ),
            aspectratio=aspectratio,
            camera = dict(projection_type="orthographic")
        )
    )

    return fig

import plotly.express as px

def my_barchart(x: pd.Series,                
                y: pd.Series,
                x_name=None,
                y_name=None,
                width=None,
                height=None,
                ):
    
    x=pd.Series(x)
    y=pd.Series(y)
    x_name = choose_label(x_name,x.name,'x')
    y_name = choose_label(y_name,y.name,'y')
    
    df= pd.DataFrame({x.name: x, y.name: y})

    fig = px.bar(df,x=x,y=y)

    fig.update_xaxes(gridcolor=theme['txt_dark'], title_text=x_name, title_font=dict(family='Inter'), linecolor=theme['txt'], linewidth=3)
    fig.update_yaxes(gridcolor=theme['txt_dark'], title_text=y_name, title_font=dict(family='Inter'))
    fig.update_traces(marker=dict(line=dict(color=theme['txt_dark'])))

    fig.update_layout(
        autosize=False if not (height is None and width is None) else True,
        width=width,
        height=height,
        margin = dict(l=0,r=0,b=0,t=0),
        paper_bgcolor=theme['bg'],
        plot_bgcolor=theme['bg'],
        font_color = theme['txt'],
    )

    return fig

def my_cumulative(x, x_name = None, y_name = None):

    x = pd.Series(x)
    x_name = choose_label(x_name,x.name,'x')
    df = pd.DataFrame({x_name: x})

    y_name = 'Cumulative Distribution' if y_name is None else y_name

    fig = px.ecdf(df,x=x_name)

    fig.update_xaxes(gridcolor=theme['txt'], title_text=x_name, title_font=dict(family='Inter'), linecolor=theme['txt'], linewidth=3)
    fig.update_yaxes(gridcolor=theme['txt'], title_text=y_name, title_font=dict(family='Inter'))
    fig.update_traces(line=dict(color=theme['red'],width=5))


    fig.update_layout(
        margin = dict(l=50,r=50,b=50,t=50),
        paper_bgcolor=theme['bg'],
        plot_bgcolor=theme['bg'],
        font_color = theme['txt'],
    )

    return fig

def my_freqdist(x,x_name=None,height=None,width=None,n_bins=None):

    x=pd.Series(x)
    x_name = choose_label(x_name,x.name,'x')
    df=pd.DataFrame({x_name:x.values})

    fig = px.histogram(df,x_name,nbins=n_bins)
    
    fig.update_xaxes(gridcolor=theme['txt'], title_text=x_name, title_font=dict(family='Inter'), linecolor=theme['txt'], linewidth=3)
    fig.update_yaxes(gridcolor=theme['txt'], title_font=dict(family='Inter'))

    fig.update_traces(marker_color=theme['red'])
    

    fig.update_layout(
        margin = dict(l=50,r=50,b=50,t=50),
        autosize=True if (height is None and width is None) else False,
        width=width,
        height=height,
        paper_bgcolor=theme['bg'],
        plot_bgcolor=theme['bg'],
        font_color = theme['txt'],
    )

    return fig

In [54]:
import numpy as np
from datetime import datetime

# import the dataset as a DataFrame
data = pd.read_csv("data/raw-data-kaggle.csv", delimiter=";")

data.head()

Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm)
0,18042525,M,15/12/2019 09:08,2965.8,812,17.4,150.3
1,18042525,M,10/12/2019 19:27,10020.8,3290,52.2,160.8
2,18042525,M,03/12/2019 19:46,12132.2,4027,249.0,148.9
3,18042525,M,26/11/2019 19:46,11631.5,4442,194.0,136.2
4,18042525,M,19/11/2019 19:45,11708.1,4022,250.7,146.0


In [55]:
print(f'Number of unique athletes: {data["athlete"].nunique()}')
print(f'Number of genders: {data["gender"].nunique()}')
print(f"Precision of elevation gain: {data['elevation gain (m)'].apply(lambda x: len(str(x).split('.')[1]) if '.' in str(x) else 0).max()} m")

Number of unique athletes: 116
Number of genders: 2
Precision of elevation gain: 1 m


The raw data contains 42,116 data points. There are 7 variables in the data. 

Some observations:
* ```gender``` appears is boolean in nature (only either Male or Female)
* ```athlete``` is categorical data recorded as the numerical user ID. 
* Despite there being so many data points, there are only 116 different athletes - this means there is a lot of data per athlete.
* The other 5 columns are continuous numerical data. However, elevation gain is recorded as an integer.

# **Exploratory Data Analysis (EDA)**

The objectives of EDA are:
1. Confirm if the data is suitable for the application
1. Discover and resolve data quality issues (missing data, duplicates, incorrect values, anomalies, incorrect data types)
1. Check the data's mean, median, mode skewness, range, count etc.
1. Detect outliers and anomalies
1. Understand data patterns and correlations between variables

## **Data Cleaning**

### **Extracting Useful Columns**


I would like to make the running time predictions based solely on an individual's performance. Therefore, I will be excluding gender as a parameter for the model.

In [56]:
data = data.drop(columns = 'gender')

All the other columns should be useful and will be kept

### **Variable Formatting**

Here I'll check if every row of each column has the same data type, and if each column has the appropriate data type.

In [57]:
data.dtypes

athlete                       int64
timestamp                    object
distance (m)                float64
elapsed time (s)              int64
elevation gain (m)          float64
average heart rate (bpm)    float64
dtype: object

In [58]:
# convert timestamp to dtype datetime 
data["timestamp"] = pd.to_datetime(data["timestamp"])

In [59]:
data.dtypes

athlete                              int64
timestamp                   datetime64[ns]
distance (m)                       float64
elapsed time (s)                     int64
elevation gain (m)                 float64
average heart rate (bpm)           float64
dtype: object

### **Missing Values**

Let's check if there are null, nan or missing values anywhere

In [60]:
data.isnull().sum()

athlete                         0
timestamp                       0
distance (m)                    0
elapsed time (s)                0
elevation gain (m)              0
average heart rate (bpm)    18384
dtype: int64

In [61]:
print(f'Average number of rows per athlete: {data.shape[0]/data["athlete"].nunique():.2f}')
print(f'Percentage of rows that are missing heart rate: {23732/42116*100:.1f} %')

Average number of rows per athlete: 363.07
Percentage of rows that are missing heart rate: 56.3 %


From the above, we see that there are null values in the gender and heart rate columns. 

The more problematic column is average heart rate, which has a null value in the majority of its rows. <!-- While this would have been an extremely useful metric to infer athlete capability and race times, there is insufficient data to safely impute the missing values. Therefore, heart rate should be omitted as a variable in the model on the basis of insufficient data. -->

<!-- Let's not exclude heart rate just yet! Let's check the proportion of athletes with heart rate data entirely missing. If too many athletes are missing heart rate data entirely, it is not a useful metric. If each athlete has a proportionally small amount of heart rate data missing, we can impute it per-athlete, given the large amount of data per-athlete. -->

**Feature Engineering: Pace (minutes per km)**

Before I analyse heart rate, I'll create a new feature for each row: pace. This is the average speed of the run in units of minutes per kilometre. This will allow be to do further data analysis based on the average speed of the runs.

In [62]:
data["pace (min/km)"] = (data["elapsed time (s)"]/60)/(data["distance (m)"]/1000)

#### **Missing Heart Rate Data**

In [63]:
# count the number of athletes that are entirely missing any heart rate data
allnull = data.groupby('athlete')['average heart rate (bpm)'].apply(lambda x: x.isnull().all()).sum()

print(f'There are {allnull} athletes that have no heart rate data at all.')

# count the number of rows with missing heart rate per athlete, as a proportion of their total number of rows
null_counts = data[data['average heart rate (bpm)'].isnull()].groupby('athlete').size()
row_per_athlete = data.groupby('athlete').size()
nullpc_per_athlete = null_counts/row_per_athlete

print(f'{(nullpc_per_athlete>0.4).sum()/len(nullpc_per_athlete)*100:.1f}% of athletes have no heart rate data in more than 40% of their recorded runs')

There are 10 athletes that have no heart rate data at all.
50.0% of athletes have no heart rate data in more than 40% of their recorded runs


In [64]:
my_cumulative(nullpc_per_athlete, x_name="Percentage of athlete's runs that are missing HR")

##### **Discussion on Heart Rate**
Half of the athletes are missing HR data on >40% of their runs. There is insufficient data to impute the missing values. One option is to omit heart rate from the variables on which the models will be trained.

Alternatively, I could ignore the 56% of data that is missing heart rate, and only keep the remaining data. While I would be throwing away 56% of the data, there are still 26 thousand data points, which could be ample for my application. Heart rate is a very useful metric as it is the only objective variable in the dataset that indicates the level of effort exerted on a run.

On the basis of the importance of heart rate as a indicative parameter, I will choose to exclude all the data without heart rate.

In [65]:
data = data.dropna(subset=['average heart rate (bpm)']) # drop rows with missing heart rate
data.drop(data[data['average heart rate (bpm)'] == 0.0].index, inplace=True)
print(f"I've now got {data.shape[0]} data points with heart rate data")

I've now got 23708 data points with heart rate data


With the dropped rows, lets see how many runs each athelete has recorded.

In [66]:
row_per_athlete = data.groupby('athlete').size()
my_cumulative(row_per_athlete, x_name="Rows per athlete")
# my_freqdist(row_per_athlete,x_name="Rows per athlete",n_bins=100)

70% of athletes have at least 100 data points. 

### **Drop Duplicates**

In [67]:
data = data.drop_duplicates()

print(f'There are now {data.shape[0]} rows that are unique')

There are now 23702 rows that are unique


## **Rubrik for Analysis**


In [76]:
data.head()

Unnamed: 0,athlete,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km)
0,18042525,2019-12-15 09:08:00,2965.8,812,17.4,150.3,4.563131
1,18042525,2019-10-12 19:27:00,10020.8,3290,52.2,160.8,5.471952
2,18042525,2019-03-12 19:46:00,12132.2,4027,249.0,148.9,5.53211
3,18042525,2019-11-26 19:46:00,11631.5,4442,194.0,136.2,6.3649
4,18042525,2019-11-19 19:45:00,11708.1,4022,250.7,146.0,5.725381


In [80]:
data.max()

athlete                                45799771
timestamp                   2020-06-01 05:24:00
distance (m)                           218950.0
elapsed time (s)                          96372
elevation gain (m)                      11128.0
average heart rate (bpm)                  237.0
pace (min/km)                               inf
dtype: object

In [81]:
data.min()

athlete                                  771514
timestamp                   2000-04-01 12:06:00
distance (m)                                0.0
elapsed time (s)                              1
elevation gain (m)                          0.0
average heart rate (bpm)                   26.8
pace (min/km)                          0.050444
dtype: object

In [79]:
maxmin = dict()
maxmin["timestamp"] = {"max": datetime.today()}

## **Univariate Analysis**

### **Univariate Analysis: Heart Rate**

The world record for the slowest resting heart rate in a healthy human is 27 BPM. Therefore, runs with average heart rate below this value are practically impossible. Some athletes have resting heart rates as low as 40 BPM, while this would be 80 BPM for the average person.

In [149]:
my_freqdist(data['average heart rate (bpm)'])

In [150]:
data[data['average heart rate (bpm)']<40]

Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km)
806,10825360,M,30/12/2019 09:34,17393.0,12745,807.0,36.9,12.212768
31292,12642956,M,04/01/2019 14:14,2471.0,9327,0.0,26.8,62.909753


The low heart rates above do not appear to be errors, as they are accompanied by extremely slow paces, but I will remove them as these paces will not be beneficial to the dataset. The upper end of the maximum heart rate is not implausible.

In [151]:
data.drop(data[data['average heart rate (bpm)']<40].index, inplace=True)
print(f"I've now got {data.shape[0]} data points")

I've now got 23700 data points


### **Looking into Distance**

From the previous step, I know that no distance values are missing. Let's check if any distance values are 0.

In [152]:
data[data["elevation gain (m)"] == 0]

Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km)
114,18042525,M,09/01/2018 19:46,2126.2,726,0.0,123.0,5.690904
161,18042525,M,04/05/2017 16:09,9539.6,2972,0.0,143.4,5.192391
163,18042525,M,27/04/2017 16:37,6177.7,1679,0.0,158.0,4.529733
167,18042525,M,17/04/2017 15:17,5160.3,1405,0.0,154.0,4.537850
170,18042525,M,31/03/2017 16:49,5253.1,1419,0.0,158.7,4.502104
...,...,...,...,...,...,...,...,...
41292,5035018,F,14/11/2018 18:47,447.5,204,0.0,151.3,7.597765
41304,5035018,F,17/10/2018 17:43,548.6,215,0.0,150.3,6.531778
41519,5035018,F,02/07/2017 08:36,1483.1,431,0.0,161.2,4.843459
41559,5035018,F,20/03/2017 12:40,2725.1,803,0.0,161.9,4.911135


In [153]:
print(f'There are {data[data["distance (m)"] == 0].shape[0]} rows without distance data')
print(f'All rows missing distance are also missing elevation: {np.all(data[data["distance (m)"] == 0] == data[(data["distance (m)"] == 0) & (data["elevation gain (m)"] == 0)])}')
data[(data["distance (m)"] == 0) & (data["elapsed time (s)"] < 10)]

There are 37 rows without distance data
All rows missing distance are also missing elevation: True


Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km)
5566,3539760,M,17/02/2019 19:43,0.0,7,0.0,76.9,inf
8032,6215611,M,21/04/2018 14:32,0.0,4,0.0,111.3,inf
13893,12740383,F,18/12/2019 06:38,0.0,2,0.0,119.0,inf
16570,17639804,M,15/11/2016 16:53,0.0,5,0.0,73.8,inf
19847,18736169,M,14/03/2018 14:07,0.0,2,0.0,99.5,inf
25022,2081128,M,28/04/2017 07:54,0.0,2,0.0,75.0,inf
26861,17199625,M,06/11/2016 08:50,0.0,1,0.0,89.0,inf
40981,5688059,M,21/01/2014 20:10,0.0,4,0.0,123.0,inf
40986,5688059,M,15/01/2014 19:42,0.0,4,0.0,114.0,inf


Many of the rows without distance also have unrealistically short elapsed time. All of the rows without distance are also missing elevation gain. As the rows missing distance are also missing other data, I'll delete them instead of imputing the missing data. This will prevent the creation of data points that are too artificial.

In [154]:
data.drop(data[data["distance (m)"] == 0.0].index, inplace=True)
print(f"I've now got {data.shape[0]} data points")

I've now got 23663 data points


**Remove sprint distances, too** - these could distort the model by creating a more complicated 'surface'

Above distances of 1500m (~1 mile), there is a significantly higher proportion of aerobic activity compared to shorter distances. Anecdotally, this is also the distance above which most non-runners cannot complete while running throughout. Therefore, I'll use this as our cut-off point. 

<[source](https://readysetmarathon.com/what-is-considered-distance-running/)>


In [155]:
data.drop(data[data["distance (m)"] < 1500].index, inplace=True)
print(f"I've now got {data.shape[0]} data points")

I've now got 23180 data points


In [157]:
print(f'Longest distance: {data["distance (m)"].max()}')
data[data["distance (m)"] == data["distance (m)"].max()]

Longest distance: 218950.0


Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km)
16127,17639804,M,18/10/2018 19:59,218950.0,96372,9729.0,134.0,7.335921


The data row with the longest distance is implausible. The elapsed time is greater than a day, and the elevation gain is greater than Mount Everest. I'll go through time and elevation later on, so I'll leave this row in here for now.

### **Looking into Elevation**

In [158]:
my_freqdist(data["elevation gain (m)"])

In [101]:
print(f'Highest elevation gain: {data["elevation gain (m)"].max()} m')
print(f'{data[data["elevation gain (m)"] > 8849].shape[0]} rows with elevation gain higher than Mount Everest')

Highest elevation gain: 11128.0 m
2 rows with elevation gain higher than Mount Everest


The highest elevation gain in the data is 11,128 m. This is taller than Mount Everest, which is "only" 8,849 m high. I'll delete the row

### **Looking into Pace**

In [124]:
print(f'Fastest pace {data["pace (min/km)"].min():.5f} min/km')
print(f'Slowest pace {data["pace (min/km)"].max():.5f} min/km')
print(f"World record 100m pace (Usain Bolt): {9.58/60/0.1:.5f} min/km")
print(f"World record 60m pace (Christian Coleman): {6.34/60/0.06:.5f} min/km")

Fastest pace 0.17581 min/km
Slowest pace 71.37915 min/km
World record 100m pace (Usain Bolt): 1.59667 min/km
World record 60m pace (Christian Coleman): 1.76111 min/km


In [125]:
print(f"Implausibly fast rows: {data[data['pace (min/km)'] < 1.59].size}")
print(f"Percentage of total: {data[data['pace (min/km)'] < 1.59].size/data.shape[0]*100:.2f} %")
data[data["pace (min/km)"] < 1.59]

Implausibly fast rows: 48
Percentage of total: 0.21 %


Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km)
11128,3460571,M,26/05/2019 07:09,33105.1,2752,935.6,144.4,1.385486
12261,2913078,F,17/10/2016 17:03,2130.7,189,0.0,127.8,1.478387
13953,12740383,F,23/08/2019 23:22,41746.6,2001,2513.0,123.6,0.798867
19643,18736169,M,05/02/2019 16:59,10683.0,399,47.6,135.3,0.622484
31301,12642956,M,01/12/2018 06:36,25866.0,543,148.2,70.8,0.34988
37941,22891135,M,23/06/2018 04:31,36593.1,386,347.4,144.7,0.175807


There are 48 rows with athletes running faster than Usain Bolt's world record for distances greater than 1.5 km. I can't impute time as it's the dependent variable, but I _can_ impute distance.


Calculate absolute time for the timestamp, with 1 Jan 2000 as datum - the datum doesn't matter as I'll be normalising it


In [19]:
data["time"] = data["timestamp"].apply(lambda x: (datetime.strptime(x, "%d/%m/%Y %H:%M") - datetime(2000, 1, 1)).total_seconds())

data.head()

Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km),time
0,18042525,M,15/12/2019 09:08,2965.8,812,17.4,150.3,4.563131,629716080.0
1,18042525,M,10/12/2019 19:27,10020.8,3290,52.2,160.8,5.471952,629321220.0
2,18042525,M,03/12/2019 19:46,12132.2,4027,249.0,148.9,5.53211,628717560.0
3,18042525,M,26/11/2019 19:46,11631.5,4442,194.0,136.2,6.3649,628112760.0
4,18042525,M,19/11/2019 19:45,11708.1,4022,250.7,146.0,5.725381,627507900.0


## **Feature Engineering**

In this step, I will implement my domain knowledge of distance running to select and transform the most relevant variables from raw data.


### **Frequency Distributions**

Let's plot the frequency distribution of the columns, to decide whether I need to transform any of the columns

In [20]:
my_freqdist(data['time'])

In [21]:
my_freqdist(data['pace (min/km)'])

In [22]:
my_freqdist(data['elevation gain (m)'])

In [23]:
my_freqdist(data['elapsed time (s)'])

In [24]:
my_freqdist(data['distance (m)'])

In [25]:
my_freqdist(data['average heart rate (bpm)'])

In [26]:
data.loc[data['distance (m)']>200000]

# you also need to throw out anything with an unrealistic pace (e.g. faster than 2 minutes per km)

Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km),time
16127,17639804,M,18/10/2018 19:59,218950.0,96372,9729.0,134.0,7.335921,593207940.0


Time is heavily skewed to the right and should be transformed towards the left.

Pace, elevation, elapsed time and distance are all highly skewed to the left and should be transformed towards the right

### **Outliers**

### **Normalise**

In [27]:
def normalise(series):
    '''
    Normalises the values of a pd.Series object
    '''

    return (series - series.min()) / (series.max() - series.min())

# create new columns for normalised data
data["distance"] = normalise(data["distance (m)"])
data["elevation"] = normalise(data["elevation gain (m)"])
data["hr"] = normalise(data["average heart rate (bpm)"])
data["pace"] = normalise(data["pace (min/km)"])

data.head()

Unnamed: 0,athlete,gender,timestamp,distance (m),elapsed time (s),elevation gain (m),average heart rate (bpm),pace (min/km),time,distance,elevation,hr,pace
0,18042525,M,15/12/2019 09:08,2965.8,812,17.4,150.3,4.563131,629716080.0,0.006736,0.001564,0.587536,0.061617
1,18042525,M,10/12/2019 19:27,10020.8,3290,52.2,160.8,5.471952,629321220.0,0.03918,0.004691,0.637488,0.074381
2,18042525,M,03/12/2019 19:46,12132.2,4027,249.0,148.9,5.53211,628717560.0,0.04889,0.022376,0.580875,0.075225
3,18042525,M,26/11/2019 19:46,11631.5,4442,194.0,136.2,6.3649,628112760.0,0.046587,0.017434,0.520457,0.086921
4,18042525,M,19/11/2019 19:45,11708.1,4022,250.7,146.0,5.725381,627507900.0,0.04694,0.022529,0.567079,0.07794


Let's start by plotting average heart rate (HR) on the z-axis, against distance and pace on the x and y axes, for a few individuals. We might use more than these 3 variables to perform the prediction, but using 3 variables allows us to easily visualise the data. This visualisation will give us an idea of how well the data is correlated.

In [28]:
my_scatter(x=data.loc[data["athlete"] == 18042525]["pace (min/km)"],
           y=data.loc[data["athlete"] == 18042525]["distance (m)"],
           z=data.loc[data["athlete"] == 18042525]["average heart rate (bpm)"],
           c=data.loc[data["athlete"] == 18042525]["average heart rate (bpm)"],
           height=600)

Now that the data is processed and normalised, I'll create an initial Neural Network (NN) to see the minimum number of nodes to capture the input/output map with at least 90% accuracy for a given user. This will allow me to decide whether a helper model for fine-tuning is feasible.

The possible outcomes will be:
* Low number of nodes: the output of the helper fine-tuning model can be the changes to weights and biases for each node
* High number of nodes: there are too many weights and biases to create a neural network with these as outputs - I'll have to fine tune base models directly, and probably use a greater number of clusters 
* Accuracy can't be captured with 90% accuracy: this indicates that there is poor correlation of the input with the output