In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import matplotlib as mpl 
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
import chart_studio as py 
from plotly import tools
from chart_studio.plotly import plot, iplot
import plotly.express as px 

## Read csv file

In [2]:
def read_csv(filename):
    df_1 = pd.read_csv(filename)
    # Convert timstamp to datetime
    df_1['datetime'] = pd.to_datetime(df_1['timestamp'])
    df_1.drop('timestamp', axis=1, inplace=True)
    # Drop NaN values rows
    df_1.dropna(how='all', inplace=True)
    # Remove useless columns
    columns = ['Cadence', 'unknown_87', 'unknown_88', 'unknown_90', 
                   'Air Power', 'Form Power', 'Ground Time', 'Leg Spring Stiffness', 
                   'Power', 'Vertical Oscillation', 'enhanced_altitude', 'enhanced_speed', 
                   'fractional_cadence', 'altitude', 'position_lat', 'position_long', 'speed','datafile']

    d = {0:'Monday',1:'Tuesday',2:'Wednesday',3:'Thursday',4:'Friday',5:'Saturday',6:'Sunday'}
    df_1['day'] = df_1['datetime'].dt.weekday
    df_1['day'].replace(d, inplace=True)
    df_1.drop(columns, axis=1, inplace=True)
    return df_1

In [3]:
df = read_csv('assets/strava.csv')

In [4]:
df.head()

Unnamed: 0,cadence,distance,heart_rate,datetime,day
0,0.0,0.0,68.0,2019-07-08 21:04:03,Monday
1,0.0,0.0,68.0,2019-07-08 21:04:04,Monday
2,54.0,1.32,71.0,2019-07-08 21:04:07,Monday
3,77.0,12.19,77.0,2019-07-08 21:04:14,Monday
4,77.0,14.08,80.0,2019-07-08 21:04:15,Monday


In [5]:
df = df[~(df['distance'] == 0)]
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,cadence,distance,heart_rate,datetime,day
0,54.0,1.32,71.0,2019-07-08 21:04:07,Monday
1,77.0,12.19,77.0,2019-07-08 21:04:14,Monday
2,77.0,14.08,80.0,2019-07-08 21:04:15,Monday
3,77.0,14.08,83.0,2019-07-08 21:04:16,Monday
4,77.0,14.99,83.0,2019-07-08 21:04:17,Monday


In [6]:
df_resample = df.resample('D', on='datetime').mean()

# Explanation of Strava dataset

* Running **cadence** is a number of steps that you will do in one minute. More steps in a minute mean that you are running more effectively.
* Air power, Cadence, Form Power, Ground Time, Leg Spring Stiffness are variables of calculating the quantity of power.
* Altitude is a distance measurement, usually in the vertical or "up" direction, between a reference datum and a point or object.

**Units of the data**
* Cadence: rpm
* Ground time: milliseconds
* Vertical oscillation: centimeters
* Distance, Altitude, and Enhanced Altitude: meters
* Longitude and Latitude: semicircles (radians)
* Air and Form Power: watts
* Leg Spring Stiffness: kN/m
* Speed: m/s

## Data Cleaning

* Calculate average speed
* Resample data to get mean values to variables
* Remove NaN values

In [7]:
df_resample.dropna(inplace=True)

In [8]:
def avg_speed(df, datetime):
    mask = (df['datetime'].dt.year == datetime.year) & (df['datetime'].dt.month == datetime.month) & (df['datetime'].dt.day == datetime.day)
    df = df[mask]
    min_time = df.iloc[0]['datetime']
    max_time = df.iloc[-1]['datetime']
    duration = (max_time - min_time).total_seconds()
    dist_min = df.iloc[0]['distance']
    dist_max = df.iloc[-1]['distance']
    dist = dist_max - dist_min
    avg_speed = dist / duration
    return duration, dist, avg_speed, min_time, max_time

values = []
for i in range(df_resample.shape[0]):
    values.append(avg_speed(df, df_resample.index[i]))
df_resample['time'] = [x for x, _, _, _, _ in values]
df_resample['distance'] = [x for _, x, _, _, _ in values]
df_resample['avg_speed'] = [x for _, _, x, _, _ in values]
df_resample.head()

Unnamed: 0_level_0,cadence,distance,heart_rate,time,avg_speed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-07-08,75.016393,2616.14,118.75,1188.0,2.202138
2019-07-10,79.818182,834.2,130.511364,1156.0,0.721626
2019-07-12,74.135016,7471.84,117.687157,3786.0,1.973545
2019-07-14,75.553472,2612.05,113.642361,13463.0,0.194017
2019-07-16,76.550296,1698.83,119.136095,1105.0,1.537403


## Scatterplot

The Scatterplot will be used to explore the main variables from the dataset. It allows us to see where the relationship may be between the variables and visualize the trends.

In [9]:
fig = go.Figure(data=go.Splom(
    dimensions=[dict(label='Cadence', values=df_resample['cadence']),
                dict(label='Distance', values=df_resample['distance']),
                dict(label='Heart rate', values=df_resample['heart_rate']),
                dict(label='Avg speed', values=df_resample['avg_speed'])],
    showupperhalf=False
))

fig.update_layout(
    title='Sports analysis',
    width=800,
    height=800,
    plot_bgcolor='rgb(247, 246, 245)'
)

fig.show()

## Line Chart

* Use line charts to create the visualization among variables of *Cadence, Heart rate, Distance, and Average Speed.*
* Line charts can be visually interpreted by human and describe relationships with variables. 
* Training heart rate would increase your athletic performance and overall level of fitness. To find a steady heart rate -- a level at which you feel like you're working hard, but your heart rate doesn't jump up over the time you're trainning.

In [10]:
trace1 = go.Scatter(
    x = df_resample.heart_rate.index,
    y = df_resample.heart_rate,
    name="Heart rate"
)

trace2 = go.Scatter(
    x=df_resample.cadence.index,
    y=df_resample.cadence,
    name="Cadence"
)


# layout = go.Layout(dict(title='Distance by date', xaxis=dict(title='Date'),yaxis=dict(title='Distance')))
fig=go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(
    height=500, width=800,
    title_text = "Distance and average speed by date",
    plot_bgcolor='white'
)
fig.show()

**Brief Summary**

From this chart, we can see that the heart rate of Mr. Brooks' heart rate had been changing along with the change of cadences. However, there's really nothing more the chart can talk about.

In [11]:
trace1 = go.Scatter(
    x=df_resample.distance.index,
    y=df_resample.distance,
    name="Distance"
)

trace2 = go.Scatter(
    x=df_resample.heart_rate.index,
    y=df.heart_rate,
    name="Heart rate"
)

trace3 = go.Scatter(
    x=df_resample.avg_speed.index,
    y=df_resample.avg_speed,
    name="Average speed"
)

fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.02)

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=2, col=1)
fig.add_trace(trace3, row=3, col=1)

fig.update_layout(height=600, width=800, title_text="Distance and Heart rate", plot_bgcolor='white')
fig.show()


**Brief Summary**

From this chart, we can tell that the heart rate remains stably while he's been doing work-out continuelly👏 and after running around the same distance for few months, he changed to do cycling, combined with running after September. This is the reason of the sharp increasing of distance.

## Bar Chart and Mixed bar-line Chart

* Bar Chart are used to compare things between differnt groups or to track changes over time.

In [12]:
a = df.groupby('day',as_index=False)['distance'].mean()
b = df.groupby('day', as_index=False)['cadence'].mean()
c = df.groupby('day', as_index=False)['heart_rate'].mean()

weekdays = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
a['day'] = pd.Categorical(a['day'], categories=weekdays, ordered=True)
a = a.sort_values('day')
b['day'] = pd.Categorical(b['day'], categories=weekdays, ordered=True)
b = b.sort_values('day')
c['day'] = pd.Categorical(c['day'], categories=weekdays, ordered=True)
c = c.sort_values('day')

In [13]:
fig = px.bar(a, x='day',y='distance', title='Average distance by weekday')

fig.update_layout(
    xaxis=dict(title='Weekday'),
    yaxis=dict(title='Average distance'),
    plot_bgcolor='white'
)
fig.show()

**Brief Summary**

From this chart, we can see that Mr. Brooks oftenly work out on Monday and Thursday.

In [14]:
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=c['day'],
        y=c['heart_rate'],
        name='Heart rate',
        marker_color='pink'
    )
)

fig.add_trace(
    go.Scatter(
        x=b['day'],
        y=b['cadence'],
        name='Cadence',
        marker_color='orange'
    )
)

fig.update_layout(
    xaxis=dict(title='Weekday'),
    title_text='Heart rate and Cadence by weekday',
    plot_bgcolor='white'
)
fig.show()

**Brief Summary**

This chart tells us that even though Mr. Brooks usually works out on Monday and Thursday, he's running in nearly the same steps in a minute.

In [15]:
d = df[['day','distance','heart_rate']]
d['day'] = pd.Categorical(d['day'], categories=weekdays, ordered=True)
d = d.sort_values('day')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Box Plot

* Box plot shows the five key elements of a dataset, the minimum, the maximum, the median, the upper quartile, and the lower quartile.

In [16]:
fig = px.box(d, x='day', y='heart_rate', color='day', notched=True, title="Distance of running/cycling by weekday", width=800, height=600)

fig.update_layout(
    xaxis=dict(title="Weekday", zeroline=False, showgrid=False),
    yaxis=dict(title='Heart rate', showgrid=False),
    plot_bgcolor='white'
)
fig.show()

**Brief Summary**

It would be better to know the type of work out, like running, jogging, and cycling, to analyst rather than organized by weekday.

## Summary

In a conclusion, Mr. Brooks started to work out since July 2019 until now. He usually does work-out on Monday and Thursday. From the scatterplot, I can see that he switched different types of work-out, but I cannot point out which sport he's been doing since I miss the variable of work-out type. Besides that, he's doing good on heart-rate trainning which stays at a stable level, around 130BPM. 