<a href="https://colab.research.google.com/github/anw-g01/strava-data-analysis/blob/main/etl_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get Strava Client

- Manage the setup and [OAuth authentication](https://developers.strava.com/docs/authentication/) workflow for accessing the [Strava API](https://developers.strava.com/).

- Initialise an authenticated client capable of securely fetching athlete data.

In [None]:
!pip install stravalib --quiet
from stravalib import Client

from google.colab import userdata
import sys
import logging
import warnings
from itertools import cycle
import pandas as pd
import numpy as np

logging.getLogger("stravalib.util.limiter").setLevel(logging.ERROR)    # suppress stravalib warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, module="jupyter_client")   # suppress deprecation warnings
pd.set_option("display.max_columns", None)  # view ALL columns
# pd.reset_option("display.max_columns")      # to reset to default (if needed)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/125.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/306.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.8/306.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
token = userdata.get("GITHUB_STRAVA_REPO_TOKEN")
username = "anw-g01"
repo = "strava-data-analysis"

# clone repo to utilise get_athlete function from oauth.py module (see GitHub):
!git clone https://{token}@github.com/{username}/{repo}

sys.path.append(f"/content/{repo}")    # add directory containing oauth.py
from oauth import get_athlete

# refresh an access token, authenticate the athlete, and return an authorised client:
CLIENT = get_athlete(
    client_id=userdata.get("CLIENT_ID"),
    client_secret=userdata.get("CLIENT_SECRET"),
    refresh_token=userdata.get("REFRESH_TOKEN1"),
    verbose=False
)

Cloning into 'strava-data-analysis'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 24 (delta 8), reused 16 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (24/24), 6.95 KiB | 3.47 MiB/s, done.
Resolving deltas: 100% (8/8), done.

Successfully authenticated athlete: Anwarat Gurung


# Extract

- Retrieve all activity data from the authenticated Strava client.

- Organise raw activity records into a clean, structured dataset suitable for processing.

In [None]:
def extract(detailled: bool = False) -> list[dict]:
    """Uses the globally authenticated CLIENT object to extract athlete data."""

    # extract all activities (public + private):
    activities = CLIENT.get_activities(limit=None)
    print(f"extracting and storing all activites...\n")

    records = []    # store all activities as a list of dicts
    for i, a in enumerate(activities, start=1):

        print(f"\rno. of activities extracted: {i}", end="")

        rec = {
            # general activity metrics:
            "activity_id": a.id,
            "name": a.name,
            "type": a.type,
            "date": a.start_date.date(),                                # UTC date
            "start_time": a.start_date.time(),                          # UTC time only
            "start_date_local": getattr(a, "start_date_local", None),
            "timezone": getattr(a, "timezone", None),
            "distance_km": a.distance / 1000,                           # m
            "moving_time_s": a.moving_time,                             # s
            "elapsed_time_s": a.elapsed_time,                           # s
            "avg_speed_mps": getattr(a, "average_speed", None),         # m/s
            "max_speed_mps": getattr(a, "max_speed", None),             # m/s
            "total_elev_gain": float(a.total_elevation_gain),           # m
            "highest_elev": getattr(a, "elev_high", None),              # m
            "lowest_elev": getattr(a, "elev_low", None),                # m
            "visibility": getattr(a, "visibility", None),
            "num_comments": getattr(a, "comment_count", None),
            "num_achievements": getattr(a, "achievement_count", None),
            "num_kudos": getattr(a, "kudos_count", None),
            "is_manual": getattr(a, "manual", None),    # auto-recorded vs manually entered activities.
            # running specific metrics:
            "avg_hr": getattr(a, "average_heartrate", None),            # bpm
            "max_hr": getattr(a, "max_heartrate", None),                # bpm
            "avg_cadence_spm": getattr(a, "average_cadence", None),     # spm (strides per minute)
            "gear_id": getattr(a, "gear_id", None),     # use shoe mapping from client.get_athlete().shoes for shoe names
            # note: average_pace not exposed directly by API - compute later as moving_time/distance
        }

        if detailled:
            # extra detailled metrics:
            d = CLIENT.get_activity(a.id)
            rec["description"] = getattr(d, "description", None)  # activity description
            rec["calories"] = getattr(d, "calories", None)
            rec["device_name"] = getattr(d, "device_name", None)

        records.append(rec)

    # get statistics on no. of total public activities:
    athlete = CLIENT.get_athlete()
    stats = CLIENT.get_athlete_stats(athlete.id)
    runs, rides, swims = stats.all_run_totals, stats.all_ride_totals, stats.all_swim_totals
    public_activities = runs.count + rides.count + swims.count

    print(f"\n\n{public_activities}/{len(records)} ({public_activities / len(records) * 100:.1f}% are public)")

    return records

In [None]:
records = extract()

extracting and storing all activites...

no. of activities extracted: 410

394/410 (96.1% are public)


View the extracted data as a `DataFrame` object:

In [None]:
df = pd.DataFrame(records)

df.sample(n=5)

Unnamed: 0,activity_id,name,type,date,start_time,start_date_local,timezone,distance_km,moving_time_s,elapsed_time_s,avg_speed_mps,max_speed_mps,total_elev_gain,highest_elev,lowest_elev,visibility,num_comments,num_achievements,num_kudos,is_manual,avg_hr,max_hr,avg_cadence_spm,gear_id
307,3443798855,Morning Run,root='Run',2020-05-13,09:40:16,2020-05-13 10:40:16+00:00,(GMT+00:00) Europe/London,6.7439,2389,2984,2.823,5.1,19.6,23.2,12.7,followers_only,0,3,3,False,,,,
300,3492236741,Morning Run,root='Run',2020-05-22,08:34:38,2020-05-22 09:34:38+00:00,(GMT+00:00) Europe/London,3.0497,994,1020,3.068,4.3,0.0,13.7,12.6,followers_only,0,0,4,False,,,,
14,9068722015,AJ Bell Great Bristol Run 10K,root='Run',2023-05-14,08:37:34,2023-05-14 09:37:34+00:00,(GMT+00:00) Europe/London,9.8357,3464,3468,2.839,4.402,32.0,19.9,8.9,everyone,1,5,5,False,186.2,203.0,84.0,g13287393
222,4281481843,Docks Run (with Geunyeong and Ethan W.),root='Run',2020-11-03,17:02:45,2020-11-03 17:02:45+00:00,(GMT+00:00) Europe/London,8.0096,2951,3042,2.714,10.7,11.0,-76.6,-103.8,followers_only,0,0,4,False,178.4,195.0,80.5,g6900542
43,8443112275,Morning Swim,root='Swim',2023-01-23,09:48:50,2023-01-23 09:48:50+00:00,(GMT+00:00) GMT,0.8,898,1396,0.891,1.282,0.0,,,followers_only,0,0,2,False,,,21.1,


# Transform

- Perform unit conversions, derive additional metrics, and formatting for personalisation.

In [None]:
def transform(df: pd.DataFrame) -> pd.DataFrame:

    # ------------ UNIT CONVERSIONS ------------ #

    # speed (m/s to km/h and mph)
    df["avg_speed_km_h"] = (df["avg_speed_mps"] * 3.6).round(2)     # m/s -> km/h
    df["max_speed_km_h"] = (df["max_speed_mps"] * 3.6).round(2)
    df["avg_speed_mph"] = (df["avg_speed_mps"] * 2.23694).round(2)  # m/s -> mph
    df["max_speed_mph"] = (df["max_speed_mps"] * 2.23694).round(2)

    # distance (km to miles)
    df["distance_miles"] = (df["distance_km"] * 0.621371).round(2)  # km -> miles

    # time (timedelta objects)
    df["moving_time"] = pd.to_timedelta(df["moving_time_s"], unit="s")      # s -> timedelta
    df["elapsed_time"] = pd.to_timedelta(df["elapsed_time_s"], unit="s")    # s -> timedelta

    # date (datetime objects)
    df["date"] = pd.to_datetime(df["date"])                                 # convert dates to datetime for ordering
    df["start_date_local"] = pd.to_datetime(df["start_date_local"])         # convert to datetime first
    df["end_time_local"] = df["start_date_local"] + df["elapsed_time"]      # compute end datetime

    # optional: extract just the time components (LOCAL):
    df["start_time"] = df["start_date_local"].dt.time
    df["end_time"] = df["end_time_local"].dt.time

    # ------------ FORMATTING + CLEANING ------------ #

    df["visibility"] = df["visibility"].map({
        "everyone": "Everyone",
        "followers_only": "Followers Only",
        "only_me": "Only Me",
    })

    # map the gear IDs to the shoe name:
    athlete = CLIENT.get_athlete()
    shoe_mapping = {}
    for gear in athlete.shoes:
        shoe_mapping[gear.id] = gear.name
    df["shoe_used"] = df["gear_id"].map(shoe_mapping)

    # clean the activity "type" column, RelaxedActivityType:
    df["type"] = df["type"].astype(str).str.extract(r"root='([^']+)'")  # any character except ', match 1+

    # average running cadence (only runs are doubled as it's per foot initially):
    df.loc[df["type"] == "Run", "avg_cadence_spm"] *= 2

    # ------ DERIVED METRICS ------ #

    # pace (as time deltas):
    for speed_col, pace_col in zip(
        ["avg_speed_km_h", "max_speed_km_h", "avg_speed_mph", "max_speed_mph"],
        ["avg_pace_km", "max_pace_km", "avg_pace_mile", "max_pace_mile"]
    ):
        # mask zeros to avoid zero-division errors:
        df.loc[df[speed_col] <= 0, speed_col] = np.nan

        # create new pace column (converts to min/km and min/mile):
        df[pace_col] = pd.to_timedelta(1 / df[speed_col] * 60, unit="min", errors="coerce")

    return df.round(2)  # all numerics to 2 d.p.

In [None]:
df = transform(df)    # overwrite existing DataFrame

df.sample(n=5)

Unnamed: 0,activity_id,name,type,date,start_time,start_date_local,timezone,distance_km,moving_time_s,elapsed_time_s,avg_speed_mps,max_speed_mps,total_elev_gain,highest_elev,lowest_elev,visibility,num_comments,num_achievements,num_kudos,is_manual,avg_hr,max_hr,avg_cadence_spm,gear_id,avg_speed_km_h,max_speed_km_h,avg_speed_mph,max_speed_mph,distance_miles,moving_time,elapsed_time,end_time_local,end_time,shoe_used,avg_pace_km,max_pace_km,avg_pace_mile,max_pace_mile
47,8148025638,Morning Swim,Swim,2022-11-21,09:10:50,2022-11-21 09:10:50+00:00,(GMT+00:00) GMT,1.7,2029,4177,0.84,1.17,0.0,,,Followers Only,0,0,2,False,,,22.4,,3.02,4.22,1.87,2.62,1.06,0 days 00:33:49,0 days 01:09:37,2022-11-21 10:20:27+00:00,10:20:27,,0 days 00:19:52.052980134,0 days 00:14:13.080568722,0 days 00:32:05.133689838,0 days 00:22:54.045801528
33,8712966389,Morning Swim,Swim,2023-03-13,09:37:54,2023-03-13 09:37:54+00:00,(GMT+00:00) Africa/Abidjan,1.2,1356,1933,0.88,1.16,0.0,,,Only Me,0,0,0,False,,,21.4,,3.19,4.19,1.98,2.6,0.75,0 days 00:22:36,0 days 00:32:13,2023-03-13 10:10:07+00:00,10:10:07,,0 days 00:18:48.526645770,0 days 00:14:19.188544152,0 days 00:30:18.181818180,0 days 00:23:04.615384614
242,3966045529,Storm Francis - Tailwind,Ride,2020-08-25,21:04:16,2020-08-25 21:04:16+00:00,(GMT+00:00) Europe/London,1.19,139,139,8.54,11.5,0.0,39.8,11.8,Followers Only,0,0,2,False,105.5,120.0,,,30.75,41.4,19.11,25.72,0.74,0 days 00:02:19,0 days 00:02:19,2020-08-25 21:06:35+00:00,21:06:35,,0 days 00:01:57.073170732,0 days 00:01:26.956521738,0 days 00:03:08.383045524,0 days 00:02:19.968895800
228,4133318718,1st run in Bristol,Run,2020-09-30,17:02:36,2020-09-30 17:02:36+00:00,(GMT+00:00) Europe/London,2.17,920,957,2.36,3.7,57.0,7.0,-63.8,Followers Only,0,0,4,False,170.7,186.0,154.0,g6900542,8.49,13.32,5.27,8.28,1.35,0 days 00:15:20,0 days 00:15:57,2020-09-30 17:18:33+00:00,17:18:33,ASICS Gel Excite 7,0 days 00:07:04.028268551,0 days 00:04:30.270270269,0 days 00:11:23.111954460,0 days 00:07:14.782608696
408,2051477394,Evening Run,Run,2019-01-02,20:55:05,2019-01-02 20:55:05+00:00,(GMT+00:00) Europe/London,3.48,1137,1155,3.06,5.2,9.6,16.6,12.3,Followers Only,0,0,0,False,,,,,11.02,18.72,6.85,11.63,2.16,0 days 00:18:57,0 days 00:19:15,2019-01-02 21:14:20+00:00,21:14:20,,0 days 00:05:26.678765880,0 days 00:03:12.307692306,0 days 00:08:45.547445256,0 days 00:05:09.544282032


# Load

Export the processed DataFrame to a suitable file format such as CSV, pickle, or Parquet.

- CSV files will not store pandas-specific data types like `datetime64[ns]` or `timedelta64[ns]`.

- Parquet is a columnar storage format that is very efficient for large datasets and preserves data types.



In [None]:
# sort by most recent activities first:
df.sort_values(by="date", ascending=True).reset_index().drop(columns="index", axis=1)

# export as a CSV file:
df.to_csv("all_activities.csv", index=False)
print("DataFrame saved to 'all_activities.csv'")

DataFrame saved to 'all_activities.csv'


In [None]:
df.to_parquet('all_activities.parquet', index=False)
print("DataFrame saved to 'all_activities.parquet'")

DataFrame saved to 'all_activities.parquet'


Preview exported dataset:

In [None]:
df = pd.read_parquet('all_activities.parquet')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 38 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   activity_id       410 non-null    int64              
 1   name              410 non-null    object             
 2   type              410 non-null    object             
 3   date              410 non-null    datetime64[ns]     
 4   start_time        410 non-null    object             
 5   start_date_local  410 non-null    datetime64[ns, UTC]
 6   timezone          410 non-null    object             
 7   distance_km       410 non-null    float64            
 8   moving_time_s     410 non-null    int64              
 9   elapsed_time_s    410 non-null    int64              
 10  avg_speed_mps     410 non-null    float64            
 11  max_speed_mps     410 non-null    float64            
 12  total_elev_gain   410 non-null    float64            
 13  highe

In [None]:
# list columns to include for viewing only:
column_order = [
    "date", "start_time", "end_time",
    "name", "type",
    "distance_km", "distance_miles",
    "moving_time", "elapsed_time",
    "avg_pace_km", "avg_pace_mile", "avg_speed_mph", "avg_cadence_spm",
    "avg_hr", "max_hr",
    "max_pace_km",
    "max_speed_mph",
    "total_elev_gain",
    "lowest_elev",
    "highest_elev",
    "shoe_used",
    "activity_id",
    # newly merged columns (not available from extract() function):
    # "desc",
    # "relative_effort",
    # "wind_speed",
    # "max_grade",
    # "calories",
    # "avg_temp",
    # "humidity",
    ]

num_runs = df[df["type"] == "Run"].shape[0]
pct_runs = num_runs / df.shape[0] * 100

print(f"no. of run activities: {num_runs}/{df.shape[0]} ({pct_runs:.1f}%)\n")

df[column_order].sample(n=5)

no. of run activities: 352/410 (85.9%)



Unnamed: 0,date,start_time,end_time,name,type,distance_km,distance_miles,moving_time,elapsed_time,avg_pace_km,avg_pace_mile,avg_speed_mph,avg_cadence_spm,avg_hr,max_hr,max_pace_km,max_speed_mph,total_elev_gain,lowest_elev,highest_elev,shoe_used,activity_id
339,2020-02-26,21:21:27,21:41:20,Night Run,Run,3.48,2.16,0 days 00:19:19,0 days 00:19:53,0 days 00:05:33.333333336,0 days 00:08:56.512667658,6.71,,,,0 days 00:03:28.333333332,10.74,9.1,12.4,17.3,,3134826169
268,2020-07-12,14:28:26,16:10:20,Millets Farm,Walk,2.96,1.84,0 days 00:54:09,0 days 01:41:54,0 days 00:18:17.560975608,0 days 00:29:24.705882354,2.04,,,,0 days 00:04:03.902439024,9.17,19.1,59.0,68.9,,3753808561
288,2020-06-13,21:12:52,21:53:09,Night Run (with mum),Run,5.49,3.41,0 days 00:39:19,0 days 00:40:17,0 days 00:07:10.107526884,0 days 00:11:32.307692310,5.2,,,,0 days 00:03:01.818181818,12.3,6.3,12.6,16.6,,3610076907
67,2022-06-10,22:44:36,00:06:00,Night Run,Run,10.01,6.22,0 days 01:08:50,0 days 01:21:24,0 days 00:06:52.371134022,0 days 00:11:04.206642066,5.42,166.0,166.3,182.0,0 days 00:05:14.136125652,7.12,18.9,8.4,17.7,Brooks Ghost,7287776130
131,2021-08-08,22:25:32,23:11:32,Night Run,Run,7.01,4.35,0 days 00:42:07,0 days 00:46:00,0 days 00:06:00.721442886,0 days 00:09:40.645161288,6.2,163.4,163.1,177.0,0 days 00:04:45.714285714,7.83,20.0,13.0,23.2,Brooks Ghost,5761223529
