# Strava data ingestion

This notebook authenticates with the Strava API, fetches athletes data and activities and store them in JSON, CSV, PostgreSQL Database.

---

## Dependencies and configuration

Import everything needed and set up configuration (constants ...)

In [38]:
import os
import json
import math
import time
import datetime as dt
from pathlib import Path
from typing import List, Dict, Any
import threading
from functools import lru_cache
import requests
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
import pprint

### Load environment variables from .env file
load_dotenv()
STRAVA_CLIENT_ID = os.getenv("STRAVA_CLIENT_ID")
STRAVA_CLIENT_SECRET = os.getenv("STRAVA_CLIENT_SECRET")
STRAVA_REFRESH_TOKEN = os.getenv("STRAVA_REFRESH_TOKEN")
STRAVA_ACCESS_TOKEN = os.getenv("STRAVA_ACCESS_TOKEN")
STRAVA_USER_AUTHORIZATION_CODE = os.getenv("STRAVA_USER_AUTHORIZATION_CODE")

print(f"Loaded STRAVA_CLIENT_ID: {STRAVA_CLIENT_ID}")
print(f"Loaded STRAVA_CLIENT_SECRET: {STRAVA_CLIENT_SECRET}")
print(f"Loaded STRAVA_REFRESH_TOKEN: {STRAVA_REFRESH_TOKEN}")
print(f"Loaded STRAVA_ACCESS_TOKEN: {STRAVA_ACCESS_TOKEN}")
print(f"Loaded STRAVA_USER_AUTHORIZATION_CODE: {STRAVA_USER_AUTHORIZATION_CODE}")

### Constants
BASE_URL = 'https://www.strava.com/api/v3'
TOKEN_URL = 'https://www.strava.com/oauth/token'

# Token store
TOKEN_FILE = Path("./tmp/strava_token.json")

# Strava data store
DATA_PATH = "./data/"
STREAM_PATH = f"{DATA_PATH}streams/"
ATHLETE_DATA = Path(f"{DATA_PATH}athlete_profile.json")
ATHLETE_ACTIVITIES = Path(f"{DATA_PATH}athlete_activities.json")

Loaded STRAVA_CLIENT_ID: 176459
Loaded STRAVA_CLIENT_SECRET: 1a9d963bec85c4de91bd28526331a13ef578a524
Loaded STRAVA_REFRESH_TOKEN: b873396b75c9504b9ebd23bac3216e476cb9d2a4
Loaded STRAVA_ACCESS_TOKEN: f4bc0ac3563ab45187161065726ccacb984d8bac
Loaded STRAVA_USER_AUTHORIZATION_CODE: bf2f91140a82b2f8ac64a06f98f5179d6682eede


In [35]:
_token_lock = threading.Lock()

def _save_token(data: Dict[str, Any]):
    TOKEN_FILE.write_text(json.dumps(data))

def _load_token():
    if TOKEN_FILE.exists():
        try:
            return json.loads(TOKEN_FILE.read_text())
        except json.JSONDecodeError:
            return None
    return None

def refresh_access_token(force=False) -> str:
    with _token_lock:
        cached = _load_token()
        now = time.time()
        if cached and not force and cached.get('expires_at', 0) - 30 > now:
            return cached['access_token']
        resp = requests.post(
            TOKEN_URL,
            data={
                'client_id': STRAVA_CLIENT_ID,
                'client_secret': STRAVA_CLIENT_SECRET,
                'grant_type': 'refresh_token',
                'refresh_token': STRAVA_REFRESH_TOKEN,
            }, timeout=30
        )
        if resp.status_code != 200:
            raise RuntimeError(f'Token refresh failed: {resp.status_code} {resp.text}')
        data = resp.json()
        token_record = {
            'access_token': data['access_token'],
            'expires_at': data['expires_at']
        }
        _save_token(token_record)
        return token_record['access_token']
    
def get_authorization_code() -> str:
    resp = requests.post(
        TOKEN_URL,
        data={
            'client_id': STRAVA_CLIENT_ID,
            'client_secret': STRAVA_CLIENT_SECRET,
            'code': STRAVA_USER_AUTHORIZATION_CODE,
            'grant_type': 'authorization_code',
        }, timeout=30
    )
    if resp.status_code != 200:
        raise RuntimeError(f'Authorization code exchange failed: {resp.status_code} {resp.text}')
    data = resp.json()
    token_record = {
        'access_token': data['access_token'],
        'expires_at': data['expires_at']
    }
    _save_token(token_record)
    return token_record['access_token']

# -----------------------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------------------

SESSION = requests.Session()

def api_get(path: str, params: Dict[str, Any] = None, retries: int = 3):
    url = f"{BASE_URL.rstrip('/')}/{path.lstrip('/')}"
    for attempt in range(retries):
        token = refresh_access_token()
        resp = SESSION.get(url, params=params, headers={'Authorization': f'Bearer {token}'}, timeout=60)

        print(f"GET {url} - Status: {resp.status_code}")

        if resp.status_code == 401:
            # Unauthorized, possibly token expired
            token = refresh_access_token(force=True)
            continue

        elif resp.status_code >= 500:
            # Server error, retry
            time.sleep(2 ** attempt)
            continue

        elif resp.status_code != 200:
            raise RuntimeError(f'API request failed: {resp.status_code} {resp.text}')

        return resp.json()
    raise RuntimeError(f'Exceeded retries for {url}')

In [6]:
get_authorization_code()

'382ed956796d623300ca93671cafb672a4b7e61c'

## Connect to the API and fetches data.

Data is then stored in CSV or JSON files.

### 1: Athlete data

fetc basic athelte data - profile.

In [41]:
athlete_data = pd.json_normalize(api_get('athlete'))  # triggers headers
print(f"Athlete: {athlete_data})")

athlete_data.to_json(ATHLETE_DATA)

GET https://www.strava.com/api/v3/athlete - Status: 200
Athlete:          id username  resource_state firstname   lastname bio      city  \
0  10097604     None               3  Baptiste  Dubillaud      Jurançon   

       state country sex  ...  friend_count  mutual_friend_count athlete_type  \
0  Aquitaine  France   M  ...            51                    0            1   

  date_preference  measurement_preference  \
0        %m/%d/%Y                  meters   

                                               clubs postable_clubs_count  \
0  [{'id': 671279, 'resource_state': 2, 'name': '...                    2   

   ftp                                              bikes  \
0  230  [{'id': 'b7438340', 'primary': False, 'name': ...   

                                               shoes  
0  [{'id': 'g22167902', 'primary': False, 'name':...  

[1 rows x 33 columns])


### 2. Activities data

Get all activities for this athlete by 200 batches.

The 'all activities' DF is concatanated with the one containing fetched activities through the last request.

In [42]:
ACTIVITIES_PER_PAGE = 200

activities_df = None
athlete_activities = pd.DataFrame()
page = 1

while True:
    print(f"Fetching activities page {len(athlete_activities) // ACTIVITIES_PER_PAGE + 1} - {ACTIVITIES_PER_PAGE} per page")
    activities_df = pd.json_normalize(api_get('athlete/activities', params={'page': page, 'per_page': ACTIVITIES_PER_PAGE}))
    page += 1
    
    athlete_activities = pd.concat([athlete_activities, activities_df])

    if len(activities_df) < ACTIVITIES_PER_PAGE:
        break

print(f"Fetched {len(athlete_activities)} activities")

athlete_activities.to_json(ATHLETE_ACTIVITIES, orient='records', lines=True)

Fetching activities page 1 - 200 per page
GET https://www.strava.com/api/v3/athlete/activities - Status: 200
Fetching activities page 2 - 200 per page
GET https://www.strava.com/api/v3/athlete/activities - Status: 200
Fetching activities page 3 - 200 per page
GET https://www.strava.com/api/v3/athlete/activities - Status: 200
Fetching activities page 4 - 200 per page
GET https://www.strava.com/api/v3/athlete/activities - Status: 200
Fetched 709 activities


### 3. Stream data

Get for each activities all available stream. This includes :

- Heart rate
- Pace / Speed
- Elevation
- Cadance (bike)
- Watts

In [45]:
activity_ids = athlete_activities['id'].tolist()


def fetch_activity_streams(activity_id: int) -> Dict[str, Any]:
    types = ['time', 'latlng', 'distance', 'altitude', 'velocity_smooth', 'heartrate', 'cadence', 'watts']

    path = f'activities/{activity_id}/streams'
    params = {'keys': ','.join(types), 'key_by_type': 'true'}
    return api_get(path, params=params)


for activity_id in activity_ids:
    # If activity is not manual, get the streams
    if athlete_activities.loc[athlete_activities['id'] == activity_id, 'manual'].values[0]:
        print(f"Skipping manual activity {activity_id}")
        continue

    # if stream file already exists, skip
    if Path(f"{STREAM_PATH}{activity_id}_streams.json").exists():
        print(f"Skipping existing streams for activity {activity_id}")
        continue

    streams = fetch_activity_streams(activity_id)
    
    streams = pd.json_normalize(streams)

    streams.to_json(f"{STREAM_PATH}{activity_id}_streams.json", orient='records', lines=True)

Skipping existing streams for activity 15863024854
Skipping existing streams for activity 15843027348
Skipping existing streams for activity 15820804606
Skipping existing streams for activity 15812832711
Skipping existing streams for activity 15799475039
Skipping existing streams for activity 15784031938
Skipping existing streams for activity 15733260383
Skipping existing streams for activity 15697711082
Skipping existing streams for activity 15695740011
Skipping existing streams for activity 15663331065
Skipping existing streams for activity 15645363840
Skipping existing streams for activity 15619585116
Skipping existing streams for activity 15615402964
Skipping existing streams for activity 15596597633
Skipping existing streams for activity 15584588481
Skipping existing streams for activity 15583796805
Skipping existing streams for activity 15575054877
Skipping existing streams for activity 15571944727
Skipping existing streams for activity 15560503652
Skipping existing streams for a

KeyboardInterrupt: 