# Strava data ingestion

This notebook authenticates with the Strava API, fetches athletes data and activities and store them.

---

## Dependencies and configuration

Import everything needed and set up configuration (constants ...)

In [6]:
import os
import json
import math
import time
import datetime as dt
from pathlib import Path
from typing import List, Dict, Any
import threading
from functools import lru_cache
import requests
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns

### Load environment variables from .env file
load_dotenv()
STRAVA_CLIENT_ID = os.getenv("STRAVA_CLIENT_ID")
STRAVA_CLIENT_SECRET = os.getenv("STRAVA_CLIENT_SECRET")
STRAVA_REFRESH_TOKEN = os.getenv("STRAVA_REFRESH_TOKEN")
STRAVA_ACCESS_TOKEN = os.getenv("STRAVA_ACCESS_TOKEN")
STRAVA_ATHLETE_ID = "xxx"

### Constants
BASE_URL = 'https://www.strava.com/api/v3'
TOKEN_URL = 'https://www.strava.com/oauth/token'

# Token store
TOKEN_FILE = Path("./tmp/strava_token.json")

# Strava data store
ATHLETE_DATA = Path("./data/athlete_profile.json")

In [None]:
from functools import lru_cache

_token_lock = threading.Lock()

def _save_token(data: Dict[str, Any]):
    TOKEN_FILE.write_text(json.dumps(data))

def _load_token():
    if TOKEN_FILE.exists():
        try:
            return json.loads(TOKEN_FILE.read_text())
        except json.JSONDecodeError:
            return None
    return None

def refresh_access_token(force=False) -> str:
    with _token_lock:
        cached = _load_token()
        now = time.time()
        if cached and not force and cached.get('expires_at', 0) - 30 > now:
            return cached['access_token']
        resp = requests.post(
            TOKEN_URL,
            data={
                'client_id': STRAVA_CLIENT_ID,
                'client_secret': STRAVA_CLIENT_SECRET,
                'grant_type': 'refresh_token',
                'refresh_token': STRAVA_REFRESH_TOKEN
            }, timeout=30
        )
        if resp.status_code != 200:
            raise RuntimeError(f'Token refresh failed: {resp.status_code} {resp.text}')
        data = resp.json()
        token_record = {
            'access_token': data['access_token'],
            'expires_at': data['expires_at']
        }
        _save_token(token_record)
        return token_record['access_token']

SESSION = requests.Session()

RATE_LIMIT = {'short_limit': None, 'short_used': None, 'long_limit': None, 'long_used': None}

def api_get(path: str, params: Dict[str, Any] = None, retries: int = 3):
    url = f"{BASE_URL.rstrip('/')}/{path.lstrip('/')}"
    for attempt in range(retries):
        token = refresh_access_token()
        resp = SESSION.get(url, params=params, headers={'Authorization': f'Bearer {token}'}, timeout=60)
        # capture rate limit headers
        if 'X-RateLimit-Limit' in resp.headers:
            short_l, long_l = resp.headers['X-RateLimit-Limit'].split(',')
            short_u, long_u = resp.headers.get('X-RateLimit-Usage', '0,0').split(',')
            RATE_LIMIT.update({
                'short_limit': int(short_l), 'long_limit': int(long_l),
                'short_used': int(short_u), 'long_used': int(long_u)
            })

        else:
            raise RuntimeError(f'GET {url} failed {resp.status_code}: {resp.text[:200]}')
        return resp.json()
    raise RuntimeError(f'Exceeded retries for {url}')

## Connect to the API and fetches data.

Data is then stored in CSV or JSON files.

### 1: Athlete data

In [7]:
athlete_data = pd.json_normalize(api_get('athlete'))  # triggers headers
print(f"Athlete: {athlete_data})")

athlete_data.to_json(ATHLETE_DATA)

Athlete:          id username  resource_state firstname   lastname bio      city  \
0  10097604     None               2  Baptiste  Dubillaud      Jurançon   

       state country sex  premium  summit            created_at  \
0  Aquitaine  France   M     True    True  2015-07-05T18:13:31Z   

             updated_at  badge_type_id  weight  \
0  2025-06-09T14:43:39Z              1    83.0   

                                      profile_medium  \
0  https://dgalywyr863hv.cloudfront.net/pictures/...   

                                             profile friend follower  
0  https://dgalywyr863hv.cloudfront.net/pictures/...   None     None  )
