In [None]:
from pathlib import Path

REPO_ROOT    = Path("/Users/amlim/triathlon-performance")
ACTIVITY_DIR = REPO_ROOT / "data" / "activities"

print("ACTIVITY_DIR exists?  ", ACTIVITY_DIR.exists())
print("ACTIVITY_DIR path:    ", ACTIVITY_DIR.resolve())

# List files we can see
all_gpx = list(ACTIVITY_DIR.rglob("*.gpx")) + list(ACTIVITY_DIR.rglob("*.GPX"))
print("Found GPX files:", len(all_gpx))
print("First 10:", [p.name for p in all_gpx[:10]])

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import gpxpy
import math

REPO_ROOT    = Path("/Users/amlim/triathlon-performance")
ACTIVITY_DIR = REPO_ROOT / "data" / "activities"
OUT_CSV      = REPO_ROOT / "data" / "cleaned_running_from_gpx.csv"

# If you only want runs, set to True and add keywords that appear in filenames (optional)
ONLY_RUN_FILES = False
RUN_KEYWORDS   = ("run", "running", "treadmill")

def time_of_day_from_hour(h):
    if 5 <= h <= 11: return "Morning"
    if 12 <= h <= 17: return "Afternoon"
    return "Evening"

def summarize_gpx(gpx_path: Path) -> dict | None:
    """Summarize a GPX by using GPXPy's built-in track stats (robust & fast)."""
    try:
        with gpx_path.open("r", encoding="utf-8") as f:
            gpx = gpxpy.parse(f)
    except Exception as e:
        print(f"[parse error] {gpx_path.name}: {e}")
        return None

    if not gpx.tracks:
        # Some GPX files store routes/waypoints but no tracks — skip those
        # (Strava activity exports should have tracks)
        return None

    # Aggregate across all tracks in the file (most files have 1)
    total_distance_m = 0.0
    moving_time_s    = 0.0
    total_up_m       = 0.0
    start_time       = None
    end_time         = None

    # Try to collect basic HR stats from point extensions (coarse fallback)
    hr_vals = []

    for track in gpx.tracks:
        # Distance & moving time
        try:
            md = track.get_moving_data()
            if md:
                # moving_distance can be None; fall back to length_2d()
                if md.moving_distance is not None:
                    total_distance_m += float(md.moving_distance)
                moving_time_s += float(md.moving_time or 0.0)
        except Exception:
            pass

        if total_distance_m == 0.0:
            # fall back to geometric length (2D)
            try:
                total_distance_m += float(track.length_2d())
            except Exception:
                pass

        # Elevation gain
        try:
            up, down = track.get_uphill_downhill()  # returns (uphill, downhill) in meters
            if up is not None:
                total_up_m += float(up)
        except Exception:
            pass

        # Time bounds
        try:
            tb = track.get_time_bounds()  # .start_time, .end_time (datetime)
            if tb and tb.start_time:
                if start_time is None or tb.start_time < start_time:
                    start_time = tb.start_time
            if tb and tb.end_time:
                if end_time is None or tb.end_time > end_time:
                    end_time = tb.end_time
        except Exception:
            pass

        # Optional coarse HR scan (iterate track points; quick but not as heavy as full export)
        for seg in track.segments:
            for p in seg.points:
                if getattr(p, "extensions", None):
                    for ext in p.extensions:
                        # Look for hr in extension tags (Garmin gpxtpx namespace)
                        tag = getattr(ext, "tag", "") or ""
                        txt = getattr(ext, "text", None)
                        if txt and tag.lower().endswith("hr"):
                            try:
                                hr_vals.append(float(txt))
                            except Exception:
                                pass
                        it = getattr(ext, "iter", None)
                        if callable(it):
                            for child in it():
                                ctag = getattr(child, "tag", "") or ""
                                ctext = getattr(child, "text", None)
                                if ctext and ctag.lower().endswith("hr"):
                                    try:
                                        hr_vals.append(float(ctext))
                                    except Exception:
                                        pass

    # sanity check: need a duration and some distance
    if (start_time is None) or (end_time is None):
        # try fallback duration: if we have moving_time only
        if moving_time_s <= 0 or total_distance_m <= 0:
            return None

    duration_min = (end_time - start_time).total_seconds() / 60.0 if (start_time and end_time) else (moving_time_s / 60.0)
    if duration_min <= 0:
        return None

    distance_km = total_distance_m / 1000.0
    pace_min_per_km = np.nan if distance_km <= 0 else (duration_min / distance_km)

    # Date/time buckets
    if start_time is not None:
        date_str = start_time.date().isoformat()
        time_str = start_time.strftime("%H:%M:%S")
        tod      = time_of_day_from_hour(start_time.hour)
    else:
        # Very rare edge case: no start_time but nonzero moving_time
        date_str = ""
        time_str = ""
        tod      = np.nan

    avg_hr = float(np.nanmean(hr_vals)) if hr_vals else np.nan
    max_hr = float(np.nanmax(hr_vals))  if hr_vals else np.nan

    return {
        "file": gpx_path.name,
        "date": date_str,
        "start_time": time_str,
        "time_of_day": tod,
        "distance_km": round(distance_km, 3),
        "duration_min": round(duration_min, 2),
        "avg_pace_min_per_km": round(pace_min_per_km, 2) if np.isfinite(pace_min_per_km) else np.nan,
        "elev_gain_m": round(total_up_m, 1),
        "average_heartrate": round(avg_hr, 1) if np.isfinite(avg_hr) else np.nan,
        "max_heartrate": round(max_hr, 0) if np.isfinite(max_hr) else np.nan,
        "indoor_outdoor": "outdoor"  # GPX implies GPS
    }

# ---- find files ----
gpx_files = list(ACTIVITY_DIR.rglob("*.gpx")) + list(ACTIVITY_DIR.rglob("*.GPX"))
if ONLY_RUN_FILES:
    before = len(gpx_files)
    gpx_files = [p for p in gpx_files if any(k in p.name.lower() for k in RUN_KEYWORDS)]
    print(f"Filtered to run-like filenames: {len(gpx_files)} / {before}")

print(f"Total GPX files to parse: {len(gpx_files)}")

# ---- parse all ----
summaries, errors, empties = [], [], []
for p in gpx_files:
    s = summarize_gpx(p)
    if s is None:
        empties.append(p.name)
    else:
        summaries.append(s)

print(f"Parsed summaries: {len(summaries)}")
print(f"Empty/invalid:   {len(empties)} (showing first 8): {empties[:8]}")

df_sum = pd.DataFrame(summaries)
if df_sum.empty:
    raise ValueError("No valid GPX activities were parsed. Check empties list above.")

# Label effectiveness: top 25% fastest
q25 = df_sum["avg_pace_min_per_km"].quantile(0.25)
df_sum["high_effectiveness_run"] = (df_sum["avg_pace_min_per_km"] <= q25).astype(int)

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df_sum.to_csv(OUT_CSV, index=False)
print(f"✅ Saved {len(df_sum)} activities → {OUT_CSV}")

df_sum.head()

In [None]:
import pandas as pd

df = pd.read_csv("/Users/amlim/triathlon-performance/data/cleaned_running_from_gpx.csv")

print(df.shape)
print(df.columns.tolist())
df.describe(include='all').T.head(15)

In [None]:
df["speed_km_per_min"] = df["distance_km"] / df["duration_min"]
df["pace_inv"] = 1 / df["avg_pace_min_per_km"]  # higher = faster (optional)

In [None]:
df = df[(df["avg_pace_min_per_km"] > 3) & (df["avg_pace_min_per_km"] < 12)]

In [None]:
df.to_csv("/Users/amlim/triathlon-performance/data/cleaned_running_final.csv", index=False)