In [None]:
# === running_03_ml_combined.ipynb â€” v3 models from merged data ===
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error, r2_score,
    classification_report, confusion_matrix, roc_auc_score
)
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow.keras import Sequential, layers, callbacks

# -------------------------
# 0) Paths
# -------------------------
ROOT     = Path("/Users/amlim/triathlon-performance")
SRC_EXCL = ROOT / "data" / "cleaned_running.csv"              # Excel-based (earlier)
SRC_GPX  = ROOT / "data" / "cleaned_running_from_gpx.csv"     # GPX-derived (new)
MERGED   = ROOT / "data" / "cleaned_running_merged.csv"
OUT_DIR  = ROOT / "results" / "models"
OUT_DIR.mkdir(parents=True, exist_ok=True)
LOG_CSV  = ROOT / "results" / "model_performance_log.csv"

# -------------------------
# 1) Load & merge (union of columns)
# -------------------------
def safe_read(p):
    try:
        return pd.read_csv(p)
    except Exception:
        return pd.DataFrame()

df1 = safe_read(SRC_EXCL)
df2 = safe_read(SRC_GPX)

if df1.empty and df2.empty:
    raise ValueError("Both source files are empty/missing. Provide at least one.")

# Normalize common column names where they may differ
rename_map = {
    "start_time": "time",           # GPX summary uses start_time
    "elev_gain": "elev_gain_m",     # just in case
}
df1 = df1.rename(columns=rename_map)
df2 = df2.rename(columns=rename_map)

# Ensure presence of expected columns
expected = [
    "date","time","time_of_day","indoor_outdoor",
    "distance_km","duration_min","avg_pace_min_per_km",
    "elev_gain_m","average_heartrate","max_heartrate","calories"
]
for c in expected:
    if c not in df1.columns: df1[c] = np.nan
    if c not in df2.columns: df2[c] = np.nan

# Concatenate and tidy
df = pd.concat([df1[expected], df2[expected]], ignore_index=True)

# Coerce numerics
num_cols = ["distance_km","duration_min","avg_pace_min_per_km","elev_gain_m",
            "average_heartrate","max_heartrate","calories"]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Basic sanity filters (tweak as needed)
df = df[(df["distance_km"] > 0) & (df["duration_min"] > 0)]
df = df[(df["avg_pace_min_per_km"] > 3) & (df["avg_pace_min_per_km"] < 12)]  # ~3â€“12 min/km
df["time_of_day"]     = df["time_of_day"].astype(str).str.strip().str.title()
df["indoor_outdoor"]  = df["indoor_outdoor"].astype(str).str.strip().str.lower().replace(
    {"indoors":"indoor","outdoors":"outdoor"}
)

# Re-label effectiveness on the MERGED set (top 25% fastest)
q25 = df["avg_pace_min_per_km"].quantile(0.25)
df["high_effectiveness_run"] = (df["avg_pace_min_per_km"] <= q25).astype(int)

# Save merged dataset
df.to_csv(MERGED, index=False)
print(f"âœ… Merged dataset saved: {MERGED}  rows={len(df)}")

# -------------------------
# -------------------------
# 2) Features & encoding (robust)
# -------------------------
# One-hot categoricals if they have >1 category
cat_cols = []
if df["time_of_day"].nunique() > 1:    cat_cols.append("time_of_day")
if df["indoor_outdoor"].nunique() > 1: cat_cols.append("indoor_outdoor")

df_ml = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Core features that should exist and be fairly complete
core_feats = [c for c in ["distance_km","duration_min","elev_gain_m"] if c in df_ml.columns]

# Optional features â€” include only if they have enough non-nulls
optional_pool = [c for c in ["average_heartrate","max_heartrate","calories"] if c in df_ml.columns]
nonnull_ratio = df_ml[optional_pool].notna().mean().sort_values(ascending=False) if optional_pool else pd.Series(dtype=float)
optional_kept = [c for c in optional_pool if df_ml[c].notna().mean() >= 0.60]

# Available dummies (after drop_first some may not exist)
dummy_feats = [c for c in df_ml.columns if c.startswith("time_of_day_") or c.startswith("indoor_outdoor_")]

X_cols = core_feats + optional_kept + dummy_feats

print("---- Feature diagnostics ----")
print("Candidate optional features and non-null ratio:")
print(nonnull_ratio if not nonnull_ratio.empty else "  (none)")
print("Core kept:", core_feats)
print("Optional kept (â‰¥60% non-null):", optional_kept)
print("Dummies kept:", dummy_feats)

if not X_cols:
    raise ValueError("No usable feature columns after checks. Inspect df_ml.columns and data completeness.")

X     = df_ml[X_cols].apply(pd.to_numeric, errors="coerce")
y_reg = pd.to_numeric(df_ml["avg_pace_min_per_km"], errors="coerce")
y_clf = df_ml["high_effectiveness_run"].astype(int)

# Show null rates before masking
print("\nNull rates in selected X features:")
print(X.isna().mean().sort_values())

# Drop rows with any NaN in selected X or y
mask = X.notna().all(axis=1) & y_reg.notna() & y_clf.notna()
print("\nRows before mask:", len(X), " | after mask:", mask.sum())

X, y_reg, y_clf = X.loc[mask], y_reg.loc[mask], y_clf.loc[mask]

if len(X) == 0:
    raise ValueError("All rows dropped after NaN filtering. Consider lowering threshold, imputing, or removing sparse features.")

# -------------------------
# 3) Train/test split & scale
# -------------------------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_tr, X_te, y_tr_reg, y_te_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
X_tr_c, X_te_c, y_tr_clf, y_te_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42, stratify=y_clf)

scaler_reg = StandardScaler().fit(X_tr)
X_tr_s = scaler_reg.transform(X_tr)
X_te_s = scaler_reg.transform(X_te)

scaler_clf = StandardScaler().fit(X_tr_c)
X_tr_c_s = scaler_clf.transform(X_tr_c)
X_te_c_s = scaler_clf.transform(X_te_c)

# -------------------------
# 4A) Regression â€” v3
# -------------------------
tf.keras.utils.set_random_seed(42)
reg_model = Sequential([
    layers.Input(shape=(X_tr_s.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(1)
])
reg_model.compile(optimizer="adam", loss="mse", metrics=["mae"])
early = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
reg_model.fit(X_tr_s, y_tr_reg, validation_split=0.2, epochs=200, batch_size=16, verbose=0, callbacks=[early])

yhat = reg_model.predict(X_te_s).ravel()
reg_mae = mean_absolute_error(y_te_reg, yhat)
reg_r2  = r2_score(y_te_reg, yhat)
print("\n=== Regression v3 (min/km) ===")
print("MAE:", round(reg_mae, 4))
print("RÂ² :", round(reg_r2, 4))

# -------------------------
# 4B) Classification â€” v3
# -------------------------
classes = np.unique(y_tr_clf)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr_clf)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}

clf_model = Sequential([
    layers.Input(shape=(X_tr_c_s.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
clf_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
early_c = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
clf_model.fit(X_tr_c_s, y_tr_clf, validation_split=0.2, epochs=150, batch_size=16, verbose=0,
              callbacks=[early_c], class_weight=class_weight)

probs = clf_model.predict(X_te_c_s).ravel()
preds = (probs >= 0.5).astype(int)

print("\n=== Classification v3 (High Effectiveness) ===")
print(classification_report(y_te_clf, preds, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_te_clf, preds))
try:
    clf_auc = roc_auc_score(y_te_clf, probs)
    print("ROC AUC:", round(clf_auc, 4))
except ValueError:
    clf_auc = np.nan
    print("ROC AUC: not defined (one class in y_true).")

# -------------------------
# 5) Save artifacts
# -------------------------
reg_path = OUT_DIR / "run_pace_reg_v3.keras"
clf_path = OUT_DIR / "run_effectiveness_clf_v3.keras"
reg_model.save(reg_path)
clf_model.save(clf_path)
import joblib
joblib.dump(scaler_reg, OUT_DIR / "run_reg_scaler_v3.joblib")
joblib.dump(scaler_clf, OUT_DIR / "run_clf_scaler_v3.joblib")

print(f"\nSaved models:\n  {reg_path}\n  {clf_path}")
print(f"Saved scalers in: {OUT_DIR}")

# -------------------------
# 6) Log metrics (append)
# -------------------------
log_row = pd.DataFrame([{
    "model_version": "v3",
    "n_rows": len(X),
    "features": ",".join(X_cols),
    "reg_mae": reg_mae,
    "reg_r2": reg_r2,
    "clf_auc": float(clf_auc) if not np.isnan(clf_auc) else "",
}])

if LOG_CSV.exists():
    old = pd.read_csv(LOG_CSV)
    pd.concat([old, log_row], ignore_index=True).to_csv(LOG_CSV, index=False)
else:
    log_row.to_csv(LOG_CSV, index=False)

print(f"ðŸ“ˆ Metrics logged to: {LOG_CSV}")

In [None]:
from sklearn.metrics import f1_score, precision_recall_curve
pr, rc, th = precision_recall_curve(y_te_clf, probs)
f1 = 2*pr*rc/(pr+rc+1e-9)
best_t = th[f1[:-1].argmax()]
preds_opt = (probs >= best_t).astype(int)
print("Best F1 threshold:", best_t)

In [None]:
df["speed_km_per_min"] = df["distance_km"] / df["duration_min"]          # linear target alt
df["pace_inv"] = 1 / df["avg_pace_min_per_km"]                           # monotonic transform
df["elev_gain_per_km"] = df["elev_gain_m"] / df["distance_km"].clip(lower=1e-6)