In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load cleaned data (use relative path if running from notebooks/)
df = pd.read_csv("/Users/amlim/triathlon-performance/data/cleaned_running.csv")

# --- Ensure the columns exist ---
expected = ["avg_pace_min_per_km", "time_of_day", "indoor_outdoor"]
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in cleaned_running.csv: {missing}")

# --- Normalize indoor_outdoor values ---
df["indoor_outdoor"] = (
    df["indoor_outdoor"]
      .astype(str)
      .str.strip()
      .str.lower()
      .replace({"indoors":"indoor", "outdoors":"outdoor"})
)

print("indoor_outdoor value counts:\n", df["indoor_outdoor"].value_counts(dropna=False), "\n")

# Fixed color mapping + order (so colors are stable across plots)
custom_palette = {"indoor": "#1f77b4", "outdoor": "#ff7f0e"}
hue_order = ["indoor", "outdoor"]  # will silently ignore missing classes

# 1) Pace distribution (min/km — lower is better)
plt.figure(figsize=(7,5))
sns.histplot(df["avg_pace_min_per_km"], bins=30, color="#4c78a8")
plt.xlabel("Average Pace (min/km)")
plt.title("Running Pace Distribution")
plt.tight_layout()
plt.show()

# 2) Pace by time of day & indoor/outdoor (boxplot)
plt.figure(figsize=(8,6))
sns.boxplot(
    data=df,
    x="time_of_day",
    y="avg_pace_min_per_km",
    hue="indoor_outdoor",
    hue_order=hue_order,
    palette=custom_palette
)
plt.title("Pace (min/km) by Time of Day & Indoor/Outdoor")
plt.ylabel("Pace (min/km)  (lower = better)")
plt.xlabel("Time of Day")
plt.tight_layout()
plt.show()

# 3) Violin plot (distribution shape)
plt.figure(figsize=(8,6))
# Use split=True only if BOTH categories exist; otherwise fall back to side-by-side
has_both = set(df["indoor_outdoor"].dropna().unique()) == set(hue_order)
sns.violinplot(
    data=df,
    x="time_of_day",
    y="avg_pace_min_per_km",
    hue="indoor_outdoor",
    hue_order=hue_order,
    palette=custom_palette,
    split=has_both,   # split only if both indoor & outdoor present
    inner="quartile"
)
plt.title("Pace (min/km) Distribution by Time of Day")
plt.ylabel("Pace (min/km)  (lower = better)")
plt.xlabel("Time of Day")
plt.tight_layout()
plt.show()

In [None]:
# ==========================================
# Running — Feature-Engineered TF Models
# ==========================================
import os
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error, r2_score,
    classification_report, confusion_matrix, roc_auc_score
)
from sklearn.utils.class_weight import compute_class_weight
import joblib

import tensorflow as tf
from tensorflow.keras import Sequential, layers, callbacks

# -------------------------
# 0) Load cleaned dataset
# -------------------------
csv_path = "/Users/amlim/triathlon-performance/data/cleaned_running.csv"
df = pd.read_csv(csv_path)

required = ["duration_min","distance_km","avg_pace_min_per_km","time_of_day","indoor_outdoor","high_effectiveness_run"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in cleaned_running.csv: {missing}")

# Normalize categoricals
df["time_of_day"] = df["time_of_day"].astype(str).str.strip().str.title()
df["indoor_outdoor"] = (
    df["indoor_outdoor"].astype(str).str.strip().str.lower()
    .replace({"indoors":"indoor","outdoors":"outdoor"})
)

# -------------------------------------
# 1) Feature engineering (improves regression)
# -------------------------------------
# Baseline pace from raw distance & duration
with np.errstate(divide="ignore", invalid="ignore"):
    df["pace_baseline_min_per_km"] = df["duration_min"] / df["distance_km"]
df.loc[~np.isfinite(df["pace_baseline_min_per_km"]), "pace_baseline_min_per_km"] = np.nan

# Elevation cost normalized by distance (if available)
if "elev_gain_m" in df.columns:
    with np.errstate(divide="ignore", invalid="ignore"):
        df["elev_per_km"] = df["elev_gain_m"] / df["distance_km"]
    df.loc[~np.isfinite(df.get("elev_per_km")), "elev_per_km"] = np.nan
else:
    df["elev_per_km"] = np.nan

# Stabilize distance scaling
df["log_distance"] = np.log1p(df["distance_km"])

# Distance bins (categorical signal)
bins = [-np.inf, 5, 10, 21.1, 42.2, np.inf]
labels = ["<=5K","5K-10K","10K-HM","HM-Marathon",">Marathon"]
df["distance_bin"] = pd.cut(df["distance_km"], bins=bins, labels=labels)

# -------------------------------------
# 2) One-hot encode categoricals (safe)
# -------------------------------------
cat_cols = []
if df["time_of_day"].nunique() > 1:
    cat_cols.append("time_of_day")
if df["indoor_outdoor"].nunique() > 1:
    cat_cols.append("indoor_outdoor")
if df["distance_bin"].notna().any() and df["distance_bin"].nunique() > 1:
    cat_cols.append("distance_bin")

df_ml = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# -------------------------------------
# 3) Build feature matrix robustly
# -------------------------------------
# Add engineered features to base list (include only if present)
base_feats = [
    "duration_min", "distance_km",
    "pace_baseline_min_per_km", "elev_per_km", "log_distance"
]
base_feats = [c for c in base_feats if c in df_ml.columns]

# Collect all dummy columns that were created
dummy_feats = [c for c in df_ml.columns if c.startswith("time_of_day_") 
                                      or c.startswith("indoor_outdoor_")
                                      or c.startswith("distance_bin_")]

X_cols = base_feats + dummy_feats
if not X_cols:
    raise ValueError("No feature columns found. Check your cleaned CSV and one-hot encoding.")

# Targets
y_reg = df_ml["avg_pace_min_per_km"]                # Regression target (lower = better)
y_clf = df_ml["high_effectiveness_run"].astype(int) # Classification target (1 = top 25% fastest)

# Feature matrix
X = df_ml[X_cols].apply(pd.to_numeric, errors="coerce")

# Drop rows with NaNs in either features or targets
mask = X.notna().all(axis=1) & y_reg.notna() & y_clf.notna()
X, y_reg, y_clf = X.loc[mask], y_reg.loc[mask], y_clf.loc[mask]

# Train/test splits
X_tr, X_te, y_tr_reg, y_te_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
X_tr_c, X_te_c, y_tr_clf, y_te_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42, stratify=y_clf)

# -------------------------------------
# 4) Scale features (fit on train only)
# -------------------------------------
reg_scaler = StandardScaler()
X_tr_s = reg_scaler.fit_transform(X_tr)
X_te_s = reg_scaler.transform(X_te)

clf_scaler = StandardScaler()
X_tr_c_s = clf_scaler.fit_transform(X_tr_c)
X_te_c_s = clf_scaler.transform(X_te_c)

# -------------------------------------
# 5A) TensorFlow Regression (predict pace)
# -------------------------------------
tf.keras.utils.set_random_seed(42)

reg_model = Sequential([
    layers.Input(shape=(X_tr_s.shape[1],)),
    layers.Dense(128, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(1)
])
reg_model.compile(optimizer="adam", loss="mse", metrics=["mae"])

early = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
hist_reg = reg_model.fit(
    X_tr_s, y_tr_reg,
    validation_split=0.2,
    epochs=200,
    batch_size=16,
    verbose=0,
    callbacks=[early]
)

yhat_reg = reg_model.predict(X_te_s).ravel()
print("\n=== Regression (Pace min/km) ===")
print("MAE:", round(mean_absolute_error(y_te_reg, yhat_reg), 4))
print("R² :", round(r2_score(y_te_reg, yhat_reg), 4))

# -------------------------------------
# 5B) TensorFlow Classification (high effectiveness)
# -------------------------------------
# Optional class weights if imbalanced
classes = np.unique(y_tr_clf)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr_clf)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}

clf_model = Sequential([
    layers.Input(shape=(X_tr_c_s.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
clf_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

early_c = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
hist_clf = clf_model.fit(
    X_tr_c_s, y_tr_clf,
    validation_split=0.2,
    epochs=150,
    batch_size=16,
    verbose=0,
    callbacks=[early_c],
    class_weight=class_weight
)

probs = clf_model.predict(X_te_c_s).ravel()
preds = (probs >= 0.5).astype(int)

print("\n=== Classification (High Effectiveness) ===")
from sklearn.metrics import precision_recall_fscore_support
print(classification_report(y_te_clf, preds, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_te_clf, preds))
try:
    print("ROC AUC:", round(roc_auc_score(y_te_clf, probs), 4))
except ValueError:
    print("ROC AUC: not defined (only one class present in y_true).")

# -------------------------------------
# 6) Save artifacts (models + scalers)
# -------------------------------------
out_dir = "/Users/amlim/triathlon-performance/results/models"
Path(out_dir).mkdir(parents=True, exist_ok=True)

# Use modern Keras format to avoid HDF5 warning
reg_model_path = os.path.join(out_dir, "run_pace_reg_v2.keras")
clf_model_path = os.path.join(out_dir, "run_effectiveness_clf_v2.keras")
reg_model.save(reg_model_path)
clf_model.save(clf_model_path)

joblib.dump(reg_scaler, os.path.join(out_dir, "run_reg_scaler_v2.joblib"))
joblib.dump(clf_scaler, os.path.join(out_dir, "run_clf_scaler_v2.joblib"))

print(f"\nSaved models:\n  {reg_model_path}\n  {clf_model_path}")
print(f"Saved scalers in {out_dir}")
print("\nFeature columns used:", X_cols)