In [None]:
# =====================================
# Swimming — ML Models (Reg + Class)
# =====================================
import os
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error, r2_score,
    classification_report, confusion_matrix, roc_auc_score
)
from sklearn.utils.class_weight import compute_class_weight
import joblib

import tensorflow as tf
from tensorflow.keras import Sequential, layers, callbacks

# -------------------------
# 0) Load cleaned dataset
# -------------------------
csv_path = "/Users/amlim/triathlon-performance/data/cleaned_swimming.csv"
df = pd.read_csv(csv_path)

required = [
    "date", "start_time", "time_of_day", "indoor_outdoor",
    "distance_m", "duration_min", "pace_min_per_100m",
    "high_effectiveness_swim"
]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in cleaned_swimming.csv: {missing}")

# Normalize categoricals
df["time_of_day"] = df["time_of_day"].astype(str).str.strip().str.title()
df["indoor_outdoor"] = (
    df["indoor_outdoor"].astype(str).str.strip().str.lower()
    .replace({"indoors":"indoor","outdoors":"outdoor"})
)

# -------------------------------------
# 1) Feature engineering (for regression)
# -------------------------------------
# Guard against division by zero/NaN
with np.errstate(divide="ignore", invalid="ignore"):
    # baseline pace computed from distance/time (min/100m)
    df["pace_baseline_min_per_100m"] = df["duration_min"] / (df["distance_m"] / 100.0)

df.loc[~np.isfinite(df["pace_baseline_min_per_100m"]), "pace_baseline_min_per_100m"] = np.nan

# Stabilize distance scaling
df["log_distance_m"] = np.log1p(df["distance_m"])

# Distance bins (short/med/long)
bins = [-np.inf, 800, 1500, 2500, 4000, np.inf]
labels = ["<=800m", "800-1500m", "1500-2500m", "2500-4000m", ">4000m"]
df["distance_bin"] = pd.cut(df["distance_m"], bins=bins, labels=labels)

# -------------------------------------
# 2) One-hot encode categoricals (safe)
# -------------------------------------
cat_cols = []
if df["time_of_day"].nunique() > 1:
    cat_cols.append("time_of_day")
if df["indoor_outdoor"].nunique() > 1:
    cat_cols.append("indoor_outdoor")
if df["distance_bin"].notna().any() and df["distance_bin"].nunique() > 1:
    cat_cols.append("distance_bin")

df_ml = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# -------------------------------------
# 3) Build feature matrices
# -------------------------------------
reg_target = "pace_min_per_100m"                 # lower is better
clf_target = "high_effectiveness_swim"           # 1 = top 25% fastest

# numeric base features (add more later if you collect them)
base_feats = [
    "distance_m", "duration_min",
    "pace_baseline_min_per_100m", "log_distance_m"
]
base_feats = [c for c in base_feats if c in df_ml.columns]

# collect created dummies
dummy_feats = [c for c in df_ml.columns if c.startswith("time_of_day_")
                                      or c.startswith("indoor_outdoor_")
                                      or c.startswith("distance_bin_")]

X_cols = base_feats + dummy_feats
if not X_cols:
    raise ValueError("No feature columns found. Check your cleaned CSV and one-hot encoding.")

X = df_ml[X_cols].apply(pd.to_numeric, errors="coerce")
y_reg = df_ml[reg_target]
y_clf = df_ml[clf_target].astype(int)

mask = X.notna().all(axis=1) & y_reg.notna() & y_clf.notna()
X, y_reg, y_clf = X.loc[mask], y_reg.loc[mask], y_clf.loc[mask]

# Train/test splits (keep classification stratified)
X_tr, X_te, y_tr_reg, y_te_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
X_tr_c, X_te_c, y_tr_clf, y_te_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42, stratify=y_clf)

# -------------------------------------
# 4) Scale features (fit on train only)
# -------------------------------------
reg_scaler = StandardScaler()
X_tr_s = reg_scaler.fit_transform(X_tr)
X_te_s = reg_scaler.transform(X_te)

clf_scaler = StandardScaler()
X_tr_c_s = clf_scaler.fit_transform(X_tr_c)
X_te_c_s = clf_scaler.transform(X_te_c)

# -------------------------------------
# 5A) TensorFlow Regression (predict pace_min_per_100m)
# -------------------------------------
tf.keras.utils.set_random_seed(42)

reg_model = Sequential([
    layers.Input(shape=(X_tr_s.shape[1],)),
    layers.Dense(128, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(1)
])
reg_model.compile(optimizer="adam", loss="mse", metrics=["mae"])

early = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
hist_reg = reg_model.fit(
    X_tr_s, y_tr_reg,
    validation_split=0.2,
    epochs=200,
    batch_size=16,
    verbose=0,
    callbacks=[early]
)

yhat_reg = reg_model.predict(X_te_s).ravel()
print("\n=== Swimming Regression (Pace min/100m) ===")
print("MAE:", round(mean_absolute_error(y_te_reg, yhat_reg), 4))
print("R² :", round(r2_score(y_te_reg, yhat_reg), 4))

# -------------------------------------
# 5B) TensorFlow Classification (high effectiveness)
# -------------------------------------
classes = np.unique(y_tr_clf)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr_clf)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}

clf_model = Sequential([
    layers.Input(shape=(X_tr_c_s.shape[1],)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])
clf_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

early_c = callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
hist_clf = clf_model.fit(
    X_tr_c_s, y_tr_clf,
    validation_split=0.2,
    epochs=150,
    batch_size=16,
    verbose=0,
    callbacks=[early_c],
    class_weight=class_weight
)

probs = clf_model.predict(X_te_c_s).ravel()
preds = (probs >= 0.5).astype(int)

print("\n=== Swimming Classification (High Effectiveness) ===")
print(classification_report(y_te_clf, preds, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_te_clf, preds))
try:
    print("ROC AUC:", round(roc_auc_score(y_te_clf, probs), 4))
except ValueError:
    print("ROC AUC: not defined (only one class present in y_true).")

# -------------------------------------
# 6) (Optional) Threshold tuning helpers
# -------------------------------------
try:
    from sklearn.metrics import roc_curve, precision_recall_curve, auc
    import matplotlib.pyplot as plt

    fpr, tpr, thr = roc_curve(y_te_clf, probs)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, label=f"AUC={roc_auc:.2f}")
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC — Swim Effectiveness")
    plt.legend(); plt.tight_layout(); plt.show()

    prec, rec, pr_thr = precision_recall_curve(y_te_clf, probs)
    plt.figure(figsize=(6,6))
    plt.plot(rec, prec)
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("PR Curve — Swim Effectiveness")
    plt.tight_layout(); plt.show()

    # Example alt threshold
    opt_thresh = 0.65
    preds_alt = (probs >= opt_thresh).astype(int)
    print(f"\nClassification report @ threshold {opt_thresh}")
    print(classification_report(y_te_clf, preds_alt, digits=3))
except Exception as e:
    print("Threshold plotting skipped:", e)

# -------------------------------------
# 7) Save artifacts (models + scalers)
# -------------------------------------
out_dir = "/Users/amlim/triathlon-performance/results/models"
Path(out_dir).mkdir(parents=True, exist_ok=True)

reg_model_path = os.path.join(out_dir, "swim_pace_reg_v1.keras")
clf_model_path = os.path.join(out_dir, "swim_effectiveness_clf_v1.keras")
reg_model.save(reg_model_path)
clf_model.save(clf_model_path)

joblib.dump(reg_scaler, os.path.join(out_dir, "swim_reg_scaler_v1.joblib"))
joblib.dump(clf_scaler, os.path.join(out_dir, "swim_clf_scaler_v1.joblib"))

print(f"\nSaved models:\n  {reg_model_path}\n  {clf_model_path}")
print(f"Saved scalers in {out_dir}")
print("\nFeature columns used:", X.columns.tolist())