In [None]:
# üèä‚Äç‚ôÇÔ∏èüö¥‚Äç‚ôÇÔ∏èüèÉ Triathlon Performance ‚Äî Cross-Sport Effectiveness Summary
# --------------------------------------------------------------------
# Unified analysis across cycling, running, swimming with robust cycling handling.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import statsmodels.api as sm
from statsmodels.formula.api import ols

sns.set_theme(style="whitegrid")

# ---------------------
# 1) Load Data
# ---------------------
BASE = Path("/Users/amlim/triathlon-performance/data")
files = {
    "cycling": BASE / "cleaned_cycling.csv",
    "running": BASE / "cleaned_running_merged.csv",
    "swimming": BASE / "cleaned_swimming.csv",
}

dfs = {}
for sport, path in files.items():
    if path.exists():
        print(f"[INFO] Loading {sport}: {path.name}")
        df = pd.read_csv(path)
        dfs[sport] = df
        print("  Rows:", len(df), "| Cols:", len(df.columns))
    else:
        print(f"[WARN] Missing file for {sport}")

# ---------------------
# 2) Helpers
# ---------------------
def norm_cols(df):
    return {c.lower(): c for c in df.columns}

def find_col(df, candidates):
    m = norm_cols(df)
    for c in candidates:
        if c.lower() in m:
            return m[c.lower()]
    return None

def parse_duration_to_minutes(series):
    """Parse HH:MM:SS or MM:SS strings to minutes; pass through numeric minutes."""
    def to_min(x):
        s = str(x)
        if ":" in s:
            try:
                parts = [int(p) for p in s.split(":")]
                if len(parts) == 3:
                    h, m, sec = parts
                    return h*60 + m + sec/60
                if len(parts) == 2:
                    m, sec = parts
                    return m + sec/60
            except:
                return np.nan
        try:
            return float(s)
        except:
            return np.nan
    return series.apply(to_min)

def bucket_time_of_day_from_value(v):
    """Coerce to Morning/Afternoon/Evening from label or clock time."""
    s = str(v)
    low = s.strip().lower()
    if low in {"morning","afternoon","evening"}:
        return low.title()
    if ":" in s:
        try:
            h = int(s.split(":")[0])
            if 5 <= h <= 11:  return "Morning"
            if 12 <= h <= 17: return "Afternoon"
            return "Evening"
        except:
            return np.nan
    return np.nan

def rebuild_time_of_day_from_dummies(df, prefix="time_of_day_"):
    """Map dummies time_of_day_Morning/Evening to labels (Afternoon is baseline)."""
    lower_map = {c.lower(): c for c in df.columns}
    mcol = lower_map.get((prefix + "morning").lower())
    ecol = lower_map.get((prefix + "evening").lower())
    out = pd.Series("Afternoon", index=df.index, dtype=object)
    if mcol is not None:
        out[df[mcol] == 1] = "Morning"
    if ecol is not None:
        out[df[ecol] == 1] = "Evening"
    return out

def normalize(series, ascending=False):
    """Rank-normalize to 0‚Äì1 (higher = better after normalization)."""
    s = pd.to_numeric(series, errors="coerce").replace([np.inf, -np.inf], np.nan)
    return s.rank(pct=True, ascending=ascending)

# ---------------------
# 3) Build sport frames
# ---------------------
records = []

# ---- CYCLING (uses your columns: Calories, Training Stress Score, Duration, Time of Day/Time) ----
cyc = dfs.get("cycling")
if cyc is not None and not cyc.empty:
    # Parse duration ‚Üí minutes
    dur_col = find_col(cyc, ["Duration_min", "duration_min", "Duration", "duration"])
    if dur_col is None:
        print("[WARN] Cycling: no duration column found.")
        duration_min = pd.Series(np.nan, index=cyc.index)
    else:
        if "min" in dur_col.lower():
            duration_min = pd.to_numeric(cyc[dur_col], errors="coerce")
        else:
            duration_min = parse_duration_to_minutes(cyc[dur_col])
    # Compute per-minute metrics if possible
    cal_col = find_col(cyc, ["Calories","calories"])
    tss_col = find_col(cyc, ["Training Stress Score","training stress score","TSS","tss"])
    avg_speed_col = find_col(cyc, ["Avg Speed","Average Speed","avg_speed","average_speed"])
    dist_col = find_col(cyc, ["Distance_km","distance_km","Distance","distance"])

    candidates = []
    if cal_col is not None and duration_min.notna().any():
        cal = pd.to_numeric(cyc[cal_col].astype(str).str.replace(",",""), errors="coerce")
        candidates.append(("Calories_per_min", cal / duration_min))
    if tss_col is not None and duration_min.notna().any():
        tssv = pd.to_numeric(cyc[tss_col], errors="coerce")
        candidates.append(("TSS_per_min", tssv / duration_min))
    if dist_col is not None and duration_min.notna().any():
        dist = pd.to_numeric(cyc[dist_col], errors="coerce")
        candidates.append(("Distance_per_min", dist / duration_min))
    if avg_speed_col is not None:
        spd = pd.to_numeric(cyc[avg_speed_col], errors="coerce")
        candidates.append((avg_speed_col, spd))

    eff_series, eff_name = None, None
    for name, ser in candidates:
        ser = ser.replace([np.inf,-np.inf], np.nan)
        if ser.notna().sum() > 0 and ser.fillna(0).sum() != 0:
            eff_series, eff_name = ser, name
            break

    if eff_series is None:
        print("[WARN] Cycling: effectiveness still empty after attempts.")
    else:
        # Get time_of_day
        tod_col = find_col(cyc, ["Time of Day","time of day","time_of_day","Time_of_Day","Time","time","start_time_local"])
        if tod_col:
            tod = cyc[tod_col].apply(bucket_time_of_day_from_value)
        else:
            tod = rebuild_time_of_day_from_dummies(cyc)
        tod = tod.fillna("Unknown")

        rec = pd.DataFrame({
            "sport": "cycling",
            "eff_norm": normalize(eff_series, ascending=False),  # higher is better for intensity
            "time_of_day": tod
        })
        print(f"[INFO] Cycling effectiveness source: {eff_name} | non-null: {rec['eff_norm'].notna().mean():.1%}")
        records.append(rec)

# ---- RUNNING (lower pace is better) ----
run = dfs.get("running")
if run is not None and not run.empty:
    pace_col = find_col(run, ["avg_pace_min_per_km","Average Pace (min/km)","average_pace_min_per_km"])
    if pace_col is None:
        # compute pace if possible
        dur = find_col(run, ["duration_min"])
        dist = find_col(run, ["distance_km"])
        if dur and dist:
            pace = pd.to_numeric(run[dur], errors="coerce") / pd.to_numeric(run[dist], errors="coerce")
            run["computed_pace_min_per_km"] = pace
            pace_col = "computed_pace_min_per_km"
            print("[INFO] Running: computed pace = duration_min / distance_km")
    if pace_col:
        tod_col = find_col(run, ["time_of_day","Time_of_Day","start_time_local"])
        tod = run[tod_col].apply(bucket_time_of_day_from_value) if tod_col else pd.Series("Unknown", index=run.index)
        rec = pd.DataFrame({
            "sport": "running",
            "eff_norm": normalize(run[pace_col], ascending=True),  # lower pace = better
            "time_of_day": tod.fillna("Unknown")
        })
        print(f"[INFO] Running effectiveness source: {pace_col} | non-null: {rec['eff_norm'].notna().mean():.1%}")
        records.append(rec)

# ---- SWIMMING (lower pace per 100m is better) ----
swi = dfs.get("swimming")
if swi is not None and not swi.empty:
    pace100_col = find_col(swi, ["pace_min_per_100m","pace_s_per_100m"])
    if pace100_col is None:
        dur = find_col(swi, ["duration_min"])
        dist_m = find_col(swi, ["distance_m","distance"])
        if dur and dist_m:
            pace_min_per_m = pd.to_numeric(swi[dur], errors="coerce") / pd.to_numeric(swi[dist_m], errors="coerce")
            swi["computed_pace_min_per_100m"] = pace_min_per_m * 100
            pace100_col = "computed_pace_min_per_100m"
            print("[INFO] Swimming: computed pace_min_per_100m from duration_min & distance_m")
    if pace100_col:
        tod_col = find_col(swi, ["time_of_day","Time_of_Day","start_time_local"])
        tod = swi[tod_col].apply(bucket_time_of_day_from_value) if tod_col else pd.Series("Unknown", index=swi.index)
        rec = pd.DataFrame({
            "sport": "swimming",
            "eff_norm": normalize(swi[pace100_col], ascending=True),  # lower pace = better
            "time_of_day": tod.fillna("Unknown")
        })
        print(f"[INFO] Swimming effectiveness source: {pace100_col} | non-null: {rec['eff_norm'].notna().mean():.1%}")
        records.append(rec)

# ---------------------
# 4) Combine & Save
# ---------------------
if not records:
    raise ValueError("No valid sports data found for analysis.")
df_all = pd.concat(records, ignore_index=True)

print("\n[INFO] Combined dataset summary:")
print(df_all["sport"].value_counts())
print(df_all.groupby("sport")["eff_norm"].describe()[["count","mean","std","min","max"]])

out_path = BASE.parent / "results" / "triathlon_all_effectiveness.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
df_all.to_csv(out_path, index=False)
print(f"[OK] Saved combined dataset ‚Üí {out_path}")

# ---------------------
# 5) Visualizations
# ---------------------
plt.figure(figsize=(7,5))
sns.boxplot(data=df_all, x="sport", y="eff_norm", palette="Set2")
sns.stripplot(data=df_all, x="sport", y="eff_norm", color="black", size=2, alpha=0.25)
plt.title("Normalized Effectiveness by Sport")
plt.ylim(0,1)
plt.tight_layout()
plt.show()

if df_all["time_of_day"].notna().any():
    plt.figure(figsize=(9,6))
    sns.violinplot(data=df_all, x="time_of_day", y="eff_norm", hue="sport",
                   split=True, inner="quartile", palette="Set3")
    plt.title("Effectiveness by Time of Day Across Sports")
    plt.ylim(0,1)
    plt.tight_layout()
    plt.show()

# ---------------------
# 6) ANOVA
# ---------------------
ana = df_all.dropna(subset=["eff_norm"]).copy()
if "time_of_day" in ana.columns and ana["time_of_day"].notna().sum() > 0:
    model = ols("eff_norm ~ C(sport) + C(time_of_day)", data=ana).fit()
else:
    model = ols("eff_norm ~ C(sport)", data=ana).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("\n=== Two-Way ANOVA (Sport √ó Time of Day) ===")
display(anova_table)

In [None]:
df_all.groupby("sport")["eff_norm"].describe()