In [None]:
# -----------------------------------------
# Swimming EDA — pace-focused visualizations
# -----------------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# 0) Load cleaned data
df = pd.read_csv("/Users/amlim/triathlon-performance/data/cleaned_swimming.csv")

# Quick sanity check
expected = ["pace_min_per_100m", "time_of_day", "indoor_outdoor", "distance_m", "duration_min"]
missing = [c for c in expected if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in cleaned_swimming.csv: {missing}")

# 1) Normalize categories
df["time_of_day"] = df["time_of_day"].astype(str).str.strip().str.title()
df["indoor_outdoor"] = (df["indoor_outdoor"]
                        .astype(str).str.strip().str.lower()
                        .replace({"indoors":"indoor","outdoors":"outdoor"}))

print("Counts — time_of_day:\n", df["time_of_day"].value_counts(dropna=False), "\n")
print("Counts — indoor_outdoor:\n", df["indoor_outdoor"].value_counts(dropna=False), "\n")

# 2) Palette & hue order
custom_palette = {"indoor": "#1f77b4", "outdoor": "#ff7f0e"}  # blue/orange
hue_order = ["indoor", "outdoor"]

# Optional: Set a consistent seaborn theme
sns.set_theme(style="whitegrid", context="notebook")

# Optional: where to save figures
fig_dir = Path("../results/figs")
fig_dir.mkdir(parents=True, exist_ok=True)

# 3) Pace distribution (min/100m) — lower is better
plt.figure(figsize=(7,5))
sns.histplot(df["pace_min_per_100m"], bins=30, color="#4c78a8", kde=True)
plt.xlabel("Pace (min/100m)")
plt.title("Swimming Pace Distribution")
plt.tight_layout()
plt.savefig(fig_dir / "swim_pace_hist.png", dpi=150)
plt.show()

# 4) Pace by time of day & indoor/outdoor — Boxplot
plt.figure(figsize=(8,6))
sns.boxplot(
    data=df,
    x="time_of_day",
    y="pace_min_per_100m",
    hue="indoor_outdoor",
    hue_order=hue_order,
    palette=custom_palette
)
plt.title("Pace (min/100m) by Time of Day & Indoor/Outdoor")
plt.ylabel("Pace (min/100m)  (lower = better)")
plt.xlabel("Time of Day")
plt.tight_layout()
plt.savefig(fig_dir / "swim_pace_box_tod_inout.png", dpi=150)
plt.show()

# 5) Pace distribution shape — Violin
has_both = set(df["indoor_outdoor"].dropna().unique()) == set(hue_order)
plt.figure(figsize=(8,6))
sns.violinplot(
    data=df,
    x="time_of_day",
    y="pace_min_per_100m",
    hue="indoor_outdoor",
    hue_order=hue_order,
    palette=custom_palette,
    split=has_both,       # only split if both indoor/outdoor present
    inner="quartile",
    cut=0
)
plt.title("Pace (min/100m) Distribution by Time of Day")
plt.ylabel("Pace (min/100m)  (lower = better)")
plt.xlabel("Time of Day")
plt.tight_layout()
plt.savefig(fig_dir / "swim_pace_violin_tod_inout.png", dpi=150)
plt.show()

# 6) Optional: Pace vs. Distance (scatter) — quick sanity relationship
plt.figure(figsize=(7,5))
sns.scatterplot(
    data=df,
    x="distance_m",
    y="pace_min_per_100m",
    hue="indoor_outdoor",
    hue_order=hue_order,
    palette=custom_palette,
    alpha=0.8
)
plt.title("Pace (min/100m) vs Distance (m)")
plt.ylabel("Pace (min/100m)")
plt.xlabel("Distance (m)")
plt.tight_layout()
plt.savefig(fig_dir / "swim_pace_vs_distance.png", dpi=150)
plt.show()

# 7) Optional: Two-way ANOVA (time_of_day × indoor_outdoor)
try:
    import statsmodels.formula.api as smf
    from statsmodels.stats.anova import anova_lm

    dd = df.dropna(subset=["pace_min_per_100m","time_of_day","indoor_outdoor"]).copy()
    # require both levels for valid interaction; skip if not present
    if dd["indoor_outdoor"].nunique() > 1 and dd["time_of_day"].nunique() > 1:
        dd["time_of_day"] = dd["time_of_day"].astype("category")
        dd["indoor_outdoor"] = dd["indoor_outdoor"].astype("category")

        model = smf.ols("pace_min_per_100m ~ C(time_of_day) * C(indoor_outdoor)", data=dd).fit()
        anova_table = anova_lm(model, typ=2)
        print("\nTwo-way ANOVA (pace_min_per_100m):")
        print(anova_table)
    else:
        print("\nANOVA skipped (need >1 level for time_of_day and indoor_outdoor).")
except Exception as e:
    print("\nANOVA step skipped due to error:", e)