In [None]:
import pandas as pd
import numpy as np

# -----------------------
# 1) Load synthetic swim data
# -----------------------
raw_path = "/Users/amlim/triathlon-performance/data/synthetic_swimming.csv"
df = pd.read_csv(raw_path)

print("Raw columns:", df.columns.tolist())
print("Rows:", len(df))

# -----------------------
# 2) Normalize column names
# -----------------------
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(r"[^a-z0-9]+", "_", regex=True)
              .str.strip("_"))

print("Normalized columns:", df.columns.tolist())

# -----------------------
# 3) Parse date & time
# -----------------------
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date.astype(str)

def bucket_tod(tstr):
    try:
        h = int(str(tstr).split(":")[0])
        if 5 <= h <= 11:   return "Morning"
        if 12 <= h <= 17:  return "Afternoon"
        return "Evening"
    except Exception:
        return np.nan

df["time_of_day"] = df["start_time"].apply(bucket_tod)

# -----------------------
# 4) Effectiveness proxy
# -----------------------
# Lower pace (min/100m) = better
thr = df["pace_min_per_100m"].quantile(0.25)   # top 25% fastest
df["high_effectiveness_swim"] = (df["pace_min_per_100m"] <= thr).astype(int)

# -----------------------
# 5) Select cleaned columns
# -----------------------
keep = [
    "date", "start_time", "time_of_day", "indoor_outdoor",
    "distance_m", "duration_min", "pace_min_per_100m",
    "high_effectiveness_swim"
]
clean = df[keep].copy()

# -----------------------
# 6) Save cleaned dataset
# -----------------------
clean_path = "/Users/amlim/triathlon-performance/data/cleaned_swimming.csv"
clean.to_csv(clean_path, index=False)
print("Saved:", clean_path, "rows:", len(clean))

clean.head(5)