# 03 — Machine Learning (Regression & Classification)
- feature prep & scaling
- TF models
- evaluation (MAE/R², confusion matrix, ROC AUC)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

df = pd.read_csv("/Users/amlim/cycling-effectiveness/data/cleaned_cycling.csv")


In [None]:
df.head(20)

In [None]:
def clean_numeric_column(df, cols):
    """
    Cleans numeric columns in a DataFrame:
    - Removes commas
    - Replaces '--' with NaN
    - Converts to float
    """
    if isinstance(cols, str):
        cols = [cols]   # allow single column name
    
    for col in cols:
        df[col] = (
            df[col].astype(str)                # ensure string
                  .str.replace(",", "", regex=False)  # remove commas
                  .replace("--", np.nan)       # replace placeholder
                  .replace("nan", np.nan)      # just in case
                  .astype(float)               # convert to float
        )
    return df

In [None]:
df = clean_numeric_column(df, ["Calories", "Avg HR", "Avg Speed"])

In [None]:
if "Session_Type" not in df.columns and "Activity Type" in df.columns:
    def classify_session(activity):
        return "Indoors" if activity in ["indoor_cycling", "virtual_ride"] else "Outdoors"
    df["Session Type"] = df["Activity Type"].apply(classify_session)

In [None]:
if "Training Stress Score®" in df.columns and "Training_Stress_Score" not in df.columns:
    df.rename(columns={"Training Stress Score®": "Training Stress Score"}, inplace=True)

In [None]:
df.head(10)

## ML and Regression

In [None]:
if "Duration_min" not in df.columns and "Duration" in df.columns:
    def parse_duration(val):
        try:
            parts = val.split(":")
            if len(parts) == 2:
                val = "00:" + val
            return pd.to_timedelta(val).total_seconds() / 60
        except:
            return None

    df["Duration_min"] = df["Duration"].astype(str).apply(parse_duration)

df = df.dropna(subset=["Duration_min"])

In [None]:
if "Calories_per_min" not in df.columns:
    df["Calories_per_min"] = df["Calories"] / df["Duration_min"]
if "Training_Stress_Score" in df.columns and "TSS_per_min" not in df.columns:
    df["TSS_per_min"] = df["Training_Stress_Score"] / df["Duration_min"]

In [None]:
if "High_Effectiveness" not in df.columns:
    thr = df["Calories_per_min"].quantile(0.75)
    df["High_Effectiveness"] = (df["Calories_per_min"] >= thr).astype(int)
if "Session_Type" in df.columns and df["Session_Type"].dtype == "object":
    df = pd.get_dummies(df, columns=["Session_Type"], drop_first=True)
if "Time_of_Day" in df.columns and df["Time_of_Day"].dtype == "object":
    df = pd.get_dummies(df, columns=["Time_of_Day"], drop_first=True)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# If you haven't already in a prior cell, ensure df exists:
# df = pd.read_csv("../data/cleaned_cycling.csv")

# If you did one-hot encoding in earlier cells on df, great.
# Now create a modeling copy:
df_ml = df.copy()

# --- Build feature list safely from what actually exists ---
num_feats = [
    "Duration_min", "Distance", "Avg HR", "Avg Speed",
    "Avg Bike Cadence", "Max HR", "Max Speed",
    "Power", "Max Power", "Max Avg Power (20 min)",
    "Elev Gain", "Elev Loss", "Training_Stress_Score"
]
num_feats = [c for c in num_feats if c in df_ml.columns]

# One-hot (dummy) columns that may exist (Afternoon is implicit baseline)
dummy_feats = [
    "Session_Type_Outdoors",
    "Time_of_Day_Morning",
    "Time_of_Day_Evening"
]
dummy_feats = [c for c in dummy_feats if c in df_ml.columns]

X_cols = num_feats + dummy_feats
print("Using features:", X_cols)

# Targets
if "Calories_per_min" not in df_ml.columns:
    raise ValueError("Calories_per_min not found. Compute it before this step.")
y_reg = df_ml["Calories_per_min"]

if "High_Effectiveness" not in df_ml.columns:
    # create label here if your cleaned file doesn't have it yet
    thr = y_reg.quantile(0.75)
    df_ml["High_Effectiveness"] = (y_reg >= thr).astype(int)
y_clf = df_ml["High_Effectiveness"]

# Assemble X and coerce to numeric
X = df_ml[X_cols].apply(pd.to_numeric, errors="coerce")

# Drop rows with any NaNs in features or targets
mask = X.notna().all(axis=1) & y_reg.notna() & y_clf.notna()
X = X.loc[mask]
y_reg = y_reg.loc[mask]
y_clf = y_clf.loc[mask]

# Train/test splits
X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)
X_train_c, X_test_c, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# Scale features
scaler_reg = StandardScaler()
X_train_s = scaler_reg.fit_transform(X_train)
X_test_s  = scaler_reg.transform(X_test)

scaler_clf = StandardScaler()
X_train_c_s = scaler_clf.fit_transform(X_train_c)
X_test_c_s  = scaler_clf.transform(X_test_c)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

tf.keras.utils.set_random_seed(42)

model_reg = Sequential([
    Dense(64, activation="relu", input_shape=(X_train_s.shape[1],)),
    Dense(32, activation="relu"),
    Dense(1)
])

model_reg.compile(optimizer="adam", loss="mse", metrics=["mae"])
hist_reg = model_reg.fit(X_train_s, y_train_reg, validation_split=0.2, epochs=100, batch_size=16, verbose=0)

# Evaluate
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_reg = model_reg.predict(X_test_s).ravel()
print("Regression — MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("Regression — R^2:", r2_score(y_test_reg, y_pred_reg))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

tf.keras.utils.set_random_seed(42)

model_clf = Sequential([
    Dense(64, activation="relu", input_shape=(X_train_c_s.shape[1],)),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")
])
model_clf.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
hist_clf = model_clf.fit(X_train_c_s, y_train_clf, validation_split=0.2, epochs=60, batch_size=16, verbose=0)

y_prob = model_clf.predict(X_test_c_s).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print(classification_report(y_test_clf, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test_clf, y_pred))
print("ROC AUC:", roc_auc_score(y_test_clf, y_prob))

In [None]:
import numpy as np
y_pred_baseline = np.full_like(y_test_reg, y_train_reg.mean(), dtype=float)
from sklearn.metrics import r2_score, mean_absolute_error
print("Baseline MAE:", mean_absolute_error(y_test_reg, y_pred_baseline))
print("Baseline R²:", r2_score(y_test_reg, y_pred_baseline))

In [None]:
import matplotlib.pyplot as plt
plt.scatter(y_test_reg, y_test_reg - y_pred_reg, s=12)
plt.axhline(0, ls="--"); plt.xlabel("Actual"); plt.ylabel("Residuals"); plt.title("Residuals vs Actual")
plt.show()

In [None]:
model_reg.save("/Users/amlim/cycling-effectiveness/results/model_reg.h5")
model_clf.save("/Users/amlim/cycling-effectiveness/results/model_clf.h5")