# 01 — Data Cleaning & Feature Engineering
- load raw data
- fix columns and types
- engineer Session_Type, Time_of_Day, per-minute metrics
- save cleaned dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

df = pd.read_csv("Data Cycling.csv")


## Data Cleaning

In [None]:
df.head(20)

In [None]:
df.rename(columns={"Time": "Duration"}, inplace=True)

In [None]:
parts = df["Date"].astype(str).str.split(r"\s+", n=1, expand=True)
df["Date"] = parts[0]                      # short date
df["Time"] = np.where(parts.shape[1] > 1, parts[1], np.nan)  # time if present

In [None]:
def clean_numeric_column(df, cols):
    """
    Cleans numeric columns in a DataFrame:
    - Removes commas
    - Replaces '--' with NaN
    - Converts to float
    """
    if isinstance(cols, str):
        cols = [cols]   # allow single column name
    
    for col in cols:
        df[col] = (
            df[col].astype(str)                # ensure string
                  .str.replace(",", "", regex=False)  # remove commas
                  .replace("--", np.nan)       # replace placeholder
                  .replace("nan", np.nan)      # just in case
                  .astype(float)               # convert to float
        )
    return df

In [None]:
df = clean_numeric_column(df, ["Calories", "Avg HR", "Avg Speed"])

In [None]:
# need to define which activity is strictly indoors or outdoors (create new columns "indoors" and "outdoors"

def classify_session(activity):
    if activity in ["indoor_cycling", "virtual_ride"]:
        return "Indoors"
    else:
        return "Outdoors"

df["Session Type"] = df["Activity Type"].apply(classify_session)

In [None]:
def classify_time(timestamp):
    t = pd.to_datetime(timestamp, errors="coerce").time()
    
    if t is None:
        return np.nan
    
    hour = t.hour
    
    if hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    else:
        return "Evening"
    
df["Time of Day"] = df["Time"].apply(classify_time)
    
        

In [None]:
df.rename(columns={"Training Stress Score®": "Training Stress Score"}, inplace=True)

In [None]:
df.head(10)
df_clean = df.copy()
df_clean.to_csv("/Users/amlim/cycling-effectiveness/data/cleaned_cycling.csv", index=False)

In [None]:
df = df.rename(columns={
    "Training Stress Score": "Training_Stress_Score",
    "Session Type": "Session_Type",
    "Time of Day": "Time_of_Day"
})
# focusing on columns: Session Type, Time of Day, Calories, Distance, Training Stress Score
sns.set(style="whitegrid", palette="muted", font_scale=1.2)

sns.boxplot(data=df, x="Time_of_Day", y="Calories", hue="Session_Type")
plt.title("Calories by Session Type and Time of Day")
plt.show()

In [None]:
sns.boxplot(data=df, x="Time_of_Day", y="Training_Stress_Score", hue="Session_Type")
plt.title("Training Stress Score by Session Type and Time of Day")
plt.show()

In [None]:
sns.violinplot(data=df, x="Time_of_Day", y="Calories", hue="Session_Type", split=True)
plt.title("Calories Distribution by Session Type & Time of Day")
plt.show()

In [None]:
sns.scatterplot(data=df, x="Distance", y="Calories", hue="Session_Type", style="Time_of_Day")
plt.title("Calories vs. Distance by Session Type & Time of Day")
plt.show()

sns.pairplot(df, vars=["Calories", "Distance", "Training_Stress_Score"], hue="Session_Type")
plt.show()

In [None]:
df.groupby(["Session_Type","Time_of_Day"])[["Calories","Training_Stress_Score"]].mean()

In [None]:

model_cal = smf.ols("Calories ~ C(Session_Type) * C(Time_of_Day)", data=df).fit()
anova_cal = sm.stats.anova_lm(model_cal, typ=2)
print("ANOVA for Calories\n", anova_cal)



Session Type is not significant (p=0.14). Calories burned doesn't differ much just by being indoors or outdoors.

Time of Day is high significant (p<0.001). Calories burned does vary across Morning, Afternoon, Evening.

Interaction is not significant. Session Type and Time of Day don't combine in a meaningful way.

In [None]:
model_tss = smf.ols("Training_Stress_Score ~ C(Session_Type) * C(Time_of_Day)", data=df).fit()
anova_tss = sm.stats.anova_lm(model_tss, typ=2)
print("ANOVA for TSS\n", anova_tss)

Session Type not significant. TSS doesn't differ by indoor vs. outdoor.

Time of Day is highly significant.

Interaction is not significant.

In [None]:
def parse_duration(val):
    try:
        parts = val.split(":")
        if len(parts) == 2:
            val = "00:" + val
        return pd.to_timedelta(val).total_seconds() / 60
    except:
        return None

df["Duration_min"] = df["Duration"].astype(str).apply(parse_duration)

df = df.dropna(subset=["Duration_min"])

In [None]:
df["Calories_per_min"] = df["Calories"] / df["Duration_min"]
df["TSS_per_min"] = df["Training_Stress_Score"] / df["Duration_min"]

In [None]:
thr = df["Calories_per_min"].quantile(0.75)
df["High_Effectiveness"] = (df["Calories_per_min"] >= thr).astype(int)
df_ml = pd.get_dummies(df, columns=["Session_Type", "Time_of_Day"], drop_first=True)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Numeric features that exist in YOUR df_ml
num_feats = [
    "Duration_min", "Distance", "Avg HR", "Avg Speed",
    "Avg Bike Cadence", "Max HR", "Max Speed",
    # optional power-related if populated:
    "Power", "Max Power", "Max Avg Power (20 min)",
    # elevation if outdoor rides have them:
    "Elev Gain", "Elev Loss",
    # if present and meaningful:
    "Training_Stress_Score"
]

# Use only those that actually exist (and are not entirely missing)
num_feats = [c for c in num_feats if c in df_ml.columns]

# Categorical dummies you already created (Afternoon is implicitly the baseline)
cat_feats = [c for c in ["Session_Type_Outdoors", "Time_of_Day_Morning", "Time_of_Day_Evening"] if c in df_ml.columns]

X_cols = num_feats + cat_feats
print("Using features:", X_cols)

# Targets (you already created these)
y_reg = df_ml["Calories_per_min"]
y_clf = df_ml["High_Effectiveness"]

# Assemble X and ensure numeric dtype
X = df_ml[X_cols].apply(pd.to_numeric, errors="coerce")

# Drop rows with any NaNs in features or targets
mask = X.notna().all(axis=1) & y_reg.notna() & y_clf.notna()
X = X.loc[mask]
y_reg = y_reg.loc[mask]
y_clf = y_clf.loc[mask]

# Train/test split (regression)
X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

# Train/test split (classification) — stratify for balanced classes
X_train_c, X_test_c, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# Scale features (fit on train, apply to test)
scaler_reg = StandardScaler()
X_train_s = scaler_reg.fit_transform(X_train)
X_test_s  = scaler_reg.transform(X_test)

scaler_clf = StandardScaler()
X_train_c_s = scaler_clf.fit_transform(X_train_c)
X_test_c_s  = scaler_clf.transform(X_test_c)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

tf.keras.utils.set_random_seed(42)

model_reg = Sequential([
    Dense(64, activation="relu", input_shape=(X_train_s.shape[1],)),
    Dense(32, activation="relu"),
    Dense(1)
])

model_reg.compile(optimizer="adam", loss="mse", metrics=["mae"])
hist_reg = model_reg.fit(X_train_s, y_train_reg, validation_split=0.2, epochs=100, batch_size=16, verbose=0)

# Evaluate
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_reg = model_reg.predict(X_test_s).ravel()
print("Regression — MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("Regression — R^2:", r2_score(y_test_reg, y_pred_reg))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

tf.keras.utils.set_random_seed(42)

model_clf = Sequential([
    Dense(64, activation="relu", input_shape=(X_train_c_s.shape[1],)),
    Dense(32, activation="relu"),
    Dense(1, activation="sigmoid")
])
model_clf.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
hist_clf = model_clf.fit(X_train_c_s, y_train_clf, validation_split=0.2, epochs=60, batch_size=16, verbose=0)

y_prob = model_clf.predict(X_test_c_s).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print(classification_report(y_test_clf, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test_clf, y_pred))
print("ROC AUC:", roc_auc_score(y_test_clf, y_prob))

In [None]:
import numpy as np
y_pred_baseline = np.full_like(y_test_reg, y_train_reg.mean(), dtype=float)
from sklearn.metrics import r2_score, mean_absolute_error
print("Baseline MAE:", mean_absolute_error(y_test_reg, y_pred_baseline))
print("Baseline R²:", r2_score(y_test_reg, y_pred_baseline))

In [None]:
import matplotlib.pyplot as plt
plt.scatter(y_test_reg, y_test_reg - y_pred_reg, s=12)
plt.axhline(0, ls="--"); plt.xlabel("Actual"); plt.ylabel("Residuals"); plt.title("Residuals vs Actual")
plt.show()