In [3]:
# ============================================================
# FULL PIPELINE: Load → Preprocess → Models → What-if Output
# ============================================================

import pandas as pd
import numpy as np
from datetime import timedelta

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ------------------------------------------------------------
# 1. Load processed runs (from data.ipynb)
# ------------------------------------------------------------

runs = pd.read_csv("runs_processed.csv")  # or "Data/runs_processed.csv" if saved there
print("Loaded runs_processed.csv with shape:", runs.shape)

# ------------------------------------------------------------
# 2. Helper: convert "mm:ss" or "h:mm:ss" strings to seconds
# ------------------------------------------------------------

def time_to_seconds(t):
    if pd.isna(t):
        return np.nan
    t = str(t)
    parts = t.split(":")
    try:
        parts = [int(p) for p in parts]
    except ValueError:
        return np.nan
    
    if len(parts) == 2:
        m, s = parts
        return m * 60 + s
    elif len(parts) == 3:
        h, m, s = parts
        return h * 3600 + m * 60 + s
    else:
        return np.nan

# ------------------------------------------------------------
# 3. Preprocessing & feature engineering (was Cells 4–5)
# ------------------------------------------------------------

df = runs.copy()

# Parse dates
df["Activity Date"] = pd.to_datetime(df["Activity Date"], format="mixed")

# Keep runs only (should be already, but just in case)
df = df[df["Activity Type"] == "Run"].copy()

# Time features as seconds
df["elapsed_sec"] = df["Elapsed Time"].apply(time_to_seconds)
df["moving_sec"] = df["Moving Time"].apply(time_to_seconds)
df["pace_sec_per_mi"] = df["Pace"].apply(time_to_seconds)

# Make sure numeric columns are numeric
for col in [
    "Distance",
    "Max Speed", "Average Speed",
    "Elevation Gain", "Elevation Loss",
    "Elevation Low", "Elevation High",
    "Max Grade", "Average Grade", "Average Grade Adjusted Pace"
]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Sort by person & date
df = df.sort_values(["Person", "Activity Date"]).reset_index(drop=True)

# Temporal features
df["day_of_week"] = df["Activity Date"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)
df["month"] = df["Activity Date"].dt.month

# Days since previous run
df["prev_date"] = df.groupby("Person")["Activity Date"].shift(1)
df["days_since_prev_run"] = (df["Activity Date"] - df["prev_date"]).dt.days

# Rolling stats (last 3 runs)
group = df.groupby("Person")
df["rolling_dist"] = group["Distance"].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
df["rolling_pace"] = group["pace_sec_per_mi"].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
df["rolling_gap_days"] = group["days_since_prev_run"].rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)

# ------------------------------------------------------------
# 4. Next-run targets (Prediction 1)
# ------------------------------------------------------------

for col, new_col in [
    ("Activity Date", "next_date"),
    ("Distance", "next_distance"),
    ("elapsed_sec", "next_elapsed_sec"),
    ("moving_sec", "next_moving_sec"),
    ("Elevation Gain", "next_elev_gain"),
    ("Elevation Loss", "next_elev_loss"),
    ("pace_sec_per_mi", "next_pace_sec"),
]:
    df[new_col] = df.groupby("Person")[col].shift(-1)

df["days_until_next_run"] = (df["next_date"] - df["Activity Date"]).dt.days

# Drop rows without a next run
df_model = df.dropna(subset=["days_until_next_run", "next_distance", "next_elapsed_sec"]).copy()
print("Rows with valid next-run targets:", df_model.shape)

# ------------------------------------------------------------
# 5. Train/val/test split (by time)
# ------------------------------------------------------------

cutoff_date = df_model["Activity Date"].quantile(0.7)
val_cutoff = df_model["Activity Date"].quantile(0.85)

train = df_model[df_model["Activity Date"] <= cutoff_date]
val = df_model[(df_model["Activity Date"] > cutoff_date) & (df_model["Activity Date"] <= val_cutoff)]
test = df_model[df_model["Activity Date"] > val_cutoff]

print("Train size:", train.shape)
print("Val size:", val.shape)
print("Test size:", test.shape)

# ------------------------------------------------------------
# 6. Features & targets
# ------------------------------------------------------------

feature_cols = [
    "Distance",
    "elapsed_sec",
    "moving_sec",
    "Elevation Gain",
    "Elevation Loss",
    "day_of_week",
    "is_weekend",
    "month",
    "days_since_prev_run",
    "rolling_dist",
    "rolling_pace",
    "rolling_gap_days",
]

forecast_targets = [
    "days_until_next_run",
    "next_distance",
]

performance_targets = [
    "next_elapsed_sec",
    "next_moving_sec",
    "next_pace_sec",
    "next_elev_gain",
    "next_elev_loss",
]

target_cols = forecast_targets + performance_targets

X_train = train[feature_cols]
X_val = val[feature_cols]
X_test = test[feature_cols]

y_train = train[target_cols]
y_val = val[target_cols]
y_test = test[target_cols]

# ------------------------------------------------------------
# 7. Baselines (global & per-user means)
# ------------------------------------------------------------

global_means = y_train.mean()
user_means = y_train.join(train["Person"]).groupby("Person").mean()

def predict_global_mean(X, target_list):
    return np.tile(global_means[target_list].values, (len(X), 1))

def predict_user_mean(df_subset, target_list):
    preds = []
    for _, row in df_subset.iterrows():
        person = row["Person"]
        if person in user_means.index:
            preds.append(user_means.loc[person][target_list].values)
        else:
            preds.append(global_means[target_list].values)
    return np.vstack(preds)

# ------------------------------------------------------------
# 8. Train separate forecast & performance models
# ------------------------------------------------------------

# Forecast model: days_until_next_run, next_distance
X_train_forecast = X_train.copy()
X_val_forecast = X_val.copy()
X_test_forecast = X_test.copy()

y_train_forecast = train[forecast_targets]
y_val_forecast = val[forecast_targets]
y_test_forecast = test[forecast_targets]

reg_forecast = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
    LinearRegression()
)

reg_forecast.fit(X_train_forecast, y_train_forecast)
y_pred_forecast_test = reg_forecast.predict(X_test_forecast)

print("Forecast model trained.")

# Performance model: elapsed time, pace, elevation
X_train_perf = X_train.copy()
X_val_perf = X_val.copy()
X_test_perf = X_test.copy()

y_train_perf = train[performance_targets]
y_val_perf = val[performance_targets]
y_test_perf = test[performance_targets]

reg_perf = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler(),
    LinearRegression()
)

reg_perf.fit(X_train_perf, y_train_perf)
y_pred_perf_test = reg_perf.predict(X_test_perf)

print("Performance model trained.")

# ------------------------------------------------------------
# 9. Evaluation helpers & model comparison
# ------------------------------------------------------------

def eval_multi_output(y_true_df, y_pred_array, target_list, name):
    print(f"\n=== {name} ===")
    for idx, col in enumerate(target_list):
        true = y_true_df[col]
        pred = y_pred_array[:, idx]
        mae = mean_absolute_error(true, pred)
        rmse = mean_squared_error(true, pred) ** 0.5
        print(f"{col}: MAE={mae:.3f}, RMSE={rmse:.3f}")
    print()

print("\n--- FORECAST TARGETS (days_until_next_run, next_distance) ---")
y_pred_global_forecast = predict_global_mean(X_test, forecast_targets)
y_pred_user_forecast = predict_user_mean(test, forecast_targets)

eval_multi_output(y_test_forecast, y_pred_global_forecast, forecast_targets, "Global mean baseline (forecast)")
eval_multi_output(y_test_forecast, y_pred_user_forecast, forecast_targets, "Per-user mean baseline (forecast)")
eval_multi_output(y_test_forecast, y_pred_forecast_test, forecast_targets, "Forecast model (imputed + scaled LR)")

print("\n--- PERFORMANCE TARGETS (time, pace, elevation) ---")
y_pred_global_perf = predict_global_mean(X_test, performance_targets)
y_pred_user_perf = predict_user_mean(test, performance_targets)

eval_multi_output(y_test_perf, y_pred_global_perf, performance_targets, "Global mean baseline (performance)")
eval_multi_output(y_test_perf, y_pred_user_perf, performance_targets, "Per-user mean baseline (performance)")
eval_multi_output(y_test_perf, y_pred_perf_test, performance_targets, "Performance model (imputed + scaled LR)")

# ------------------------------------------------------------
# 10. What-if predictions for each person (2 / 4 / 6 miles)
# ------------------------------------------------------------

def predict_scenarios_for_user(person, distances=[2.0, 4.0, 6.0]):
    """
    For a given person:
    1) Use their most recent run to predict days_until_next_run (forecast model)
    2) For each hypothetical distance, use performance model to predict
       elapsed/moving/pace/elevation.
    """
    user_hist = df[df["Person"] == person].sort_values("Activity Date")
    if user_hist.empty:
        raise ValueError(f"No data found for person: {person}")

    latest = user_hist.iloc[-1].copy()

    # 1) Forecast timing (same for all scenarios)
    X_latest_forecast = pd.DataFrame([latest[feature_cols]])
    forecast_pred = reg_forecast.predict(X_latest_forecast)[0]
    pred_days_until_next, pred_next_dist = forecast_pred
    pred_days_rounded = int(max(0, round(pred_days_until_next)))

    # 2) Performance predictions per hypothetical distance
    scenario_rows = []
    for d in distances:
        row = latest.copy()
        row["Distance"] = d
        scenario_rows.append(row[feature_cols])

    X_scenarios = pd.DataFrame(scenario_rows)
    perf_preds = reg_perf.predict(X_scenarios)

    scenario_df = pd.DataFrame(perf_preds, columns=performance_targets)
    scenario_df.insert(0, "scenario_distance", distances)
    scenario_df.insert(0, "Person", person)
    scenario_df["pred_days_until_next_run"] = pred_days_rounded
    scenario_df["pred_next_distance_forecast_model"] = pred_next_dist

    return scenario_df

# ------------------------------------------------------------
# 11. Formatting for human-readable output
# ------------------------------------------------------------

def seconds_to_time_str(seconds):
    if pd.isna(seconds):
        return None
    seconds = int(round(seconds))
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return f"{h}:{m:02d}:{s:02d}" if h > 0 else f"{m}:{s:02d}"

def format_scenarios_for_user(person, distances=[2.0, 4.0, 6.0]):
    raw = predict_scenarios_for_user(person, distances=distances).copy()

    user_hist = df[df["Person"] == person].sort_values("Activity Date")
    last_date = user_hist.iloc[-1]["Activity Date"]

    raw["Days Until Next Run"] = raw["pred_days_until_next_run"].astype(int)
    raw["Predicted Activity Date"] = [
        last_date + timedelta(days=int(d)) for d in raw["Days Until Next Run"]
    ]

    raw["Predicted Distance (mi)"] = raw["scenario_distance"].round(2)
    raw["Predicted Elapsed Time"] = raw["next_elapsed_sec"].apply(seconds_to_time_str)
    raw["Predicted Moving Time"] = raw["next_moving_sec"].apply(seconds_to_time_str)
    raw["Predicted Pace"] = raw["next_pace_sec"].apply(seconds_to_time_str)
    raw["Predicted Elevation Gain (ft)"] = raw["next_elev_gain"].round(1)
    raw["Predicted Elevation Loss (ft)"] = raw["next_elev_loss"].round(1)

    cols = [
        "Person",
        "Predicted Activity Date",
        "scenario_distance",
        "Predicted Distance (mi)",
        "Days Until Next Run",
        "Predicted Elapsed Time",
        "Predicted Moving Time",
        "Predicted Pace",
        "Predicted Elevation Gain (ft)",
        "Predicted Elevation Loss (ft)",
    ]

    formatted = raw[cols].rename(columns={"scenario_distance": "Scenario Distance (mi)"})
    return formatted

# ------------------------------------------------------------
# 12. Run formatted what-if predictions for ALL people
# ------------------------------------------------------------

all_people = sorted(df["Person"].unique())
formatted_results = pd.concat(
    [format_scenarios_for_user(p) for p in all_people],
    ignore_index=True
)

formatted_results

# ============================================================
# FINAL PREDICTION FORMAT: REQUIRED ASSIGNMENT OUTPUT STRUCTURE
# ============================================================

def generate_final_predictions(distances=[2.0, 4.0, 6.0]):
    
    final_rows_part1 = []  # actual next run prediction
    final_rows_part2 = []  # what-if scenarios
    
    for person in sorted(df["Person"].unique()):
        
        user_hist = df[df["Person"] == person].sort_values("Activity Date")
        latest = user_hist.iloc[-1].copy()
        last_date = latest["Activity Date"]
        
        # -------- PART 1: Predict Next Real Run --------
        X_latest = pd.DataFrame([latest[feature_cols]])
        forecast_pred = reg_forecast.predict(X_latest)[0]
        pred_days, pred_dist = forecast_pred
        
        pred_days = int(max(0, round(pred_days)))
        predicted_run_date = last_date + timedelta(days=pred_days)
        
        # Use the predicted distance as input to the performance model
        latest_for_perf = latest.copy()
        latest_for_perf["Distance"] = float(pred_dist)
        perf_input_row = pd.DataFrame([latest_for_perf[feature_cols]])
        perf_result = reg_perf.predict(perf_input_row)[0]
        
        # format performance metrics
        next_elapsed = seconds_to_time_str(perf_result[0])
        next_moving = seconds_to_time_str(perf_result[1])
        next_pace = seconds_to_time_str(perf_result[2])
        
        final_rows_part1.append({
            "Person": person,
            "Predicted Next Run Date": predicted_run_date.date(),
            "Predicted Distance (mi)": round(pred_dist, 2),
            "Predicted Elapsed Time": next_elapsed,
            "Predicted Moving Time": next_moving,
            "Predicted Pace": next_pace,
            "Predicted Elevation Gain (ft)": round(perf_result[3], 1),
            "Predicted Elevation Loss (ft)": round(perf_result[4], 1)
        })
        
        
        # -------- PART 2: Hypothetical Scenarios --------
        for d in distances:
            scenario_row = latest.copy()
            scenario_row["Distance"] = d
            
            perf_input = pd.DataFrame([scenario_row[feature_cols]])
            perf_alt = reg_perf.predict(perf_input)[0]
            
            final_rows_part2.append({
                "Person": person,
                "Scenario Distance (mi)": d,
                "Predicted Next Run Date (same as part 1)": predicted_run_date.date(),
                "Predicted Elapsed Time": seconds_to_time_str(perf_alt[0]),
                "Predicted Moving Time": seconds_to_time_str(perf_alt[1]),
                "Predicted Pace": seconds_to_time_str(perf_alt[2]),
                "Predicted Elevation Gain (ft)": round(perf_alt[3], 1),
                "Predicted Elevation Loss (ft)": round(perf_alt[4], 1)
            })
    
    return (
        pd.DataFrame(final_rows_part1),  # Part 1 table
        pd.DataFrame(final_rows_part2)   # Part 2 table
    )

# Call it:
prediction_part1, prediction_part2 = generate_final_predictions()
prediction_part1, prediction_part2


Loaded runs_processed.csv with shape: (241, 17)
Rows with valid next-run targets: (238, 36)
Train size: (166, 36)
Val size: (37, 36)
Test size: (35, 36)
Forecast model trained.
Performance model trained.

--- FORECAST TARGETS (days_until_next_run, next_distance) ---

=== Global mean baseline (forecast) ===
days_until_next_run: MAE=7.628, RMSE=8.492
next_distance: MAE=1.770, RMSE=2.466


=== Per-user mean baseline (forecast) ===
days_until_next_run: MAE=6.226, RMSE=8.851
next_distance: MAE=1.597, RMSE=2.336


=== Forecast model (imputed + scaled LR) ===
days_until_next_run: MAE=8.544, RMSE=10.900
next_distance: MAE=1.567, RMSE=2.144


--- PERFORMANCE TARGETS (time, pace, elevation) ---

=== Global mean baseline (performance) ===
next_elapsed_sec: MAE=8876.229, RMSE=9085.490
next_moving_sec: MAE=1839.984, RMSE=1976.737
next_pace_sec: MAE=440.529, RMSE=442.587
next_elev_gain: MAE=83.089, RMSE=98.226
next_elev_loss: MAE=117.368, RMSE=146.952


=== Per-user mean baseline (performance) ===
n

(   Person Predicted Next Run Date  Predicted Distance (mi)  \
 0    Alex              2025-12-01                     4.61   
 1  Karina              2025-12-21                     2.57   
 2   Zubin              2025-12-14                     6.26   
 
   Predicted Elapsed Time Predicted Moving Time Predicted Pace  \
 0                6:42:22               2:48:15          42:06   
 1               22:33:05               4:29:30        1:08:31   
 2                  34:34                 56:36          36:40   
 
    Predicted Elevation Gain (ft)  Predicted Elevation Loss (ft)  
 0                           90.7                          120.3  
 1                           91.5                           54.8  
 2                          276.6                          332.1  ,
    Person  Scenario Distance (mi) Predicted Next Run Date (same as part 1)  \
 0    Alex                     2.0                               2025-12-01   
 1    Alex                     4.0                   