# Validation and Robustness Checks

This lightweight scaffold loads processed artifacts, runs sensitivity analyses and validation utilities from [`src/validation.py`](../src/validation.py), and saves CSV summaries in `outputs/reports/` and quick figures in `outputs/figures/`.

# Imports and setup

In [None]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from src.validation import (
    sensitivity_stop_detection,
    sensitivity_trip_segmentation,
    cross_user_validation,
    temporal_generalization_check,
    evaluate_stay_detection_comparison,
    evaluate_map_matching_quality,
    evaluate_mode_model_extended,
)

FIG_DIR = Path("outputs/figures")
REP_DIR = Path("outputs/reports")
FIG_DIR.mkdir(parents=True, exist_ok=True)
REP_DIR.mkdir(parents=True, exist_ok=True)

PROCESSED_DIR = Path("data/processed")
PATH_POINTS = PROCESSED_DIR / "01_trajectories_cleaned.parquet"
PATH_TRIPS = PROCESSED_DIR / "02_trips.parquet"

def _read_parquet_safe(p: Path) -> pd.DataFrame:
    if not p.exists():
        print(f"[nb] Missing: {p}; proceeding with empty DataFrame.")
        return pd.DataFrame()
    try:
        return pd.read_parquet(p)
    except Exception as e:
        print(f"[nb] Could not read {p}: {e}. Returning empty DataFrame.")
        return pd.DataFrame()

df_points = _read_parquet_safe(PATH_POINTS)
df_trips = _read_parquet_safe(PATH_TRIPS)
print(f"[nb] points={len(df_points)}, trips={len(df_trips)}")

## Sensitivity: Stay-Point Detection
Runs DBSCAN at multiple (eps, min_samples) settings. Saves `outputs/reports/sensitivity_staypoints.csv` and a figure.

In [None]:
sens_stays = sensitivity_stop_detection(
    df_points,
    eps_list=[100, 150, 200],
    min_samples_list=[3, 5, 8],
)
display(sens_stays.head())

# Quick line plot: stays vs eps per min_samples
if not sens_stays.empty:
    plt.figure(figsize=(7,4))
    sns.lineplot(data=sens_stays, x="eps_m", y="n_stays", hue="min_samples", marker="o")
    plt.title("Stay-Points Sensitivity")
    plt.tight_layout()
    out_path = FIG_DIR / "sensitivity_staypoints.png"
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[nb] Figure saved to {out_path}")

## Sensitivity: Trip Segmentation
Vary gap threshold (min) and min trip distance (m). Saves `outputs/reports/sensitivity_trips.csv` and a figure.

In [None]:
sens_trips = sensitivity_trip_segmentation(
    df_points,
    gap_minutes_list=[10, 20, 30, 45],
    min_dist_list=[50, 100, 250],
)
display(sens_trips.head())

if not sens_trips.empty:
    plt.figure(figsize=(7,4))
    sns.lineplot(data=sens_trips, x="gap_min", y="trips_count", hue="min_trip_dist_m", marker="o")
    plt.title("Trip Segmentation Sensitivity")
    plt.tight_layout()
    out_path = FIG_DIR / "sensitivity_trips.png"
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[nb] Figure saved to {out_path}")

## Cross-User Validation
KS stability of trip distributions across user folds. Saves `outputs/reports/cross_user_validation.csv` and a figure.

In [None]:
cross_user = cross_user_validation(df_trips, n_splits=5, random_state=42)
display(cross_user.head())

if not cross_user.empty:
    plt.figure(figsize=(7,4))
    sns.barplot(data=cross_user, x="fold", y="ks_distance_km", color="#4C78A8")
    plt.title("Cross-User KS (Distance)")
    plt.tight_layout()
    out_path = FIG_DIR / "cross_user_validation.png"
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[nb] Figure saved to {out_path}")

## Temporal Generalization
Distribution summaries and KS comparisons between consecutive periods. Saves `outputs/reports/temporal_generalization.csv` and `_ks.csv`, plus a figure.

In [None]:
temporal = temporal_generalization_check(df_trips)
display(temporal.head())

if not temporal.empty:
    plt.figure(figsize=(8,5))
    # Example: median distance over periods
    tmp = temporal[temporal["metric"]=="distance_km"].copy()
    sns.lineplot(data=tmp, x="period", y="median", marker="o")
    plt.xticks(rotation=45, ha="right")
    plt.title("Median Trip Distance by Period")
    plt.tight_layout()
    out_path = FIG_DIR / "temporal_generalization.png"
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[nb] Figure saved to {out_path}")

## Stays: DBSCAN vs HDBSCAN comparison

In [None]:
STAYS_DB_PATH = PROCESSED_DIR / "03_stay_points_dbscan.parquet"
STAYS_HDB_PATH = PROCESSED_DIR / "03_stay_points_hdbscan.parquet"

def _read_stays_safe(p: Path) -> pd.DataFrame:
    if not p.exists():
        return pd.DataFrame()
    try:
        return pd.read_parquet(p)
    except Exception as e:
        print(f"[nb] Could not read stays from {p}: {e}")
        return pd.DataFrame()

stays_db = _read_stays_safe(STAYS_DB_PATH)
stays_hdb = _read_stays_safe(STAYS_HDB_PATH)

if not (stays_db.empty and stays_hdb.empty):
    comp_df = evaluate_stay_detection_comparison(df_points, stays_db, stays_hdb)
    display(comp_df.head())
else:
    print("[nb] Stays artifacts missing; skipping stay detection comparison.")

## Map-Matching Quality (enhanced)

In [None]:
PATH_SNAPPED = PROCESSED_DIR / "01_trajectories_snapped.parquet"
snapped_df = _read_parquet_safe(PATH_SNAPPED)

mm_summary = evaluate_map_matching_quality(
    df_points,
    snapped_df,
    report_path="outputs/reports/map_matching_quality.csv",
    by_trip_report_path="outputs/reports/map_matching_quality_by_trip.csv",
)
display(pd.DataFrame([mm_summary]))

if not pd.isna(mm_summary.get("average_lateral_offset_m", np.nan)):
    fig, ax = plt.subplots(figsize=(7,4))
    vals = {
        "avg_offset_m": mm_summary["average_lateral_offset_m"],
        "delta_half_offset_m": mm_summary.get("delta_half_offset_m", np.nan),
        "delta_third_offset_m": mm_summary.get("delta_third_offset_m", np.nan),
    }
    sns.barplot(x=list(vals.keys()), y=list(vals.values()), ax=ax, color="#72B7B2")
    ax.set_title("Map-Matching Offset and Downsampling Deltas")
    plt.tight_layout()
    out_path = FIG_DIR / "map_matching_quality_offsets.png"
    plt.savefig(out_path, dpi=150)
    plt.close()
    print(f"[nb] Figure saved to {out_path}")

## Mode Model Extended Metrics Summary

In [None]:
EXT_METRICS_JSON = Path("outputs/reports/mode_model_metrics_extended.json")
BASE_METRICS_JSON = Path("outputs/reports/mode_model_metrics_baseline.json")
if EXT_METRICS_JSON.exists():
    summary_df = evaluate_mode_model_extended(
        metrics_json_path=str(EXT_METRICS_JSON),
        baseline_metrics_json_path=str(BASE_METRICS_JSON) if BASE_METRICS_JSON.exists() else None,
        report_path="outputs/reports/mode_model_extended_summary.csv",
    )
    display(summary_df)
    cols = [c for c in ["extended_accuracy","extended_f1_macro","extended_auroc"] if c in summary_df.columns]
    if cols:
        plt.figure(figsize=(6,4))
        sns.barplot(data=summary_df.melt(value_vars=cols, var_name="metric", value_name="value"), x="metric", y="value", color="#E45756")
        plt.title("Mode Model (Extended) Metrics")
        plt.tight_layout()
        out_path = FIG_DIR / "mode_model_extended_metrics.png"
        plt.savefig(out_path, dpi=150)
        plt.close()
        print(f"[nb] Figure saved to {out_path}")
else:
    print(f"[nb] Extended metrics JSON not found at {EXT_METRICS_JSON}; skipping.")