# 1.0 - Data Exploration

Lightweight, reproducible scaffold to load processed artifacts, run EDA summaries, write CSV reports, and generate publication-ready figures.

Outputs:
- Figures: `outputs/figures/`
- Reports (CSV): `outputs/reports/`

In [None]:
# Imports
import os
from pathlib import Path
import pandas as pd

from src.eda import (
    load_artifacts,
    summarize_points,
    summarize_users,
    write_overview_report,
    write_users_report,
    generate_all_figures,
)

FIG_DIR = Path("outputs/figures")
REP_DIR = Path("outputs/reports")
FIG_DIR.mkdir(parents=True, exist_ok=True)
REP_DIR.mkdir(parents=True, exist_ok=True)
print(f"[Notebook] Figures dir: {FIG_DIR.resolve()}")
print(f"[Notebook] Reports dir: {REP_DIR.resolve()}")

In [None]:
# Load processed artifacts
df_points, df_trips = load_artifacts(
    points_path="data/processed/01_trajectories_cleaned.parquet",
    trips_path="data/processed/02_trips.parquet",
)
print("[Notebook] Points shape:", getattr(df_points, "shape", None))
print("[Notebook] Trips shape:", getattr(df_trips, "shape", None))

## Overview Summary and Per-user Summary
Writes CSVs to:
- `outputs/reports/summary_overview.csv`
- `outputs/reports/summary_users.csv`

In [None]:
# Compute and write overview summary
overview_path = write_overview_report(df_points, out_dir=str(REP_DIR))
print("[Notebook] Wrote:", overview_path)

# Compute and write per-user summary
users_path = write_users_report(df_points, df_trips, out_dir=str(REP_DIR))
print("[Notebook] Wrote:", users_path)

# Display short previews in notebook
try:
    display(pd.read_csv(overview_path))
    display(pd.read_csv(users_path).head())
except Exception as e:
    print("[Notebook] Preview read failed:", e)

## Generate Figures
Saves plots to `outputs/figures/`:
- `speed_distribution.png`, `speed_boxplot.png`
- `acceleration_distribution.png`, `acceleration_boxplot.png` (if acceleration exists)
- `trip_distance_distribution.png`, `trip_distance_boxplot.png`
- `trip_duration_distribution.png`, `trip_duration_boxplot.png`
- `tod_activity.png`, `dow_activity.png`, `monthly_activity.png`
- `trajectory_map_user_sample.png`
- `kde_heatmap.png`

In [None]:
saved = generate_all_figures(df_points, df_trips, out_dir=str(FIG_DIR))
print("[Notebook] Saved figures:")
for k, v in saved.items():
    print(f"  - {k}: {v}")

## Notes
- The EDA utilities are robust to missing columns (e.g., acceleration). Missing plots are skipped.
- The notebook is designed to run top-to-bottom without manual intervention once artifacts exist.
- Adjust paths if your processed artifacts are in a different location.