# User Segmentation, Frequent Paths, and Mode Heuristics\n\nThis notebook builds per-user mobility features, runs KMeans segmentation, and optionally computes frequent routes and heuristic transport modes.\n\nArtifacts saved under `data/processed/`:\n- 04_user_features.parquet\n- 05_frequent_paths.parquet (optional)\n- 06_trip_modes_heuristic.parquet (optional)\n- 07_user_segments.parquet\n\nFigures saved under `outputs/figures/`:\n- user_segments_pairplot.png\n

In [None]:
import os\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nfrom src.data_loader import load_config\nfrom src.segmentation import build_user_feature_table, run_user_segmentation\nfrom src.routes import frequent_paths\nfrom src.mode_inference import label_mode_heuristic\n\ncfg = load_config("configs/config.yaml")\nprocessed_dir = cfg.get('paths', {}).get('processed_dir', 'data/processed')\noutput_dir = cfg.get('paths', {}).get('output_dir', 'outputs')\nfig_dir = os.path.join(output_dir, 'figures')\nreport_dir = os.path.join(output_dir, 'reports')\nos.makedirs(processed_dir, exist_ok=True)\nos.makedirs(fig_dir, exist_ok=True)\nos.makedirs(report_dir, exist_ok=True)\n\npoints_path = os.path.join(processed_dir, '01_trajectories_cleaned.parquet')\ntrips_path = os.path.join(processed_dir, '02_trips.parquet')\nstays_path = os.path.join(processed_dir, '03_stay_points.parquet')\n\nprint(f"Loading: {points_path}, {trips_path}, {stays_path}")\ndf_points = pd.read_parquet(points_path) if os.path.exists(points_path) else pd.DataFrame()\ndf_trips = pd.read_parquet(trips_path) if os.path.exists(trips_path) else pd.DataFrame()\ndf_stays = pd.read_parquet(stays_path) if os.path.exists(stays_path) else pd.DataFrame()\nprint(f"Loaded points={len(df_points)}, trips={len(df_trips)}, stays={len(df_stays)}")\n

## Build User Feature Table

In [None]:
# If you have assignments table from the previous notebook (home/work),\n# you can load it; for now we proceed without it (None).\ndf_assignments = None\ndf_user_features = build_user_feature_table(df_points, df_trips, df_stays, df_assignments)\nprint(df_user_features.head())\n\nengine = 'pyarrow'\ntry:\n    import pyarrow  # noqa: F401\nexcept Exception:\n    try:\n        import fastparquet  # noqa: F401\n        engine = 'fastparquet'\n    except Exception as e:\n        raise RuntimeError('No parquet engine available. Install pyarrow or fastparquet.')\n\nuser_feat_path = os.path.join(processed_dir, '04_user_features.parquet')\ndf_user_features.to_parquet(user_feat_path, index=False, engine=engine)\ncsv_summary_path = os.path.join(report_dir, '04_user_features_summary.csv')\ndf_user_features.describe(include='all').to_csv(csv_summary_path)\nprint(f"Saved user features: {user_feat_path} and CSV summary: {csv_summary_path}")\n

## Run KMeans Segmentation and Save Figure + Artifact

In [None]:
df_segments, fig_path, centers = run_user_segmentation(df_user_features, k=4, random_state=42, figures_dir=fig_dir, figure_name='user_segments_pairplot.png')\nseg_path = os.path.join(processed_dir, '07_user_segments.parquet')\ndf_segments.to_parquet(seg_path, index=False, engine=engine)\nprint(f"Saved segments: {seg_path}. Figure: {fig_path}")\ndf_segments.head()\n

## Optional: Frequent Paths and Mode Heuristic Labeling

In [None]:
# Frequent paths\nif not df_points.empty and not df_trips.empty:\n    df_freq = frequent_paths(df_points, df_trips, min_occurrences=3, rounding=5, downsample_n=5)\n    freq_path = os.path.join(processed_dir, '05_frequent_paths.parquet')\n    df_freq.to_parquet(freq_path, index=False, engine=engine)\n    freq_csv = os.path.join(report_dir, '05_frequent_paths.csv')\n    df_freq.to_csv(freq_csv, index=False)\n    print(f"Saved frequent paths: {freq_path} and CSV: {freq_csv}")\nelse:\n    df_freq = pd.DataFrame()\n    print("Skipping frequent paths: missing points or trips.")\n\n# Mode heuristic\nif not df_trips.empty:\n    df_modes = label_mode_heuristic(df_trips)\n    modes_path = os.path.join(processed_dir, '06_trip_modes_heuristic.parquet')\n    df_modes.to_parquet(modes_path, index=False, engine=engine)\n    modes_csv = os.path.join(report_dir, '06_trip_modes_heuristic.csv')\n    df_modes.to_csv(modes_csv, index=False)\n    print(f"Saved heuristic modes: {modes_path} and CSV: {modes_csv}")\nelse:\n    print("Skipping mode labeling: no trips available.")\n