In [9]:
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from shapely.geometry import Point
import matplotlib.pyplot as plt
import os
from glob import glob

In [5]:
# load all OD data

all_od_files = glob('../output/OD_Data_Prep/*.csv')
all_od = pd.DataFrame()

for od_file in all_od_files:
    od = pd.read_csv(od_file)
    all_od = pd.concat([all_od, od], ignore_index=True)

all_od.head()

Unnamed: 0,tripid,transportmode,p_ageclass,p_gender,origin_lon,origin_lat,start_datetime,Id_x,destination_lon,destination_lat,end_datetime,Id_y,trip_type
0,00003b26-f4ff-52d0-85f5-13b72a98637c,in_vehicle,,,139.758117,35.645158,2023-05-27 17:30:19.816000+00:00,,139.887644,35.638197,2023-05-27 17:46:34+00:00,,
1,00074cdb-0307-5fc3-a923-562221ae8fbb,in_vehicle,30代,男性,139.793669,35.652479,2023-05-27 14:14:37+00:00,0.0,139.795648,35.65407,2023-05-27 14:15:26+00:00,0.0,Trip within Toyosu
2,0009aa1f-6fed-5030-ac23-c5f948ea449c,on_foot,30代,女性,139.795612,35.655163,2023-05-27 09:58:07.348000+00:00,0.0,139.797147,35.654223,2023-05-27 10:03:53.593000+00:00,0.0,Trip within Toyosu
3,000c2f9b-ece9-58fe-a8a5-16ef0b817cae,on_foot,40代,女性,139.795845,35.654947,2023-05-27 18:28:58.003000+00:00,0.0,139.794256,35.659544,2023-05-27 18:37:39+00:00,0.0,Trip within Toyosu
4,000c3645-5930-5b1a-b22a-2cb624d4092b,in_vehicle,60歳以上,女性,139.728874,35.579196,2023-05-27 15:57:35.640000+00:00,,139.889226,35.63901,2023-05-27 16:20:35.950000+00:00,,


In [6]:
df = all_od[all_od["trip_type"] == "Trip within Toyosu"]

In [7]:
gdf_origins = gpd.GeoDataFrame(df,
                               geometry=gpd.points_from_xy(df.origin_lon,
                                                           df.origin_lat),
                               crs="EPSG:4326")

gdf_dests = gpd.GeoDataFrame(df,
                               geometry=gpd.points_from_xy(df.destination_lon,
                                                           df.destination_lat),
                               crs="EPSG:4326")

In [8]:
gdf_all = pd.concat([gdf_origins[["tripid", "geometry"]], gdf_dests[["tripid", "geometry"]]])

In [10]:
def cluster_points_kmeans(gdf, n_clusters=8):
    """
    Cluster points using K-means.
    - n_clusters: Number of clusters to form (e.g., 6-10 for 10 stops)
    """
    # Extract coordinates
    coords = np.array([[point.x, point.y] for point in gdf.geometry])

    # Run K-means
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(coords)

    # Add cluster labels to GeoDataFrame (0 to n_clusters-1)
    gdf['cluster'] = kmeans.labels_

    # Calculate cluster centroids and point counts
    clusters = gdf.groupby('cluster').agg({
        'geometry': lambda x: Point(x.x.mean(), x.y.mean()),
        'cluster': 'count'
    }).rename(columns={'cluster': 'point_count'})

    return gdf, gpd.GeoDataFrame(clusters, geometry='geometry', crs="EPSG:4326")

In [11]:
gdf_clustered, centroids = cluster_points_kmeans(gdf_all, n_clusters=25)

In [12]:
centroids.reset_index(drop=False, inplace=True)

centroids.to_file("../output/cluster_25.geojson", driver="GeoJSON")