In [1]:
import pandas as pd
import h3
import folium
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
from shapely import MultiPolygon, Point, Polygon
from shapely.ops import unary_union
import json

In [2]:
df = pd.read_parquet('df_la.parquet')
df_test = pd.read_parquet('df_la_test.parquet')

In [3]:
df = df[["route_id", "stop_id", "zone_id", "lat", "lng", "order"]]

In [12]:
df.loc[:, "h3_index_6"] = df.apply(lambda row: h3.latlng_to_cell(row["lat"], row["lng"], res=6), axis=1)
df_test.loc[:, "h3_index_6"] = df_test.apply(lambda row: h3.latlng_to_cell(row["lat"], row["lng"], res=6), axis=1)

df.loc[:, "h3_index_5"] = df.apply(lambda row: h3.latlng_to_cell(row["lat"], row["lng"], res=5), axis=1)
df_test.loc[:, "h3_index_5"] = df_test.apply(lambda row: h3.latlng_to_cell(row["lat"], row["lng"], res=5), axis=1)

# df

In [13]:
indices = df["h3_index_6"].unique().tolist()
indices_test = df["h3_index_6"].unique().tolist()

mean_lat = df["lat"].mean()
mean_lng = df["lng"].mean()

m = folium.Map(location=[df["lat"].mean(), df["lng"].mean()], zoom_start=9)

for index in indices:
    hex_boundary = h3.cell_to_boundary(index)

    folium.Polygon(
        locations=hex_boundary,
        color="red",
        weight=1,
    ).add_to(m)

for index in indices_test:
    hex_boundary = h3.cell_to_boundary(index)

    folium.Polygon(
        locations=hex_boundary,
        color="blue",
        weight=1,
    ).add_to(m)

m

In [14]:
print(len(indices))

229


In [15]:
count_list = df[df["order"] != 0].groupby("route_id")["h3_index_6"].nunique().tolist()
counts = pd.Series(count_list).value_counts()
counts


2    1108
1     926
3     749
4      43
5       5
Name: count, dtype: int64

In [16]:
points = []

for index in indices:
    points.append(h3.cell_to_latlng(index))

db = KMeans(n_clusters=len(indices) // 4, random_state=16).fit(points)

labels = db.labels_

In [17]:
unique_labels = set(labels)

cmap = plt.cm.get_cmap('jet', len(unique_labels))

color_map = {
    label: mcolors.to_hex(cmap(i)) for i, label in enumerate(unique_labels)
}

routes = df["route_id"].unique()
route_id = random.choice(routes)

route = df[df["route_id"] == route_id]

m = folium.Map(location=[df["lat"].mean(), df["lng"].mean()], zoom_start=10)

for i in range(len(indices)):
    hex_boundary = h3.cell_to_boundary(indices[i])
    color = color_map[labels[i]]
    folium.Polygon(
        locations=hex_boundary,
        color=color,
        weight=1,
        fill=True,
        fill_color=color,
        fill_opacity=0.6,
        tooltip=[labels[i].item()]
    ).add_to(m)


for _, row in route.iterrows():
    folium.CircleMarker(
        location=[row["lat"], row["lng"]],
        radius=5,
        color="red",
        fill=True,
        fill_color="red",
        fill_opacity=1,
        # tooltip=row[["order"]]
    ).add_to(m)

m

  cmap = plt.cm.get_cmap('jet', len(unique_labels))


In [20]:
labels_dict = {}

for i in range(len(indices)):
    labels_dict[indices[i]] = labels[i].item()

df["zone_cluster"] = df["h3_index_6"].apply(lambda x: labels_dict[x])

label_df = pd.DataFrame(list(labels_dict.items()), columns=['hex', 'label'])

multipolygon_list = []

for _, group in label_df.groupby(by="label"):
    polygon_list = []
    for _, row in group.iterrows():
        polygon_list.append(Polygon([(lon, lat) for lat, lon in h3.cell_to_boundary(row.hex)]))
    multipolygon_list.append(MultiPolygon(polygon_list))
    

m = folium.Map(location=[df["lat"].mean(), df["lng"].mean()], zoom_start=10)

for i, p in enumerate(multipolygon_list):
    merged = unary_union(p)
    coords = [(lat, lon) for lon, lat in merged.exterior.coords]
    color = color_map[i]
    folium.PolyLine(
        locations=coords,
        fill=True,
        fill_opacity=0.2,
        weight=2,
        color="red",
    ).add_to(m)


res5_hex = df["h3_index_5"].unique()

for index in res5_hex:
    hex_boundary = h3.cell_to_boundary(index)

    folium.Polygon(
        locations=hex_boundary,
        color="blue",
        weight=3,
    ).add_to(m)

m

In [52]:
for _, cluster in df.groupby("zone_cluster"):
    cluster.to_parquet(f"zone_dfs/cluster_{cluster['zone_cluster'].iloc[0]}.parquet")

In [53]:
test = pd.read_parquet("zone_dfs/cluster_0.parquet")
test

Unnamed: 0,route_id,station_code,date,departure_time_utc,executor_capacity_cm3,route_score,stop_id,lat,lng,type,zone_id,cluster,order,is_start,planned_time,h3_index,zone_cluster
3932,RouteID_01c9f706-0d46-427b-998c-f1559dda4ad3,DLA7,2018-07-22,14:36:46,3313071.0,Medium,BP,33.965832,-117.982821,Dropoff,D-6.3A,0,117,False,33.5,8629a0a4fffffff,0
3952,RouteID_01c9f706-0d46-427b-998c-f1559dda4ad3,DLA7,2018-07-22,14:36:46,3313071.0,Medium,EI,33.964745,-117.985162,Dropoff,D-6.1A,0,131,False,35.3,8629a0a4fffffff,0
3982,RouteID_01c9f706-0d46-427b-998c-f1559dda4ad3,DLA7,2018-07-22,14:36:46,3313071.0,Medium,HY,33.964967,-117.986359,Dropoff,D-6.3A,0,119,False,70.5,8629a0a4fffffff,0
4004,RouteID_01c9f706-0d46-427b-998c-f1559dda4ad3,DLA7,2018-07-22,14:36:46,3313071.0,Medium,NT,33.965821,-117.985039,Dropoff,D-6.3A,0,118,False,82.0,8629a0a4fffffff,0
4006,RouteID_01c9f706-0d46-427b-998c-f1559dda4ad3,DLA7,2018-07-22,14:36:46,3313071.0,Medium,NY,33.967667,-117.984246,Dropoff,D-6.3A,0,115,False,130.0,8629a0a4fffffff,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396763,RouteID_f91fa739-9986-4dbf-8a14-99d3ca80d724,DLA7,2018-08-12,16:28:44,3313071.0,Medium,WP,34.009675,-117.868067,Dropoff,E-25.2H,0,2,False,85.0,8629a0a5fffffff,0
396764,RouteID_f91fa739-9986-4dbf-8a14-99d3ca80d724,DLA7,2018-08-12,16:28:44,3313071.0,Medium,WS,34.011607,-117.872223,Dropoff,E-25.1J,0,11,False,55.0,8629a0a5fffffff,0
396775,RouteID_f91fa739-9986-4dbf-8a14-99d3ca80d724,DLA7,2018-08-12,16:28:44,3313071.0,Medium,YX,34.020337,-117.873034,Dropoff,E-26.3J,0,40,False,25.0,8629a0a5fffffff,0
396778,RouteID_f91fa739-9986-4dbf-8a14-99d3ca80d724,DLA7,2018-08-12,16:28:44,3313071.0,Medium,ZK,34.013536,-117.878602,Dropoff,E-26.1J,0,19,False,45.0,8629a0a5fffffff,0


In [56]:
with open("zone_dfs/labels_dict.json", "w") as f:
    json.dump(labels_dict, f, indent=2)

In [69]:
df[df["order"] != 0].groupby("route_id")["zone_cluster"].nunique().max()

np.int64(3)