In [40]:
import os

from typing import List

import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from folium import Map, CircleMarker, PolyLine, FeatureGroup
from shapely import MultiPoint, centroid

from formation.helpers.map import calcul_distances, calcul_distance

import plotly.io as pio

pio.get_chrome()

PosixPath('/home/jaden/Sources/followchon_back/.venv/lib/python3.12/site-packages/choreographer/cli/browser_exe/chrome-linux64/chrome')

In [41]:
COLORS = px.colors.qualitative.Vivid + px.colors.qualitative.Alphabet

In [52]:
def cross_distance(row):
    return calcul_distance(
        row['lat_current'],
        row['lng_current'],
        row['lat_next'],
        row['lng_next']
    )

def calcul_near_point(index: str, points_others: List[str], df_all_distances: pd.DataFrame)->str:
    distances = df_all_distances.loc[index, points_others].to_dict()

    return str(min(distances, key=lambda k: distances[k]))

def calcul_path(
        indexes_started: List[str],
        indexes_ended: List[str],
        indexes_between: List[str],
        df_all_distances: pd.DataFrame,
):
    index_start = indexes_started[-1]
    index_end = indexes_ended[-1]

    if len(indexes_between) > 0:
        indexes_started.append(
            calcul_near_point(index_start, indexes_between, df_all_distances)
        )

        indexes_between = list(
            filter(
                lambda val: val != indexes_started[-1],
                indexes_between
            )
        )

        if len(indexes_between) > 0:
            indexes_ended.append(
                calcul_near_point(index_end, indexes_between, df_all_distances)
            )

            indexes_between = list(
                filter(
                    lambda val: val != indexes_ended[-1],
                    indexes_between
                )
            )

        return calcul_path(
            indexes_started,
            indexes_ended,
            indexes_between,
            df_all_distances,
        )

    return indexes_started + list(reversed(indexes_ended))

def calcul_centroids(df_cluster_points: pd.DataFrame):
    clusters = df_cluster_points['cluster'].unique()

    df_centroids = pd.DataFrame()

    for i in clusters:
        cluster_points = df_cluster_points[df_cluster_points['cluster'] == str(i)].loc[:, ['lat', 'lng']].to_numpy()
        c = centroid(MultiPoint(cluster_points))

        df_centroids = pd.concat([
            df_centroids,
            pd.DataFrame({
                'index': [i],
                'lat': [c.x],
                'lng': [c.y]
            })
        ], ignore_index=True)

    df_centroids['index'] = df_centroids['index'].astype(str)
    df_centroids['lat'] = df_centroids['lat'].astype(float)
    df_centroids['lng'] = df_centroids['lng'].astype(float)

    return calcul_distances(df_centroids.set_index('index'))

def calc_clusters(df_centroids: pd.DataFrame, df_clusters_points: pd.DataFrame)->pd.DataFrame:
    df_crossed = pd.DataFrame()

    # On determine les ponts entre clusters
    for index, row_cluster in df_centroids.iterrows():
        df_current = df_clusters_points[df_clusters_points['cluster'] == row_cluster['index']][['cluster', 'lat', 'lng']]
        df_next = df_clusters_points[df_clusters_points['cluster'] == row_cluster['next']][['cluster', 'lat', 'lng']]

        df_cross = df_current.reset_index() \
            .merge(df_next.reset_index(), how='cross', suffixes=("_current", "_next"))

        df_cross['distance'] = df_cross.apply(cross_distance, axis=1)

        df_bridge = df_cross.sort_values(by='distance', ascending=True)[[
            'index_current', 'cluster_current', 'index_next', 'cluster_next'
        ]].head(1)

        df_crossed = pd.concat([
            df_crossed,
            df_bridge,
        ])

    df_crossed['pair'] = df_crossed['cluster_current'] + '_' + df_crossed['cluster_next']

    df_centroids = pd.merge(
        df_centroids,
        df_crossed,
        left_on='pair',
        right_on='pair',
    )[['index', 'next', 'prev', 'lat', 'lng', 'pair', 'index_current', 'index_next']]

    df_centroids = pd.merge(
        df_centroids,
        df_clusters_points[['lat', 'lng']],
        left_on='index_current',
        right_index=True,
        suffixes=('', '_current')
    )

    df_centroids = pd.merge(
        df_centroids,
        df_clusters_points[['lat', 'lng']],
        left_on='index_next',
        right_index=True,
        suffixes=('', '_next')
    )

    df_centroids = df_centroids.drop(columns=['pair']) \
        .rename(columns={
        'index': 'cluster_current',
        'next': 'cluster_next',
        'prev': 'cluster_prev',
        'lat': 'center_lat',
        'lng': 'center_lng',
        'index_current': 'bridge_out_current_code',
        'lat_current': 'bridge_out_current_lat',
        'lng_current': 'bridge_out_current_lng',
        'index_next': 'bridge_out_next_code',
        'lat_next': 'bridge_out_next_lat',
        'lng_next': 'bridge_out_next_lng',
    })

    df_centroids['bridge_in_current_code'] = df_centroids['bridge_out_next_code'].shift(periods=1, fill_value='')
    df_centroids.loc[df_centroids.index[0], 'bridge_in_current_code'] = df_centroids.loc[df_centroids.index[-1], 'bridge_out_next_code']

    df_centroids['bridge_in_current_lat'] = df_centroids['bridge_out_next_lat'].shift(periods=1, fill_value='')
    df_centroids.loc[df_centroids.index[0], 'bridge_in_current_lat'] = df_centroids.loc[df_centroids.index[-1], 'bridge_out_next_lat']

    df_centroids['bridge_in_current_lng'] = df_centroids['bridge_out_next_lng'].shift(periods=1, fill_value='')
    df_centroids.loc[df_centroids.index[0], 'bridge_in_current_lng'] = df_centroids.loc[df_centroids.index[-1], 'bridge_out_next_lng']

    df_centroids['bridge_in_next_code'] = df_centroids['bridge_out_current_code'].shift(periods=1, fill_value='')
    df_centroids.loc[df_centroids.index[0], 'bridge_in_next_code'] = df_centroids.loc[df_centroids.index[-1], 'bridge_out_current_code']

    df_centroids['bridge_in_next_lat'] = df_centroids['bridge_out_current_lat'].shift(periods=1, fill_value='')
    df_centroids.loc[df_centroids.index[0], 'bridge_in_next_lat'] = df_centroids.loc[df_centroids.index[-1], 'bridge_out_current_lat']

    df_centroids['bridge_in_next_lng'] = df_centroids['bridge_out_current_lng'].shift(periods=1, fill_value='')
    df_centroids.loc[df_centroids.index[0], 'bridge_in_next_lng'] = df_centroids.loc[df_centroids.index[-1], 'bridge_out_current_lng']

    return df_centroids

def calc_layer_tour(df_centroids: pd.DataFrame, df_cluster_points: pd.DataFrame, df_all_distances: pd.DataFrame) -> pd.DataFrame:
    df_tour = pd.DataFrame()

    for index, row_cluster in df_centroids.iterrows():
        df_current = df_cluster_points[df_cluster_points['cluster'] == row_cluster['cluster_current']][[]].reset_index()

        start = df_current.loc[df_current['index'] == row_cluster['bridge_in_current_code'], ['index']].iloc[0]['index']
        end = df_current.loc[df_current['index'] == row_cluster['bridge_out_current_code'], ['index']].iloc[0]['index']
        between = df_current[
            (df_current['index'] != row_cluster['bridge_in_current_code']) &
            (df_current['index'] != row_cluster['bridge_out_current_code'])
            ]['index'].values
        between = [str(b) for b in between]

        indexes = calcul_path(
            [str(start)],
            [str(end)],
            between,
            df_all_distances
        )

        df_current['order'] = df_current['index'].map({
            index: i
            for i, index in enumerate(indexes)
        })

        df_tour = pd.concat([
            df_tour,
            df_current.sort_values('order').drop(columns=['order'])
        ], ignore_index=True)

    df_tour = df_tour.merge(
        df_cluster_points,
        left_on='index',
        right_index=True,
        how='left',
    )

    df_tour['next_lat'] = df_tour['lat'].shift(periods=-1, fill_value='')
    df_tour['next_lng'] = df_tour['lng'].shift(periods=-1, fill_value='')

    df_tour.loc[df_tour.index[-1], 'next_lat'] = df_tour.loc[df_tour.index[0], 'lat']
    df_tour.loc[df_tour.index[-1], 'next_lng'] = df_tour.loc[df_tour.index[0], 'lng']

    return df_tour

def draw_map(df_cities, end=False):
    m = Map(
        location=(46.227638, 2.213749),
        zoom_start=6,
    )

    fg = FeatureGroup(name="Markers")

    for i, point in df_cities.iterrows():
        if 'lat' in point and 'lng' in point:
            fg.add_child(
                CircleMarker(
                    location=[
                        point['lat'],
                        point['lng'],
                    ],
                    tooltip=f"{point['index']} ({point['cluster']})",
                    fill=True,
                    color=COLORS[int(point['cluster'])],
                    weight=0,
                    fill_opacity=0.6,
                    radius=15,
                )
            )

        if 'center_lat' in point and 'center_lng' in point:
            fg.add_child(
                CircleMarker(
                    location=[
                        point['center_lat'],
                        point['center_lng'],
                    ],
                    #tooltip=f"{point['index']} ({point['cluster_current']})",
                    fill=True,
                    color=COLORS[int(point['cluster_current'])],
                    weight=0,
                    fill_opacity=0.6,
                    radius=15,
                )
            )

        if 'next_lat' in point and 'next_lng' in point:
            fg.add_child(
                PolyLine(
                    locations=[
                        [point['lat'], point['lng']],
                        [point['next_lat'], point['next_lng']],
                    ],
                    color="black",
                    weight=4,
                )
            )

    m.add_child(fg)

    return m

def calcul_min_path(df_points: pd.DataFrame, df_distances: pd.DataFrame):
    indexes = df_points.index.unique()

    index_started = []
    points_tours = {}

    while True:
        index_not_started = list(
            filter(
                lambda v: v not in index_started,
                indexes
            )
        )

        if len(index_not_started) > 0:
            point_start = index_not_started[0]
            index_started.append(point_start)

            points_added = [point_start]
            distance_total = 0
            while True:
                city_code = points_added[-1]
                distances = df_distances[city_code].to_dict()

                index_not_added = {
                    k: v
                    for k, v in distances.items()
                    if k not in points_added
                }

                if len(index_not_added) > 0:
                    index_next = min(index_not_added, key=lambda k: index_not_added[k])
                    distance_total += min(index_not_added.values())

                    points_added.append(index_next)
                else:
                    break

            points_tours[point_start] = {
                'distance': distance_total,
                'indexes': points_added
            }

        else:
            break

    best_tours = min(points_tours.values(), key=lambda obj: obj["distance"])['indexes']

    df_points['order'] = df_points.index.map({
        code: i
        for i, code in enumerate(best_tours)
    })

    df_points = df_points.sort_values(by='order')
    df_points['index'] = df_points.index

    df_points['next'] = df_points['index'].shift(periods=-1, fill_value='')
    df_points['next'] = df_points['next'].astype(str)
    df_points.loc[df_points.index[-1], 'next'] = df_points.iloc[0]['index']

    df_points['prev'] = df_points['index'].shift(periods=1, fill_value='')
    df_points['prev'] = df_points['prev'].astype(str)
    df_points.loc[df_points.index[0], 'prev'] = df_points.iloc[-1]['index']

    df_points["distance"] = df_points.apply(
        lambda r: df_distances.loc[r["index"], r["next"]] if r["next"] else 0,
        axis=1
    )
    #df_points["distance_cum"] = df_points["distance"].cumsum()

    return df_points.drop(columns=['index'])

def calcul_min_path_last_cluster(df_current_tour: pd.DataFrame, df_curent_distances: pd.DataFrame)->pd.DataFrame:
    df_current_tour = calcul_min_path(df_current_tour.set_index('cluster_current'), df_curent_distances) \
        .reset_index().drop(columns=['order', 'next', 'prev', 'distance'])

    df_current_tour['index'] = df_current_tour['cluster_current']
    df_current_tour['cluster'] = df_current_tour['cluster_current']
    df_current_tour['lat'] = df_current_tour['center_lat']
    df_current_tour['lng'] = df_current_tour['center_lng']

    df_current_tour['next_lat'] = df_current_tour['lat'].shift(periods=-1, fill_value='')
    df_current_tour['next_lng'] = df_current_tour['lng'].shift(periods=-1, fill_value='')

    df_current_tour.loc[df_current_tour.index[-1], 'next_lat'] = df_current_tour.loc[df_current_tour.index[0], 'lat']
    df_current_tour.loc[df_current_tour.index[-1], 'next_lng'] = df_current_tour.loc[df_current_tour.index[0], 'lng']

    return df_current_tour

In [43]:
df_cities = pd.read_csv(
    '../data/formated/cities-france.csv',
    dtype={'code_insee': str, 'dep_code': str, 'nom_standard': str, 'latitude_mairie': float, 'longitude_mairie': float},
).rename(columns={'code_insee': 'index', 'nom_standard': 'name', 'latitude_mairie': 'lat', 'longitude_mairie': 'lng'}) \
    .set_index('index') \
    .loc[:, ['lat', 'lng']]

In [44]:
city_count = 70
cluster_max_length = 3
cluster_min_length = 3
df_points_coords, df_points_distances = calcul_distances(df_cities.sample(city_count))

if not os.path.exists('data'):
    os.makedirs('data')

if not os.path.exists('export'):
    os.makedirs('export')

steps_count = 1

while True:
    print(f'Step {steps_count} with {df_points_coords.shape[0]} points')

    points = df_points_coords.loc[:,['x', 'y']].to_numpy()

    k_kmeans = df_points_coords.shape[0] if df_points_coords.shape[0] < cluster_min_length else df_points_coords.shape[0] // cluster_max_length
    if cluster_max_length ** 2 > df_points_coords.shape[0] > cluster_max_length:
        k_kmeans = cluster_min_length

    print(f'with {k_kmeans} clusters')

    model_kmeans = KMeans(n_clusters=k_kmeans)
    model_kmeans.fit(points)
    df_points_coords['cluster'] = model_kmeans.predict(points)
    df_points_coords['cluster'] = df_points_coords['cluster'].astype(str)

    draw_map(df_points_coords.reset_index()).save(f'export/step_{steps_count}_map.html')

    df_points_coords.to_csv(f'data/step_{steps_count}_points.csv', index=False)
    df_points_distances.to_csv(f'data/step_{steps_count}_distances.csv')

    fig = px.scatter(
        df_points_coords.reset_index(),
        x='x',
        y='y',
        color='cluster',
        hover_name='index',
        height=700,
        size=[1] * df_points_coords.shape[0],
        color_discrete_sequence=COLORS,
    )
    fig.write_image(f"export/step_{steps_count}_points.jpg")

    if df_points_coords.shape[0] > cluster_min_length:
        df_centroids_coords, df_centroids_distances = calcul_centroids(df_points_coords)

        # df_centroids_min_path = calcul_min_path(
        #     df_centroids_coords,
        #     df_centroids_distances
        # )
        # df_centroids_min_path['pair'] = df_centroids_min_path.index + '_' + df_centroids_min_path['next']
        # df_centroids_min_path.sort_values('order', ascending=True)

        # df_clusters = calc_clusters(df_centroids_coords, df_points_coords)
        # df_centroids_coords.to_csv(f'data/step_{step}_centroids.csv', index=False)

        df_points_coords, df_points_distances = df_centroids_coords, df_centroids_distances

        steps_count += 1
    else:
        break




The default value of `n_init` will change from 4 to 1 in 1.9.



Step 1 with 70 points
with 23 clusters



The default value of `n_init` will change from 4 to 1 in 1.9.



Step 2 with 23 points
with 7 clusters



The default value of `n_init` will change from 4 to 1 in 1.9.



Step 3 with 7 points
with 3 clusters



The default value of `n_init` will change from 4 to 1 in 1.9.



Step 4 with 3 points
with 1 clusters


In [56]:
steps_list = range(1, steps_count + 1)[::-1]
print("steps :", list(steps_list))
# layers = []

# df_tour = pd.read_csv('data/step_' + str(steps_count) + '_clusters.csv', dtype={
#     'cluster_current': str,
#     'cluster_next': str,
#     'cluster_prev': str,
#     'bridge_out_current_code': str,
#     'bridge_out_next_code': str,
#     'bridge_in_current_code': str,
#     'bridge_in_next_code': str,
# })

df_tour = pd.DataFrame()

for n in steps_list:
    df_centroids = pd.read_csv('data/step_' + str(n) + '_points.csv', dtype={'index': str, 'cluster': str})

    df_centroids_distances = pd.read_csv('data/step_' + str(n) + '_distances.csv', dtype={'index': str})
    df_centroids_distances = df_centroids_distances.set_index('index')

    print(f'Begin step {n} with {df_centroids.shape[0]} points')

    if n > 1:
        df_points = pd.read_csv('data/step_' + str(n - 1) + '_points.csv', dtype={'index': str, 'cluster': str})
        df_points = df_points.set_index('index')

        df_points_distances = pd.read_csv('data/step_' + str(n - 1) + '_distances.csv', dtype={'index': str})
        df_points_distances = df_points_distances.set_index('index')

        df_centroids_min_path = calcul_min_path(df_centroids.set_index('index'), df_centroids_distances)
        df_centroids_min_path['pair'] = df_centroids_min_path.index + '_' + df_centroids_min_path['next']
        df_centroids_min_path = df_centroids_min_path.reset_index().sort_values('order', ascending=True)

        df_cluster = calc_clusters(df_centroids_min_path, df_points)
        df_tour = calc_layer_tour(df_cluster, df_points, df_points_distances)

        draw_map(df_tour, True).save(f'export/step_{n - 1}_map.html')

        #

        # df_tour['order'] = df_tour.index
        #
        # df_tour_next = pd.read_csv('data/step_' + str(n - 1) + '_clusters.csv', dtype={
        #     'cluster_current': str,
        #     'cluster_next': str,
        #     'cluster_prev': str,
        #     'bridge_out_current_code': str,
        #     'bridge_out_next_code': str,
        #     'bridge_in_current_code': str,
        #     'bridge_in_next_code': str,
        # })
        #
        # df_tour = pd.merge(
        #     df_tour_next,
        #     df_tour[['index', 'order']],
        #     left_on='cluster_current',
        #     right_on='index',
        #     how='left',
        # ).sort_values('order').drop(columns=['index', 'order'])




    #df_tour = calc_layer_tour(df_step_min_path, df_points, df_distances)

    # if n == steps_list[-1]:
    #     df_tour = calcul_min_path_last_cluster(df_tour, df_distances)
    #     draw_map(df_tour, True).save(f'export/step_{n + 1}_map.html')
    #
    # df_tour = calc_layer_tour(df_tour, df_points, df_distances)
    # draw_map(df_tour, True).save(f'export/step_{n}_map.html')

    # if n > 1:
    #     df_tour['order'] = df_tour.index

        # df_tour_next = pd.read_csv('data/step_' + str(n - 1) + '_clusters.csv', dtype={
        #     'cluster_current': str,
        #     'cluster_next': str,
        #     'cluster_prev': str,
        #     'bridge_out_current_code': str,
        #     'bridge_out_next_code': str,
        #     'bridge_in_current_code': str,
        #     'bridge_in_next_code': str,
        # })
        #
        # df_tour = pd.merge(
        #     df_tour_next,
        #     df_tour[['index', 'order']],
        #     left_on='cluster_current',
        #     right_on='index',
        #     how='left',
        # ).sort_values('order').drop(columns=['index', 'order'])

    print(f'End step {n} with {df_tour.shape[0]} points')
    break

df_tour

steps : [4, 3, 2, 1]
Begin step 4 with 3 points
  index_current cluster_current  lat_current  lng_current index_next  \
1             2               0    49.053417     1.648367          3   
3             2               0    49.053417     1.648367          6   
7             4               0    49.021250    -2.438750          6   
5             4               0    49.021250    -2.438750          3   
0             2               0    49.053417     1.648367          1   
2             2               0    49.053417     1.648367          5   
4             4               0    49.021250    -2.438750          1   
6             4               0    49.021250    -2.438750          5   

  cluster_next   lat_next  lng_next  distance  
1            1  46.457167  3.111367       309  
3            1  45.605833  0.041000       402  
7            1  45.605833  0.041000       423  
5            1  46.457167  3.111367       503  
0            1  43.855100  0.906833       581  
2            1 

In [22]:
df_cluster_first = pd.read_csv('data/step_' + str(steps_count) + '_clusters.csv', dtype={
    'cluster_current': str,
    'cluster_next': str,
    'cluster_prev': str,
    'bridge_out_current_code': str,
    'bridge_out_next_code': str,
    'bridge_in_current_code': str,
    'bridge_in_next_code': str,
})

df_cluster_first

Unnamed: 0,cluster_current,cluster_next,cluster_prev,center_lat,center_lng,bridge_out_current_code,bridge_out_next_code,bridge_out_current_lat,bridge_out_current_lng,bridge_out_next_lat,bridge_out_next_lng,bridge_in_current_code,bridge_in_current_lat,bridge_in_current_lng,bridge_in_next_code,bridge_in_next_lat,bridge_in_next_lng
0,0,2,1,44.585472,3.397352,5,2,45.608444,3.063389,47.411833,5.403028,5,45.608444,3.063389,3,48.901076,1.302307
1,2,1,0,48.017979,6.085114,2,3,47.411833,5.403028,48.901076,1.302307,2,47.411833,5.403028,5,45.608444,3.063389
2,1,0,2,48.356955,-0.589763,3,5,48.901076,1.302307,45.608444,3.063389,3,48.901076,1.302307,2,47.411833,5.403028


In [11]:
df_distances_first = pd.read_csv('data/step_' + str(steps_count) + '_distances.csv', dtype={'index': str})
df_distances_first = df_distances_first.set_index('index')

df_distances_first

Unnamed: 0_level_0,0,1,2,3,4,5,6
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,685.0,517.0,564.0,388.0,248.0,524.0
1,685.0,0.0,169.0,402.0,483.0,437.0,691.0
2,517.0,169.0,0.0,346.0,347.0,269.0,592.0
3,564.0,402.0,346.0,0.0,622.0,389.0,305.0
4,388.0,483.0,347.0,622.0,0.0,270.0,759.0
5,248.0,437.0,269.0,389.0,270.0,0.0,489.0
6,524.0,691.0,592.0,305.0,759.0,489.0,0.0


In [17]:
df_tour_first = calcul_min_path(df_cluster_first.set_index('cluster_current'), df_distances_first)\
    .reset_index().drop(column=['order', 'next', 'prev', 'distance'])

df_tour_first

Unnamed: 0,cluster_current,cluster_next,cluster_prev,center_lat,center_lng,bridge_out_current_code,bridge_out_next_code,bridge_out_current_lat,bridge_out_current_lng,bridge_out_next_lat,...,bridge_in_current_code,bridge_in_current_lat,bridge_in_current_lng,bridge_in_next_code,bridge_in_next_lat,bridge_in_next_lng,order,next,prev,distance
0,2,1,0,48.017979,6.085114,2,3,47.411833,5.403028,48.901076,...,2,47.411833,5.403028,5,45.608444,3.063389,0,1,0,169.0
1,1,0,2,48.356955,-0.589763,3,5,48.901076,1.302307,45.608444,...,3,48.901076,1.302307,2,47.411833,5.403028,1,0,2,685.0
2,0,2,1,44.585472,3.397352,5,2,45.608444,3.063389,47.411833,...,5,45.608444,3.063389,3,48.901076,1.302307,5,2,1,517.0
