In [280]:
from typing import List

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from folium import Map, CircleMarker, PolyLine, FeatureGroup

from formation.helpers.map import calcul_distances, calcul_distance, calcul_centroids, calcul_min_path

In [281]:
COLORS = px.colors.qualitative.Vivid

city_count = 70

In [282]:
df_cities = pd.read_csv(
    '../data/formated/cities-france.csv',
    dtype={'code_insee': str, 'dep_code': str, 'nom_standard': str, 'latitude_mairie': float, 'longitude_mairie': float},
).rename(columns={'code_insee': 'index', 'nom_standard': 'name', 'latitude_mairie': 'lat', 'longitude_mairie': 'lng'}) \
    .set_index('index') \
    .loc[:, ['name', 'lat', 'lng']]

In [283]:
df_cities_random = df_cities.sample(city_count)

df_cities_random

Unnamed: 0_level_0,name,lat,lng
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
47225,Roquefort,44.174,0.561
71399,Saint-Christophe-en-Brionnais,46.288,4.178
42062,Chevrières,45.589,4.400
90104,Vézelois,47.607,6.918
35001,Acigné,48.134,-1.534
...,...,...,...
01309,Pouillat,46.327,5.428
43027,Berbezit,45.285,3.596
67315,Neewiller-près-Lauterbourg,48.954,8.125
80293,Ételfay,49.662,2.620


In [284]:
df_points, df_points_distances = calcul_distances(df_cities_random)

points = df_points.loc[:,['x', 'y']].to_numpy()

df_points


The default value of `n_init` will change from 4 to 1 in 1.9.



Unnamed: 0_level_0,name,lat,lng,x,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01309,Pouillat,46.327,5.428,-115.881765,-187.902254
01384,Saint-Rambert-en-Bugey,45.949,5.440,-97.540557,-225.797432
02546,La Neuville-en-Beine,49.676,3.151,-139.988077,220.537334
03077,Chirat-l'Église,46.240,3.033,51.033435,-110.463983
03097,Deneuille-les-Mines,46.378,2.782,60.584222,-87.710448
...,...,...,...,...,...
90104,Vézelois,47.607,6.918,-280.557226,-110.736563
94047,Mandres-les-Roses,48.702,2.544,-49.152980,146.988407
94070,Santeny,48.726,2.570,-52.009398,148.461769
95436,Mours,49.134,2.271,-55.089620,198.617786


In [285]:
measures = {
    'kmeans': [],
}

clustering_type = ['kmeans', 'gaussian']
gaussian_type = ['full', 'tied', 'diag', 'spherical']

for t in gaussian_type:
    measures['gaussian_' + t] = []

n_clusters_list = range(
    1,
    min(
        df_points.shape[0],
        len(COLORS) - 1
    ) + 1
)

for k in n_clusters_list:
    kmeans = KMeans(n_clusters=k)

    kmeans.fit(points)

    measures['kmeans'].append(kmeans.inertia_)

    for t in gaussian_type:
        gaussian = GaussianMixture(n_components=k, covariance_type=t, random_state=42)
        gaussian.fit(points)
        measures['gaussian_' + t].append(gaussian.bic(points))

measures['kmeans'] = StandardScaler().fit_transform(np.array(measures['kmeans']).reshape(-1, 1)).reshape(-1)

for t in gaussian_type:
    measures['gaussian_' + t] = StandardScaler().fit_transform(np.array(measures['gaussian_' + t]).reshape(-1, 1)).reshape(-1)

fig = px.line(
    pd.DataFrame(measures, index=n_clusters_list),
    title='Méthode du Coude',
    labels={
        'index': 'Nombre de Clusters',
        'value': "Score",
    },
    color_discrete_map={
        "kmeans": COLORS[0],
        "gaussian_full": COLORS[1],
        "gaussian_tied": COLORS[2],
        "gaussian_diag": COLORS[3],
        "gaussian_spherical": COLORS[4],
    }
)

fig.update_layout(showlegend=True)
fig.show()
fig.write_html("coude.html")

In [297]:
k_kmeans = 8
k_gaussian = 8
gaussian_type = 'diag'

model_kmeans = KMeans(n_clusters=k_kmeans)
model_kmeans.fit(points)
df_points['cluster_kmeans'] = model_kmeans.predict(points)
df_points['cluster_kmeans'] = df_points['cluster_kmeans'].astype(str)

model_gaussian = GaussianMixture(n_components=k_gaussian, covariance_type=gaussian_type, random_state=42)
model_gaussian.fit(points)
df_points['cluster_gaussian'] = model_gaussian.predict(points)
df_points['cluster_gaussian'] = df_points['cluster_gaussian'].astype(str)

for c in ['cluster_kmeans', 'cluster_gaussian']:
    fig = px.scatter(
        df_points.reset_index(),
        x='x',
        y='y',
        color=c,
        hover_name='index',
        height=700,
        size=[1] * df_points.shape[0],
        color_discrete_sequence=COLORS,
        title=f"{c}"
    )

    fig.update_yaxes(scaleanchor="x", scaleratio=1)

    fig.show()

df_points

Unnamed: 0_level_0,name,lat,lng,x,y,cluster_kmeans,cluster_gaussian
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
01309,Pouillat,46.327,5.428,-115.881765,-187.902254,18,17
01384,Saint-Rambert-en-Bugey,45.949,5.440,-97.540557,-225.797432,18,17
02546,La Neuville-en-Beine,49.676,3.151,-139.988077,220.537334,0,6
03077,Chirat-l'Église,46.240,3.033,51.033435,-110.463983,1,12
03097,Deneuille-les-Mines,46.378,2.782,60.584222,-87.710448,1,12
...,...,...,...,...,...,...,...
90104,Vézelois,47.607,6.918,-280.557226,-110.736563,7,8
94047,Mandres-les-Roses,48.702,2.544,-49.152980,146.988407,10,11
94070,Santeny,48.726,2.570,-52.009398,148.461769,10,11
95436,Mours,49.134,2.271,-55.089620,198.617786,10,11


In [287]:
clustering_selected = 'gaussian'
clustering_dropped = 'kmeans'

df_points_selected = df_points.drop(columns=['cluster_' + clustering_dropped]).rename(columns={
    'cluster_' + clustering_selected: 'cluster'
})

df_points_selected

Unnamed: 0_level_0,name,lat,lng,x,y,cluster
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01309,Pouillat,46.327,5.428,-115.881765,-187.902254,2
01384,Saint-Rambert-en-Bugey,45.949,5.440,-97.540557,-225.797432,2
02546,La Neuville-en-Beine,49.676,3.151,-139.988077,220.537334,5
03077,Chirat-l'Église,46.240,3.033,51.033435,-110.463983,2
03097,Deneuille-les-Mines,46.378,2.782,60.584222,-87.710448,2
...,...,...,...,...,...,...
90104,Vézelois,47.607,6.918,-280.557226,-110.736563,0
94047,Mandres-les-Roses,48.702,2.544,-49.152980,146.988407,1
94070,Santeny,48.726,2.570,-52.009398,148.461769,1
95436,Mours,49.134,2.271,-55.089620,198.617786,1


In [288]:
df_centroids, df_centroids_distances = calcul_centroids(df_points_selected)

fig = px.scatter(
    df_centroids.reset_index(),
    x='x',
    y='y',
    hover_name='index',
    height=700,
    size=[1] * df_centroids.shape[0],
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.show()

df_centroids


The default value of `n_init` will change from 4 to 1 in 1.9.



Unnamed: 0,lat,lng,x,y
0,47.341667,6.4855,-278.152534,67.111282
1,48.9515,1.9755,92.521756,146.864399
2,45.802167,4.558722,-187.490084,-139.934353
3,46.607333,-0.871667,239.327382,-155.918528
4,44.0041,0.9625,34.891883,-406.006867
5,50.016929,3.0585,47.091739,281.323655
6,48.079857,-1.109286,291.501456,0.694252
7,48.641,6.553,-239.691598,205.866159


In [289]:
# On determine l'ordre des clusters, en determinant le chemin le plus rapide entre les centroides

df_centroids_min_path = calcul_min_path(df_centroids, df_centroids_distances)

df_centroids_min_path['pair'] = df_centroids_min_path.index + '_' + df_centroids_min_path['next']

df_centroids_min_path.sort_values('order', ascending=True)

df_centroids_min_path = df_centroids_min_path.reset_index()

df_centroids_min_path

Unnamed: 0,index,lat,lng,x,y,order,next,prev,distance,pair
0,2,45.802167,4.558722,-187.490084,-139.934353,0,0,4,226.0,2_0
1,0,47.341667,6.4855,-278.152534,67.111282,1,7,2,145.0,0_7
2,7,48.641,6.553,-239.691598,205.866159,2,5,0,296.0,7_5
3,5,50.016929,3.0585,47.091739,281.323655,3,1,7,142.0,5_1
4,1,48.9515,1.9755,92.521756,146.864399,4,6,5,247.0,1_6
5,6,48.079857,-1.109286,291.501456,0.694252,5,3,1,165.0,6_3
6,3,46.607333,-0.871667,239.327382,-155.918528,6,4,6,323.0,3_4
7,4,44.0041,0.9625,34.891883,-406.006867,7,2,3,347.0,4_2


In [290]:
def cross_distance(row):
    return calcul_distance(
        row['lat_current'],
        row['lng_current'],
        row['lat_next'],
        row['lng_next']
    )

df_crossed = pd.DataFrame()

# On determine les ponts entre clusters
for index, row_cluster in df_centroids_min_path.iterrows():
    df_current = df_points_selected[df_points_selected['cluster'] == row_cluster['index']][['cluster', 'lat', 'lng']]
    df_next = df_points_selected[df_points_selected['cluster'] == row_cluster['next']][['cluster', 'lat', 'lng']]

    df_cross = df_current.reset_index()\
        .merge(df_next.reset_index(), how='cross', suffixes=("_current", "_next"))

    df_cross['distance'] = df_cross.apply(
        cross_distance,
        axis=1
    )

    df_bridge = df_cross.sort_values(by='distance', ascending=True)[[
        'index_current', 'cluster_current', 'index_next', 'cluster_next'
    ]].head(1)

    df_crossed = pd.concat([
        df_crossed,
        df_bridge,
    ])

df_crossed['pair'] = df_crossed['cluster_current'] + '_' + df_crossed['cluster_next']

df_clusters = pd.merge(
    df_centroids_min_path,
    df_crossed,
    left_on='pair',
    right_on='pair',
)[['index', 'next', 'prev', 'lat', 'lng', 'pair', 'index_current', 'index_next']]

df_clusters = pd.merge(
    df_clusters,
    df_points_selected[['lat', 'lng']],
    left_on='index_current',
    right_index=True,
    suffixes=('', '_current')
)

df_clusters = pd.merge(
    df_clusters,
    df_points_selected[['lat', 'lng']],
    left_on='index_next',
    right_index=True,
    suffixes=('', '_next')
)

df_clusters = df_clusters.drop(columns=['pair'])\
    .rename(columns={
        'index': 'cluster_current',
        'next': 'cluster_next',
        'prev': 'cluster_prev',
        'lat': 'center_lat',
        'lng': 'center_lng',
        'index_current': 'bridge_out_current_code',
        'lat_current': 'bridge_out_current_lat',
        'lng_current': 'bridge_out_current_lng',
        'index_next': 'bridge_out_next_code',
        'lat_next': 'bridge_out_next_lat',
        'lng_next': 'bridge_out_next_lng',
    })

df_clusters['bridge_in_current_code'] = df_clusters['bridge_out_next_code'].shift(periods=1, fill_value='')
df_clusters.loc[df_clusters.index[0], 'bridge_in_current_code'] = df_clusters.loc[df_clusters.index[-1], 'bridge_out_next_code']

df_clusters['bridge_in_current_lat'] = df_clusters['bridge_out_next_lat'].shift(periods=1, fill_value='')
df_clusters.loc[df_clusters.index[0], 'bridge_in_current_lat'] = df_clusters.loc[df_clusters.index[-1], 'bridge_out_next_lat']

df_clusters['bridge_in_current_lng'] = df_clusters['bridge_out_next_lng'].shift(periods=1, fill_value='')
df_clusters.loc[df_clusters.index[0], 'bridge_in_current_lng'] = df_clusters.loc[df_clusters.index[-1], 'bridge_out_next_lng']

df_clusters['bridge_in_next_code'] = df_clusters['bridge_out_current_code'].shift(periods=1, fill_value='')
df_clusters.loc[df_clusters.index[0], 'bridge_in_next_code'] = df_clusters.loc[df_clusters.index[-1], 'bridge_out_current_code']

df_clusters['bridge_in_next_lat'] = df_clusters['bridge_out_current_lat'].shift(periods=1, fill_value='')
df_clusters.loc[df_clusters.index[0], 'bridge_in_next_lat'] = df_clusters.loc[df_clusters.index[-1], 'bridge_out_current_lat']

df_clusters['bridge_in_next_lng'] = df_clusters['bridge_out_current_lng'].shift(periods=1, fill_value='')
df_clusters.loc[df_clusters.index[0], 'bridge_in_next_lng'] = df_clusters.loc[df_clusters.index[-1], 'bridge_out_current_lng']

df_clusters.T

Unnamed: 0,0,1,2,3,4,5,6,7
cluster_current,2.0,0.0,7.0,5.0,1.0,6.0,3.0,4.0
cluster_next,0.0,7.0,5.0,1.0,6.0,3.0,4.0,2.0
cluster_prev,4.0,2.0,0.0,7.0,5.0,1.0,6.0,3.0
center_lat,45.802167,47.341667,48.641,50.016929,48.9515,48.079857,46.607333,44.0041
center_lng,4.558722,6.4855,6.553,3.0585,1.9755,-1.109286,-0.871667,0.9625
bridge_out_current_code,71353.0,68300.0,52047.0,80293.0,76706.0,49064.0,17221.0,81028.0
bridge_out_next_code,39182.0,54039.0,8315.0,95604.0,53211.0,79047.0,24586.0,43132.0
bridge_out_current_lat,46.53,47.79,48.318,49.662,49.714,47.69,46.124,43.696
bridge_out_current_lng,4.872,7.37,4.833,2.62,0.401,-0.683,-0.608,2.565
bridge_out_next_lat,47.069,48.449,49.761,49.099,48.342,46.829,45.263,45.12


In [291]:
def calcul_near_point(index: str, points: List[str], df_distances: pd.DataFrame)->str:
    distances = df_distances.loc[index, points].to_dict()

    return min(distances, key=lambda k: distances[k])

In [292]:
def calcul_path(
        indexes_started: List[str],
        indexes_ended: List[str],
        indexes_between: List[str],
        df_distances: pd.DataFrame,
):
    index_start = indexes_started[-1]
    index_end = indexes_ended[-1]

    if len(indexes_between) > 0:
        indexes_started.append(
            calcul_near_point(index_start, indexes_between, df_distances)
        )

        indexes_between = list(
            filter(
                lambda val: val != indexes_started[-1],
                indexes_between
            )
        )

        if len(indexes_between) > 0:
            indexes_ended.append(
                calcul_near_point(index_end, indexes_between, df_distances)
            )

            indexes_between = list(
                filter(
                    lambda val: val != indexes_ended[-1],
                    indexes_between
                )
            )

        return calcul_path(
            indexes_started,
            indexes_ended,
            indexes_between,
            df_distances,
        )

    return indexes_started + list(reversed(indexes_ended))


In [295]:
df_tour = pd.DataFrame()

df_points_selected_with_index = df_points_selected.reset_index()

for index, row_cluster in df_clusters.iterrows():
    df_current = df_points_selected[df_points_selected['cluster'] == row_cluster['cluster_current']][[]].reset_index()
    # df_next = df_points_selected[df_points_selected['cluster'] == row_cluster['cluster_next']][['lat', 'lng']].reset_index()
    # df_prev = df_points_selected[df_points_selected['cluster'] == row_cluster['cluster_prev']][['lat', 'lng']].reset_index()

    start = df_current.loc[df_current['index'] == row_cluster['bridge_in_current_code'], ['index']].iloc[0]['index']
    end = df_current.loc[df_current['index'] == row_cluster['bridge_out_current_code'], ['index']].iloc[0]['index']
    between = df_current[
        (df_current['index'] != row_cluster['bridge_in_current_code']) &
        (df_current['index'] != row_cluster['bridge_out_current_code'])
    ]['index'].values

    indexes = calcul_path(
        indexes_started=[start],
        indexes_ended=[end],
        indexes_between=between,
        df_distances=df_points_distances
    )

    df_current['order'] = df_current['index'].map({
        index: i
        for i, index in enumerate(indexes)
    })

    df_tour = pd.concat([
        df_tour,
        df_current.sort_values('order').drop(columns=['order'])
    ], ignore_index=True)

df_tour = df_tour.merge(
    df_points_selected,
    left_on='index',
    right_index=True,
    how='left',
)

df_tour['next_lat'] = df_tour['lat'].shift(periods=-1, fill_value='')
df_tour['next_lng'] = df_tour['lng'].shift(periods=-1, fill_value='')

df_tour.loc[df_tour.index[-1], 'next_lat'] = df_tour.loc[df_tour.index[0], 'lat']
df_tour.loc[df_tour.index[-1], 'next_lng'] = df_tour.loc[df_tour.index[0], 'lng']

df_tour

Unnamed: 0,index,name,lat,lng,x,y,cluster,next_lat,next_lng
0,43132,Mazeyrat-d'Allier,45.120,3.528,77.175614,-238.077251,2,45.285,3.596
1,43027,Berbezit,45.285,3.596,63.623081,-224.580045,2,45.182,3.747
2,43043,Céaux-d'Allègre,45.182,3.747,58.698779,-240.137106,2,45.589,4.4
3,42062,Chevrières,45.589,4.400,-7.537447,-224.199015,2,45.219,5.131
4,26210,Valherbasse,45.219,5.131,-38.984201,-286.957842,2,45.139,5.793
...,...,...,...,...,...,...,...,...,...
65,31505,Saint-Michel,43.168,1.084,354.370269,-330.812305,4,43.35,0.978
66,31543,Sénarens,43.350,0.978,351.485892,-309.089554,4,43.911,1.593
67,81178,Montgaillard,43.911,1.593,277.069360,-280.379752,4,43.307,2.127
68,11070,Carlipa,43.307,2.127,273.324509,-359.675679,4,43.696,2.565


In [296]:
m = Map(
    location=(46.227638, 2.213749),
    zoom_start=6,
)

fg = FeatureGroup(name="Markers")

#for i, point in df_points_selected.reset_index().iterrows():
for i, point in df_tour.iterrows():
    fg.add_child(
        CircleMarker(
            location=[
                point['lat'],
                point['lng'],
            ],
            tooltip=f"{point['index']} ({point['cluster']})",
            fill=True,
            color=COLORS[int(point['cluster'])],
            weight=0,
            fill_opacity=0.6,
            radius=15,
        )
    )

    fg.add_child(
        PolyLine(
            locations=[
                [point['lat'], point['lng']],
                [point['next_lat'], point['next_lng']],
            ],
            color="black",
            weight=4,
        )
    )

for i, row_cluster in df_clusters.iterrows():
    fg.add_child(
        CircleMarker(
            location=[
                row_cluster['center_lat'],
                row_cluster['center_lng'],
            ],
            tooltip=str(row_cluster['cluster_current']),
            fill=False,
            color=COLORS[int(row_cluster['cluster_current'])],
            weight=5,
            radius=11,
        )
    )


    # fg.add_child(
    #     PolyLine(
    #         locations=[
    #             [row_cluster['lat_current'], row_cluster['lng_current']],
    #             [row_cluster['lat_next'], row_cluster['lng_next']],
    #         ],
    #         color="black",
    #         weight=4,
    #     )
    # )


#
#     df_cluster = df_points_selected[df_points_selected['cluster'] == str(centroid['index'])]
#     df_cluster_min_path = calcul_min_path(df_cluster, df_points_distances)
#
#     for j, point in df_cluster_min_path.reset_index().iterrows():
#         next_point = df_cluster_min_path.loc[point['next']:point['next']]
#
#         fg.add_child(
#             PolyLine(
#                 locations=[
#                     [point['lat'], point['lng']],
#                     [next_point['lat'].values[0], next_point['lng'].values[0]],
#                 ],
#                 color="#FF0000",
#                 weight=2,
#             )
#         )

m.add_child(fg)

m.save('map.html')

m