In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import logging as log

from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from scipy.spatial.distance import cdist

%matplotlib inline

In [2]:
df = pd.read_parquet('rent_clean_2019_09_09T19_30_03.parquet')

# max distance (in km) between coordinates to get "clustered"
EPSILON = 1
# min samples per cluster
MIN_SAMPLES = 1

In [4]:
def get_repr_points(lon_lat_df):
    """
    Get's lon's and lat's representative for a given lon and lat dataset.
    For details see: https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
    """
    coords = df[['lat', 'lon']].to_numpy()
    KMS_PER_RADIAN = 6371.0088
    epsilon = EPSILON / kms_per_radian
    db = DBSCAN(eps=epsilon,
                min_samples=MIN_SAMPLES,
                algorithm='ball_tree',
                metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    centermost_points = list(clusters.map(get_centermost_point))
    log.info(f'Found {num_clusters} clusters')
    return centermost_points
                                
def get_centermost_point(cluster):
    """
    Get the most "center" point for a cluster according to DBscan.
    """
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

In [6]:
repr_coords = get_repr_points(df)

In [12]:
def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

df['coords_tuple'] = [(x, y) for x,y in zip(df['lat'], df['lon'])]

df['closest_coords_center'] = [closest_point(x, repr_coords) for x in df['coords_tuple']]

In [15]:
df.loc[:,['coords_tuple', 'closest_coords_center']]

Unnamed: 0,coords_tuple,closest_coords_center
0,"(54.5623296, 18.4882319)","(54.555969999999995, 18.5081)"
1,"(52.4454, 16.8784)","(52.442506900000005, 16.893565600000002)"
2,"(51.2592, 22.4842)","(51.277452000000004, 22.503736800000002)"
3,"(53.48924375, 18.7625187)","(53.492510200000005, 18.7519414)"
4,"(52.3904, 16.8995)","(52.4085, 16.9198)"
5,"(51.7880708, 19.469314800000003)","(51.7623451, 19.4578512)"
6,"(51.404202500000004, 16.1938917)","(51.404202500000004, 16.1938917)"
7,"(53.1398539, 18.0257721)","(53.12424, 18.00303)"
8,"(53.0174, 18.5712)","(53.026121700000004, 18.5496221)"
9,"(50.8122, 19.1124)","(50.81439493, 19.11586761)"
