In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import logging as log

from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

%matplotlib inline

In [None]:
df = pd.read_parquet('rent_clean_2019_09_09T19_30_03.parquet')

# max distance (in km) between coordinates to get "clustered"
EPSILON = 1
# min samples per cluster
MIN_SAMPLES = 1

In [None]:
def get_repr_points(lon_lat_df):
    """
    Get's lon's and lat's representative for a given lon and lat dataset.
    For details see: https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
    """
    coords = df[['lat', 'lon']].to_numpy()
    kms_per_radian = 6371.0088
    epsilon = EPSILON / kms_per_radian
    db = DBSCAN(eps=epsilon,
                min_samples=MIN_SAMPLES,
                algorithm='ball_tree',
                metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    centermost_points = list(clusters.map(get_centermost_point))
    log.info(f'Found {num_clusters} clusters')
    return centermost_points
                                
def get_centermost_point(cluster):\n",
    """
    Get the most "center" point for a cluster according to DBscan.
    """
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

In [None]:
repr_coords = get_repr_points(coords)
centers = pd.DataFrame(repr_coords, columns=['lat', 'lon'])