In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import logging as log

from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
from scipy.spatial.distance import cdist

%matplotlib inline

In [7]:
df = pd.read_parquet('sale_clean_2019_09_09T19_29_51.parquet')

# max distance (in km) between coordinates to get "clustered"
EPSILON = 1
# min samples per cluster
MIN_SAMPLES = 1

In [8]:
len(df)

147178

In [3]:
def get_repr_points(lon_lat_df):
    """
    Get's lon's and lat's representative for a given lon and lat dataset.
    For details see: https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
    """
    coords = df[['lat', 'lon']].to_numpy()
    KMS_PER_RADIAN = 6371.0088
    epsilon = EPSILON / KMS_PER_RADIAN
    db = DBSCAN(eps=epsilon,
                min_samples=MIN_SAMPLES,
                algorithm='ball_tree',
                metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    centermost_points = list(clusters.map(get_centermost_point))
    log.info(f'Found {num_clusters} clusters')
    return centermost_points
                                
def get_centermost_point(cluster):
    """
    Get the most "center" point for a cluster according to DBscan.
    """
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

In [4]:
repr_coords = get_repr_points(df)

In [5]:
def closest_point(point, points):
    """ Find closest point from a list of points. """
    return points[cdist([point], points).argmin()]

df['coords_tuple'] = [(x, y) for x,y in zip(df['lat'], df['lon'])]

df['coords_closest_tuple'] = [closest_point(x, repr_coords) for x in df['coords_tuple']]

In [6]:
center_coords = (df.loc[:,['coords_closest_tuple','price_m2']]
                   .groupby('coords_closest_tuple', as_index=False)
                   .mean()
                   .sort_values(by='price_m2')
                   # later encode index to coords categories
                   .reset_index(drop=True)
                   .rename(columns={'price_m2': 'coords_mean_price_m2'})
                )
center_coords['coords_category'] = center_coords.index + 1

In [7]:
center_coords

Unnamed: 0,coords_closest_tuple,coords_mean_price_m2,coords_category
0,"(50.06232241, 18.66123551)",0.080000,1
1,"(50.5316, 18.7144)",1.400000,2
2,"(51.9533, 18.3914)",1.740000,3
3,"(51.802229700000005, 15.7172237)",2.050000,4
4,"(50.4124, 18.2642)",2.400000,5
5,"(49.7818658, 18.8030232)",2.650000,6
6,"(52.2377975, 17.2747117)",2.700000,7
7,"(50.525833, 18.289167000000003)",3.000000,8
8,"(49.3625, 20.8887)",3.130000,9
9,"(52.931909999999995, 17.961)",4.000000,10
