In [None]:
!pip3 install geopy
!pip3 install shapely
!pip3 install folium

In [27]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
from sklearn.cluster import DBSCAN
from shapely.geometry import MultiPoint
from scipy.spatial.distance import cdist
import folium
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

%matplotlib inline

INPUT_FILE = 'sale_data_10_03_2020.parquet'


def get_coords_map(df):
    # remove "artificial" duplicates
    df_unduped = df.drop_duplicates(subset=['lon__offer', 'lat__offer'], keep="last")
    
    center_coords_df = get_repr_points(df_unduped)
    
    center_coords_df = add_zipped_coords_column(center_coords_df, 'coords_tuple')
    df = add_zipped_coords_column(df, 'coords_tuple')
    # assign a closest point
    df["coords_closest_tuple"] = [
        closest_point(x, list(center_coords_df['coords_tuple']))
        for x in df['coords_tuple']
    ]
    
    # coords encoding map is just center coords df with "mean" values assigned
    coords_encoding_map = (
        df.loc[:, ["coords_closest_tuple", 'price_m2__offer', 'price__offer']]
        .groupby("coords_closest_tuple", as_index=False)
        .mean()
        .sort_values(by='price_m2__offer')
        .reset_index(drop=True)
        .rename(columns={
            'price_m2__offer': 'cluster_mean_price_m2',
            'price__offer': 'cluster_mean_price',
        })
        .pipe(unzip_coord_series_to_lon_and_lat, "coords_closest_tuple")
    )
    coords_encoding_map['cluster_id'] = coords_encoding_map.index + 1
    return coords_encoding_map


def get_repr_points(lon_lat_df):
    """
    Returns a dataframe with "center" points - only lon and lat columns. Based on:
    https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/
    """
    # max distance (in km) between coordinates to get "clustered"
    EPSILON = 3
    # min samples per cluster
    MIN_SAMPLES = 1
    KMS_PER_RADIAN = 6371.0088

    coords = lon_lat_df[['lat__offer', 'lon__offer']].to_numpy()
    epsilon = EPSILON / KMS_PER_RADIAN

    print("Starting DBScan alghorithm ...")
    db = DBSCAN(
        eps=epsilon, min_samples=MIN_SAMPLES, algorithm="ball_tree", metric="haversine"
    ).fit(np.radians(coords))

    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
    centermost_points = list(clusters.map(get_centermost_point))
    print(f"DBScan algoritm found {num_clusters} clusters.")

    return pd.DataFrame(centermost_points, columns=['lat__offer', 'lon__offer'])


def get_centermost_point(cluster):
    """
    Get the most "center" point for a cluster according to DBscan.
    """
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    # get an actual point not a centeroid
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

def closest_point(point, points):
    """ Find closest point from a of list tuples with coordinates. """
    return points[cdist([point], points).argmin()]


def unzip_coord_series_to_lon_and_lat(df, zipped_colname):
    df['lat__offer'] = df[zipped_colname].apply(lambda x: x[0])
    df['lon__offer'] = df[zipped_colname].apply(lambda x: x[1])
    df = df.drop(zipped_colname, axis=1)
    return df


def add_zipped_coords_column(df, new_col_name):
    """ Zips lon and lat columns to create a series of coords tuples. """
    df[new_col_name] = [(x, y) for x, y in zip(df['lat__offer'], df['lon__offer'])]
    return df

def plot_points_on_map(df):
    """ Assumes lat__offer, lon__offer and cluster_id columns are present """
    cluster_id_max = df['cluster_id'].max()
    
    cmap = plt.cm.get_cmap('viridis')
    colors = cmap(np.arange(cmap.N))
    np.random.shuffle(colors)
    m = folium.Map(location=[51.5, 19.3], zoom_start=6, prefer_canvas=True, tiles='Stamen Toner')
    for index, row in df.iterrows():
        color = 'rgb({})'.format(', '.join([str(int(j*256)) for j in mcolors.to_rgb(colors[int(row['cluster_id']/cluster_id_max*256)-1])]))
        folium.CircleMarker(
            location=[row['lat__offer'], row['lon__offer']],
            radius=5,
            color=color,
            fill=True,
            fill_opacity=1,
        ).add_to(m)
    display(m)

In [28]:
newest_df = pd.read_parquet(INPUT_FILE)
newest_df.head()

Unnamed: 0,balcony__offer,basement__clean,building_height__offer,building_material__offer,building_type__offer,building_year__offer,date_added__offer,date_refreshed__offer,desc_len__offer,direct__offer,...,price__offer,price_m2__offer,promotion_counter__offer,room_n__offer,sewers__clean,size__offer,taras__offer,telecom__clean,view_count__offer,water__offer
0,1.0,0.0,2.0,3,3,2019.0,2020-03-09,2020-03-09,962.0,0.0,...,31000.0,534.39,0.0,3.0,0,58.01,0.0,0.0,18.0,0
1,1.0,0.0,2.0,3,3,2019.0,2020-03-09,2019-11-19,999.0,0.0,...,60000.0,1500.0,0.0,2.0,0,40.0,0.0,0.0,9.0,0
2,1.0,1.0,3.0,3,3,2019.0,2020-03-09,2019-10-30,507.0,0.0,...,69000.0,1864.86,0.0,1.0,0,37.0,0.0,0.0,12.0,0
3,1.0,0.0,3.0,2,2,1995.0,2020-03-09,2020-03-09,2210.0,0.0,...,75000.0,2245.51,0.0,1.0,1,33.4,0.0,0.0,22.0,1
4,1.0,0.0,4.0,3,2,2019.0,2020-03-09,2020-03-09,1383.0,0.0,...,80000.0,4545.45,0.0,3.0,0,17.6,0.0,0.0,28.0,0


In [29]:
newest_df = newest_df[:5000]
coords_map = get_coords_map(newest_df)

Starting DBScan alghorithm ...
DBScan algoritm found 274 clusters.


In [30]:
print(coords_map.shape)
coords_map.head()

(274, 5)


Unnamed: 0,cluster_mean_price_m2,cluster_mean_price,lat__offer,lon__offer,cluster_id
0,534.39,31000.0,51.0322,17.9815,1
1,546.36,330000.0,52.4373,17.108,2
2,1285.71,99000.0,51.18739,16.49725,3
3,1348.85,80000.0,53.5317,17.6031,4
4,1423.84,129000.0,50.88076,15.92359,5


In [31]:
plot_points_on_map(coords_map)