In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics
from shapely.geometry import MultiPoint
from sklearn.metrics.pairwise import haversine_distances

In [70]:
def find_centroid(cluster):
    centroid = MultiPoint(cluster).centroid
    return [centroid.x, centroid.y]

In [71]:
def dbscan():
    df = pd.read_csv('trimmed.csv')
    data = df.loc[:, ['latitude', 'longitude']] # take only the latitude and longitude
    array = data.to_numpy() # Converted to numpy array
    # dbscan requires everything to be passed in as radians
    kms_in_radian = 6371000
    eps = 50/kms_in_radian
    db = DBSCAN(eps = eps, min_samples=6, algorithm="ball_tree", metric="haversine", n_jobs=-1)
    results=db.fit_predict(np.radians(array))
    clusters = pd.Series(array[db.labels_==label] for label in set(db.labels_) if label!=-1)
    
    cluster_center = clusters.map(find_centroid)
    cluster_center = np.stack(cluster_center.to_numpy())
    print("Silhouette coefficient is {}".format(metrics.silhouette_score(array, db.labels_)))
    
    return cluster_center
    

In [75]:
def find_Error(cluster_center, true_data):
    radians_in_m = 6371000
    result = haversine_distances(np.radians(true_data), np.radians(cluster_centers)) * radians_in_m
    min_distances = result.min(axis=1)

    print(min_distances)
    new_cluster_center = cluster_centers[np.argmin(result, axis=1)]
    print("Mean error is {}m".format(min_distances.mean()))
    print("Maximum error is {}m".format(min_distances.max()))
    
true_data = [[29.70154390339952, -95.39209601022654],[29.70428338419638, -95.39100648039813], [29.703335743447496, -95.39130824050142], [29.702188761458533, -95.3936918414558], [29.702691697389692, -95.39530032710667], [29.703085520490866, -95.39629883360872], [29.722233773075338, -95.36705220231357],[29.72526628586194, -95.37240452291601], [29.72734309820575, -95.375255487577], [29.728189612898646, -95.37701142524726],[29.729725295675568, -95.38003134004325], [29.73117267948174, -95.38260255498517],[29.731857687219605, -95.38404510951365],[29.756970123575087, -95.37525775033865], [29.7552538566023, -95.37571750526642],[29.755200480497276, -95.3794874504529],[29.75464650875399, -95.38024579642178],[29.754603020003177, -95.38300456681712],[29.752629019819906, -95.38306070220811], [29.752095895517666, -95.38491481879633]]
true_data = np.array(true_data)

cluster_centers = dbscan()
find_Error(cluster_centers, true_data)

Silhouette coefficient is 0.7784287056418805
[ 4.28607097 12.83899117  8.79966194 11.94213393 13.99092313  3.67928401
  3.45069202  4.05967764 21.85065967 18.40810344  9.53658977 19.5946282
  9.73549026  4.99267114  6.8192461   3.46820934 10.78692971 11.66175354
  2.75807896  3.2524791 ]
Mean error is 9.29561370221224m
Maximum error is 21.850659671841143m
