In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score as ss
import itertools
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt
import random

In [None]:
df = pd.read_csv('../../US_Accidents_May19_Migrated Data.csv')

In [None]:
df.head()

In [None]:

US_loca_df = df[['Start_Lat','Start_Lng']]
US_loca_df.columns = ["latitude", "longitude"]
coords = US_loca_df[["latitude", "longitude"]]
X = US_loca_df.to_numpy()

In [None]:
US_loca_df.head()

In [None]:
# random select 100000 samples
l = list(range(2243939))
random.seed(10)
pick = sorted(random.sample(l, 100000))


In [None]:
US_picked_df = US_loca_df.iloc[pick]
US_picked_df

In [None]:
X = US_picked_df.to_numpy()

In [None]:
plt.scatter( US_loca_df["longitude"],US_loca_df["latitude"],s=2)

In [None]:
plt.scatter( US_picked_df["longitude"],US_picked_df["latitude"],s=2)

In [None]:
#elbow method define range of eps
neigh = NearestNeighbors(n_neighbors=300)
nbrs = neigh.fit(np.radians(X))
distances, indices = nbrs.kneighbors(np.radians(X))
distances = distances[:, 1]
distances = np.sort(distances, axis=0)
fig=plt.figure()
plt.plot(distances)

In [None]:
distances = np.sort(distances, axis=0)
fig=plt.figure()
plt.plot(distances)
plt.xlim(99500, 100300)

In [None]:
# test
dbscan_cluster_model = DBSCAN(eps=0.014, min_samples=300, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
dbscan_cluster_model
dbscan_cluster_model.labels_

In [None]:
US_picked_df['cluster'] = dbscan_cluster_model.labels_

US_picked_df

In [None]:
# visualise test
location = US_picked_df['latitude'].mean(), US_picked_df['longitude'].mean()

m = folium.Map(location=location,zoom_start=5,control_scale = True)

folium.TileLayer('cartodbpositron').add_to(m)

clust_colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']

for i in range(0,len(US_picked_df)):
    colouridx = US_picked_df['cluster'].iloc[i]
    if colouridx == -1:
        pass
    else:
        col = clust_colours[colouridx%len(clust_colours)]
        folium.CircleMarker([US_picked_df['latitude'].iloc[i],US_picked_df['longitude'].iloc[i]], radius = 10, color = col, fill = col).add_to(m)

m


In [None]:
# score of test
ss(X, US_picked_df['cluster'])

In [None]:
# below is testing range of min_s and eps

epsilons = np.linspace(0.014,0.014, num=1)
epsilons

In [None]:
min_samples = np.arange(930, 990, step=20) 
min_samples

In [None]:
combinations = list(itertools.product(epsilons, min_samples))
combinations

In [None]:
N = len(combinations)
N

In [None]:
# find best model
def get_scores_and_labels(combinations, X):
  scores = []
  all_labels_list = []
  

  for i, (eps, num_samples) in enumerate(combinations):
    
    dbscan_cluster_model = DBSCAN(eps= eps, min_samples= num_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
    labels = dbscan_cluster_model.labels_
    labels_set = set(labels)
    num_clusters = len(labels_set)
    if -1 in labels_set:
      num_clusters -= 1
    
    if (num_clusters < 2) or (num_clusters > 100):
      scores.append(-10)
      all_labels_list.append('bad')
      c = (eps, num_samples)
      print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
      continue
    
    scores.append(ss(X, labels))
    all_labels_list.append(labels)
    print(f"Index: {i}, Score: {scores[-1]}, Labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")

  best_index = np.argmax(scores)
  best_parameters = combinations[best_index]
  best_labels = all_labels_list[best_index]
  best_score = scores[best_index]

  return {'best_epsilon': best_parameters[0],
          'best_min_samples': best_parameters[1], 
          'best_labels': best_labels,
          'best_score': best_score}

best_dict = get_scores_and_labels(combinations, X)

In [None]:
best_dict

In [None]:
US_picked_df['cluster'] = best_dict['best_labels']

In [None]:
#visualisation 
location = US_picked_df['latitude'].mean(), US_picked_df['longitude'].mean()

m = folium.Map(location=location,zoom_start=5,control_scale = True)

folium.TileLayer('cartodbpositron').add_to(m)

clust_colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']

for i in range(0,len(US_picked_df)):
    colouridx = US_picked_df['cluster'].iloc[i]
    if colouridx == -1:
        pass
    else:
        col = clust_colours[colouridx%len(clust_colours)]
        folium.CircleMarker([US_picked_df['latitude'].iloc[i],US_picked_df['longitude'].iloc[i]], radius = 10, color = col, fill = col).add_to(m)

m