# install packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
import dataframe_image as dfi
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score as ss
import itertools
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt
import random
from shapely.geometry import MultiPoint
from geopy.distance import great_circle
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

---------------------------------------------------------------------------

# Data processing 

In [None]:
#load data
df = pd.read_csv('../../US_Accidents_May19_Migrated Data.csv')

In [None]:
#dataset observing
df.shape
df.head()
df.isnull().sum()

In [None]:
# selecting attribute
df.drop = df[['ID','City','State','Severity','Visibility(mi)','Start_Lat','Start_Lng', 
            'count Traffic Signal','Count of Crossing','count of Bump','Description','Count of accidents',
             'Weather_Condition','Humidity(%)','Precipitation(in)','Wind_Chill(F)','Wind_Speed(mph)']]
df.drop = pd.get_dummies(df.drop, columns=['Amenity', 
    'Bump', 
    'Crossing',
    'Give_Way', 
    'Junction', 
    'No_Exit',
    'Railway', 
    'Roundabout', 
    'Station',
    'Stop', 
    'Traffic_Calming',
    'Traffic_Signal', 
    'Turning_Loop'])

In [None]:
# print description table as a plot
table = df.drop.describe()

dfi.export(table, 'dataframe.png')
table

In [None]:
# visilize count of accident in each state
states = df.State.unique()
count_by_state=[]
for i in df.State.unique():
    count_by_state.append(df[df['State']==i].count()['ID'])

fig,ax = plt.subplots(figsize=(16,10))
sns.barplot(states,count_by_state)

In [None]:
#find top 10 city having accident
top_cities=df["City"].value_counts().sort_values()[-20:].reset_index()
top_cities.columns=["city","number_of_accidents"]

plt.figure(figsize=(10,7))
sns.barplot(x="city",y="accidents number",data=top_cities)
plt.title("TOP 10 CITIES WITH HIGHEST NUMBER OF ACCIDENTS",fontsize=20)
plt.xticks(rotation=40)
plt.show()

In [None]:
# map represent accident severity 

severity_cols = {
    0: 'green',
    1: 'palegreen',
    2: 'papayawhip',
    3: 'lightsalmon',
    4: 'tomato'
}

vcol = [severity_cols[i] for i in df['Severity']]

ax = plt.scatter(df['Start_Lng'], df['Start_Lat'],c = vcol,s=2)
plt.title('Accidents representating map by severity level')
fig = ax.get_figure()
fig.savefig('Severity.png')

In [None]:
# percentage of accident including road params, save as plot
road_params = [
    'Amenity', 
    'Bump', 
    'Crossing',
    'Give_Way', 
    'Junction', 
    'No_Exit',
    'Railway', 
    'Roundabout', 
    'Station',
    'Stop', 
    'Traffic_Calming',
    'Traffic_Signal', 
    'Turning_Loop']

# % of accident including road params
road_param_percent = df.loc[:, road_params].sum() / len(df)
plt.title('Presence of road element near accidents')
plt.xlabel('% of total of accidents')
ax=road_param_percent.sort_values().plot(kind='barh');

fig = ax.get_figure()
fig.savefig('road.png')

In [None]:
# percentage of accident by Weather_Condition
acc_by_weather_condition = df.groupby('Weather_Condition').size() / len(df)
acc_by_weather_condition = acc_by_weather_condition[acc_by_weather_condition > 0.005]
plt.title('Presence of weather condition during accidents')
plt.xlabel('% of total of accidents')
acc_by_weather_condition.sort_values().plot(kind='barh');

---------------------------------------------------------------------------

# Clustering

## kmean

## DBSCAN
### New york

In [None]:
# take lat and lng colunm filter Newyork data
NY = df['City'] == 'New York'
NY_df = df[NY]
NY_loca_df = NY_df[['Start_Lat','Start_Lng']]
NY_loca_df.columns = ["latitude", "longitude"]
coords = NY_loca_df[["latitude", "longitude"]]
X = NY_loca_df.to_numpy()

In [None]:
#simply plot new york accident coordinate 
plt.scatter( NY_loca_df["longitude"],NY_loca_df["latitude"],s=2)

In [None]:
# NearestNeighbors find knee point for optimal eps
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(np.radians(X))
distances, indices = nbrs.kneighbors(np.radians(X))
distances = distances[:, 1]
distances = np.sort(distances, axis=0)
fig=plt.figure()
plt.plot(distances)
plt.xlim(4000, 5570)

In [None]:
# first time try 
dbscan_cluster_model = DBSCAN(eps=0.000035, min_samples=5, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
dbscan_cluster_model
dbscan_cluster_model.labels_
NY_loca_df['cluster'] = dbscan_cluster_model.labels_
location = NY_loca_df['latitude'].mean(), NY_loca_df['longitude'].mean()

m = folium.Map(location=location,zoom_start=11,control_scale = True)

folium.TileLayer('cartodbpositron').add_to(m)

clust_colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']

for i in range(0,len(NY_loca_df)):
    colouridx = NY_loca_df['cluster'].iloc[i]
    if colouridx == -1:
        pass
    else:
        col = clust_colours[colouridx%len(clust_colours)]
        folium.CircleMarker([NY_loca_df['latitude'].iloc[i],NY_loca_df['longitude'].iloc[i]], radius = 10, color = col, fill = col).add_to(m)

m

In [None]:
# find optimal min_samples based on 

ss(X, NY_loca_df['cluster'])
epsilons = np.linspace(6.75e-05,6.75e-05, num=1)
print(epsilons)
min_samples = np.arange(2, 100 , step=5) 
print(min_samples)
combinations = list(itertools.product(epsilons, min_samples))
print(combinations)
N = len(combinations)

#define a function to run through all combinations
def get_scores_and_labels(combinations, X):
  scores = []
  all_labels_list = []

  for i, (eps, num_samples) in enumerate(combinations):
    
    dbscan_cluster_model = DBSCAN(eps= eps, min_samples= num_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
    labels = dbscan_cluster_model.labels_
    labels_set = set(labels)
    num_clusters = len(labels_set)
    if -1 in labels_set:
      num_clusters -= 1
    
    if (num_clusters < 2) or (num_clusters > 50):
      scores.append(-10)
      all_labels_list.append('bad')
      c = (eps, num_samples)
      print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
      continue
    
    scores.append(ss(X, labels))
    all_labels_list.append(labels)
    print(f"Index: {i}, Score: {scores[-1]}, Labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")

  best_index = np.argmax(scores)
  best_parameters = combinations[best_index]
  best_labels = all_labels_list[best_index]
  best_score = scores[best_index]

  return {'best_epsilon': best_parameters[0],
          'best_min_samples': best_parameters[1], 
          'best_labels': best_labels,
          'best_score': best_score}

# find best model
best_dict = get_scores_and_labels(combinations, X)
NY_loca_df['cluster'] = best_dict['best_labels']
best_dict

In [None]:
# pick clustered data excluding outliers
involved = NY_loca_df['cluster'] != -1
NY_involved = NY_loca_df[involved]
NY_involved

Xx = NY_involved[['latitude','longitude']].to_numpy()
lablel = NY_involved[['cluster']]

In [None]:
# get clusters centoid 
num_clusters = len(set(best_dict['best_labels']) - set([-1]))
cluster_labels = best_dict['best_labels']
clusters = pd.Series([X[cluster_labels == n] for n in range(num_clusters)])
clusters

#clusters centoid 
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

# get the centroid point for each cluster
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
rep_points

In [None]:
# map all data by folium and save the map
location = NY_loca_df['latitude'].mean(), NY_loca_df['longitude'].mean()

m = folium.Map(location=location,zoom_start=11,control_scale = True)
folium.TileLayer('cartodbpositron').add_to(m)


for i in range(0,len(NY_loca_df)):
    colouridx = NY_loca_df['cluster'].iloc[i]
    if colouridx == -1:
        folium.CircleMarker([NY_loca_df['latitude'].iloc[i],NY_loca_df['longitude'].iloc[i]], radius = 5, color = "white", fill = "white").add_to(m)
    else:
        col = clust_colours[colouridx%len(clust_colours)]
        folium.CircleMarker([NY_loca_df['latitude'].iloc[i],NY_loca_df['longitude'].iloc[i]], radius = 5, color = col, fill = col).add_to(m)
        
for i in range(len(rep_points)):
    folium.CircleMarker([rep_points['lat'].iloc[i],rep_points['lon'].iloc[i]], radius = 2, color = "black", fill_opacity=0.7, fill = "black").add_to(m)      
        
m.save("ny.html")
m

In [None]:
# davies_bouldin_score
db_index = davies_bouldin_score(X, best_dict['best_labels'])
db_index

In [None]:
#ch_index
ch_index = calinski_harabasz_score(X, best_dict['best_labels'])
print(ch_index)

### US
the codes are very same as what we did on NY data

In [None]:
# take lat and lng colunm 
US_loca_df = df[['Start_Lat','Start_Lng']]
US_loca_df.columns = ["latitude", "longitude"]
coords = US_loca_df[["latitude", "longitude"]]
X = US_loca_df.to_numpy()

# random select 100000 samples from entire US data
l = list(range(2243939))
random.seed(10)
pick = sorted(random.sample(l, 100000))

In [None]:
##simply plot US accident coordinate 
plt.scatter( US_picked_df["longitude"],US_picked_df["latitude"],s=2)

In [None]:
#elbow method define range of eps
neigh = NearestNeighbors(n_neighbors=300)
nbrs = neigh.fit(np.radians(X))
distances, indices = nbrs.kneighbors(np.radians(X))
distances = distances[:, 1]
distances = np.sort(distances, axis=0)
fig=plt.figure()
plt.plot(distances)
plt.xlim(99000, 100300)

In [None]:
# test
dbscan_cluster_model = DBSCAN(eps=0.009, min_samples=605, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
dbscan_cluster_model
dbscan_cluster_model.labels_
US_picked_df['cluster'] = dbscan_cluster_model.labels_

In [None]:
# score of test
ss(X, US_picked_df['cluster'])
# below is testing range of min_s and eps

epsilons = np.linspace(0.009,0.011, num=3)
min_samples = np.arange(100, 700, step=55) 
combinations = list(itertools.product(epsilons, min_samples))
combinations
N = len(combinations)
# find best model
def get_scores_and_labels(combinations, X):
  scores = []
  all_labels_list = []
  

  for i, (eps, num_samples) in enumerate(combinations):
    
    dbscan_cluster_model = DBSCAN(eps= eps, min_samples= num_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
    labels = dbscan_cluster_model.labels_
    labels_set = set(labels)
    num_clusters = len(labels_set)
    if -1 in labels_set:
      num_clusters -= 1
    
    if (num_clusters < 2) or (num_clusters > 50):
      scores.append(-10)
      all_labels_list.append('bad')
      c = (eps, num_samples)
      print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Moving on")
      continue
    
    scores.append(ss(X, labels))
    all_labels_list.append(labels)
    print(f"Index: {i}, Score: {scores[-1]}, Labels: {all_labels_list[-1]}, NumClusters: {num_clusters}")

  best_index = np.argmax(scores)
  best_parameters = combinations[best_index]
  best_labels = all_labels_list[best_index]
  best_score = scores[best_index]

  return {'best_epsilon': best_parameters[0],
          'best_min_samples': best_parameters[1], 
          'best_labels': best_labels,
          'best_score': best_score}

best_dict = get_scores_and_labels(combinations, X)
US_picked_df['cluster'] = best_dict['best_labels']
best_dict

In [None]:
# find centroid for each cluster
involved = US_picked_df['cluster'] != -1
US_involved = US_picked_df[involved]
US_involved


Xx = US_involved[['latitude','longitude']].to_numpy()
lablel = US_involved[['cluster']]
num_clusters = len(set(best_dict['best_labels']) - set([-1]))
cluster_labels = best_dict['best_labels']
clusters = pd.Series([X[cluster_labels == n] for n in range(num_clusters)])
clusters

def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

# get the centroid point for each cluster
centermost_points = clusters.map(get_centermost_point)
lats, lons = zip(*centermost_points)
rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
rep_points

In [None]:
#visualisation 
location = US_picked_df['latitude'].mean(), US_picked_df['longitude'].mean()

m = folium.Map(location=location,zoom_start=4,control_scale = True)

folium.TileLayer('cartodbpositron').add_to(m)

clust_colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']

for i in range(0,len(US_picked_df)):
    colouridx = US_picked_df['cluster'].iloc[i]
    if colouridx != -1:
         folium.CircleMarker([US_picked_df['latitude'].iloc[i],US_picked_df['longitude'].iloc[i]], radius = 4, color = "white", fill = "white").add_to(m)
         
         col = clust_colours[colouridx%len(clust_colours)]
         folium.CircleMarker([US_picked_df['latitude'].iloc[i],US_picked_df['longitude'].iloc[i]], radius = 4, color = col, fill = col).add_to(m)
        
    else:
       
        folium.CircleMarker([US_picked_df['latitude'].iloc[i],US_picked_df['longitude'].iloc[i]], radius = 4, color = "white", fill = "white").add_to(m)
for i in range(len(rep_points)):
    folium.CircleMarker([rep_points['lat'].iloc[i],rep_points['lon'].iloc[i]], radius = 3, color = "black", fill = "black").add_to(m)      
        
m
m.save("US.html")

In [None]:
#davies_bouldin_score and calinski_harabasz_score
db_index = davies_bouldin_score(X, best_dict['best_labels'])
print(db_index)
ch_index = calinski_harabasz_score(X, best_dict['best_labels'])
print(ch_index)

---------------------------------------------------------------------------

# Text Similarity

---------------------------------------------------------------------------

# Neural Network