# install packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
import dataframe_image as dfi

# Data processing 

In [None]:
#load data
df = pd.read_csv('../../US_Accidents_May19_Migrated Data.csv')

In [None]:
#dataset observing
df.shape
df.head()
df.isnull().sum()

In [None]:
# selecting attribute
df.drop = df[['ID','City','State','Severity','Visibility(mi)','Start_Lat','Start_Lng', 
            'count Traffic Signal','Count of Crossing','count of Bump','Description','Count of accidents',
             'Weather_Condition','Humidity(%)','Precipitation(in)','Wind_Chill(F)','Wind_Speed(mph)']]
df.drop = pd.get_dummies(df.drop, columns=['Amenity', 
    'Bump', 
    'Crossing',
    'Give_Way', 
    'Junction', 
    'No_Exit',
    'Railway', 
    'Roundabout', 
    'Station',
    'Stop', 
    'Traffic_Calming',
    'Traffic_Signal', 
    'Turning_Loop'])

In [None]:
# print description table as a plot
table = df.drop.describe()

dfi.export(table, 'dataframe.png')
table

In [None]:
# visilize count of accident in each state
states = df.State.unique()
count_by_state=[]
for i in df.State.unique():
    count_by_state.append(df[df['State']==i].count()['ID'])

fig,ax = plt.subplots(figsize=(16,10))
sns.barplot(states,count_by_state)

In [None]:
#find top 10 city having accident
top_cities=df["City"].value_counts().sort_values()[-20:].reset_index()
top_cities.columns=["city","number_of_accidents"]

plt.figure(figsize=(10,7))
sns.barplot(x="city",y="accidents number",data=top_cities)
plt.title("TOP 10 CITIES WITH HIGHEST NUMBER OF ACCIDENTS",fontsize=20)
plt.xticks(rotation=40)
plt.show()

In [None]:
# map represent accident severity 

severity_cols = {
    0: 'green',
    1: 'palegreen',
    2: 'papayawhip',
    3: 'lightsalmon',
    4: 'tomato'
}

vcol = [severity_cols[i] for i in df['Severity']]

ax = plt.scatter(df['Start_Lng'], df['Start_Lat'],c = vcol,s=2)
plt.title('Accidents representating map by severity level')
fig = ax.get_figure()
fig.savefig('Severity.png')

In [None]:
# percentage of accident including road params, save as plot
road_params = [
    'Amenity', 
    'Bump', 
    'Crossing',
    'Give_Way', 
    'Junction', 
    'No_Exit',
    'Railway', 
    'Roundabout', 
    'Station',
    'Stop', 
    'Traffic_Calming',
    'Traffic_Signal', 
    'Turning_Loop']

# % of accident including road params
road_param_percent = df.loc[:, road_params].sum() / len(df)
plt.title('Presence of road element near accidents')
plt.xlabel('% of total of accidents')
ax=road_param_percent.sort_values().plot(kind='barh');

fig = ax.get_figure()
fig.savefig('road.png')

In [None]:
# percentage of accident by Weather_Condition
acc_by_weather_condition = df.groupby('Weather_Condition').size() / len(df)
acc_by_weather_condition = acc_by_weather_condition[acc_by_weather_condition > 0.005]
plt.title('Presence of weather condition during accidents')
plt.xlabel('% of total of accidents')
acc_by_weather_condition.sort_values().plot(kind='barh');

# Clustering

## kmean

## DBSCAN

In [None]:
# take lat and lng colunm filter Newyork data
NY = df['City'] == 'New York'
NY_df = df[NY]
NY_loca_df = NY_df[['Start_Lat','Start_Lng']]
NY_loca_df.columns = ["latitude", "longitude"]
coords = NY_loca_df[["latitude", "longitude"]]
X = NY_loca_df.to_numpy()

In [None]:
#simply plot new york accident coordinate 
plt.scatter( NY_loca_df["longitude"],NY_loca_df["latitude"],s=2)

In [None]:
# NearestNeighbors find knee point for optimal eps
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(np.radians(X))
distances, indices = nbrs.kneighbors(np.radians(X))
distances = distances[:, 1]
distances = np.sort(distances, axis=0)
fig=plt.figure()
plt.plot(distances)
plt.xlim(4000, 5570)

In [None]:
# first time try 
dbscan_cluster_model = DBSCAN(eps=0.000035, min_samples=5, algorithm='ball_tree', metric='haversine').fit(np.radians(X))
dbscan_cluster_model
dbscan_cluster_model.labels_
NY_loca_df['cluster'] = dbscan_cluster_model.labels_
location = NY_loca_df['latitude'].mean(), NY_loca_df['longitude'].mean()

m = folium.Map(location=location,zoom_start=11,control_scale = True)

folium.TileLayer('cartodbpositron').add_to(m)

clust_colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']

for i in range(0,len(NY_loca_df)):
    colouridx = NY_loca_df['cluster'].iloc[i]
    if colouridx == -1:
        pass
    else:
        col = clust_colours[colouridx%len(clust_colours)]
        folium.CircleMarker([NY_loca_df['latitude'].iloc[i],NY_loca_df['longitude'].iloc[i]], radius = 10, color = col, fill = col).add_to(m)

m

In [None]:
# find optimal min_samples based on 

ss(X, NY_loca_df['cluster'])


In [None]:
# take lat and lng colunm 
US_loca_df = df[['Start_Lat','Start_Lng']]
US_loca_df.columns = ["latitude", "longitude"]
coords = US_loca_df[["latitude", "longitude"]]
X = US_loca_df.to_numpy()

# random select 100000 samples from entire US data
l = list(range(2243939))
random.seed(10)
pick = sorted(random.sample(l, 100000))

In [None]:
##simply plot US accident coordinate 
plt.scatter( US_picked_df["longitude"],US_picked_df["latitude"],s=2)

# Text Similarity

# Neural Network