### Clustering

In [22]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from shapely.geometry import Polygon,MultiPoint,Point,LineString
from makeclustermap import MarkerClusterScript
import shapely.geometry
import sys, getopt
import folium
import random
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

%matplotlib inline

sns.set(style="whitegrid")

In [2]:
path = os.getcwd()
kms_per_radian = 6371.0088

# 100 metros
epsilon = 0.1 / kms_per_radian

In [3]:
def load_dataset(dataset_name,index_column_name,sep):
    try:
        dataframe = pd.read_csv(dataset_name, sep=sep, parse_dates=True, infer_datetime_format=True)
        #dataframe.index = pd.to_datetime(dataframe[index_column_name])
        return dataframe
    except Exception:
        print("Ocorreu um erro ao tentar carregar o arquivo.") 

In [4]:
def dbscan_fit(coords_dataset, min_samples=3):
    model = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine', n_jobs=1).fit(np.radians(coords_dataset))
    num_clusters = len(set(model.labels_))
    print('Number of clusters: {:,}'.format(num_clusters))
    return model

In [71]:
df = load_dataset(path+'/data/regioes/accidents_brasil_cleansed.csv','data_inversa',';')    
df[['latitude','longitude']].head()

Unnamed: 0,latitude,longitude
0,-27.599717,-48.575657
1,-27.953636,-52.916374
2,-31.395214,-53.783912
3,-29.728375,-51.633419
4,-26.858473,-48.690788


In [72]:
coords = df.drop_duplicates(['longitude','latitude'])[['latitude','longitude']]#.sort_values(by=['latitude','longitude'])

coords = coords.sort_values(by=['latitude', 'longitude'], ascending=False)

In [73]:
coords.as_matrix(columns=['longitude', 'latitude'])

array([[-61.13547206,   4.4588279 ],
       [-61.133815  ,   4.457483  ],
       [-61.132     ,   4.451     ],
       ...,
       [-53.44852483, -33.68033394],
       [-53.44855607, -33.68050581],
       [-53.4472847 , -33.68727231]])

In [74]:
model = dbscan_fit(coords)

Number of clusters: 6,663


In [75]:
model.labels_[0:100] # Firt 100 points

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        0,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1,  1,  1,  1, -1,  2,  2,  2, -1, -1, -1, -1, -1, -1, -1, -1,
        3,  3,  3,  3, -1, -1, -1,  4,  4,  4,  4, -1,  5,  5,  5])

In [76]:
coords[model.labels_ == 100] # pontos pentencentes ao 200º cluster

Unnamed: 0,latitude,longitude
90003,-2.438445,-54.732798
94586,-2.438454,-54.732823
94356,-2.438507,-54.732849
92209,-2.438518,-54.732822
94623,-2.439193,-54.732749
88605,-2.439279,-54.732567
89895,-2.439772,-54.732857


In [80]:
clusters = pd.Series([coords[model.labels_ == n] for n in range(len(set(model.labels_)))]) # cria grupos com os pontos 

In [81]:
len(clusters)

6663

In [82]:
c = [[i,len(clusters[i])] for i in range(0,len(clusters))]
clusters_len = pd.DataFrame(c, columns=('index','tamanho')) 

In [83]:
clusters_len.sort_values(by=['tamanho'],ascending=False).head(10)

Unnamed: 0,index,tamanho
5957,5957,387
5968,5968,292
2729,2729,197
4388,4388,164
73,73,141
4288,4288,139
4378,4378,107
4367,4367,106
5967,5967,105
5965,5965,93


In [84]:
clusters_len = clusters_len[(clusters_len.tamanho > 10 )].sort_values(by=['tamanho'],ascending=False)

In [85]:
clusters_len.head()

Unnamed: 0,index,tamanho
5957,5957,387
5968,5968,292
2729,2729,197
4388,4388,164
73,73,141


In [86]:
clusters_len[clusters_len['tamanho'] == clusters_len['tamanho'].max()]# cluster com maior numero de pontos
idx = clusters_len[clusters_len['tamanho'] == clusters_len['tamanho'].max()].index[0]

In [87]:
c = pd.DataFrame(clusters[73], columns=('longitude','latitude'))
#c = c.append(pd.DataFrame(clusters[107], columns=('longitude','latitude')))

In [88]:
def create_marker_clustered(row, popup=None):
    """Returns a L.marker object"""
    icon = L.AwesomeMarkers.icon({markerColor: row.color})    
    marker = L.marker(L.LatLng(row.latitude, row.longitude))
    marker.setIcon(icon)
    marker.bindPopup(row.latitude + ' ' + row.longitude + ' ocorrencias:'+row.nr_ocurrences)
    return marker


width, height = '100%', 600

#coords = clustered_data[['longitude','latitude','nr_ocurrences']]

coords = c                     

fig = folium.Figure(width=width, height=height)
map_orgs = folium.Map(width=width,height=height,location=[-12.032378, -51.574553], zoom_start=4)
MarkerClusterScript(coords.to_json(orient="records"),callback=create_marker_clustered).add_to(map_orgs)
map_orgs.add_to(fig)

In [90]:
cluster0 = pd.DataFrame(clusters[73],columns=['longitude','latitude'])
cluster0 = cluster0.set_index(['longitude','latitude'])

#df = df.set_index(['longitude','latitude'])

cluster0.head()

longitude,latitude
-48.359891,-1.365351
-48.356972,-1.365378
-48.357482,-1.365378
-48.35716,-1.365405
-48.357788,-1.365416


In [101]:
clusters[73][:10]#[0][1]

Unnamed: 0,latitude,longitude
93199,-1.365351,-48.359891
90754,-1.365378,-48.356972
90614,-1.365378,-48.357482
90892,-1.365405,-48.35716
93941,-1.365416,-48.357788
91996,-1.365448,-48.359048
92717,-1.365473,-48.357228
92004,-1.365475,-48.360561
93612,-1.365481,-48.360869
89697,-1.365496,-48.363141


In [174]:
import urllib.request
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
from multiprocessing import Pool
import pandas as pd
import numpy as np
import json
import glob
import os

tqdm.pandas()

def concatenate_data(df):
    
    try:
        call_url_ws = "http://10.3.0.105:5000/route/v1/driving/"
        call_url_maps = "http://10.3.0.105:9966/?"
        for row in df.itertuples():
            call_url_ws = call_url_ws + str(row.longitude)+","+str(row.latitude)+';'
            call_url_maps = call_url_maps + '&loc='+str(row.latitude)+","+str(row.longitude)
        
        print(call_url_ws)
        print(call_url_maps)

        contents = urllib.request.urlopen(call_url_ws[:-1]).read()
        wjdata = json.loads(contents)    
        call_url_maps = "http://10.3.0.105:9966/?"
        for el in wjdata['waypoints']:
            call_url_maps = call_url_maps + '&loc='+str(el['location'][1])+","+str(el['location'][0])
        print(call_url_maps)
        print('-------------------------------------------------')
    except:
        pass
    
#clusters.apply(lambda row: concatenate_data(row)) 

In [166]:
import urllib.request

url = "http://127.0.0.1:5000/route/v1/driving/-48.35989058,-1.36535118;-48.35697234,-1.365378;-48.35748196,-1.365378;-48.35716009,-1.36540481"

contents = urllib.request.urlopen(url).read()
wjdata = json.loads(contents)    

call_url_maps = "http://localhost:9966/?"
for el in wjdata['waypoints']:
    call_url_maps = call_url_maps + '&loc='+str(el['location'][1])+","+str(el['location'][0])
    
print(call_url_maps)

http://localhost:9966/?&loc=-1.36546,-48.359886&loc=-1.365362,-48.356972&loc=-1.365371,-48.357482&loc=-1.365365,-48.357161


In [175]:
clusters[73]

Unnamed: 0,latitude,longitude,osrm_snap_url
93199,-1.365351,-48.359891,-1.36535118;-48.35989058
90754,-1.365378,-48.356972,-1.365378;-48.35697234
90614,-1.365378,-48.357482,-1.365378;-48.35748196
90892,-1.365405,-48.357160,-1.36540481;-48.35716009
93941,-1.365416,-48.357788,-1.36541554;-48.35778773
91996,-1.365448,-48.359048,-1.36544771;-48.35904837
92717,-1.365473,-48.357228,-1.36547346;-48.35722833
92004,-1.365475,-48.360561,-1.36547453;-48.36056113
93612,-1.365481,-48.360869,-1.36548133;-48.36086942
89697,-1.365496,-48.363141,-1.36549565;-48.36314108


In [178]:
t = pd.DataFrame([])
matrix = clusters[100].as_matrix(columns=['longitude', 'latitude'])
for i in range(0,len(matrix)):
    t = t.append(df.loc[(df['longitude'] == matrix[i][0]) & (df['latitude'] == matrix[i][1])])

In [189]:
t.drop(['id','nome_feriado','estacao','year','month','day','data_inversa','uf','br','km','feriado','velocidade_via','regiao','uop','delegacia','regional','sentido_via','horario','dia_semana','tracado_via','uso_solo'],axis=1)

Unnamed: 0,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,condicao_metereologica,tipo_pista,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude
90003,SANTAREM ...,Não guardar distância de segurança ...,Colisão traseira ...,Com Vítimas Feridas,Pleno dia,Nublado,Dupla,2,0,1,0,1,0,1,2,-2.438445,-54.732798
94586,SANTAREM ...,Falta de Atenção à Condução ...,Colisão transversal ...,Com Vítimas Feridas,Pleno dia,Céu Claro,Dupla,2,0,1,0,1,0,1,2,-2.438454,-54.732823
94356,SANTAREM ...,Falta de Atenção à Condução ...,Colisão traseira ...,Com Vítimas Feridas,Plena Noite,Céu Claro,Dupla,2,0,1,0,0,1,1,2,-2.438507,-54.732849
92209,SANTAREM ...,Falta de Atenção à Condução ...,Colisão transversal ...,Com Vítimas Feridas,Pleno dia,Céu Claro,Dupla,3,0,1,0,2,0,1,2,-2.438518,-54.732822
94623,SANTAREM ...,Falta de Atenção à Condução ...,Colisão traseira ...,Com Vítimas Feridas,Pleno dia,Céu Claro,Dupla,4,0,1,1,2,0,2,3,-2.439193,-54.732749
88605,SANTAREM ...,Falta de Atenção à Condução ...,Colisão traseira ...,Com Vítimas Feridas,Pleno dia,Nublado,Dupla,2,0,1,0,1,0,1,2,-2.439279,-54.732567
89895,SANTAREM ...,Desobediência às normas de trânsito pelo pedes...,Atropelamento de Pedestre ...,Com Vítimas Feridas,Pleno dia,Céu Claro,Simples,2,0,0,1,1,0,1,1,-2.439772,-54.732857


In [None]:
#coords['radius'] = (coords['nr_ocurrences'] - coords.nr_ocurrences.min()) / (coords.nr_ocurrences.max() - coords.nr_ocurrences.min()) * 200

class Cluster(object):

    def __init__(self,latitude_centroid,longitude_centroid,cluster_dataframe):
        self.latitude_centroid = latitude_centroid
        self.longitude_centroid = longitude_centroid
        self.cluster_dataframe = cluster_dataframe
        self.nr_points = len(cluster_dataframe)
        self.polygon = Polygon(cluster_dataframe.as_matrix(columns=['longitude', 'latitude']))

## ----------------------------------------------------------------------------------

# def get_centermost_point(cluster):
#     coords = cluster.sort_values(by=['latitude', 'longitude'], ascending=False)
#     coords = cluster.as_matrix(columns=['longitude', 'latitude'])
#     centroid = (MultiPoint(coords).centroid.x, MultiPoint(coords).centroid.y)
#     longitude_centroid = MultiPoint(coords).centroid.x
#     latitude_centroid = MultiPoint(coords).centroid.y
#     item_cluster = Cluster(latitude_centroid,longitude_centroid, cluster)
#     return item_cluster


# def generate_data(cluster_areas):
#     d = []
#     for i in range(len(cluster_areas)):
#         try:
#             polygon = Polygon(cluster_areas[i].points)
#             centroid = Point(cluster_areas[i].centroid)
#             x = polygon.buffer(buffer)
#             s = x.simplify(simplify_const, preserve_topology=True)
#             p = LineString(s.exterior.coords)
#             d.append((cluster_areas[i].latitude,cluster_areas[i].longitude,centroid, cluster_areas[i].nr_points(), polygon,  json.dumps(shapely.geometry.mapping(p))))
            
#         except Exception as inst:
#             pass
#     return d

x = get_centermost_point(coords)

In [None]:
x.cluster_dataframe#.polygon.coords

In [None]:
buffer = 0.008
simplify_const = 0.5


# y = x.polygon.buffer(buffer)
# s = y.simplify(simplify_const, preserve_topology=True)
# p = LineString(s.exterior.coords)
# print(p.wkt)
# json.dumps(shapely.geometry.mapping(p))

In [None]:
x.polygon.bounds

In [None]:
x.polygon.wkt

In [None]:
s.exterior.coords.xy

In [None]:
coords.head()

In [None]:
width, height = '100%', 600

colors = {'A' : 'red', 'B' : 'blue'}

map_osm = folium.Map(width=width,height=height,location=[-12.032378, -51.574553], zoom_start=4)

coords.head(1).apply(lambda row: folium.CircleMarker(location=[row['latitude'], row['longitude']],fill=True,fill_opacity=0.3,color = 'red', popup=str(row.nr_ocurrences),radius=row['radius'])
                                             .add_to(map_osm), axis=1)

map_osm

In [None]:
dataframe.to_csv(filename, index=False)

In [None]:
coords_ocurrences = sudeste.as_matrix(columns=['longitude', 'latitude'])
outputfile = 'accidents_clustered.csv'
generate_cluster(outputfile, coords_ocurrences)

In [None]:
#https://github.com/qingkaikong/blog/tree/master/28_DBSCAN
#https://www.aaai.org/Papers/KDD/1996/KDD96-037.pdf
#ftp://public.dhe.ibm.com/software/analytics/spss/documentation/modeler/17.1/br_po/ModelerCRISPDM.pdf
#https://exame.abril.com.br/brasil/as-10-rodovias-federais-mais-perigosas-do-brasil/
#http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf
#https://arthur-e.github.io/Wicket/sandbox-gmaps3.html
# https://www.jamesrcroft.com/2015/06/snapping-gps-tracks-to-roads/