## Prepare Data

In [37]:
import urllib.request
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
from multiprocessing import Pool
import pandas as pd
import numpy as np
import json
import glob
import os

tqdm.pandas()

In [30]:
path = '/home/altieris/docker/jupyter/notebooks/ceabs/accidents/data/input/'
data_files = glob.glob(os.path.join(path, "*.csv"))  
num_partitions = 15 #number of partitions to split dataframe
num_cores = 7 #number of cores on your machine

In [31]:
df = pd.concat(map(lambda file: pd.read_csv(file, sep=';',encoding='latin-1'), data_files))

In [32]:
df.shape

(119605, 30)

In [10]:
#df.drop(['id','uop','delegacia','regional','sentido_via','uso_solo','ignorados'],inplace=True,axis=1)

In [33]:
df.head()

Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,...,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop
0,99973.0,2018-01-01,segunda-feira,00:20:00,RJ,116.0,3035,RESENDE ...,Condutor Dormindo ...,Saída de leito carroçável ...,...,4,1,0,4,1,-22.46937,-44.44705,SR-RJ,DEL5/7,UOP03/RJ
1,99976.0,2018-01-01,segunda-feira,00:40:00,SC,282.0,4,FLORIANOPOLIS ...,Não guardar distância de segurança ...,Colisão traseira ...,...,2,1,0,2,2,-27.599717,-48.575657,SR-SC,DEL8/1,UOP01/SC
2,99977.0,2018-01-01,segunda-feira,00:30:00,RJ,493.0,1,ITABORAI ...,Ultrapassagem Indevida ...,Colisão frontal ...,...,0,3,1,1,3,-22.763901,-42.927532,SR-RJ,DEL5/4,UOP02/RJ
3,99981.0,2018-01-01,segunda-feira,01:15:00,RS,386.0,134,SARANDI ...,Ingestão de Álcool ...,Colisão transversal ...,...,0,2,0,0,2,-27.953636,-52.916374,SR-RS,DEL9/14,UOP01/RS
4,99982.0,2018-01-01,segunda-feira,00:20:00,RS,293.0,1517,CANDIOTA ...,Falta de Atenção à Condução ...,Saída de leito carroçável ...,...,1,0,0,1,1,-31.395214,-53.783912,SR-RS,DEL9/11,UOP03/RS


In [43]:
def concatenate_data(latitude,longitude):
    return "http://revgeo1.ceabsservicos.com:5000/search/json/"+str(latitude)+';'+str(longitude)

def work(x):
    x['revgeo_url'] = x.progress_apply(lambda row: concatenate_data(row['latitude'], row['longitude']), axis=1) 
    return x

In [42]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [41]:
def call_url(url):
    try:
        contents = urllib.request.urlopen(url).read()
        wjdata = json.loads(contents)
        return wjdata['velocidadeMaximaVia_']
    except:
        return 0

    
def call_revgeo(data):
    data['velocidade_via'] = data.progress_apply(lambda row: call_url(row['revgeo_url']), axis=1) 
    return data

In [46]:
df  = parallelize_dataframe(df,work)

100%|██████████| 7974/7974 [00:00<00:00, 19812.71it/s]
100%|██████████| 7974/7974 [00:00<00:00, 21318.00it/s]
100%|██████████| 7974/7974 [00:00<00:00, 22224.57it/s]
100%|██████████| 7974/7974 [00:00<00:00, 25829.74it/s]
100%|██████████| 7974/7974 [00:00<00:00, 18052.32it/s]

100%|██████████| 7974/7974 [00:00<00:00, 19650.97it/s]
100%|██████████| 7974/7974 [00:00<00:00, 24883.88it/s]
100%|██████████| 7974/7974 [00:00<00:00, 21276.08it/s]
100%|██████████| 7974/7974 [00:00<00:00, 19286.26it/s]
100%|██████████| 7973/7973 [00:00<00:00, 19993.34it/s]
100%|██████████| 7973/7973 [00:00<00:00, 18510.35it/s]
100%|██████████| 7973/7973 [00:00<00:00, 19713.97it/s]
100%|██████████| 7973/7973 [00:00<00:00, 22481.20it/s]
100%|██████████| 7973/7973 [00:00<00:00, 35354.54it/s]


In [47]:
df = parallelize_dataframe(df, call_revgeo)

100%|██████████| 7974/7974 [06:56<00:00, 19.16it/s]
 99%|█████████▊| 7863/7974 [06:59<00:08, 13.64it/s]
 94%|█████████▎| 7461/7974 [07:04<00:20, 24.90it/s]
  2%|▏         | 194/7974 [00:09<05:32, 23.39it/s]
  0%|          | 0/7973 [00:00<?, ?it/s]
100%|██████████| 7974/7974 [07:28<00:00, 17.78it/s]
 29%|██▊       | 2284/7973 [01:49<04:57, 19.13it/s]
 98%|█████████▊| 7851/7973 [06:23<00:04, 24.68it/s]
100%|██████████| 7974/7974 [06:30<00:00, 20.44it/s]
100%|██████████| 7974/7974 [06:26<00:00, 20.61it/s]
 94%|█████████▍| 7476/7973 [06:04<00:26, 18.90it/s]
100%|██████████| 7973/7973 [06:28<00:00, 20.50it/s]
100%|██████████| 7973/7973 [06:29<00:00, 20.47it/s]
100%|██████████| 7973/7973 [06:29<00:00, 20.45it/s]
100%|██████████| 7973/7973 [06:09<00:00, 21.60it/s]


In [48]:
df.drop(['revgeo_url'],inplace=True,axis=1)

In [49]:
df.head()

Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,...,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop,velocidade_via
0,99973.0,2018-01-01,segunda-feira,00:20:00,RJ,116.0,3035,RESENDE ...,Condutor Dormindo ...,Saída de leito carroçável ...,...,1,0,4,1,-22.46937,-44.44705,SR-RJ,DEL5/7,UOP03/RJ,50
1,99976.0,2018-01-01,segunda-feira,00:40:00,SC,282.0,4,FLORIANOPOLIS ...,Não guardar distância de segurança ...,Colisão traseira ...,...,1,0,2,2,-27.599717,-48.575657,SR-SC,DEL8/1,UOP01/SC,100
2,99977.0,2018-01-01,segunda-feira,00:30:00,RJ,493.0,1,ITABORAI ...,Ultrapassagem Indevida ...,Colisão frontal ...,...,3,1,1,3,-22.763901,-42.927532,SR-RJ,DEL5/4,UOP02/RJ,50
3,99981.0,2018-01-01,segunda-feira,01:15:00,RS,386.0,134,SARANDI ...,Ingestão de Álcool ...,Colisão transversal ...,...,2,0,0,2,-27.953636,-52.916374,SR-RS,DEL9/14,UOP01/RS,60
4,99982.0,2018-01-01,segunda-feira,00:20:00,RS,293.0,1517,CANDIOTA ...,Falta de Atenção à Condução ...,Saída de leito carroçável ...,...,0,0,1,1,-31.395214,-53.783912,SR-RS,DEL9/11,UOP03/RS,80


In [50]:
df.shape

(119605, 31)

In [51]:
list(df)

['id',
 'data_inversa',
 'dia_semana',
 'horario',
 'uf',
 'br',
 'km',
 'municipio',
 'causa_acidente',
 'tipo_acidente',
 'classificacao_acidente',
 'fase_dia',
 'sentido_via',
 'condicao_metereologica',
 'tipo_pista',
 'tracado_via',
 'uso_solo',
 'pessoas',
 'mortos',
 'feridos_leves',
 'feridos_graves',
 'ilesos',
 'ignorados',
 'feridos',
 'veiculos',
 'latitude',
 'longitude',
 'regional',
 'delegacia',
 'uop',
 'velocidade_via']

In [55]:
path = '/home/altieris/docker/jupyter/notebooks/ceabs/accidents/data/input/raw/'
df.to_csv(path+'/accidents_brasil.csv', sep=';', encoding='utf-8',index=False)