In [166]:
import pandas as pd
import numpy as np
import geopandas as gpd

# Notebook criado com o intuito de gerar dados de não acidentes, já que eles não são registrados.

In [167]:
df = pd.read_csv('../data_processed/data_processed.csv', sep=',', encoding='latin1', low_memory = False)

In [168]:
df.columns

Index(['logradouro', 'municipio', 'latitude', 'longitude', 'tipo_registro',
       'turno', 'tp_sinistro_primario', 'qtd_gravidade_fatal',
       'qtd_gravidade_grave', 'qtd_gravidade_leve', 'qtd_caminhao',
       'qtd_motocicleta', 'qtd_automovel', 'qtd_pedestre', 'qtd_bicicleta',
       'qtd_onibus', 'qtd_veic_outros', 'qtd_veic_nao_disponivel',
       'concessionaria', 'numero_logradouro', 'tp_sinistro_atropelamento',
       'tp_sinistro_colisao_frontal', 'tp_sinistro_colisao_lateral',
       'tp_sinistro_colisao_transversal', 'tp_sinistro_colisao_outros',
       'tp_sinistro_choque', 'tp_sinistro_capotamento',
       'tp_sinistro_engavetamento', 'tp_sinistro_tombamento',
       'tp_sinistro_outros', 'data_hora', 'hora', 'mes', 'dia_semana',
       'indice_severidade', 'feriado', 'geometry', 'id', 'motorway', 'oneway',
       'lanes', 'maxspeed', 'bridge', 'dist_imprecisao'],
      dtype='object')

In [169]:
df['target'] = 1

In [170]:
columns_drop = ['municipio', 'tipo_registro', 'turno', 'tp_sinistro_primario', 'qtd_gravidade_fatal', 
                'qtd_gravidade_grave', 'qtd_gravidade_leve', 'qtd_caminhao', 'qtd_motocicleta', 'qtd_automovel', 'qtd_pedestre', 
                'qtd_bicicleta', 'qtd_onibus', 'qtd_veic_outros', 'qtd_veic_nao_disponivel', 'numero_logradouro', 'tp_sinistro_atropelamento',
                'tp_sinistro_colisao_frontal', 'tp_sinistro_colisao_lateral', 'tp_sinistro_colisao_transversal', 'tp_sinistro_colisao_outros', 
                'tp_sinistro_choque', 'tp_sinistro_capotamento', 'tp_sinistro_engavetamento', 'tp_sinistro_tombamento', 'tp_sinistro_outros', 
                'data_hora', 'geometry', 'id', 'dist_imprecisao']

df = df.drop(axis = 1, columns = columns_drop)

In [171]:
df.columns

Index(['logradouro', 'latitude', 'longitude', 'concessionaria', 'hora', 'mes',
       'dia_semana', 'indice_severidade', 'feriado', 'motorway', 'oneway',
       'lanes', 'maxspeed', 'bridge', 'target'],
      dtype='object')

In [172]:
geometry = gpd.points_from_xy(df['longitude'], df['latitude'])

gdf = gpd.GeoDataFrame(df, geometry=geometry)

gdf.set_crs(epsg=4326, inplace=True)

gdf = gdf.to_crs(epsg=31983)

In [173]:
gdf['geometry'] = gdf.buffer(distance=2000)

In [174]:
area_mask = gdf['geometry'].unary_union

area_mask = area_mask.simplify(tolerance=20)

  area_mask = gdf['geometry'].unary_union


In [175]:
columns_parquet = ['geometry', 'highway', 'oneway', 'lanes', 'maxspeed', 'bridge']

geodf = gpd.read_parquet('../data_processed/final_map_sp.parquet', columns=columns_parquet)

geodf = geodf.to_crs(epsg=31983)    

In [176]:
geodf = gpd.clip(geodf, mask=area_mask)

In [177]:
mask = (geodf['highway'] == 'tertiary') | (geodf['highway'] == 'secondary') | (geodf['highway'] == 'primary')

geodf = geodf[~mask]

In [178]:
geodf['geometry'].length

48629      372.725814
48628      370.977005
63361     2067.424263
48627     2063.065474
141890    1204.413273
             ...     
183296     140.665015
183298     167.433652
183297     141.558665
151695     433.161071
165947     123.835585
Length: 8491, dtype: float64

In [179]:
geometry_lenghts = geodf['geometry'].length

probability = geometry_lenghts/geometry_lenghts.sum()
probability = probability/probability.sum()

In [180]:
index_list = np.random.choice(
    a=geodf.index.values,
    size=73000,
    replace=True,
    p=probability
)


In [181]:
newdf = geodf.loc[index_list]

In [182]:
points = np.random.rand(73000)

newdf['geometry'] = newdf['geometry'].interpolate(points, normalized=True)

In [183]:
newdf['target'] = 0

In [184]:
newdf['indice_severidade'] = 0

In [185]:
days_probability = [0.14, 0.14, 0.14, 0.14, 0.17, 0.13, 0.14]

days_list = np.random.choice(
    a=[0, 1, 2, 3, 4, 5, 6],
    size=73000,
    replace=True,
    p=days_probability
)

In [186]:
newdf['dia_semana'] = days_list

In [187]:
months_probability = np.array([0.100, 0.077, 0.077, 0.077, 0.077, 0.077, 0.100, 0.077, 0.077, 0.077, 0.077, 0.100])
months_probability = months_probability / months_probability.sum()

months_list = np.random.choice(
    a=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    size=73000,
    replace=True,
    p=months_probability
)

In [188]:
newdf['mes'] = months_list

In [189]:
holiday_probability = [0.96, 0.04]

holiday_list = np.random.choice(
    a=[0, 1],
    size=73000,
    replace=True,
    p=holiday_probability
)

In [190]:
newdf['feriado'] = holiday_list

In [191]:
hours_probability = [0.01, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.06, 0.06, 0.06, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 
                     0.05, 0.06, 0.06, 0.06, 0.05, 0.05, 0.05, 0.01]

hours_list = np.random.choice(
    a=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
    size=73000,
    replace=True,
    p=hours_probability
)

In [192]:
newdf['hora'] = hours_list

In [193]:
newdf['highway'].value_counts(dropna=False)

highway
motorway    62034
trunk       10966
Name: count, dtype: int64

In [194]:
newdf['oneway'].value_counts(dropna=False)

oneway
yes     70531
no       2000
None      469
Name: count, dtype: int64

In [195]:
newdf['lanes'].value_counts(dropna=False)

lanes
2       51890
3       12755
4        3504
None     2258
5        1428
1         610
6         180
8          84
7          80
10         78
9          34
18         25
14         25
11         18
12         13
21         10
15          4
20          2
16          1
13          1
Name: count, dtype: int64

In [196]:
newdf['maxspeed'].value_counts(dropna=False)

maxspeed
110     25011
120     14464
100     13375
80       6410
90       4813
60       2851
None     2471
50       1839
70       1079
40        673
30          8
20          2
45          2
35          2
Name: count, dtype: int64

In [197]:
columns_dropna = ['oneway', 'lanes', 'maxspeed']

newdf = newdf.dropna(subset=columns_dropna)

In [198]:
newdf['oneway'] = newdf['oneway'].replace('yes', 1)

newdf['oneway'] = newdf['oneway'].replace('no', 0)

  newdf['oneway'] = newdf['oneway'].replace('no', 0)


In [199]:
newdf['oneway'] = newdf['oneway'].astype('int')
newdf['lanes'] = newdf['lanes'].astype('int')
newdf['maxspeed'] = newdf['maxspeed'].astype('int')

In [200]:
mask = newdf['lanes'] > 8

newdf = newdf[~mask]

In [201]:
to_replace = [35, 45]
value = [30, 40]

newdf['maxspeed'] = newdf['maxspeed'].replace(value=value, to_replace=to_replace)

In [202]:
newdf['bridge'].value_counts(dropna=False)

bridge
None       66743
yes         1225
viaduct      557
Name: count, dtype: int64

In [203]:
newdf['bridge'] = newdf['bridge'].fillna(0)

newdf['bridge'] = newdf['bridge'].replace('yes', 1)

newdf['bridge'] = newdf['bridge'].replace('viaduct', 1)

  newdf['bridge'] = newdf['bridge'].replace('viaduct', 1)


In [204]:
newdf['bridge'] = newdf['bridge'].astype('int')

In [205]:
newdf['highway'] = newdf['highway'].replace('motorway', 1)
newdf['highway'] = newdf['highway'].replace('trunk', 0)

  newdf['highway'] = newdf['highway'].replace('trunk', 0)


In [206]:
newdf.rename(columns={'highway': 'motorway'}, inplace=True)

In [None]:
#highways = newdf['highway'].unique()

#for highway in highways:

#    counts = newdf.loc[newdf['highway'] == highway, 'maxspeed'].value_counts(normalize=True)
#    maxspeeds = counts.index
#    prob = counts.values

#    maxspeed_list = np.random.choice(
#    a=maxspeeds,
#    size=len(newdf[(newdf['highway'] == highway) & (newdf['maxspeed'].isnull())]),
#    replace=True,
#    p=prob
#    )
#
#    newdf.loc[(newdf['highway'] == highway) & (newdf['maxspeed'].isnull()), 'maxspeed'] = maxspeed_list

In [207]:
gdf = gpd.GeoDataFrame(
    df, 
    geometry=gpd.points_from_xy(df['longitude'], df['latitude']),
    crs="EPSG:4326"
)

In [208]:
gdf = gdf.to_crs(newdf.crs)

datagendf = gpd.sjoin_nearest(
    newdf,
    gdf[['geometry', 'logradouro', 'concessionaria']], 
    how='left',
    distance_col='dist_imprecision',
)

datagendf = datagendf.drop_duplicates(subset=['geometry'])

In [209]:
datagendf['dist_imprecision'].describe()

count    68525.000000
mean       371.576135
std        467.226136
min          0.078350
25%         63.127453
50%        172.234736
75%        452.550016
max       2010.226541
Name: dist_imprecision, dtype: float64

In [210]:
datagendf['logradouro'].value_counts()

logradouro
SP 330    15900
SP 270    12820
SP 310    11000
SP 280    10526
SP 070     7566
SP 348     6397
SP 150     3771
SP 123      545
Name: count, dtype: int64

In [211]:
datagendf['concessionaria'].value_counts()

concessionaria
DER                       20782
AUTOBAN                    9472
ECOPISTAS                  7566
NAO DISPONIVEL             7462
ECONOROESTE                5543
CART                       4599
SPVIAS                     2572
EIXOSP - PIPA              1864
ENTREVIAS                  1833
ECOVIAS                    1514
VIAOESTE                   1244
INTERVIAS                  1102
ROTA SOROCABANA            1012
ECOVIAS RAPOSO-CASTELO      962
COLINAS                     595
VIAPAULISTA                 403
Name: count, dtype: int64

In [212]:
df.columns

Index(['logradouro', 'latitude', 'longitude', 'concessionaria', 'hora', 'mes',
       'dia_semana', 'indice_severidade', 'feriado', 'motorway', 'oneway',
       'lanes', 'maxspeed', 'bridge', 'target'],
      dtype='object')

In [213]:
datagendf.columns

Index(['geometry', 'motorway', 'oneway', 'lanes', 'maxspeed', 'bridge',
       'target', 'indice_severidade', 'dia_semana', 'mes', 'feriado', 'hora',
       'index_right', 'logradouro', 'concessionaria', 'dist_imprecision'],
      dtype='object')

In [214]:
df_drop_col = ['latitude', 'longitude']
datagendf_drop_col = ['index_right', 'geometry', 'dist_imprecision']

df = df.drop(df_drop_col, axis=1)
datagendf = datagendf.drop(datagendf_drop_col, axis=1)

In [215]:
df = pd.concat([df, datagendf])

In [216]:
columns_encode = ['logradouro', 'concessionaria']

df = pd.get_dummies(
    data = df,
    columns = columns_encode,
    dtype=int
)

In [217]:
df.columns

Index(['hora', 'mes', 'dia_semana', 'indice_severidade', 'feriado', 'motorway',
       'oneway', 'lanes', 'maxspeed', 'bridge', 'target', 'logradouro_SP 070',
       'logradouro_SP 123', 'logradouro_SP 150', 'logradouro_SP 270',
       'logradouro_SP 280', 'logradouro_SP 310', 'logradouro_SP 330',
       'logradouro_SP 348', 'concessionaria_AUTOBAN', 'concessionaria_CART',
       'concessionaria_COLINAS', 'concessionaria_DER',
       'concessionaria_ECONOROESTE', 'concessionaria_ECOPISTAS',
       'concessionaria_ECOVIAS', 'concessionaria_ECOVIAS RAPOSO-CASTELO',
       'concessionaria_EIXOSP - PIPA', 'concessionaria_ENTREVIAS',
       'concessionaria_INTERVIAS', 'concessionaria_NAO DISPONIVEL',
       'concessionaria_ROTA SOROCABANA', 'concessionaria_SPVIAS',
       'concessionaria_VIAOESTE', 'concessionaria_VIAPAULISTA'],
      dtype='object')

In [218]:
df.shape

(91099, 35)

In [219]:
df.to_csv('data_machine_learning.csv', index=False)