In [1]:
import pandas as pd
import geopandas as gpd
import vaex
import glob
import gc
import metadata
import requests
import datetime
# import concurrent.futures
from multiprocessing import Pool

In [2]:
CNAES_alimentacao = [
    "4711301",
    "4711302",
    "4712100",
    "4721102",
    "4721103",
    "4721104",
    "4722901",
    "4722902",
    "4724500",
    "4729602",
    "4729699",
    "5611201",
    "5611202",
    "5611203",
    "5612100",
    "5620103",
    "5620104"
] 
# Segundo o Estudo_Técnico_Mapeamento_Desertos_Alimentares
# https://aplicacoes.mds.gov.br/sagirmps/noticias/arquivos/files/Estudo_T%C3%A9cnico_Mapeamento_Desertos_Alimentares_Final_30_10_2019(1).pdf

In [3]:
df_estabelecimentos = vaex.open('data/brasil_cnpj.hdf5')

In [4]:
df_estabelecimentos.shape

(55813191, 30)

In [5]:
df_estabelecimentos.logradouro_tipo.value_counts()

missing                   916949
RUA                     36640622
AVENIDA                 10430240
ESTRADA                  1121829
RODOVIA                   906033
                          ...   
ESCADA DE PEDRA                1
5 PARALELA                     1
T�NEL                          1
5  ALTO                        1
PASSAGEM SUBTERR�NEA           1
Length: 398, dtype: int64

In [6]:
df_municipios = vaex.read_csv('data/Municipios.zip',
                    compression='zip', 
                    sep=';', 
                    encoding_errors='replace',
                    names=['cd_municipio', 'nome_municipio'])

In [7]:
## TODO
# preencher municipio
# df_municipios.cd_municipio.str.pad(4, side='right', fillchar='0')
df_municipios['cd_municipio_str'] = df_municipios.cd_municipio.to_string().str.pad(4, side='left', fillchar='0')

In [8]:
df_estabelecimentos.shape

(55813191, 30)

In [9]:
df_estabelecimentos = df_estabelecimentos.join(df_municipios, left_on='logradouro_cod_municipio', right_on='cd_municipio_str')

In [10]:
df_alimentacao_br = df_estabelecimentos[df_estabelecimentos.cnae_principal.isin(CNAES_alimentacao)]

In [14]:
# CÓDIGO DA SITUAÇÃO CADASTRAL:
# 01 – NULA
# 2 – ATIVA
# 3 – SUSPENSA
# 4 – INAPTA
# 08 – BAIXADA
df_alimentacao_br.situacao_cadastral.value_counts()

08    3657328
02    2440648
04    1273923
03      28410
01      11452
dtype: int64

In [15]:
ativos = (df_alimentacao_br.situacao_cadastral == '02')

In [17]:
df_alimentacao_br = df_alimentacao_br[ativos]

In [18]:
df_alimentacao_br.shape

(2440648, 33)

In [19]:
df_alimentacao_br['endereco'] = df_alimentacao_br.logradouro_tipo + ' ' + df_alimentacao_br.logradouro_nome + ', ' + df_alimentacao_br.logradouro_numero + ', ' + df_alimentacao_br['nome_municipio'] + ', ' + df_alimentacao_br['logradouro_uf']

In [20]:
df_alimentacao_br['cnpj'] = df_alimentacao_br.cnpj_basico + '/' + df_alimentacao_br.cnpj_ordem + '-' + df_alimentacao_br.cnpj_dv

In [21]:
df_alimentacao_br = df_alimentacao_br[df_alimentacao_br.endereco.str.len() > 5]

In [22]:
# df_alimentacao_br_sp = df_alimentacao_br[df_alimentacao_br.logradouro_cod_municipio == '7107']

In [23]:
# df_alimentacao_br.shape

In [24]:
def locate(cnpj, endereco, cnae_principal, cnaes_segundarios):
    params = {
        'size':'1', 
        'text': endereco,
    }
    
    geolocator = requests.get(url='http://localhost:4000/v1/search', params=params)
    
    if geolocator.json()['features']:
        long = geolocator.json()['features'][0]['geometry']['coordinates'][0]
        lat = geolocator.json()['features'][0]['geometry']['coordinates'][1]
        confidence = geolocator.json()['features'][0]['properties']['confidence']
        return cnpj, long, lat, confidence, cnae_principal, cnaes_segundarios
    else:
        return None
    

In [25]:
list(zip(df_alimentacao_br[:5].cnpj.tolist(), 
         df_alimentacao_br[:5].endereco.tolist(),
         df_alimentacao_br[:5].cnae_principal.tolist(),
         df_alimentacao_br[:5].cnae_secundario.tolist()))

[('61901229/0001-91', 'RUA PANTOJO, 1026, SAO PAULO, SP', '4729699', None),
 ('41273749/0001-90', 'RUA 54, 2270, TERESINA, PI', '4712100', None),
 ('82225053/0001-75', 'RUA SAMUEL CEZAR, 1100, CURITIBA, PR', '4711301', None),
 ('82237199/0001-30',
  'AVENIDA LUCILIO DE HELD, 1045, MARINGA, PR',
  '5611203',
  None),
 ('62240486/0001-92',
  'RUA CEL BENTO BICUDO, 1191, SAO PAULO, SP',
  '4722901',
  None)]

In [26]:
cnpj, long, lat, confidence, cnae_principal, cnae_secundario = locate('61745626/0001-11', 'RUA SAO POMPONIO, 298, SAO PAULO, SP', '4721102', None)

In [27]:
cnpj, long, lat, confidence

('61745626/0001-11', -46.576816, -23.570642, 1)

In [28]:
with Pool(12) as p:
    results = p.starmap(locate, zip(df_alimentacao_br.cnpj.tolist(), 
                                    df_alimentacao_br.endereco.tolist(),
                                    df_alimentacao_br.cnae_principal.tolist(),
                                    df_alimentacao_br.cnae_secundario.tolist()))

In [29]:
cnpjs, lats, longs, confidences, cnaes_principal, cnaes_secundario = [], [], [], [], [], []

for result in results:
    if result.__class__ == tuple:
        cnpjs.append(result[0])
        longs.append(result[1])
        lats.append(result[2])
        confidences.append(result[3])
        cnaes_principal.append(result[4])
        cnaes_secundario.append(result[5])

In [30]:
len(cnpjs), len(longs), len(lats), len(confidences), len(cnaes_principal), len(cnaes_secundario)

(2425121, 2425121, 2425121, 2425121, 2425121, 2425121)

In [31]:
df_geocoded = vaex.from_dict({'cnpj':cnpjs, 
                                 'long':longs, 
                                 'lat':lats, 
                                 'confidence':confidences,
                                 'cnae_principal':cnaes_principal,
                                 'cnae_secundario':cnaes_secundario
                                 })

In [32]:
df_geocoded.dtypes

cnpj                string
long               float64
lat                float64
confidence         float64
cnae_principal      string
cnae_secundario     string
dtype: object

In [33]:
df_geocoded.shape

(2425121, 6)

In [34]:
df_geocoded.confidence.value_counts()

0.8    909265
0.6    884897
1.0    628978
0.4      1926
0.3        55
dtype: int64

In [35]:
gdf_estabelecimentos_alimentacao_sampa = gpd.GeoDataFrame(
    df_geocoded.to_pandas_df(), geometry=gpd.points_from_xy(df_geocoded.to_pandas_df().long, df_geocoded.to_pandas_df().lat), crs="EPSG:4326"
)

In [36]:
gdf_estabelecimentos_alimentacao_sampa.to_file('results/estabelecimentos_alimentares_BR_all.gpkg', driver='GPKG')

In [37]:
gdf_estabelecimentos_alimentacao_sampa.shape

(2425121, 7)