### CVM Administradores de Carteira: Informação Cadastral

Limpeza realizada por Ricardo Dahis  
Dados originais obtidos em [Dados Abertos CVM](http://dados.cvm.gov.br/dataset/adm_cart-cad)   
E os dados processados se encontram em [Base dos Dados](https://basedosdados.org/dataset/br-cvm-administradores-carteira)

### Prefácio

In [15]:
import os
import sys
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils import log

In [16]:
import shutil
from pathlib import Path
from typing import Union

import requests
import pandas as pd
import basedosdados as bd

In [17]:
root = "/tmp/basedosdados"
url = "http://dados.cvm.gov.br/dados/ADM_CART/CAD/DADOS/cad_adm_cart.zip"

### Download

In [20]:
def crawl(root: str, url: str) -> None:
    """Download and unzip dataset br_cvm_administradores_carteira"""
    filepath = f"{root}/data.zip"
    os.makedirs(root, exist_ok=True)
    with open(filepath, "wb") as file:
        response = requests.get(url)
        file.write(response.content)
        shutil.unpack_archive(filepath, extract_dir=root)

crawl(root, url)

### Limpeza

In [21]:
def clean_table_responsavel(root: str) -> str:
    """Clean table pessoa_fisica"""
    in_filepath = f"{root}/cad_adm_cart_resp.csv"
    ou_filepath = f"{root}/bd_responsavel.csv"

    df = pd.read_csv(
        in_filepath,
        sep=";",
        keep_default_na=False,
        encoding="latin1",
        dtype=object
    )

    df.columns = [
        "cnpj",
        "nome",
        "tipo"
    ]

    df["cnpj"] = df["cnpj"].str.replace(".", "")
    df["cnpj"] = df["cnpj"].str.replace("/", "")
    df["cnpj"] = df["cnpj"].str.replace("-", "")

    df.to_csv(ou_filepath, index=False)

    return ou_filepath

re_filepath = clean_table_responsavel(root)

In [22]:
def clean_table_pessoa_fisica(root: str) -> str:
    """Clean table pessoa_fisica"""
    in_filepath = f"{root}/cad_adm_cart_pf.csv"
    ou_filepath = f"{root}/bd_pessoa_fisica.csv"

    df = pd.read_csv(
        in_filepath,
        sep=";",
        keep_default_na=False,
        encoding="latin1",
        dtype=object,
    )

    df.columns = [
        "nome",
        "data_registro",
        "data_cancelamento",
        "motivo_cancelamento",
        "situacao",
        "data_inicio_situacao",
        "categoria_registro",
    ]

    df.to_csv(ou_filepath, index=False)
    
    return ou_filepath

pf_filepath = clean_table_pessoa_fisica(root)

In [23]:
def clean_table_pessoa_juridica(root: str) -> str:
    """Clean table pessoa_fisica"""
    in_filepath = f"{root}/cad_adm_cart_pj.csv"
    ou_filepath = f"{root}/bd_pessoa_juridica.csv"

    df = pd.read_csv(
        in_filepath,
        sep=";",
        keep_default_na=False,
        encoding="latin1",
        dtype=object
    )

    df.columns = [
        "cnpj",
        "denominacao_social",
        "denominacao_comercial",
        "data_registro",
        "data_cancelamento",
        "motivo_cancelamento",
        "situacao",
        "data_inicio_situacao",
        "categoria_registro",
        "subcategoria_registro",
        "controle_acionario",
        "tipo_endereco",
        "logradouro",
        "complemento",
        "bairro",
        "municipio",
        "sigla_uf",
        "cep",
        "ddd",
        "telefone",
        "valor_patrimonial_liquido",
        "data_patrimonio_liquido",
        "email",
        "website"
    ]

    df["cnpj"] = df["cnpj"].str.replace(".", "")
    df["cnpj"] = df["cnpj"].str.replace("/", "")
    df["cnpj"] = df["cnpj"].str.replace("-", "")

    df.to_csv(ou_filepath, index=False)
    
    return ou_filepath

pj_filepath = clean_table_pessoa_juridica(root)

### Upload
Adaptado do código em `basedosdados_template` por Diego Oliveira

In [None]:
def upload_to_gcs(dataset_id: str, table_id: str, path: Union[str, Path]) -> None:
    """Upload a bunch of CSVs to Google Cloud Storage using basedosdados library"""
    tb = bd.Table(dataset_id=dataset_id, table_id=table_id)

    if tb.table_exists(mode="staging"):
        tb.append(
            filepath=path,
            if_exists="replace",
        )

        log((f"Successfully uploaded {path} to "
             f"{tb.bucket_name}.staging.{dataset_id}.{table_id}"))
    else:
        log(("Table does not exist in STAGING, need to create it in local first.\n"
             "Create and publish the table in BigQuery first."))

In [None]:
upload_to_gcs("br_cvm_administradores_carteira", "responsavel", re_filepath)
upload_to_gcs("br_cvm_administradores_carteira", "pessoa_fisica", pf_filepath)
upload_to_gcs("br_cvm_administradores_carteira", "pessoa_juridica", pj_filepath)

### Epílogo

In [None]:
shutil.rmtree(root)