| Etapa                      | Tecnologia recomendada                              | Justificativa                                           |
| -------------------------- | --------------------------------------------------- | ------------------------------------------------------- |
| Orquestração mensal        | `cron` no App Service ou Azure Functions            | Automação simples                                       |
| Ambiente isolado           | Docker                                              | Facilita dependências (`duckdb`, `pandas`, `azure-sdk`) |
| Extração + Upload Bronze   | Python puro (`requests`, `zipfile`, `DataLake SDK`) | Você já faz isso bem                                    |
| Transformações Silver/Gold | `Pandas` ou `DuckDB`                                | Leves, simples, ideais para CSV                         |
| Deploy                     | Azure App Service + GitHub Actions                  | CI/CD automatizado                                      |


cnes-data-pipeline/
│
├── Dockerfile
├── requirements.txt
├── app.py                  ← ponto de entrada principal
├── extract/
│   └── extract_cnes.py     ← download e bronze upload
├── transform/
│   ├── silver.py           ← limpeza e joins
│   └── gold.py             ← agregações e finais
├── utils/
│   └── azure_datalake.py   ← funções para leitura/escrita
└── cron/
    └── crontab.txt         ← para rodar mensalmente no container


In [1]:
from azure.storage.filedatalake import DataLakeServiceClient
import os

# Conexão com a conta
account_name = "cnesstorage"
account_key = "/ae47eZuE0NGPopxVHEkxOKsQwtEm3qQM0vBRPBRbB5nAW1zO6FPkEO9gwNQwkqExaVhOyHWgb68+AStIau+Uw=="#os.environ["AZURE_STORAGE_KEY"]
file_system_name = "bronze"

# Criar cliente
service_client = DataLakeServiceClient(
    account_url=f"https://{account_name}.dfs.core.windows.net",
    credential=account_key
)

file_system_client = service_client.get_file_system_client(file_system_name)

# Exibir apenas os arquivos de um mês

for path in file_system_client.get_paths(path="202406"):
    if path.name.endswith(".csv"):
        print(path.name)



202406/rlAdmGerenciaCnes202406.csv
202406/rlAtividadeObrigatoria202406.csv
202406/rlCooperativa202406.csv
202406/rlEquipeAldeia202406.csv
202406/rlEquipeNasfEsf202406.csv
202406/rlEstabAtenPsico202406.csv
202406/rlEstabAtendPrestConv202406.csv
202406/rlEstabAvaliacao202406.csv
202406/rlEstabCentralReg202406.csv
202406/rlEstabColetaSelRejeito202406.csv
202406/rlEstabComissaoOutro202406.csv
202406/rlEstabComplementar202406.csv
202406/rlEstabEndCompl202406.csv
202406/rlEstabEqpEmbarcacao202406.csv
202406/rlEstabEqpUnidApoio202406.csv
202406/rlEstabEquipamento202406.csv
202406/rlEstabEquipeMun202406.csv
202406/rlEstabEquipeProf202406.csv
202406/rlEstabInstFisiAssist202406.csv
202406/rlEstabOrgParc202406.csv
202406/rlEstabPoloAldeia202406.csv
202406/rlEstabProfComissao202406.csv
202406/rlEstabProgFundo202406.csv
202406/rlEstabRegimeRes202406.csv
202406/rlEstabRepresentante202406.csv
202406/rlEstabSamu202406.csv
202406/rlEstabServClass202406.csv
202406/rlEstabServicoApoio202406.csv
202406/rl

In [None]:
# ==== UPLOAD DE CSVs PARA O DATA LAKE ====
for root, _, files in os.walk(local_folder):
    for file_name in files:
        if file_name.lower().endswith(".csv"):
            local_path = os.path.join(root, file_name)
            blob_path = f"{target_dir}/{file_name}"

            print(f"Enviando {file_name} para abfss://{file_system_name}@{account_name}.dfs.core.windows.net/{blob_path}")

            file_client = file_system_client.get_file_client(blob_path)
            with open(local_path, "rb") as data:
                file_client.upload_data(
    data,
    overwrite=True,
    max_concurrency=4,            # número de uploads paralelos
    chunk_size=4 * 1024 * 1024    # 4 MB por chunk (ajustável)
)

print("Upload finalizado com DataLakeServiceClient.")

In [None]:
import pandas as pd
from io import BytesIO

# Nome do arquivo (coloque o que você viu acima)
remote_path = "202406/rlEstabAtenPsico202406.csv"

# Cliente do arquivo
file_client = file_system_client.get_file_client(remote_path)

# Download em memória
download = file_client.download_file()
downloaded_bytes = download.readall()

# Ler com pandas
df = pd.read_csv(BytesIO(downloaded_bytes), sep=';', encoding='latin1')  # ou ajuste para ',' e 'utf-8' se necessário
df.head()


Unnamed: 0,CO_UNIDADE,TP_ESTRUTURA,ST_PARCERIA_ONG,NU_CNPJ_ONG,NU_VAGAS_ACOL_NOTUR,CO_PROFISSIONAL_SUS,CO_CBO,TP_SUS_NAO_SUS,IND_VINCULACAO,CO_CNES_REFERENCIA,ST_UNIDADE_REGIONAL,"TO_CHAR(DT_ATUALIZACAO,'DD/MM/YYYY')",CO_USUARIO,"TO_CHAR(DT_ATUALIZACAO_ORIGEM,'DD/MM/YYYY')"
0,1702107566700,1,N,,1,51652EB48CFA6A6C,131210,S,10301,2600536,N,01/07/2024,SCNES,
1,4304603731278,0,S,7836454000000.0,10,8C22FD5A31AEBF04,142105,S,80400,3508528,N,23/02/2024,NOVOSTEMPOS,
2,4304606691757,0,S,7836454000000.0,10,CCBD98DBBCF6F484,131210,S,80400,3508528,N,01/07/2024,MARGANE,
3,1100115054141,1,N,,0,14A00EB01AF8421A,131210,S,10405,2808609,N,11/06/2024,SEMSAU,
4,4311202243970,0,N,,0,55E8F61DDC47D816,251510,S,10101,2244306,N,02/01/2024,SAUDE,


In [None]:
import pandas as pd
from io import BytesIO

def build_nulls_summary_df(file_system_client, base_path, max_files=None):
    """
    Constrói um DataFrame com o nome do arquivo, colunas, tipos e % de nulos.

    Args:
        file_system_client: cliente do container (DataLake).
        base_path: diretório dentro do container (ex: "202406").
        max_files: máximo de arquivos para processar (opcional).
        
    Returns:
        DataFrame com colunas: ['arquivo', 'coluna', 'tipo', 'porcentagem_nulos']
    """
    summary_rows = []
    paths = file_system_client.get_paths(path=base_path)
    count = 0

    for path in paths:
        print(f"Checking {path}")
        if not path.name.endswith(".csv"):
            continue

        try:
            file_client = file_system_client.get_file_client(path.name)
            content = file_client.download_file().readall()
            df = pd.read_csv(BytesIO(content), sep=';', encoding='latin1')

            if df.empty:
                continue

            nulls_pct = df.isnull().mean() * 100

            for col in df.columns:
                summary_rows.append({
                    "arquivo": path.name,
                    "coluna": col,
                    "tipo": str(df[col].dtype),
                    "porcentagem_nulos": round(nulls_pct[col], 2)
                })

        except Exception as e:
            print(f"❌ Erro ao processar {path.name}: {e}")

        count += 1
        if max_files and count >= max_files:
            break

    return pd.DataFrame(summary_rows)


In [None]:
df_sanity = build_nulls_summary_df(file_system_client, "202406")
df_sanity.to_csv("./local_storage/exploring/sanity_cnes.csv", index=False)

KeyboardInterrupt: 

## Métricas
- Relação profissionais / população
- Leitos hospitalares por habitante
- Cobertura da Atenção Básica -> Proporção da população vinculada a equipes de saúde da família

OBS: quebrar escopo para SP, olhar 

In [None]:
import pandas as pd
import pandasql as ps

df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/exploration/output/cnes_202503_sanity_summary.csv", sep=',') 
query = "SELECT * from df"

resultado = ps.sqldf(query, locals())
print(resultado)

ModuleNotFoundError: No module named 'pandasql'

In [6]:
df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/exploration/output/cnes_202503_missingness_by_column.csv", sep=',') 
df.sort_values(by='column_name', ascending=False).head(15)

Unnamed: 0,file_name,column_name,missing_pct,sep,encoding
417,tbSubTpModVinculo202503.csv,TP_VINCULO,0.0,;,latin-1
264,tbEstabelecimento202503.csv,TP_UNIDADE,0.0,;,latin-1
182,tbCargaHorariaSus202503.csv,TP_TERCEIRO_SIH,31.6,;,latin-1
390,tbResidenciaMed202503.csv,TP_SUS_NAO_SUS,0.0,;,latin-1
180,tbCargaHorariaSus202503.csv,TP_SUS_NAO_SUS,0.0,;,latin-1
418,tbSubTpModVinculo202503.csv,TP_SUBVINCULO,0.0,;,latin-1
384,tbResidenciaMed202503.csv,TP_SRT,7.59,;,latin-1
188,tbCargaHorariaSus202503.csv,TP_RESIDENTE,0.0,;,latin-1
187,tbCargaHorariaSus202503.csv,TP_PRECEPTOR,0.0,;,latin-1
239,tbEstabelecimento202503.csv,TP_PFPJ,0.0,;,latin-1


In [7]:
import pandas as pd

df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/estabelecimentos_202504.csv", sep=',') 
df.head()

  df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/estabelecimentos_202504.csv", sep=',')


Unnamed: 0,CO_UNIDADE,CO_PROFISSIONAL_SUS,NO_PROFISSIONAL,CO_CBO,TP_SUS_NAO_SUS,DS_ATIVIDADE_PROFISSIONAL,NO_FANTASIA,NO_BAIRRO,NO_MUNICIPIO,CO_MUNICIPIO,CO_SIGLA_ESTADO,CO_CEP,ds_localidade,SK_REGISTRO,DATA_INGESTAO
0,SP00003509205000001329730000101,9D27061F6644A854,MARTHA TIDORI KIOTA KOTSUBO,225320,S,MEDICO EM RADIOLOGIA E DIAGNOSTICO POR IMAGEM,CLINICA MEDICA SULLA PELLE S C LTDA,ACLIMACAO,SAO PAULO,355030,SP,1530000,"01530000,SAO PAULO,SP,Brasil",SP00003509205000001329730000101_9D27061F6644A8...,2025-08-20
1,SP00003509205000001329730000101,F3575C9617F8998A,WLADMIR GUBEISSI PINTO FILHO,225250,S,MEDICO GINECOLOGISTA E OBSTETRA,CLINICA MEDICA SULLA PELLE S C LTDA,ACLIMACAO,SAO PAULO,355030,SP,1530000,"01530000,SAO PAULO,SP,Brasil",SP00003509205000001329730000101_F3575C9617F899...,2025-08-20
2,3500100047406,F6965A7E959C6A39,GRAZIELE DAVID,251510,N,PSICOLOGO CLINICO,GRAZIELE DAVID PSICOLOGA EIRELI,CENTRO,ADAMANTINA,350010,SP,17800037,"17800037,ADAMANTINA,SP,Brasil",3500100047406_F6965A7E959C6A39_251510,2025-08-20
3,3500100081655,1FC46A4EA7312E5B,KELLY PRESTES RUFINO,223605,N,FISIOTERAPEUTA GERAL,RUFINO PRESTES LTDA,CENTRO,ADAMANTINA,350010,SP,17800057,"17800057,ADAMANTINA,SP,Brasil",3500100081655_1FC46A4EA7312E5B_223605,2025-08-20
4,3500100109789,EDEB3090A41EB3A5,JOANA DARC BORRO,251510,N,PSICOLOGO CLINICO,JOANA DARC BORRO,VILA CICMA,ADAMANTINA,350010,SP,17803116,"17803116,ADAMANTINA,SP,Brasil",3500100109789_EDEB3090A41EB3A5_251510,2025-08-20


In [8]:
df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/servicos_202504.csv", sep=',') 
df.head()

  df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/servicos_202504.csv", sep=',')


Unnamed: 0,CO_UNIDADE,NO_MUNICIPIO,CO_MUNICIPIO,CO_SERVICO,CO_CLASSIFICACAO,DS_CLASSIFICACAO_SERVICO,SK_REGISTRO,DATA_INGESTAO
0,3500100251402,ADAMANTINA,350010,112,1,ACOMPANHAMENTO DO PRE-NATAL DE RISCO HABITUAL,3500100251402_112_001,2025-08-20
1,3500100251402,ADAMANTINA,350010,122,3,EXAME ELETROCARDIOGRAFICO,3500100251402_122_003,2025-08-20
2,3500100251402,ADAMANTINA,350010,159,4,ESTRATEGIA DE SAUDE DA FAMILIA,3500100251402_159_004,2025-08-20
3,3500100251402,ADAMANTINA,350010,174,1,INDIVIDUOS EM GERAL,3500100251402_174_001,2025-08-20
4,3500100853437,ADAMANTINA,350010,121,1,RADIOLOGIA,3500100853437_121_001,2025-08-20
