| Etapa                      | Tecnologia recomendada                              | Justificativa                                           |
| -------------------------- | --------------------------------------------------- | ------------------------------------------------------- |
| Orquestração mensal        | `cron` no App Service ou Azure Functions            | Automação simples                                       |
| Ambiente isolado           | Docker                                              | Facilita dependências (`duckdb`, `pandas`, `azure-sdk`) |
| Extração + Upload Bronze   | Python puro (`requests`, `zipfile`, `DataLake SDK`) | Você já faz isso bem                                    |
| Transformações Silver/Gold | `Pandas` ou `DuckDB`                                | Leves, simples, ideais para CSV                         |
| Deploy                     | Azure App Service + GitHub Actions                  | CI/CD automatizado                                      |


cnes-data-pipeline/
│
├── Dockerfile
├── requirements.txt
├── app.py                  ← ponto de entrada principal
├── extract/
│   └── extract_cnes.py     ← download e bronze upload
├── transform/
│   ├── silver.py           ← limpeza e joins
│   └── gold.py             ← agregações e finais
├── utils/
│   └── azure_datalake.py   ← funções para leitura/escrita
└── cron/
    └── crontab.txt         ← para rodar mensalmente no container


In [1]:
from azure.storage.filedatalake import DataLakeServiceClient
import os

# Conexão com a conta
account_name = "cnesstorage"
account_key = "/ae47eZuE0NGPopxVHEkxOKsQwtEm3qQM0vBRPBRbB5nAW1zO6FPkEO9gwNQwkqExaVhOyHWgb68+AStIau+Uw=="#os.environ["AZURE_STORAGE_KEY"]
file_system_name = "bronze"

# Criar cliente
service_client = DataLakeServiceClient(
    account_url=f"https://{account_name}.dfs.core.windows.net",
    credential=account_key
)

file_system_client = service_client.get_file_system_client(file_system_name)

# Exibir apenas os arquivos de um mês

for path in file_system_client.get_paths(path="202406"):
    if path.name.endswith(".csv"):
        print(path.name)



202406/rlAdmGerenciaCnes202406.csv
202406/rlAtividadeObrigatoria202406.csv
202406/rlCooperativa202406.csv
202406/rlEquipeAldeia202406.csv
202406/rlEquipeNasfEsf202406.csv
202406/rlEstabAtenPsico202406.csv
202406/rlEstabAtendPrestConv202406.csv
202406/rlEstabAvaliacao202406.csv
202406/rlEstabCentralReg202406.csv
202406/rlEstabColetaSelRejeito202406.csv
202406/rlEstabComissaoOutro202406.csv
202406/rlEstabComplementar202406.csv
202406/rlEstabEndCompl202406.csv
202406/rlEstabEqpEmbarcacao202406.csv
202406/rlEstabEqpUnidApoio202406.csv
202406/rlEstabEquipamento202406.csv
202406/rlEstabEquipeMun202406.csv
202406/rlEstabEquipeProf202406.csv
202406/rlEstabInstFisiAssist202406.csv
202406/rlEstabOrgParc202406.csv
202406/rlEstabPoloAldeia202406.csv
202406/rlEstabProfComissao202406.csv
202406/rlEstabProgFundo202406.csv
202406/rlEstabRegimeRes202406.csv
202406/rlEstabRepresentante202406.csv
202406/rlEstabSamu202406.csv
202406/rlEstabServClass202406.csv
202406/rlEstabServicoApoio202406.csv
202406/rl

In [None]:
# ==== UPLOAD DE CSVs PARA O DATA LAKE ====
for root, _, files in os.walk(local_folder):
    for file_name in files:
        if file_name.lower().endswith(".csv"):
            local_path = os.path.join(root, file_name)
            blob_path = f"{target_dir}/{file_name}"

            print(f"Enviando {file_name} para abfss://{file_system_name}@{account_name}.dfs.core.windows.net/{blob_path}")

            file_client = file_system_client.get_file_client(blob_path)
            with open(local_path, "rb") as data:
                file_client.upload_data(
    data,
    overwrite=True,
    max_concurrency=4,            # número de uploads paralelos
    chunk_size=4 * 1024 * 1024    # 4 MB por chunk (ajustável)
)

print("Upload finalizado com DataLakeServiceClient.")

In [None]:
import pandas as pd
from io import BytesIO

# Nome do arquivo (coloque o que você viu acima)
remote_path = "202406/rlEstabAtenPsico202406.csv"

# Cliente do arquivo
file_client = file_system_client.get_file_client(remote_path)

# Download em memória
download = file_client.download_file()
downloaded_bytes = download.readall()

# Ler com pandas
df = pd.read_csv(BytesIO(downloaded_bytes), sep=';', encoding='latin1')  # ou ajuste para ',' e 'utf-8' se necessário
df.head()


Unnamed: 0,CO_UNIDADE,TP_ESTRUTURA,ST_PARCERIA_ONG,NU_CNPJ_ONG,NU_VAGAS_ACOL_NOTUR,CO_PROFISSIONAL_SUS,CO_CBO,TP_SUS_NAO_SUS,IND_VINCULACAO,CO_CNES_REFERENCIA,ST_UNIDADE_REGIONAL,"TO_CHAR(DT_ATUALIZACAO,'DD/MM/YYYY')",CO_USUARIO,"TO_CHAR(DT_ATUALIZACAO_ORIGEM,'DD/MM/YYYY')"
0,1702107566700,1,N,,1,51652EB48CFA6A6C,131210,S,10301,2600536,N,01/07/2024,SCNES,
1,4304603731278,0,S,7836454000000.0,10,8C22FD5A31AEBF04,142105,S,80400,3508528,N,23/02/2024,NOVOSTEMPOS,
2,4304606691757,0,S,7836454000000.0,10,CCBD98DBBCF6F484,131210,S,80400,3508528,N,01/07/2024,MARGANE,
3,1100115054141,1,N,,0,14A00EB01AF8421A,131210,S,10405,2808609,N,11/06/2024,SEMSAU,
4,4311202243970,0,N,,0,55E8F61DDC47D816,251510,S,10101,2244306,N,02/01/2024,SAUDE,


# População

In [None]:
import re

year_cols = [c for c in df_pop.columns if re.fullmatch(r"\d{4}", str(c))]

df_pop_long = df_pop.melt(
    id_vars=["CO_MUNICIPIO", "NO_MUNICIPIO"],
    value_vars=year_cols,
    var_name="YYYY",
    value_name="POPULACAO"
)
print(df_pop_long.query("NO_MUNICIPIO == 'Sorocaba'"))



import pandas as pd
import numpy as np

# garante tipos
df = df_pop_long.copy()
df["YYYY"] = df["YYYY"].astype(int)
df["POPULACAO"] = pd.to_numeric(df["POPULACAO"], errors="coerce")
df["IMPUTED"] = False

# reindexa anos ausentes por município
def impute_geom(g):
    years = range(g["YYYY"].min(), g["YYYY"].max() + 1)
    g = g.set_index("YYYY").reindex(years)
    g["CO_MUNICIPIO"] = g["CO_MUNICIPIO"].ffill().bfill()
    g["NO_MUNICIPIO"] = g["NO_MUNICIPIO"].ffill().bfill()

    # interpolação geométrica: interpola log e depois exponencia
    mask_missing = g["POPULACAO"].isna()
    if mask_missing.any():
        y = np.log(g["POPULACAO"])
        y_interp = y.interpolate(method="linear", limit_direction="both")
        g.loc[mask_missing, "POPULACAO"] = np.exp(y_interp[mask_missing])
        g.loc[mask_missing, "IMPUTED"] = True
    else:
        g["IMPUTED"] = False

    g = g.reset_index().rename(columns={"index": "YYYY"})
    return g

df_populacao = (
    df.sort_values(["CO_MUNICIPIO", "YYYY"])
      .groupby(["CO_MUNICIPIO", "NO_MUNICIPIO"], group_keys=False)
      .apply(impute_geom)
      .astype({"YYYY": int})
)

df_populacao.to_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/populacao_imputada.csv", index=False)

In [None]:
import pandas as pd
from io import BytesIO

def build_nulls_summary_df(file_system_client, base_path, max_files=None):
    """
    Constrói um DataFrame com o nome do arquivo, colunas, tipos e % de nulos.

    Args:
        file_system_client: cliente do container (DataLake).
        base_path: diretório dentro do container (ex: "202406").
        max_files: máximo de arquivos para processar (opcional).
        
    Returns:
        DataFrame com colunas: ['arquivo', 'coluna', 'tipo', 'porcentagem_nulos']
    """
    summary_rows = []
    paths = file_system_client.get_paths(path=base_path)
    count = 0

    for path in paths:
        print(f"Checking {path}")
        if not path.name.endswith(".csv"):
            continue

        try:
            file_client = file_system_client.get_file_client(path.name)
            content = file_client.download_file().readall()
            df = pd.read_csv(BytesIO(content), sep=';', encoding='latin1')

            if df.empty:
                continue

            nulls_pct = df.isnull().mean() * 100

            for col in df.columns:
                summary_rows.append({
                    "arquivo": path.name,
                    "coluna": col,
                    "tipo": str(df[col].dtype),
                    "porcentagem_nulos": round(nulls_pct[col], 2)
                })

        except Exception as e:
            print(f"❌ Erro ao processar {path.name}: {e}")

        count += 1
        if max_files and count >= max_files:
            break

    return pd.DataFrame(summary_rows)


In [None]:
df_sanity = build_nulls_summary_df(file_system_client, "202406")
df_sanity.to_csv("./local_storage/exploring/sanity_cnes.csv", index=False)

KeyboardInterrupt: 

## Métricas
- Relação profissionais / população
- Leitos hospitalares por habitante
- Cobertura da Atenção Básica -> Proporção da população vinculada a equipes de saúde da família

OBS: quebrar escopo para SP, olhar 

In [2]:
!pip install pandasql


Collecting pandasql
  Using cached pandasql-0.7.3.tar.gz (26 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting sqlalchemy (from pandasql)
  Downloading sqlalchemy-2.0.43-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.6 kB)
Downloading sqlalchemy-2.0.43-cp312-cp312-macosx_11_0_arm64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: pandasql
  Building wheel for pandasql (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26867 sha256=9e80752a09d4b8b3e1cb72574cbfee8ea189abf638492d048db7ae3e74dfc3f1
  Stored in directory: /Users/caio.maximiano/Library/Caches/pip/wheels/15/a1/e7/6f92f295b5272ae5c02365e6b8fa19cb93f16a537090a1cf27
Successfully built pandasql
Inst

In [7]:
import pandas as pd
import pandasql as ps

df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/exploration/output/cnes_202503_sanity_summary.csv", sep=',') 
query = "SELECT * from df"

resultado = ps.sqldf(query, locals())
resultado.head(10)

Unnamed: 0,file_name,sep,encoding,load_time_sec,num_rows,num_columns,memory_mb,columns
0,rlAdmGerenciaCnes202503.csv,",",latin-1,0.0121,4594,5,0.712,"NU_CNPJ_ADM;""CO_UNIDADE"";""TO_CHAR(DT_VIGENCIA_..."
1,rlAtividadeObrigatoria202503.csv,",",utf-8,0.0007,14,1,0.001,"CO_TIPO_ESTABELECIMENTO;""CO_ATIVIDADE_OBRIGATO..."
2,rlCooperativa202503.csv,",",utf-8,0.0029,2308,3,0.27,"CO_UNIDADE;""CO_COOPERATIVA"";""CO_CBO"";""CO_USUAR..."
3,rlEquipeAldeia202503.csv,",",utf-8,0.0007,300,3,0.039,"CO_MUNICIPIO;""CO_AREA"";""CO_SEQ_EQUIPE"";""CO_ALD..."
4,rlEquipeNasfEsf202503.csv,;,latin-1,0.0909,86467,17,34.202,"CO_MUNICIPIO, CO_AREA, SEQ_EQUIPE, CO_MUNICIPI..."
5,rlEstabAtenPsico202503.csv,",",latin-1,0.0013,1373,3,0.229,"CO_UNIDADE;""TP_ESTRUTURA"";""ST_PARCERIA_ONG"";""N..."
6,rlEstabAtendPrestConv202503.csv,",",latin-1,0.5452,1004536,3,108.831,"CO_UNIDADE;""CO_ATENDIMENTO_PRESTADO"";""CO_CONVE..."
7,rlEstabAvaliacao202503.csv,",",latin-1,0.0014,1737,4,0.23,"CO_UNIDADE;""CO_AVALIACAO"";""CO_CLASSIFICACAO"";""..."
8,rlEstabCentralReg202503.csv,",",latin-1,0.0049,3210,3,0.943,"CO_UNIDADE;""CO_SEQ_CENTRAL"";""NO_CENTRAL"";""CO_S..."
9,rlEstabColetaSelRejeito202503.csv,",",latin-1,0.464,851518,3,88.224,"CO_UNIDADE;""CO_COLETA_REJEITO"";""TO_CHAR(DT_ATU..."


In [6]:
df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/exploration/output/cnes_202503_missingness_by_column.csv", sep=',') 
df.sort_values(by='column_name', ascending=False).head(15)

Unnamed: 0,file_name,column_name,missing_pct,sep,encoding
417,tbSubTpModVinculo202503.csv,TP_VINCULO,0.0,;,latin-1
264,tbEstabelecimento202503.csv,TP_UNIDADE,0.0,;,latin-1
182,tbCargaHorariaSus202503.csv,TP_TERCEIRO_SIH,31.6,;,latin-1
390,tbResidenciaMed202503.csv,TP_SUS_NAO_SUS,0.0,;,latin-1
180,tbCargaHorariaSus202503.csv,TP_SUS_NAO_SUS,0.0,;,latin-1
418,tbSubTpModVinculo202503.csv,TP_SUBVINCULO,0.0,;,latin-1
384,tbResidenciaMed202503.csv,TP_SRT,7.59,;,latin-1
188,tbCargaHorariaSus202503.csv,TP_RESIDENTE,0.0,;,latin-1
187,tbCargaHorariaSus202503.csv,TP_PRECEPTOR,0.0,;,latin-1
239,tbEstabelecimento202503.csv,TP_PFPJ,0.0,;,latin-1


# Especialidades
MEDICO CARDIOLOGISTA -> 225120
MEDICO CLINICO -> 225125
MEDICO GERIATRA -> 225180
MEDICO PEDIATRA -> 225124
MEDICO DA ESTRATEGIA DE SAUDE DA FAMILIA -> 225142
MEDICO GINECOLOGISTA E OBSTETRA -> 225250
MEDICO NEUROLOGISTA -> 225112

# População

In [30]:
!pip install thefuzz[speedup]

Collecting thefuzz[speedup]
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz[speedup])
  Downloading rapidfuzz-3.14.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.14.0-cp312-cp312-macosx_11_0_arm64.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz, thefuzz
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [thefuzz]
[1A[2KSuccessfully installed rapidfuzz-3.14.0 thefuzz-0.22.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [31]:
import pandas as pd
from thefuzz import process

# Load data
df_pop = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/csv/populacao.csv")
df_estab = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/estabelecimentos_202504.csv")

# Filter estabelecimentos
df_estab = df_estab.query(
    "CO_CBO in ('225120', '225125', '225180', '225124', '225142', '225250', '225112') & NO_MUNICIPIO in ('INDAIATUBA', 'SAO CARLOS')"
)

# Fuzzy match NO_MUNICIPIO
def fuzzy_match(municipio, choices, threshold=90):
    match, score = process.extractOne(municipio, choices)
    return match if score >= threshold else None

df_estab["NO_MUNICIPIO_MATCH"] = df_estab["NO_MUNICIPIO"].apply(
    lambda x: fuzzy_match(x, df_pop["NO_MUNICIPIO"].unique())
)

# Merge on matched municipio
df_merged = pd.merge(
    df_estab,
    df_pop,
    left_on="NO_MUNICIPIO_MATCH",
    right_on="NO_MUNICIPIO",
    suffixes=("_estab", "_pop")
)

df_merged.head()

  df_estab = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/estabelecimentos_202504.csv")


Unnamed: 0,CO_UNIDADE,CO_PROFISSIONAL_SUS,NO_PROFISSIONAL,CO_CBO,TP_SUS_NAO_SUS,DS_ATIVIDADE_PROFISSIONAL,NO_FANTASIA,NO_BAIRRO,NO_MUNICIPIO_estab,CO_MUNICIPIO_estab,CO_SIGLA_ESTADO,CO_CEP,ds_localidade,SK_REGISTRO,DATA_INGESTAO,NO_MUNICIPIO_MATCH,CO_MUNICIPIO_pop,NO_MUNICIPIO_pop,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2024,2025
0,3520500247480,A529AE6169C5B873,SERGIO AUGUSTO MARCELINO FILHO,225120,N,MEDICO CARDIOLOGISTA,VITTACOR,JARDIM POMPEIA,INDAIATUBA,352050,SP,13345040,"13345040,INDAIATUBA,SP,Brasil",3520500247480_A529AE6169C5B873_225120,2025-08-20,Indaiatuba,20509,Indaiatuba,205.808,209.859,222.042,226.602,231.033,235.367,239.602,246.908,251.627,256.223,260.69,267.796,269.657
1,3520500535176,61290B8EC9B61D93,FERNANDA JOSLIN OLIVEIRA REZENDE,225120,N,MEDICO CARDIOLOGISTA,DIMEN MEDICINA NUCLEAR INDAIATUBA,CIDADE NOVA,INDAIATUBA,352050,SP,13334070,"13334070,INDAIATUBA,SP,Brasil",3520500535176_61290B8EC9B61D93_225120,2025-08-20,Indaiatuba,20509,Indaiatuba,205.808,209.859,222.042,226.602,231.033,235.367,239.602,246.908,251.627,256.223,260.69,267.796,269.657
2,3520500737852,98ED76DA3EE021AA,CRISTIAN RICARDO GONZALEZ,225125,N,MEDICO CLINICO,FOCO SAUDE OCUPACIONAL,JARDIM SANTA CRUZ,INDAIATUBA,352050,SP,13344000,"13344000,INDAIATUBA,SP,Brasil",3520500737852_98ED76DA3EE021AA_225125,2025-08-20,Indaiatuba,20509,Indaiatuba,205.808,209.859,222.042,226.602,231.033,235.367,239.602,246.908,251.627,256.223,260.69,267.796,269.657
3,3520500737852,697156A3F88459E4,BRUNA GONZALEZ PORTO,225125,N,MEDICO CLINICO,FOCO SAUDE OCUPACIONAL,JARDIM SANTA CRUZ,INDAIATUBA,352050,SP,13344000,"13344000,INDAIATUBA,SP,Brasil",3520500737852_697156A3F88459E4_225125,2025-08-20,Indaiatuba,20509,Indaiatuba,205.808,209.859,222.042,226.602,231.033,235.367,239.602,246.908,251.627,256.223,260.69,267.796,269.657
4,3520500737852,BC7E25DDD69368BE,MARIANA DE OLIVEIRA SAMPAIO MATTOS,225125,N,MEDICO CLINICO,FOCO SAUDE OCUPACIONAL,JARDIM SANTA CRUZ,INDAIATUBA,352050,SP,13344000,"13344000,INDAIATUBA,SP,Brasil",3520500737852_BC7E25DDD69368BE_225125,2025-08-20,Indaiatuba,20509,Indaiatuba,205.808,209.859,222.042,226.602,231.033,235.367,239.602,246.908,251.627,256.223,260.69,267.796,269.657


In [33]:
import pandas as pd
import pandasql as ps

df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/estabelecimentos_202504.csv")
df = df.query("CO_CBO in ('225120', '225125', '225180', '225124', '225142', '225250', '225112') & NO_MUNICIPIO in ('INDAIATUBA', 'SAO CARLOS')")

query = "SELECT NO_MUNICIPIO,DS_ATIVIDADE_PROFISSIONAL, count( distinct CO_PROFISSIONAL_SUS) from df group by NO_MUNICIPIO, DS_ATIVIDADE_PROFISSIONAL"

resultado = ps.sqldf(query, locals())
resultado.head(10)

  df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/estabelecimentos_202504.csv")


Unnamed: 0,NO_MUNICIPIO,DS_ATIVIDADE_PROFISSIONAL,count( distinct CO_PROFISSIONAL_SUS)
0,INDAIATUBA,MEDICO CARDIOLOGISTA,47
1,INDAIATUBA,MEDICO CLINICO,879
2,INDAIATUBA,MEDICO DA ESTRATEGIA DE SAUDE DA FAMILIA,50
3,INDAIATUBA,MEDICO GERIATRA,3
4,INDAIATUBA,MEDICO GINECOLOGISTA E OBSTETRA,181
5,INDAIATUBA,MEDICO NEUROLOGISTA,45
6,INDAIATUBA,MEDICO PEDIATRA,209
7,SAO CARLOS,MEDICO CARDIOLOGISTA,62
8,SAO CARLOS,MEDICO CLINICO,450
9,SAO CARLOS,MEDICO DA ESTRATEGIA DE SAUDE DA FAMILIA,20


In [4]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl (31.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.2/31.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-21.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import pandas as pd
from io import BytesIO
from azure.storage.filedatalake import DataLakeServiceClient

# Connection info
account_name = "cnesstorage"
account_key = "/ae47eZuE0NGPopxVHEkxOKsQwtEm3qQM0vBRPBRbB5nAW1zO6FPkEO9gwNQwkqExaVhOyHWgb68+AStIau+Uw=="

datalake_client = DataLakeServiceClient(
    account_url=f"https://{account_name}.dfs.core.windows.net",
    credential=account_key,
)
silver_fs_client = datalake_client.get_file_system_client("silver")
gold_fs_client = datalake_client.get_file_system_client("gold")

def read_all_curated_periods(fs_client, prefix, sep=",", encoding="utf-8"):
    """
    Reads all CSVs with the given prefix from the silver container, concatenates, and adds yyyymm column.
    """
    all_dfs = []
    for path in fs_client.get_paths():
        if path.name.endswith('.csv') and f"/{prefix}_" in path.name:
            # Extract yyyymm from filename, e.g., estabelecimentos_202201.csv
            yyyymm = path.name.split("_")[-1].replace(".csv", "")
            try:
                file_client = fs_client.get_file_client(path.name)
                content = file_client.download_file().readall()
                df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
                df["yyyymm"] = yyyymm
                all_dfs.append(df)
            except Exception as e:
                print(f"Erro ao ler {path.name}: {e}")
    if all_dfs:
        return pd.concat(all_dfs, ignore_index=True)
    else:
        return pd.DataFrame()

def save_parquet_to_gold(fs_client, df, filename):
    """
    Save DataFrame as Parquet in the 'gold' container of the Data Lake.
    """
    import io
    buffer = io.BytesIO()
    df.to_parquet(buffer, index=False, engine="pyarrow")
    buffer.seek(0)
    file_client = fs_client.get_file_client(filename)
    file_client.upload_data(buffer.read(), overwrite=True)
    print(f"Arquivo Parquet salvo em gold/{filename}")

# Example for estabelecimentos:
df_estab_all = read_all_curated_periods(silver_fs_client, "estabelecimentos")
save_parquet_to_gold(gold_fs_client, df_estab_all, "estabelecimentos_all_periods.parquet")

# Example for servicos:
# df_serv_all = read_all_curated_periods(silver_fs_client, "servicos")
# save_parquet_to_gold(gold_fs_client, df_serv_all, "servicos_all_periods.parquet")

  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)
  df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding)


KeyboardInterrupt: 

In [10]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting fsspec (from fastparquet)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-macosx_11_0_arm64.whl (685 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m685.4/685.4 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.11.0-cp312-cp312-macosx_11_0_arm64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
Installing collected packages: fsspec, cramjam, fastparquet
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [fastparquet]
[1A[2KSuccessfully installed cramjam-2.11.0 fastpar

In [15]:
import pandas as pd
from io import BytesIO
from azure.storage.filedatalake import DataLakeServiceClient

# Connection info
account_name = "cnesstorage"
account_key = "/ae47eZuE0NGPopxVHEkxOKsQwtEm3qQM0vBRPBRbB5nAW1zO6FPkEO9gwNQwkqExaVhOyHWgb68+AStIau+Uw=="

datalake_client = DataLakeServiceClient(
    account_url=f"https://{account_name}.dfs.core.windows.net",
    credential=account_key,
)
silver_fs_client = datalake_client.get_file_system_client("silver")
gold_fs_client = datalake_client.get_file_system_client("gold")

def read_all_curated_periods(fs_client, prefix, sep=",", encoding="utf-8", periods=None):
    """
    Reads all CSVs with the given prefix from the silver container, concatenates, and adds yyyymm column.
    If periods is provided, only loads files for those periods (list of strings, e.g. ['202201','202202']).
    """
    all_dfs = []
    for path in fs_client.get_paths():
        if path.name.endswith('.csv') and f"/{prefix}_" in path.name:
            yyyymm = path.name.split("_")[-1].replace(".csv", "")
            if periods is not None and yyyymm not in periods:
                continue
            try:
                print(f"Lendo {path.name}")
                file_client = fs_client.get_file_client(path.name)
                content = file_client.download_file().readall()
                df = pd.read_csv(BytesIO(content), sep=sep, encoding=encoding,dtype=str)
                df["yyyymm"] = yyyymm
                all_dfs.append(df)
                print(f"DataFrame para {yyyymm} adicionado.")
            except Exception as e:
                print(f"Erro ao ler {path.name}: {e}")
    if all_dfs:
        return pd.concat(all_dfs, ignore_index=True)
    else:
        return pd.DataFrame()

def save_parquet_to_gold(fs_client, df, filename):
    """
    Save DataFrame as Parquet in the 'gold' container of the Data Lake.
    """
    print(f"Salvando {filename} no formato Parquet.")
    import io
    buffer = io.BytesIO()
    df.to_parquet(buffer, index=False, engine="fastparquet")
    buffer.seek(0)
    file_client = fs_client.get_file_client(filename)
    file_client.upload_data(buffer.read(), overwrite=True)
    print(f"Arquivo Parquet salvo em gold/{filename}")

# range from Jan 2022 to Jul 2025 inclusive
periods = pd.period_range("202201", "202507", freq="M").strftime("%Y%m").tolist()

df_estab_range = read_all_curated_periods(
    silver_fs_client,
    "estabelecimentos",
    periods=periods
)

save_parquet_to_gold(
    gold_fs_client,
    df_estab_range,
    "estabelecimentos_all.parquet"
)

# Example for all periods (default):
df_estab_all = read_all_curated_periods(
    silver_fs_client,
    "estabelecimentos",
    periods=periods
)

save_parquet_to_gold(
    gold_fs_client,
    df_estab_all,
    "estabelecimentos_all.parquet"
)

Lendo 202201/estabelecimentos_202201.csv
DataFrame para 202201 adicionado.
Lendo 202202/estabelecimentos_202202.csv
DataFrame para 202202 adicionado.
Lendo 202203/estabelecimentos_202203.csv
DataFrame para 202203 adicionado.
Lendo 202204/estabelecimentos_202204.csv
DataFrame para 202204 adicionado.
Lendo 202205/estabelecimentos_202205.csv
DataFrame para 202205 adicionado.
Lendo 202206/estabelecimentos_202206.csv
DataFrame para 202206 adicionado.
Lendo 202207/estabelecimentos_202207.csv
DataFrame para 202207 adicionado.
Lendo 202208/estabelecimentos_202208.csv
DataFrame para 202208 adicionado.
Lendo 202209/estabelecimentos_202209.csv
DataFrame para 202209 adicionado.
Lendo 202210/estabelecimentos_202210.csv
DataFrame para 202210 adicionado.
Lendo 202212/estabelecimentos_202212.csv
DataFrame para 202212 adicionado.
Lendo 202301/estabelecimentos_202301.csv
DataFrame para 202301 adicionado.
Lendo 202302/estabelecimentos_202302.csv
DataFrame para 202302 adicionado.
Lendo 202303/estabelecime

Unable to stream download.


Erro ao ler 202403/estabelecimentos_202403.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202404/estabelecimentos_202404.csv


Unable to stream download.


Erro ao ler 202404/estabelecimentos_202404.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202405/estabelecimentos_202405.csv


Unable to stream download.


Erro ao ler 202405/estabelecimentos_202405.csv: [SSL] record layer failure (_ssl.c:2580)
Lendo 202406/estabelecimentos_202406.csv
Erro ao ler 202406/estabelecimentos_202406.csv: No columns to parse from file
Lendo 202407/estabelecimentos_202407.csv


Unable to stream download.


DataFrame para 202407 adicionado.
Lendo 202408/estabelecimentos_202408.csv


Unable to stream download.


Erro ao ler 202408/estabelecimentos_202408.csv: [SSL] record layer failure (_ssl.c:2580)
Lendo 202409/estabelecimentos_202409.csv


Unable to stream download.


Erro ao ler 202409/estabelecimentos_202409.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202410/estabelecimentos_202410.csv


Unable to stream download.
Unable to stream download.
Unable to stream download.


Erro ao ler 202410/estabelecimentos_202410.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202411/estabelecimentos_202411.csv


Unable to stream download.


Erro ao ler 202411/estabelecimentos_202411.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202501/estabelecimentos_202501.csv


Unable to stream download.
Unable to stream download.
Unable to stream download.


Erro ao ler 202501/estabelecimentos_202501.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202502/estabelecimentos_202502.csv


Unable to stream download.


Erro ao ler 202502/estabelecimentos_202502.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202504/estabelecimentos_202504.csv


Unable to stream download.


Erro ao ler 202504/estabelecimentos_202504.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202506/estabelecimentos_202506.csv


Unable to stream download.
Unable to stream download.


Erro ao ler 202506/estabelecimentos_202506.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Lendo 202507/estabelecimentos_202507.csv


Unable to stream download.


Erro ao ler 202507/estabelecimentos_202507.csv: HTTPSConnectionPool(host='cnesstorage.blob.core.windows.net', port=443): Read timed out.
Salvando estabelecimentos_all.parquet no formato Parquet.
Arquivo Parquet salvo em gold/estabelecimentos_all.parquet
Lendo 202201/estabelecimentos_202201.csv
DataFrame para 202201 adicionado.
Lendo 202202/estabelecimentos_202202.csv
DataFrame para 202202 adicionado.
Lendo 202203/estabelecimentos_202203.csv
DataFrame para 202203 adicionado.
Lendo 202204/estabelecimentos_202204.csv
DataFrame para 202204 adicionado.
Lendo 202205/estabelecimentos_202205.csv
DataFrame para 202205 adicionado.
Lendo 202206/estabelecimentos_202206.csv
DataFrame para 202206 adicionado.
Lendo 202207/estabelecimentos_202207.csv
DataFrame para 202207 adicionado.
Lendo 202208/estabelecimentos_202208.csv
DataFrame para 202208 adicionado.
Lendo 202209/estabelecimentos_202209.csv
DataFrame para 202209 adicionado.
Lendo 202210/estabelecimentos_202210.csv
DataFrame para 202210 adiciona

In [3]:
df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/servicos_202504.csv", sep=',') 
df.columns


  df = pd.read_csv("/Users/caio.maximiano/pessoal/cnes-project-analysis/local_storage/curated/servicos_202504.csv", sep=',')


Index(['CO_UNIDADE', 'NO_MUNICIPIO', 'CO_MUNICIPIO', 'CO_SERVICO',
       'CO_CLASSIFICACAO', 'DS_CLASSIFICACAO_SERVICO', 'SK_REGISTRO',
       'DATA_INGESTAO'],
      dtype='object')