| Etapa                      | Tecnologia recomendada                              | Justificativa                                           |
| -------------------------- | --------------------------------------------------- | ------------------------------------------------------- |
| Orquestração mensal        | `cron` no App Service ou Azure Functions            | Automação simples                                       |
| Ambiente isolado           | Docker                                              | Facilita dependências (`duckdb`, `pandas`, `azure-sdk`) |
| Extração + Upload Bronze   | Python puro (`requests`, `zipfile`, `DataLake SDK`) | Você já faz isso bem                                    |
| Transformações Silver/Gold | `Pandas` ou `DuckDB`                                | Leves, simples, ideais para CSV                         |
| Deploy                     | Azure App Service + GitHub Actions                  | CI/CD automatizado                                      |


cnes-data-pipeline/
│
├── Dockerfile
├── requirements.txt
├── app.py                  ← ponto de entrada principal
├── extract/
│   └── extract_cnes.py     ← download e bronze upload
├── transform/
│   ├── silver.py           ← limpeza e joins
│   └── gold.py             ← agregações e finais
├── utils/
│   └── azure_datalake.py   ← funções para leitura/escrita
└── cron/
    └── crontab.txt         ← para rodar mensalmente no container


# Conexão com o Datalake

In [2]:
import re
from core.storage import Storage

silver = Storage(file_system="silver")
gold = Storage(file_system="gold")
pat = re.compile(r"^estabelecimentos/year_month=(\d{6})/data\.parquet$")

zero = []
ok = []
for p in silver.list_paths(prefix="estabelecimentos"):
    m = pat.match(p)
    if not m:
        continue
    ym = m.group(1)
    # usa metadados (não baixa o arquivo)
    props = silver.fs.get_file_client(p).get_file_properties()
    size = props.size or 0
    if size == 0:
        zero.append((ym, p))
    else:
        ok.append((ym, size))

print(f"⚠️ zero-byte files: {len(zero)}")
for ym, p in zero[:20]:
    print(f"  {ym} -> {p}")
if len(zero) > 20:
    print("  ...")

print(f"\n✅ ok files: {len(ok)} (exibe 10)")
for ym, size in sorted(ok)[:10]:
    print(f"  {ym}: {size} bytes")


⚠️ zero-byte files: 0

✅ ok files: 45 (exibe 10)
  202101: 65342233 bytes
  202102: 65974806 bytes
  202103: 66756388 bytes
  202104: 67571819 bytes
  202105: 68090258 bytes
  202106: 68389629 bytes
  202107: 68840794 bytes
  202108: 69229590 bytes
  202109: 69401417 bytes
  202110: 69598211 bytes


In [22]:
import re, io, pandas as pd
from core.storage import Storage

silver = Storage(file_system="silver")

def list_partitions(storage: Storage, prefix="estabelecimentos",
                    partition_key="year_month", filename="data.parquet") -> list[str]:
    pat = re.compile(rf"^{prefix}/{partition_key}=(\d{{6}})/{filename}$")
    paths = storage.list_paths(prefix=prefix)
    yms = sorted({m.group(1) for p in paths if (m := pat.match(p))})
    return yms

def check_zero_byte(storage: Storage, prefix="estabelecimentos",
                    partition_key="year_month", filename="data.parquet") -> list[tuple[str,str]]:
    """retorna lista [(YYYYMM, path)] de arquivos com size==0 (sem baixar)"""
    pat = re.compile(rf"^{prefix}/{partition_key}=(\d{{6}})/{filename}$")
    zeros = []
    for p in storage.list_paths(prefix=prefix):
        m = pat.match(p)
        if not m: 
            continue
        size = storage.fs.get_file_client(p).get_file_properties().size or 0
        if size == 0:
            zeros.append((m.group(1), p))
    return zeros

def read_partition(storage: Storage, ym: str, prefix="estabelecimentos",
                   partition_key="year_month", filename="data.parquet",
                   columns=None) -> pd.DataFrame:
    print(f"lendo {ym} ...", end=" ")
    path = f"{prefix}/{partition_key}={ym}/{filename}"
    raw = storage.download_file(path)
    print("ok")
    df = pd.read_parquet(io.BytesIO(raw), engine="pyarrow")
    df = df[
        (df["TP_SUS_NAO_SUS"] == "S") &
        (df["DS_ATIVIDADE_PROFISSIONAL"].str.startswith("MEDICO", na=False))
    ]
    return df.loc[:, columns]

# --- uso ---
yms = list_partitions(silver, prefix="estabelecimentos")
print(f"partições encontradas: {len(yms)}")
print("exemplo:", yms[:10])

zeros = check_zero_byte(silver, prefix="estabelecimentos")
print(f"zero-byte: {len(zeros)}")
if zeros:
    for ym, p in zeros[:10]:
        print("  ", ym, "->", p)

# ler rapidamente 2 partições para teste (só algumas colunas)
test_yms = yms  # pegue outras se preferir
dfs = []
for ym in test_yms:
    df = read_partition(
        silver, ym, prefix="estabelecimentos",
        columns=["CO_PROFISSIONAL_SUS","NO_MUNICIPIO", "DS_ATIVIDADE_PROFISSIONAL", "TP_SUS_NAO_SUS", "CO_MUNICIPIO","YYYYMM"]
    )
    df["year_month"] = ym
    dfs.append(df)

partições encontradas: 45
exemplo: ['202101', '202102', '202103', '202104', '202105', '202106', '202107', '202108', '202109', '202110']
zero-byte: 0
lendo 202101 ... ok
lendo 202102 ... ok
lendo 202103 ... ok
lendo 202104 ... ok
lendo 202105 ... ok
lendo 202106 ... ok
lendo 202107 ... ok
lendo 202108 ... ok
lendo 202109 ... ok
lendo 202110 ... ok
lendo 202111 ... ok
lendo 202112 ... ok
lendo 202201 ... ok
lendo 202202 ... ok
lendo 202203 ... ok
lendo 202204 ... ok
lendo 202205 ... ok
lendo 202206 ... ok
lendo 202207 ... ok
lendo 202208 ... ok
lendo 202209 ... ok
lendo 202210 ... ok
lendo 202211 ... ok
lendo 202212 ... ok
lendo 202301 ... ok
lendo 202302 ... ok
lendo 202303 ... ok
lendo 202304 ... ok
lendo 202305 ... ok
lendo 202306 ... ok
lendo 202307 ... ok
lendo 202309 ... ok
lendo 202310 ... ok
lendo 202311 ... ok
lendo 202401 ... ok
lendo 202402 ... ok
lendo 202403 ... ok
lendo 202404 ... ok
lendo 202405 ... ok
lendo 202406 ... ok
lendo 202407 ... ok
lendo 202409 ... ok
lendo 20241

Unnamed: 0,CO_PROFISSIONAL_SUS,NO_MUNICIPIO,DS_ATIVIDADE_PROFISSIONAL,TP_SUS_NAO_SUS,CO_MUNICIPIO,YYYYMM,year_month
0,F3575C9617F8998A,SAO PAULO,MEDICO GINECOLOGISTA E OBSTETRA,S,355030,202101,202101
1,9D27061F6644A854,SAO PAULO,MEDICO EM RADIOLOGIA E DIAGNOSTICO POR IMAGEM,S,355030,202101,202101
2,9573AEE972CC2D8E,ADAMANTINA,MEDICO DA ESTRATEGIA DE SAUDE DA FAMILIA,S,350010,202101,202101
3,798945E3EA7F16C2,ADAMANTINA,MEDICO CARDIOLOGISTA,S,350010,202101,202101
4,798945E3EA7F16C2,ADAMANTINA,MEDICO CARDIOLOGISTA,S,350010,202101,202101


In [23]:
df_sample.shape

(11794436, 7)

In [28]:
import unicodedata

def cast_estab_types(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # --- textual ids kept as string to avoid losing leading zeros
    text_as_string = [
        "CO_PROFISSIONAL_SUS",   # professional id (id-like)
        "NO_MUNICIPIO",
        "NO_MUNICIPIO",
        "DS_ATIVIDADE_PROFISSIONAL",

    ]
    for c in text_as_string:
        if c in out.columns:
            out[c] = out[c].astype("string")

    # --- codes that should be numeric (nullable Int64)
    numeric_codes = [
        "CO_MUNICIPIO",
        "CO_MUNICIPIO_COMPLETO",
        "yyyymm"
    ]
    for c in numeric_codes:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce").astype("Int64")

    # --- SUS flag (S/N) as ordered categorical
    if "TP_SUS_NAO_SUS" in out.columns:
        out["TP_SUS_NAO_SUS"] = pd.Categorical(out["TP_SUS_NAO_SUS"], categories=["N", "S"], ordered=True)

    # If you already created YYYY and MM, cast them too:
    if "YYYY" in out.columns:
        out["YYYY"] = pd.to_numeric(out["YYYY"], errors="coerce").astype("Int16")
    if "MM" in out.columns:
        out["MM"] = out["MM"].astype("string").str.zfill(2)
        out["MM"] = pd.Categorical(out["MM"], categories=[f"{m:02d}" for m in range(1,13)], ordered=True)

    # --- Parse ingestion date if present
    if "DATA_INGESTAO" in out.columns:
        out["DATA_INGESTAO"] = pd.to_datetime(out["DATA_INGESTAO"], errors="coerce")

    return out

def norm_city(x: str) -> str:
    if pd.isna(x):
        return x
    s = str(x)
    s = unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("ASCII")
    return s.strip().upper()

In [29]:
import pandas as pd


df_estab_normalized = df_sample.copy()

# Normalizando nome das cidades
df_estab_normalized["NO_MUNICIPIO"] = df_estab_normalized["NO_MUNICIPIO"].map(norm_city)
df_estab_normalized = df_estab_normalized.rename(columns={"CO_MUNICIPIO": "CO_MUNICIPIO_SEM_DIGITO"}).astype(str)#.astype(int)

df_estab_normalized = df_estab_normalized.assign(
    YYYY=df_estab_normalized["YYYYMM"].astype(str).str[:4],
    MM=df_estab_normalized["YYYYMM"].astype(str).str[-2:].str.zfill(2),
)

# Casting columns
df_estab_adjusted = cast_estab_types(df_estab_normalized)
df_estab_adjusted.head(3) # write to gold

Unnamed: 0,CO_PROFISSIONAL_SUS,NO_MUNICIPIO,DS_ATIVIDADE_PROFISSIONAL,TP_SUS_NAO_SUS,CO_MUNICIPIO_SEM_DIGITO,YYYYMM,year_month,YYYY,MM
0,F3575C9617F8998A,SAO PAULO,MEDICO GINECOLOGISTA E OBSTETRA,S,355030,202101,202101,2021,1
1,9D27061F6644A854,SAO PAULO,MEDICO EM RADIOLOGIA E DIAGNOSTICO POR IMAGEM,S,355030,202101,202101,2021,1
2,9573AEE972CC2D8E,ADAMANTINA,MEDICO DA ESTRATEGIA DE SAUDE DA FAMILIA,S,350010,202101,202101,2021,1


# Médicos a cada 1000 habitantes

In [16]:
# df_estab_adjusted = df_estab_adjusted[
#     df_estab_adjusted["DS_ATIVIDADE_PROFISSIONAL"].str.startswith("MEDICO", na=False)
# ].copy()
# df_estab_adjusted = df_estab_adjusted.query("TP_SUS_NAO_SUS == 'S'")

# df_filtered_medicos_sus = df_estab_adjusted.query("DS_ATIVIDADE_PROFISSIONAL.str.startswith('MEDICO') and TP_SUS_NAO_SUS == 'S'")

In [18]:
gold = Storage(file_system="gold")
raw = gold.download_file("populacao/data.parquet")
populacao_df = pd.read_parquet(io.BytesIO(raw), engine="pyarrow")

In [31]:
group_cols = [
    "CO_MUNICIPIO_SEM_DIGITO","NO_MUNICIPIO","DS_ATIVIDADE_PROFISSIONAL",
    "TP_SUS_NAO_SUS","YYYY","MM"
]

df_estab_grouped = (
    df_estab_adjusted
      .groupby(group_cols, as_index=False)["CO_PROFISSIONAL_SUS"]
      .nunique()
      .rename(columns={"CO_PROFISSIONAL_SUS":"TOTAL_PROFISSIONAIS"})
)
# import pandasql as ps
# query = """
# SELECT
#     CO_MUNICIPIO_SEM_DIGITO,
#     NO_MUNICIPIO,
#     DS_ATIVIDADE_PROFISSIONAL,
#     TP_SUS_NAO_SUS,
#     YYYY,
#     MM,
#     COUNT(DISTINCT CO_PROFISSIONAL_SUS) AS TOTAL_PROFISSIONAIS
# FROM df_estab_adjusted
# WHERE TP_SUS_NAO_SUS = 'S'
# GROUP BY
#     CO_MUNICIPIO_SEM_DIGITO,
#     NO_MUNICIPIO,
#     DS_ATIVIDADE_PROFISSIONAL,
#     TP_SUS_NAO_SUS,
#     YYYY,
#     MM
# """
df_estab_grouped = ps.sqldf(query, locals())
df_estab_grouped["CO_MUNICIPIO_SEM_DIGITO"] = pd.to_numeric(df_estab_grouped["CO_MUNICIPIO_SEM_DIGITO"], errors="coerce").astype("Int64")
df_estab_grouped["YYYY"] = pd.to_numeric(df_estab_grouped["YYYY"], errors="coerce").astype("Int16")
df_estab_grouped["MM"] = pd.to_numeric(df_estab_grouped["MM"], errors="coerce").astype("Int16")
# Join com df de populacao
join_keys = ["CO_MUNICIPIO_SEM_DIGITO", "YYYY", "MM"]

df_estab_pop = df_estab_grouped.merge(
    populacao_df[join_keys + ["CO_UF", "NO_UF", "NO_REGIAO","NO_MUNICIPIO_IBGE","POPULACAO_MENSAL", "POPULACAO", "GROWTH_ABS", "GROWTH_PCT"]],
    on=join_keys,
    how="left",
)

# Criando a métrica final
df_final = df_estab_pop.copy()
df_final["PROFISSIONAIS_POR_1000"] = (df_final["TOTAL_PROFISSIONAIS"] / df_final["POPULACAO_MENSAL"]) * 1000

# df_final.head(3)

  .groupby(group_cols, as_index=False)["CO_PROFISSIONAL_SUS"]


: 

In [30]:
df_final

Unnamed: 0,CO_MUNICIPIO_SEM_DIGITO,NO_MUNICIPIO,DS_ATIVIDADE_PROFISSIONAL,TP_SUS_NAO_SUS,YYYY,MM,TOTAL_PROFISSIONAIS,CO_UF,NO_UF,NO_REGIAO,NO_MUNICIPIO_IBGE,POPULACAO_MENSAL,POPULACAO,GROWTH_ABS,GROWTH_PCT,PROFISSIONAIS_POR_1000
0,350010,ADAMANTINA,MEDICO ANATOMOPATOLOGISTA,S,2021,1,1,,,,,,,,,
1,350010,ADAMANTINA,MEDICO ANATOMOPATOLOGISTA,S,2021,2,1,,,,,,,,,
2,350010,ADAMANTINA,MEDICO ANATOMOPATOLOGISTA,S,2021,3,1,,,,,,,,,
3,350010,ADAMANTINA,MEDICO ANATOMOPATOLOGISTA,S,2021,4,1,,,,,,,,,
4,350010,ADAMANTINA,MEDICO ANATOMOPATOLOGISTA,S,2021,5,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478967,355730,ESTIVA GERBI,MEDICO UROLOGISTA,S,2021,11,1,35.0,São Paulo,Campinas,ESTIVA GERBI,11524,11507.0,2,0.000174,0.086775
478968,355730,ESTIVA GERBI,MEDICO UROLOGISTA,S,2021,12,1,35.0,São Paulo,Campinas,ESTIVA GERBI,11525,11507.0,1,0.000087,0.086768
478969,355730,ESTIVA GERBI,MEDICO UROLOGISTA,S,2022,1,1,35.0,São Paulo,Campinas,ESTIVA GERBI,11527,11527.0,2,0.000174,0.086753
478970,355730,ESTIVA GERBI,MEDICO UROLOGISTA,S,2022,2,1,35.0,São Paulo,Campinas,ESTIVA GERBI,11529,11527.0,2,0.000174,0.086738


In [7]:
import re, io, pandas as pd


gold = Storage(file_system="gold")
raw = gold.download_file("metricas/estabelecimentos_sp/year_month=202212/data.parquet")
df = pd.read_parquet(io.BytesIO(raw), engine="pyarrow")
df

Unnamed: 0,CO_MUNICIPIO_SEM_DIGITO,NO_MUNICIPIO,DS_ATIVIDADE_PROFISSIONAL,TP_SUS_NAO_SUS,YYYY,MM,TOTAL_PROFISSIONAIS,CO_UF,NO_UF,NO_REGIAO,NO_MUNICIPIO_IBGE,POPULACAO_MENSAL,POPULACAO,GROWTH_ABS,GROWTH_PCT,PROFISSIONAIS_POR_1000,year_month
0,35001,ADAMANTINA,MEDICO ANATOMOPATOLOGISTA,S,2022,12,1,,,,,,,,,,202212
1,35001,ADAMANTINA,MEDICO ANESTESIOLOGISTA,S,2022,12,2,,,,,,,,,,202212
2,35001,ADAMANTINA,MEDICO ANGIOLOGISTA,S,2022,12,1,,,,,,,,,,202212
3,35001,ADAMANTINA,MEDICO CARDIOLOGISTA,S,2022,12,5,,,,,,,,,,202212
4,35001,ADAMANTINA,MEDICO CIRURGIAO GERAL,S,2022,12,5,,,,,,,,,,202212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10610,35573,ESTIVA GERBI,MEDICO NEUROLOGISTA,S,2022,12,2,,,,,,,,,,202212
10611,35573,ESTIVA GERBI,MEDICO OFTALMOLOGISTA,S,2022,12,1,,,,,,,,,,202212
10612,35573,ESTIVA GERBI,MEDICO ORTOPEDISTA E TRAUMATOLOGISTA,S,2022,12,1,,,,,,,,,,202212
10613,35573,ESTIVA GERBI,MEDICO PEDIATRA,S,2022,12,3,,,,,,,,,,202212


In [10]:
#view type of each column
df.columns.to_series().map(type)

CO_MUNICIPIO_SEM_DIGITO      <class 'str'>
NO_MUNICIPIO                 <class 'str'>
DS_ATIVIDADE_PROFISSIONAL    <class 'str'>
TP_SUS_NAO_SUS               <class 'str'>
YYYY                         <class 'str'>
MM                           <class 'str'>
TOTAL_PROFISSIONAIS          <class 'str'>
CO_UF                        <class 'str'>
NO_UF                        <class 'str'>
NO_REGIAO                    <class 'str'>
NO_MUNICIPIO_IBGE            <class 'str'>
POPULACAO_MENSAL             <class 'str'>
POPULACAO                    <class 'str'>
GROWTH_ABS                   <class 'str'>
GROWTH_PCT                   <class 'str'>
PROFISSIONAIS_POR_1000       <class 'str'>
year_month                   <class 'str'>
dtype: object