In [2]:
from __future__ import annotations
import os
from pathlib import Path
from typing import Iterable
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core.exceptions import ResourceExistsError
import io
import pandas as pd
class Storage:
    """
    Camada fina para operações no Azure Data Lake Gen2 (DFS).
    """
    def __init__(self, account_name: str = "cnesstorageaccount", file_system: str = "bronze"):
        self.account_name = account_name
        self.file_system = file_system

        # chave hardcoded
        key = "cP1htVg+Qtmzi+4dJKz0qEDb1c7uHu3f5VuDWK8/RV2FP/6Qa5GJzT7q2jcGLVvUfwpC3UaFbTEY+ASt38FW+A=="#os.environ.get("AZURE_STORAGE_KEY")

        self.client = DataLakeServiceClient(
            account_url=f"https://{self.account_name}.dfs.core.windows.net",
            credential=key,
        )
        self.fs = self.client.get_file_system_client(self.file_system)


# instâncias prontas para uso
bronze = Storage(file_system="bronze")
silver = Storage(file_system="silver")
gold   = Storage(file_system="gold")


#read table from silver
def read_silver_table(table_name: str, year_month: str) -> pd.DataFrame:
    silver_fs = silver.fs
    file_path = f"{table_name}/{year_month}.parquet"
    file_client = silver_fs.get_file_client(file_path)

    download = file_client.download_file()
    downloaded_bytes = download.readall()

    # Carregar os bytes em um DataFrame do pandas
    df = pd.read_parquet(io.BytesIO(downloaded_bytes))
    return df

test = read_silver_table("cnes_estabelecimentos", "202201")
test

Unnamed: 0,CO_UNIDADE,CO_PROFISSIONAL_SUS,NO_PROFISSIONAL,CO_CBO,TP_SUS_NAO_SUS,DS_ATIVIDADE_PROFISSIONAL,NO_FANTASIA,NO_BAIRRO,NO_MUNICIPIO,CO_MUNICIPIO,CO_SIGLA_ESTADO,CO_CEP,ds_localidade,SK_REGISTRO,DATA_INGESTAO,YYYYMM
0,SP00003509205000001329730000101,F3575C9617F8998A,WLADMIR GUBEISSI PINTO FILHO,225250,S,MEDICO GINECOLOGISTA E OBSTETRA,CLINICA MEDICA SULLA PELLE S C LTDA,ACLIMACAO,SAO PAULO,355030,SP,01530000,"01530000,SAO PAULO,SP,Brasil",SP00003509205000001329730000101_F3575C9617F899...,2025-11-01,202201
1,SP00003509205000001329730000101,9D27061F6644A854,MARTHA TIDORI KIOTA KOTSUBO,225320,S,MEDICO EM RADIOLOGIA E DIAGNOSTICO POR IMAGEM,CLINICA MEDICA SULLA PELLE S C LTDA,ACLIMACAO,SAO PAULO,355030,SP,01530000,"01530000,SAO PAULO,SP,Brasil",SP00003509205000001329730000101_9D27061F6644A8...,2025-11-01,202201
2,3500100047406,F6965A7E959C6A39,GRAZIELE DAVID,251510,N,PSICOLOGO CLINICO,GRAZIELE DAVID PSICOLOGA EIRELI,CENTRO,ADAMANTINA,350010,SP,17800000,"17800000,ADAMANTINA,SP,Brasil",3500100047406_F6965A7E959C6A39_251510,2025-11-01,202201
3,3500100081655,1FC46A4EA7312E5B,KELLY PRESTES RUFINO,223605,N,FISIOTERAPEUTA GERAL,RUFINO PRESTES LTDA,CENTRO,ADAMANTINA,350010,SP,17800000,"17800000,ADAMANTINA,SP,Brasil",3500100081655_1FC46A4EA7312E5B_223605,2025-11-01,202201
4,3500100109789,EDEB3090A41EB3A5,JOANA DARC BORRO,251510,S,PSICOLOGO CLINICO,JOANA DARC BORRO,VILA CICMA,ADAMANTINA,350010,SP,17800000,"17800000,ADAMANTINA,SP,Brasil",3500100109789_EDEB3090A41EB3A5_251510,2025-11-01,202201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208984,3557307758561,33D124E57E11072D,FLAVIA DOMINGOS BRAGANHOLI ALBORGHETTE,251605,N,ASSISTENTE SOCIAL,CLINICA VILA FENIX,RANCHO NOVO,ESTIVA GERBI,355730,SP,13857000,"13857000,ESTIVA GERBI,SP,Brasil",3557307758561_33D124E57E11072D_251605,2025-11-01,202201
1208985,3557307758561,CD421979A9CDD2F2,VERA LUCIA BARBOSA,322205,N,TECNICO DE ENFERMAGEM,CLINICA VILA FENIX,RANCHO NOVO,ESTIVA GERBI,355730,SP,13857000,"13857000,ESTIVA GERBI,SP,Brasil",3557307758561_CD421979A9CDD2F2_322205,2025-11-01,202201
1208986,3557307758561,89AAC893DE5A15C7,PRISCILA DE VASCONCELLOS GALI,251510,N,PSICOLOGO CLINICO,CLINICA VILA FENIX,RANCHO NOVO,ESTIVA GERBI,355730,SP,13857000,"13857000,ESTIVA GERBI,SP,Brasil",3557307758561_89AAC893DE5A15C7_251510,2025-11-01,202201
1208987,3557307758561,69CD39B556C177AE,ELISEU MARCEL DOMINGOS,251505,N,PSICOLOGO EDUCACIONAL,CLINICA VILA FENIX,RANCHO NOVO,ESTIVA GERBI,355730,SP,13857000,"13857000,ESTIVA GERBI,SP,Brasil",3557307758561_69CD39B556C177AE_251505,2025-11-01,202201


In [8]:
import pandas as pd
from typing import Iterable, FrozenSet


class SingletonMeta(type):
    """Garante que cada classe concreta tenha uma única instância."""
    _instances = {}

    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            inst = super().__call__(*args, **kwargs)
            cls._instances[cls] = inst
        return cls._instances[cls]


class Table(metaclass=SingletonMeta):
    """
    Base para tabelas (camadas de dados).
    Cada subclasse concreta representa uma tabela única.
    """
    name: str
    layer: str  # "silver" | "gold"

    def __init__(self, inputs: Iterable[pd.DataFrame] = ()):
        self._inputs: FrozenSet[pd.DataFrame] = frozenset(inputs)

    @property
    def inputs(self) -> FrozenSet[pd.DataFrame]:
        return self._inputs

    def definition(self) -> pd.DataFrame:
        """Cada tabela concreta implementa sua lógica de transformação."""
        raise NotImplementedError

    def describe(self) -> str:
        return f"Tabela '{self.name}' (layer={self.layer}, inputs={len(self.inputs)})"


class Silver(Table):
    layer: str = "silver"


class Gold(Table):
    layer: str = "gold"


In [None]:
import io
import pandas as pd
from azure.storage.filedatalake import DataLakeServiceClient
import pyarrow as pa, pyarrow.parquet as pq

# ---------- Storage básico ----------
class Storage:
    def __init__(self, account_name: str = "cnesstorageaccount", file_system: str = "bronze"):
        self.account_name = account_name
        self.file_system = file_system
        key = "cP1htVg+Qtmzi+4dJKz0qEDb1c7uHu3f5VuDWK8/RV2FP/6Qa5GJzT7q2jcGLVvUfwpC3UaFbTEY+ASt38FW+A=="
        self.client = DataLakeServiceClient(
            account_url=f"https://{self.account_name}.dfs.core.windows.net",
            credential=key,
        )
        self.fs = self.client.get_file_system_client(self.file_system)

# ✅ instâncias prontas para uso (precisam existir antes de Silver/Gold)
bronze = Storage(file_system="bronze")
silver = Storage(file_system="silver")
gold   = Storage(file_system="gold")

class SingletonMeta(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            inst = super().__call__(*args, **kwargs)
            cls._instances[cls] = inst
        return cls._instances[cls]

# ---------- Camadas ----------
class Table(metaclass=SingletonMeta):
    layer: str
    allowed_layers: list[str]
    def __init__(self, name: str):
        self.name = name
        self.inputs: dict[str, pd.DataFrame] = {}

class Bronze(Table):
    layer = "bronze"
    allowed_layers = ["bronze"]

# --- Silver com helpers genéricos da camada ---
class Silver(Table):
    layer = "silver"
    allowed_layers = ["bronze", "silver"]

    def __init__(self, name: str, bronze_store: Storage = bronze, silver_store: Storage = silver):
        super().__init__(name)
        self._bronze_fs = bronze_store.fs
        self._silver_fs = silver_store.fs

    def _read_csv_from_fs(self, fs_client, path: str) -> pd.DataFrame:
        file_client = fs_client.get_file_client(path)
        data = file_client.download_file().readall()
        try:
            return pd.read_csv(io.BytesIO(data), sep=";", quotechar='"', dtype=str,
                               encoding="latin-1", engine="python", on_bad_lines="warn")
        except UnicodeDecodeError:
            for enc in ("cp1252", "utf-8-sig"):
                try:
                    return pd.read_csv(io.BytesIO(data), sep=";", quotechar='"', dtype=str,
                                       encoding=enc, engine="python", on_bad_lines="warn")
                except UnicodeDecodeError:
                    continue
            raise

    def read_csv_from_bronze(self, path: str) -> pd.DataFrame:
        return self._read_csv_from_fs(self._bronze_fs, path)

    def read_csv_from_silver(self, path: str) -> pd.DataFrame:
        return self._read_csv_from_fs(self._silver_fs, path)
    
    def _write_parquet_to_silver(self, df: pd.DataFrame, year_month: str) -> None:
        """Escreve em silver no caminho: {name}/{year_month}.parquet"""
        if not isinstance(df, pd.DataFrame):
            raise TypeError("definition() deve retornar um pandas.DataFrame")
        dest_path = f"{self.name}/{year_month}.parquet"

        buf = io.BytesIO()
        # requer pyarrow instalado
        df.to_parquet(buf, index=False, engine="pyarrow", compression="snappy")
        buf.seek(0)

        file_client = self._silver_fs.get_file_client(dest_path)
        file_client.upload_data(buf.getvalue(), overwrite=True)

    def run(self) -> None:
        """Executa a transformação (definition) e grava em silver."""
        if not hasattr(self, "year_month") or not isinstance(self.year_month, str):
            raise AttributeError("Defina self.year_month (ex.: '202401') antes de chamar .run().")
        if not hasattr(self, "definition"):
            raise AttributeError("Implemente .definition(self) na subclasse.")

        df = self.definition()              # a subclasse retorna o DataFrame final
        self._write_parquet_to_silver(df, self.year_month)


class Gold(Table):
    layer = "gold"
    allowed_layers = ["silver", "gold"]


In [31]:
from datetime import date
import pandas as pd

class CnesProfissionaisSilver(Silver):
    def __init__(self, year_month: str):
        super().__init__(name="cnes_profissionais")
        self.year_month = year_month
        

        # monta os caminhos e usa o helper herdado de Silver
        self.inputs = {
            "tbEstabelecimento": self.read_csv_from_bronze(f"{self.year_month}/tbEstabelecimento{self.year_month}.csv"),
            "tbMunicipio": self.read_csv_from_bronze(f"{self.year_month}/tbMunicipio{self.year_month}.csv"),
            "rlEstabServClass": self.read_csv_from_bronze(f"{self.year_month}/rlEstabServClass{self.year_month}.csv"),
            "tbClassificacaoServico": self.read_csv_from_bronze(f"{self.year_month}/tbClassificacaoServico{self.year_month}.csv"),
            "tbCargaHorariaSus": self.read_csv_from_bronze(f"{self.year_month}/tbCargaHorariaSus{self.year_month}.csv"),
            "tbAtividadeProfissional": self.read_csv_from_bronze(f"{self.year_month}/tbAtividadeProfissional{self.year_month}.csv"),
            "tbDadosProfissionalSus": self.read_csv_from_bronze(f"{self.year_month}/tbDadosProfissionalSus{self.year_month}.csv"),
        }

    def definition(self) -> pd.DataFrame:

        """
        Executa a curadoria e retorna o DataFrame final de serviços.
        """
        # --- inputs ---
        tbEstabelecimento = self.inputs["tbEstabelecimento"].copy()
        tbMunicipio = self.inputs["tbMunicipio"].copy()
        rlEstabServClass = self.inputs["rlEstabServClass"].copy()
        tbClassificacaoServico = self.inputs["tbClassificacaoServico"].copy()

        # --- pré-processo estab + município (somente SP) ---
        tbEstabelecimento["CO_ESTADO_GESTOR"] = pd.to_numeric(
            tbEstabelecimento.get("CO_ESTADO_GESTOR"), errors="coerce"
        )
        estab_sp = tbEstabelecimento[tbEstabelecimento["CO_ESTADO_GESTOR"] == 35]

        estab_munic = estab_sp.merge(
            tbMunicipio,
            left_on="CO_MUNICIPIO_GESTOR",
            right_on="CO_MUNICIPIO",
            how="inner",
            suffixes=("", "_mun"),
        )
        
         # --- curadoria de serviços ---
        serv_join = (
            rlEstabServClass
            .merge(
                tbClassificacaoServico,
                left_on=["CO_SERVICO", "CO_CLASSIFICACAO"],
                right_on=["CO_SERVICO_ESPECIALIZADO", "CO_CLASSIFICACAO_SERVICO"],
                how="inner",
            )
            .merge(estab_munic, on="CO_UNIDADE", how="inner")
        )

        servicos = serv_join[
            [
                "CO_UNIDADE",
                "NO_MUNICIPIO",
                "CO_MUNICIPIO",
                "CO_SERVICO",
                "CO_CLASSIFICACAO",
                "DS_CLASSIFICACAO_SERVICO",
            ]
        ].copy()

        # --- metadados ---
        today_str = date.today().isoformat()
        ym = self.year_month

        servicos["SK_REGISTRO"] = (
            servicos["CO_UNIDADE"].astype(str)
            + "_"
            + servicos["CO_SERVICO"].astype(str)
            + "_"
            + servicos["CO_CLASSIFICACAO"].astype(str)
        )
        servicos["DATA_INGESTAO"] = today_str
        servicos["YYYYMM"] = ym

        servicos = servicos.drop_duplicates(subset=["SK_REGISTRO"])

        # normalização de texto
        for col in ["NO_MUNICIPIO", "DS_CLASSIFICACAO_SERVICO"]:
            if col in servicos.columns:
                servicos[col] = servicos[col].astype(str)

        return servicos


#Display do resultado final
cnes_profissionais_silver = CnesProfissionaisSilver(year_month="202401")
df_resultado = cnes_profissionais_silver.definition()
df_resultado.head()

Unnamed: 0,CO_UNIDADE,NO_MUNICIPIO,CO_MUNICIPIO,CO_SERVICO,CO_CLASSIFICACAO,DS_CLASSIFICACAO_SERVICO,SK_REGISTRO,DATA_INGESTAO,YYYYMM
0,3500100251402,ADAMANTINA,350010,112,1,ACOMPANHAMENTO DO PRE-NATAL DE RISCO HABITUAL,3500100251402_112_001,2025-11-01,202401
1,3500100251402,ADAMANTINA,350010,122,3,EXAME ELETROCARDIOGRAFICO,3500100251402_122_003,2025-11-01,202401
2,3500100251402,ADAMANTINA,350010,159,4,ESTRATEGIA DE SAUDE DA FAMILIA,3500100251402_159_004,2025-11-01,202401
3,3500100251402,ADAMANTINA,350010,174,1,INDIVIDUOS EM GERAL,3500100251402_174_001,2025-11-01,202401
4,3500100853437,ADAMANTINA,350010,121,1,RADIOLOGIA,3500100853437_121_001,2025-11-01,202401
