# Bronze to Silver - Fato Consultas

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import (
    col, when, coalesce, lit, current_timestamp, row_number
)
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from datetime import datetime

## Configurações

In [0]:
from datetime import datetime

# Paths
STORAGE_ACCOUNT = "mystoacc"
BRONZE_PATH = f"abfss://bronze@{STORAGE_ACCOUNT}.dfs.core.windows.net/facts"
SILVER_PATH = f"abfss://silver@{STORAGE_ACCOUNT}.dfs.core.windows.net/facts"

spark.sql("USE CATALOG hive_metastore")
spark.sql("""
CREATE DATABASE IF NOT EXISTS hive_metastore.healthcare_silver
LOCATION 'abfss://silver@mystoacc.dfs.core.windows.net/'
""")
spark.sql("USE healthcare_silver")

PROCESSING_TIMESTAMP = datetime.now()
PROCESSING_DATE = PROCESSING_TIMESTAMP.strftime("%Y-%m-%d")

print(f"Processamento Facts iniciado: {PROCESSING_TIMESTAMP}")

## Funções de Qualidade

In [0]:
def add_silver_metadata(df: DataFrame) -> DataFrame:
    """Adiciona colunas de metadados da Silver"""
    return df.withColumn("silver_processed_at", lit(PROCESSING_TIMESTAMP)) \
             .withColumn("silver_processing_date", lit(PROCESSING_DATE))

def remove_duplicates(df: DataFrame, key_columns: list) -> DataFrame:
    """Remove duplicatas mantendo o registro mais recente"""
    window_spec = Window.partitionBy(key_columns).orderBy(col("ingestion_timestamp").desc())
    return df.withColumn("row_num", row_number().over(window_spec)) \
             .filter(col("row_num") == 1) \
             .drop("row_num")

## Transformações Fato

In [0]:
def transform_fato_consultas(df: DataFrame) -> DataFrame:
    """
    Transformações fato_consultas:
    - Validar id_consulta (PK)
    - Validar todas as FKs (obrigatórias)
    - Validar métricas (>= 0)
    - Cast de tipos
    """

    df_transformed = df.select(
        # PK
        col("id_consulta").cast("int"),
        
        # Chaves estrangeiras
        col("sk_paciente").cast("int"),
        col("sk_medico").cast("int"),
        col("sk_clinica").cast("int"),
        col("sk_diagnostico").cast("int"),
        col("sk_exame").cast("int"),
        col("sk_data").cast("int"),
        
        col("valor_total").cast("decimal(10,2)"),
        col("plano_cobriu").cast("int"),
        
        col("ingestion_timestamp")
    )
    
    df_transformed = df_transformed.withColumn(
        "valor_total",
        when(col("valor_total") < 0, lit(0.0)).otherwise(col("valor_total"))
    ).withColumn(
        "plano_cobriu",
        when(col("plano_cobriu").isin([0, 1]), col("plano_cobriu")).otherwise(0)
    )
    
    df_transformed = df_transformed.filter(
        col("id_consulta").isNotNull() &
        col("sk_paciente").isNotNull() &
        col("sk_medico").isNotNull() &
        col("sk_clinica").isNotNull() &
        col("sk_diagnostico").isNotNull() &
        col("sk_exame").isNotNull() &
        col("sk_data").isNotNull()
    )
    
    return df_transformed

## Pipeline de Processamento

In [0]:
def process_fact_to_silver(table_name: str) -> dict:
    """
    Pipeline completo Bronze → Silver para fato
    """
    try:
        print(f"\n{'='*60}")
        print(f"Processando FATO: {table_name}")
        print('='*60)
        
        bronze_path = f"{BRONZE_PATH}/{table_name}"
        df_bronze = spark.read.format("delta").load(bronze_path)
        
        print(f"Lidos da Bronze")
        
        df_transformed = transform_fato_consultas(df_bronze)
        
        key_columns = ["id_consulta"]
        df_transformed = remove_duplicates(df_transformed, key_columns)
        
        df_transformed = add_silver_metadata(df_transformed)
        
        silver_path = f"{SILVER_PATH}/{table_name}"
        
        if DeltaTable.isDeltaTable(spark, silver_path):
            delta_table = DeltaTable.forPath(spark, silver_path)
            
            merge_condition = "target.id_consulta = source.id_consulta" 

            delta_table.alias("target").merge(
                df_transformed.alias("source"),
                merge_condition
            ).whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()
            
            print(f"MERGE executado com sucesso")
            
        else:
            df_transformed.write.format("delta") \
                .mode("overwrite") \
                .option("mergeSchema", "true") \
                .saveAsTable(f"healthcare_silver.{table_name}")
            
            print(f"Tabela criada com sucesso")
        
        return {
            "table": table_name,
            "status": "SUCCESS"
        }
        
    except Exception as e:
        print(f"ERRO: {str(e)}")
        return {
            "table": table_name,
            "status": "FAILED",
            "error": str(e)
        }

## Execução

In [0]:
result = process_fact_to_silver("fato_consultas")

## Sumário de Execução

In [0]:
import pandas as pd

results_df = pd.DataFrame([result])
print("\n" + "="*80)
print("SUMÁRIO DE EXECUÇÃO - FACTS BRONZE TO SILVER")
print("="*80)
display(results_df)

if result['status'] == 'SUCCESS':
    print("\nTabela FATO processada com sucesso!")
else:
    print(f"\nERRO ao processar FATO: {result.get('error', 'Unknown')}")

print("\n" + "="*80)