In [0]:
silver_2_table = "saas_project.core.silver_2_data"

# Puxa Silver 2 tratado direto
df_gold = spark.table(silver_2_table)



In [0]:
import os
import shutil

# Caminho da pasta onde o CSV será salvo
output_dir = "/Volumes/saas_project/core/download/gold_data_csv/"
final_csv_path = os.path.join(output_dir, "gold_data.csv")

# 1 Remove a coluna VOID que dá erro no CSV
df_gold_clean = df_gold.drop("source_file")

# 2 Salva o DataFrame em CSV (Spark cria arquivos part-00000...)
temp_path = os.path.join(output_dir, "temp_csv")
df_gold_clean.coalesce(1).write.mode("overwrite").option("header", "true").csv(temp_path)

# 3 Renomeia o CSV gerado pelo Spark para gold_data.csv
for file_name in os.listdir(temp_path):
    if file_name.endswith(".csv"):
        temp_csv_file = os.path.join(temp_path, file_name)
        shutil.move(temp_csv_file, final_csv_path)
        break

# 4 Remove a pasta temporária
shutil.rmtree(temp_path)

print(f"Arquivo CSV final salvo em: {final_csv_path}")


Arquivo CSV final salvo em: /Volumes/saas_project/core/download/gold_data_csv/gold_data.csv


In [0]:
# Mostra o DataFrame Gold já limpo (sem a coluna VOID)
display(df_gold_clean.limit(10))


id,nome,idade,email,cidade,salario,ingestion_time
1,Pessoa 0,26.0,user0@exemplo.com,,2036.32,2026/02/19-18:05:36
2,Pessoa 1,68.0,user1@exemplo.com,Rio,3705.09,2026/02/19-18:05:36
3,Pessoa 2,67.0,user2@exemplo.com,São Paulo,5693.16,2026/02/19-18:05:36
4,Pessoa 3,38.0,user3@exemplo.com,Belo Horizonte,9376.51,2026/02/19-18:05:36
5,Pessoa 4,53.0,user4@exemplo.com,,6392.52,2026/02/19-18:05:36
6,Pessoa 5,49.0,user5@exemplo.com,Rio,7172.05,2026/02/19-18:05:36
7,Pessoa 6,,user6@exemplo.com,,3802.53,2026/02/19-18:05:36
8,Pessoa 7,47.0,user7@exemplo.com,,4748.2,2026/02/19-18:05:36
9,Pessoa 8,27.0,user8@exemplo.com,Belo Horizonte,4476.1,2026/02/19-18:05:36
10,Pessoa 9,61.0,user9@exemplo.com,,9242.02,2026/02/19-18:05:36


In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import current_timestamp
import uuid

spark = SparkSession.builder.getOrCreate()

# -----------------------------
# 1 Cria batch_id único
# -----------------------------
batch_id = str(uuid.uuid4())

# -----------------------------
# 2 Calcula métricas
# -----------------------------
total_records = spark.table("saas_project.core.silver_data").count()
valid_records = spark.table("saas_project.core.silver_2_data").count()
rejected_records = total_records - valid_records

# -----------------------------
# 3 Cria DataFrame de métricas de forma compatível com Orquestrador
# -----------------------------
metrics_df = spark.createDataFrame([
    Row(batch_id=batch_id, total_records=total_records, rejected_record=rejected_records)
]).withColumn("processing_ts", current_timestamp())

# -----------------------------
# 4 Salva no Delta com mergeSchema
# -----------------------------
metrics_df.write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable("saas_project.core.silver2_metrics")

print("Métricas gravadas com sucesso!")
metrics_df.show(5)


Métricas gravadas com sucesso!
+--------------------+-------------+---------------+--------------------+
|            batch_id|total_records|rejected_record|       processing_ts|
+--------------------+-------------+---------------+--------------------+
|b501ecb4-17b5-47b...|      1000000|              0|2026-02-19 21:13:...|
+--------------------+-------------+---------------+--------------------+

