### Setup e Variáveis

In [1]:
from core.bemol_validator import BemolValidator
from core.bemol_lakehouse import BemolLakeHouse
from core.bemol_controller import BemolController
from core.bemol_logger import BemolLogger
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws

In [None]:
# Instanciando classe de logger
logger = BemolLogger("silver_users")

# Configurando Spark com Delta Lake
spark = (
  SparkSession.builder
  .appName("TransformacaoSilverUsuarios")
  .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
  .getOrCreate()
)

# Intanciando classe de leitura/escrita de dados
lakehouse = BemolLakeHouse(spark, logger)

In [3]:
# Definindo paths de origem e destino
origin_path = "../data/bronze/users/"
destination_path = "../data/silver/users/"

# Path para salvar os dados de monitoramento
destination_path_monitor = "../data/monitoring/"

### Leitura

In [4]:
# Lendo dados da camada bronze usando a classe BemolLakeHouse
df_users = lakehouse.read_bronze(origin_path)

2025-10-16 20:35:54,544 - INFO - Iniciando operação: read_bronze
2025-10-16 20:35:54,549 - INFO - Lendo dados da camada Bronze: ../data/bronze/users/
25/10/16 20:36:03 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2025-10-16 20:36:10,321 - INFO - Dados lidos com sucesso da camada Bronze: 10 linhas, 13 colunas.
2025-10-16 20:36:10,328 - INFO - Operação read_bronze finalizada em 15.78 segundos.


### Tranformações

In [12]:
# Remove duplicatas
df_users = df_users.dropDuplicates(["id"])

# Adiciona a coluna full_name
df_users_silver = df_users.withColumn("full_name", concat_ws(" ", col("first_name"), col("last_name")))

# Remove colunas indesejadas
df_users_silver = df_users_silver.select("id", "username", "full_name", "email", "phone", "city")

# Valida a coluna de e-mail com a classe BemolValidator
df_users_silver = BemolValidator.validate_email(df_users_silver, "email")

### Escrita

In [13]:
# Adiciona coluna de timestamp de ingestão
df_users_silver = BemolController.control_field(df_users_silver, layer="silver")

In [14]:
df_users_silver.show(5, truncate=False)

+---+---------+--------------+------------------+--------------+-----------+--------------+--------------------------+
|id |username |full_name     |email             |phone         |city       |is_valid_email|insertion_silver          |
+---+---------+--------------+------------------+--------------+-----------+--------------+--------------------------+
|1  |johnd    |john doe      |john@gmail.com    |1-570-236-7033|kilcoole   |true          |2025-10-16 21:10:07.129131|
|2  |mor_2314 |david morrison|morrison@gmail.com|1-570-236-7033|kilcoole   |true          |2025-10-16 21:10:07.129131|
|3  |kevinryan|kevin ryan    |kevin@gmail.com   |1-567-094-1345|Cullman    |true          |2025-10-16 21:10:07.129131|
|4  |donero   |don romer     |don@gmail.com     |1-765-789-6734|San Antonio|true          |2025-10-16 21:10:07.129131|
|5  |derek    |derek powell  |derek@gmail.com   |1-956-001-1945|san Antonio|true          |2025-10-16 21:10:07.129131|
+---+---------+--------------+------------------

In [15]:
# Escreve os dados na camada silver no formato Delta Lake e no modo overwrite como padrão
lakehouse.write_silver(df_users_silver, destination_path, table_name="silver_users")

2025-10-16 21:10:31,782 - INFO - Iniciando operação: write_silver
2025-10-16 21:10:31,790 - INFO - Escrevendo dados na camada silver em ../data/silver/users/
2025-10-16 21:10:36,883 - INFO - Dados escritos com sucesso na camada silver.
2025-10-16 21:10:37,267 - INFO - Métricas silver_users: 10 linhas, 8 colunas.
2025-10-16 21:10:37,268 - INFO - Operação write_silver finalizada em 5.49 segundos.


In [16]:
# Escreve os dados de monitoramento no formato Delta Lake e no modo overwrite como padrão
lakehouse.monitor.export_delta(spark, destination_path_monitor)

2025-10-16 21:10:58,603 - INFO - Métricas exportadas com sucesso para ../data/monitoring/


DataFrame[table_name: string, row_count: bigint, col_count: bigint, timestamp: string]

In [17]:
df_monitor = spark.read.format("delta").load(destination_path_monitor)
df_monitor.show(5, truncate=False)

+---------------+---------+---------+-------------------+
|table_name     |row_count|col_count|timestamp          |
+---------------+---------+---------+-------------------+
|bronze_products|20       |9        |2025-10-16 20:27:04|
|bronze_carts   |14       |6        |2025-10-16 20:27:10|
|silver_users   |10       |8        |2025-10-16 21:10:37|
|bronze_users   |10       |13       |2025-10-16 19:46:39|
+---------------+---------+---------+-------------------+

