### Setup e Variáveis

In [1]:
from core.bemol_lakehouse import BemolLakeHouse
from core.bemol_controller import BemolController
from core.bemol_landing_reader import BemolLandingReader
from core.bemol_logger import BemolLogger
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

In [None]:
# Instanciando classe de logger
logger = BemolLogger("bronze_products_carts")

# Configurando Spark com Delta Lake
spark = (
  SparkSession.builder
  .appName("IngestaoBronzeProdutos")
  .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
  .getOrCreate()
)

# Intanciando classe de leitura/escrita de dados
lakehouse = BemolLakeHouse(spark, logger)

# Intanciando classe de leitura da camada landing
landing = BemolLandingReader(logger)

In [3]:
# Definindo URL's da API e destinos dos dados
url_products = "https://fakestoreapi.com/products"
url_carts = "https://fakestoreapi.com/carts"
destination_path_products = "../data/bronze/products"
destination_path_carts = "../data/bronze/carts"

# Path para salvar os dados de monitoramento
destination_path_monitor = "../data/monitoring/"

### Leitura

In [4]:
# Lendo dados da API e criando DataFrame
df_carts = landing.read_api(spark, url_carts)
df_products = landing.read_api(spark, url_products)

2025-10-16 19:54:27,890 - INFO - Iniciando operação: read_api
2025-10-16 19:54:27,912 - INFO - Lendo dados da API: https://fakestoreapi.com/carts
2025-10-16 19:54:37,403 - INFO - Dados lidos com sucesso da API: 7 linhas, 5 colunas.
2025-10-16 19:54:37,913 - INFO - Operação read_api finalizada em 10.02 segundos.
2025-10-16 19:54:37,940 - INFO - Iniciando operação: read_api
2025-10-16 19:54:37,944 - INFO - Lendo dados da API: https://fakestoreapi.com/products
2025-10-16 19:54:39,308 - INFO - Dados lidos com sucesso da API: 20 linhas, 7 colunas.
2025-10-16 19:54:39,309 - INFO - Operação read_api finalizada em 1.37 segundos.


In [6]:
df_products.show(5, truncate=False)

+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---+------------------------------------------------------------------------+------+----------+---------------------------------------------------------------------------+
|category      |description                                                                                                                                                                                                                                                                                                                                       |id |image                                                                   |price |rating    |title                  

In [5]:
df_carts.show(5, truncate=False)

+---+------------------------+---+-------------------------+------+
|__v|date                    |id |products                 |userId|
+---+------------------------+---+-------------------------+------+
|0  |2020-03-02T00:00:00.000Z|1  |[{1, 4}, {2, 1}, {3, 6}] |1     |
|0  |2020-01-02T00:00:00.000Z|2  |[{2, 4}, {1, 10}, {5, 2}]|1     |
|0  |2020-03-01T00:00:00.000Z|3  |[{1, 2}, {9, 1}]         |2     |
|0  |2020-01-01T00:00:00.000Z|4  |[{1, 4}]                 |3     |
|0  |2020-03-01T00:00:00.000Z|5  |[{7, 1}, {8, 1}]         |3     |
+---+------------------------+---+-------------------------+------+
only showing top 5 rows



### Transformações Produto

In [9]:
# Remove registros com valores nulos em colunas críticas
df_products_bronze = df_products.dropna(subset=["id"])

# Renomeia coluna pora manter evitar ambiguidade
df_products_bronze = df_products_bronze.withColumnRenamed("title", "product_title")

# Extrai campos aninhados
df_products_bronze = (
  df_products_bronze
  .withColumn("rating_count", col("rating.count"))
  .withColumn("rating", col("rating.rate"))
  )

In [8]:
df_products_bronze.show(truncate=False)

+----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---+------------------------------------------------------------------------+------+------+-------------------------------------------------------------------------------------------------+------------+
|categ

### Tranformações Carrinho

In [10]:
# Remove linhas com valores nulos em colunas críticas
df_carts_bronze = df_carts.dropna(subset=["id", "products"])

# Explode a coluna de produtos para criar uma linha por produto em cada carrinho
df_carts_bronze = df_carts_bronze.withColumn("products", explode("products"))

# Extrai campos aninhados e renomeia colunas para evitar ambiguidade
df_carts_bronze = df_carts_bronze.select(
    col("id").alias("cart_id"),
    col("userId").alias("user_id"),
    col("date").alias("cart_date"),
    col("products.productId").alias("product_id"),
    col("products.quantity").alias("product_quantity")
)

In [12]:
df_carts_bronze.show(5, truncate=False)

+-------+-------+------------------------+----------+----------------+
|cart_id|user_id|cart_date               |product_id|product_quantity|
+-------+-------+------------------------+----------+----------------+
|1      |1      |2020-03-02T00:00:00.000Z|1         |4               |
|1      |1      |2020-03-02T00:00:00.000Z|2         |1               |
|1      |1      |2020-03-02T00:00:00.000Z|3         |6               |
|2      |1      |2020-01-02T00:00:00.000Z|2         |4               |
|2      |1      |2020-01-02T00:00:00.000Z|1         |10              |
+-------+-------+------------------------+----------+----------------+
only showing top 5 rows



### Escrita

In [13]:
# Adiciona coluna de timestamp de ingestão
df_products_bronze = BemolController.control_field(df_products_bronze, layer="bronze")
df_carts_bronze = BemolController.control_field(df_carts_bronze, layer="bronze")

In [14]:
df_carts_bronze.show(5, truncate=False)

+-------+-------+------------------------+----------+----------------+--------------------------+
|cart_id|user_id|cart_date               |product_id|product_quantity|insertion_bronze          |
+-------+-------+------------------------+----------+----------------+--------------------------+
|1      |1      |2020-03-02T00:00:00.000Z|1         |4               |2025-10-16 20:26:06.221442|
|1      |1      |2020-03-02T00:00:00.000Z|2         |1               |2025-10-16 20:26:06.221442|
|1      |1      |2020-03-02T00:00:00.000Z|3         |6               |2025-10-16 20:26:06.221442|
|2      |1      |2020-01-02T00:00:00.000Z|2         |4               |2025-10-16 20:26:06.221442|
|2      |1      |2020-01-02T00:00:00.000Z|1         |10              |2025-10-16 20:26:06.221442|
+-------+-------+------------------------+----------+----------------+--------------------------+
only showing top 5 rows



In [None]:
# Escreve os dados na camada bronze no formato Delta Lake e no modo overwrite como padrão
lakehouse.write_bronze(df_products_bronze, destination_path_products, table_name="bronze_products")
lakehouse.write_bronze(df_carts_bronze, destination_path_carts, table_name="bronze_carts")

In [16]:
# Escreve os dados de monitoramento no formato Delta Lake e no modo overwrite como padrão
lakehouse.monitor.export_delta(spark, destination_path_monitor)

2025-10-16 20:28:41,370 - INFO - Métricas exportadas com sucesso para ../data/monitoring/


DataFrame[table_name: string, row_count: bigint, col_count: bigint, timestamp: string]

In [17]:
df_monitor = spark.read.format("delta").load(destination_path_monitor)
df_monitor.show(5, truncate=False)

                                                                                

+---------------+---------+---------+-------------------+
|table_name     |row_count|col_count|timestamp          |
+---------------+---------+---------+-------------------+
|bronze_products|20       |9        |2025-10-16 20:27:04|
|bronze_carts   |14       |6        |2025-10-16 20:27:10|
|bronze_users   |10       |13       |2025-10-16 19:46:39|
+---------------+---------+---------+-------------------+

