### Setup e Variáveis

In [1]:
from core.bemol_lakehouse import BemolLakeHouse
from core.bemol_controller import BemolController
from core.bemol_landing_reader import BemolLandingReader
from core.bemol_logger import BemolLogger
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [None]:
# Instanciando classe de logger
logger = BemolLogger("bronze_users")

# Configurando Spark com Delta Lake
spark = (
  SparkSession.builder
  .appName("IngestaoBronzeUsuarios")
  .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
  .getOrCreate()
)

# Intanciando classe de leitura da camada landing
landing = BemolLandingReader(logger)

# Intanciando classe de leitura/escrita de dados relacionados ao LakeHouse
lakehouse = BemolLakeHouse(spark, logger)

In [3]:
# Definindo URL da API e destino dos dados
url_users = "https://fakestoreapi.com/users"
destination_products = "../data/bronze/users"

# Path para salvar os dados de monitoramento
destination_path_monitor = "../data/monitoring/"

### Leitura

In [4]:
# Lendo dados da API e criando DataFrame
df_users = landing.read_api(spark, url_users)

2025-10-16 20:14:07,077 - INFO - Iniciando operação: read_api
2025-10-16 20:14:07,093 - INFO - Lendo dados da API: https://fakestoreapi.com/users
2025-10-16 20:14:24,470 - INFO - Dados lidos com sucesso da API: 10 linhas, 8 colunas.
2025-10-16 20:14:24,472 - INFO - Operação read_api finalizada em 17.4 segundos.


In [5]:
df_users.show(5, truncate=False)

                                                                                

+---+----------------------------------------------------------------------+------------------+---+-----------------+---------+--------------+---------+
|__v|address                                                               |email             |id |name             |password |phone         |username |
+---+----------------------------------------------------------------------+------------------+---+-----------------+---------+--------------+---------+
|0  |{kilcoole, {-37.3159, 81.1496}, 7682, new road, 12926-3874}           |john@gmail.com    |1  |{john, doe}      |m38rmF$  |1-570-236-7033|johnd    |
|0  |{kilcoole, {-37.3159, 81.1496}, 7267, Lovers Ln, 12926-3874}          |morrison@gmail.com|2  |{david, morrison}|83r5^_   |1-570-236-7033|mor_2314 |
|0  |{Cullman, {40.3467, -30.1310}, 86, Frances Ct, 29567-1452}            |kevin@gmail.com   |3  |{kevin, ryan}    |kev02937@|1-567-094-1345|kevinryan|
|0  |{San Antonio, {50.3467, -20.1310}, 6454, Hunters Creek Dr, 98234-1734}|don@gm

### Transformações

In [None]:
# Remove registros com valores nulos em colunas críticas
df_users_bronze = df_users.dropna(subset=["id"])

# Remove coluna password
df_users_bronze = df_users_bronze.drop("password")

# Extrai campos aninhados
df_users_bronze = (
    df_users_bronze
    .withColumn("first_name", col("name.firstname"))
    .withColumn("last_name", col("name.lastname"))
    .withColumn("city", col("address.city"))
    .withColumn("number", col("address.number"))
    .withColumn("street", col("address.street"))
    .withColumn("zip_code", col("address.zipcode"))
    .withColumn("lat", col("address.geolocation.lat"))
    .withColumn("long", col("address.geolocation.long"))
    .drop("name", "address", "__v")  # remove as colunas aninhadas originais
)

In [6]:
df_users_bronze.show(5, truncate=False)

+------------------+---+--------------+---------+----------+---------+-----------+------+----------------+----------+--------+--------+
|email             |id |phone         |username |first_name|last_name|city       |number|street          |zip_code  |lat     |long    |
+------------------+---+--------------+---------+----------+---------+-----------+------+----------------+----------+--------+--------+
|john@gmail.com    |1  |1-570-236-7033|johnd    |john      |doe      |kilcoole   |7682  |new road        |12926-3874|-37.3159|81.1496 |
|morrison@gmail.com|2  |1-570-236-7033|mor_2314 |david     |morrison |kilcoole   |7267  |Lovers Ln       |12926-3874|-37.3159|81.1496 |
|kevin@gmail.com   |3  |1-567-094-1345|kevinryan|kevin     |ryan     |Cullman    |86    |Frances Ct      |29567-1452|40.3467 |-30.1310|
|don@gmail.com     |4  |1-765-789-6734|donero   |don       |romer    |San Antonio|6454  |Hunters Creek Dr|98234-1734|50.3467 |-20.1310|
|derek@gmail.com   |5  |1-956-001-1945|derek    

### Escrita

In [9]:
# Adiciona coluna de controle de ingestão
df_users_bronze = BemolController.control_field(df_users_bronze, layer="bronze")

In [10]:
df_users_bronze.show(truncate=False)

+---+---------+------------------+--------------+----------+---------+-----------+------+----------------+----------+--------+----------+--------------------------+
|id |username |email             |phone         |first_name|last_name|city       |number|street          |zip_code  |lat     |long      |insertion_bronze          |
+---+---------+------------------+--------------+----------+---------+-----------+------+----------------+----------+--------+----------+--------------------------+
|1  |johnd    |john@gmail.com    |1-570-236-7033|john      |doe      |kilcoole   |7682  |new road        |12926-3874|-37.3159|81.1496   |2025-10-16 19:45:38.313673|
|2  |mor_2314 |morrison@gmail.com|1-570-236-7033|david     |morrison |kilcoole   |7267  |Lovers Ln       |12926-3874|-37.3159|81.1496   |2025-10-16 19:45:38.313673|
|3  |kevinryan|kevin@gmail.com   |1-567-094-1345|kevin     |ryan     |Cullman    |86    |Frances Ct      |29567-1452|40.3467 |-30.1310  |2025-10-16 19:45:38.313673|
|4  |doner

In [None]:
# Escreve os dados na camada bronze no formato Delta Lake e no modo overwrite como padrão
lakehouse.write_bronze(df_users_bronze, destination_products, table_name="bronze_users")

In [12]:
# Escreve os dados de monitoramento no formato Delta Lake e no modo overwrite como padrão
lakehouse.monitor.export_delta(spark, destination_path_monitor)

2025-10-16 19:47:10,086 - INFO - Métricas exportadas com sucesso para ../data/monitoring/


DataFrame[table_name: string, row_count: bigint, col_count: bigint, timestamp: string]

In [13]:
df_monitor = spark.read.format("delta").load(destination_path_monitor)
df_monitor.show(5, truncate=False)

+------------+---------+---------+-------------------+
|table_name  |row_count|col_count|timestamp          |
+------------+---------+---------+-------------------+
|bronze_users|10       |13       |2025-10-16 19:46:39|
+------------+---------+---------+-------------------+

