In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timezone

ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

spark = SparkSession.builder \
    .appName(f"Bronze-Product-Item-{ts}") \
    .getOrCreate()

In [2]:
from pyspark.sql.functions import current_timestamp, to_utc_timestamp, col
from pyspark.sql.types import DecimalType

In [3]:
# Criar DataFrame diretamente
df_product_item_bronze = spark.createDataFrame([
    ("2023-01-20 22:02:00", "2023-01-20", 55, 696969, 10, 50.00),
    ("2023-01-25 23:59:59", "2023-01-25", 56, 808080, 120, 2400.00),
    ("2023-02-26 03:00:00", "2023-02-26", 69, 373737, 2, 2000.00),
    ("2023-07-12 09:00:00", "2023-07-12", 55, 696969, 10, 55.00)
], schema=[
    "transaction_datetime",
    "transaction_date",
    "purchase_id",
    "product_id",
    "item_quantity",
    "purchase_value"
])

# Definindo schemas corretos
df_product_item_bronze = df_product_item_bronze \
    .withColumn("transaction_datetime", col("transaction_datetime").cast("timestamp")) \
    .withColumn("transaction_date", col("transaction_date").cast("date")) \
    .withColumn("purchase_id", col("purchase_id").cast("bigint")) \
    .withColumn("product_id", col("product_id").cast("bigint")) \
    .withColumn("item_quantity", col("item_quantity").cast("int")) \
    .withColumn("purchase_value", col("purchase_value").cast(DecimalType(18, 2)))

# Adicionar ingestion_date
df_product_item_bronze = df_product_item_bronze.withColumn(
    "ingestion_date",
    to_utc_timestamp(current_timestamp(), "UTC")
)

In [4]:
# Criar view tempor√°ria
df_product_item_bronze.createOrReplaceTempView("product_item_bronze")

spark.sql("SELECT * FROM product_item_bronze").show()

+--------------------+----------------+-----------+----------+-------------+--------------+--------------------+
|transaction_datetime|transaction_date|purchase_id|product_id|item_quantity|purchase_value|      ingestion_date|
+--------------------+----------------+-----------+----------+-------------+--------------+--------------------+
| 2023-01-20 22:02:00|      2023-01-20|         55|    696969|           10|         50.00|2026-01-14 23:10:...|
| 2023-01-25 23:59:59|      2023-01-25|         56|    808080|          120|       2400.00|2026-01-14 23:10:...|
| 2023-02-26 03:00:00|      2023-02-26|         69|    373737|            2|       2000.00|2026-01-14 23:10:...|
| 2023-07-12 09:00:00|      2023-07-12|         55|    696969|           10|         55.00|2026-01-14 23:10:...|
+--------------------+----------------+-----------+----------+-------------+--------------+--------------------+



In [5]:
# Salvar como Parquet 
df_product_item_bronze.write \
    .format("parquet") \
    .mode("append") \
    .partitionBy("transaction_date") \
    .save("data_lake/bronze/product_item")