In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timezone

ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

spark = SparkSession.builder \
    .appName(f"Bronze-Purchase-Extra-Info-{ts}") \
    .getOrCreate()

In [2]:
from pyspark.sql.functions import current_timestamp, to_utc_timestamp, col
from pyspark.sql.types import DecimalType

In [3]:
# Criar DataFrame diretamente
df_purchase_extra_info_bronze = spark.createDataFrame([
    ("2023-01-23 00:05:00", "2023-01-23", 55, "nacional"),
    ("2023-01-25 23:59:59", "2023-01-25", 56, "internacional"),
    ("2023-02-28 01:10:00", "2023-02-28", 69, "nacional"),
    ("2023-03-12 07:00:00", "2023-03-12", 69, "internacional")
], schema=[
    "transaction_datetime",
    "transaction_date",
    "purchase_id",
    "subsidiary"
])

# Definir schema correto
df_purchase_extra_info_bronze = df_purchase_extra_info_bronze \
    .withColumn("transaction_datetime", col("transaction_datetime").cast("timestamp")) \
    .withColumn("transaction_date", col("transaction_date").cast("date")) \
    .withColumn("purchase_id", col("purchase_id").cast("bigint")) \
    .withColumn("subsidiary", col("subsidiary").cast("string"))

# Adicionar ingestion_date
df_purchase_extra_info_bronze = df_purchase_extra_info_bronze.withColumn(
    "ingestion_date",
    to_utc_timestamp(current_timestamp(), "UTC")
)

In [4]:
# Criar view tempor√°ria
df_purchase_extra_info_bronze.createOrReplaceTempView("purchase_extra_info_bronze")

spark.sql("SELECT * FROM purchase_extra_info_bronze").show()

+--------------------+----------------+-----------+-------------+--------------------+
|transaction_datetime|transaction_date|purchase_id|   subsidiary|      ingestion_date|
+--------------------+----------------+-----------+-------------+--------------------+
| 2023-01-23 00:05:00|      2023-01-23|         55|     nacional|2026-01-14 10:30:...|
| 2023-01-25 23:59:59|      2023-01-25|         56|internacional|2026-01-14 10:30:...|
| 2023-02-28 01:10:00|      2023-02-28|         69|     nacional|2026-01-14 10:30:...|
| 2023-03-12 07:00:00|      2023-03-12|         69|internacional|2026-01-14 10:30:...|
+--------------------+----------------+-----------+-------------+--------------------+



In [5]:
# Persistir Bronze
df_purchase_extra_info_bronze.write \
    .format("parquet") \
    .mode("append") \
    .partitionBy("transaction_date") \
    .save("data_lake/bronze/purchase_extra_info")