In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timezone

ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

spark = SparkSession.builder \
    .appName(f"Bronze-Purchase-{ts}") \
    .getOrCreate()

In [2]:
from pyspark.sql.functions import current_timestamp, to_utc_timestamp, col
from pyspark.sql.types import DecimalType

In [3]:
# Criar DataFrame diretamente
df_purchase_bronze = spark.createDataFrame([
    ("2023-01-20 22:00:00", "2023-01-20", 55, 15947, 5, "2023-01-20", "2023-01-20", 852852),
    ("2023-01-26 00:01:00", "2023-01-26", 56, 369798, 746520, "2023-01-25", None, 963963),
    ("2023-02-05 10:00:00", "2023-02-05", 55, 160001, 5, "2023-01-20", "2023-01-20", 852852),
    ("2023-02-26 03:00:00", "2023-02-26", 69, 160001, 18, "2023-01-26", "2023-02-28", 96967),
    ("2023-07-15 09:00:00", "2023-07-15", 55, 160001, 5, "2023-01-20", "2023-03-01", 852852)
], schema=[
    "transaction_datetime",
    "transaction_date",
    "purchase_id",
    "buyer_id",
    "prod_item_id",
    "order_date",
    "release_date",
    "producer_id"
])

# Definindo schemas corretos 
df_purchase_bronze = df_purchase_bronze \
    .withColumn("transaction_datetime", col("transaction_datetime").cast("timestamp")) \
    .withColumn("transaction_date", col("transaction_date").cast("date")) \
    .withColumn("purchase_id", col("purchase_id").cast("bigint")) \
    .withColumn("buyer_id", col("buyer_id").cast("bigint")) \
    .withColumn("prod_item_id", col("prod_item_id").cast("bigint")) \
    .withColumn("producer_id", col("producer_id").cast("bigint")) \
    .withColumn("order_date", col("order_date").cast("date")) \
    .withColumn("release_date", col("release_date").cast("date"))


# Adicionar a coluna de ingestion_date
df_purchase_bronze = df_purchase_bronze.withColumn("ingestion_date", to_utc_timestamp(current_timestamp(), "UTC"))


In [4]:
# Criar view tempor√°ria
df_purchase_bronze.createOrReplaceTempView("purchase_bronze")

spark.sql("SELECT * FROM purchase_bronze").show()

+--------------------+----------------+-----------+--------+------------+----------+------------+-----------+--------------------+
|transaction_datetime|transaction_date|purchase_id|buyer_id|prod_item_id|order_date|release_date|producer_id|      ingestion_date|
+--------------------+----------------+-----------+--------+------------+----------+------------+-----------+--------------------+
| 2023-01-20 22:00:00|      2023-01-20|         55|   15947|           5|2023-01-20|  2023-01-20|     852852|2026-01-14 23:08:...|
| 2023-01-26 00:01:00|      2023-01-26|         56|  369798|      746520|2023-01-25|        null|     963963|2026-01-14 23:08:...|
| 2023-02-05 10:00:00|      2023-02-05|         55|  160001|           5|2023-01-20|  2023-01-20|     852852|2026-01-14 23:08:...|
| 2023-02-26 03:00:00|      2023-02-26|         69|  160001|          18|2023-01-26|  2023-02-28|      96967|2026-01-14 23:08:...|
| 2023-07-15 09:00:00|      2023-07-15|         55|  160001|           5|2023-01-20

In [5]:
# Salvar como Parquet 
df_purchase_bronze.coalesce(1).write \
    .format("parquet") \
    .mode("append") \
    .partitionBy("transaction_date")\
    .save("data_lake/bronze/purchase")