# Fake Data Pipeline (Bronze → Silver → Gold)

Generates fake data, writes bronze/silver/gold tables to Unity Catalog, and stores run metrics.

In [None]:
# Widgets (passed via job base_parameters)
dbutils.widgets.text("catalog", "andrea_tardif", "Catalog")
dbutils.widgets.text("job_id", "", "Job ID")
dbutils.widgets.text("run_id", "", "Run ID")
dbutils.widgets.text("start_time", "", "Start time (ms)")

In [None]:
import time
from datetime import datetime

from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, FloatType, TimestampType
)

spark = SparkSession.builder.getOrCreate()
fake = Faker()

CATALOG = dbutils.widgets.get("catalog")
NUM_RECORDS = 10000
JOB_ID = dbutils.widgets.get("job_id")
RUN_ID = dbutils.widgets.get("run_id")
RUN_START = datetime.utcnow()

In [None]:
for schema in ["bronze", "silver", "gold"]:
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{schema}")
    print(f"[INFO] Schema {CATALOG}.{schema} is ready.")

In [None]:
# BRONZE — raw fake customer orders
t0 = time.time()

records = [
    (
        fake.uuid4(),
        fake.name(),
        fake.email(),
        fake.state(),
        float(round(fake.pyfloat(min_value=5, max_value=500, right_digits=2), 2)),
        fake.random_element(["electronics", "clothing", "food", "books", "sports"]),
        fake.date_time_between(start_date="-90d", end_date="now"),
        fake.random_element(["completed", "pending", "cancelled", "refunded"]),
    )
    for _ in range(NUM_RECORDS)
]

schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("customer_name", StringType(), False),
    StructField("email", StringType(), False),
    StructField("state", StringType(), False),
    StructField("order_amount", FloatType(), False),
    StructField("category", StringType(), False),
    StructField("order_ts", TimestampType(), False),
    StructField("status", StringType(), False),
])

bronze_df = spark.createDataFrame(records, schema=schema).withColumn(
    "_ingested_at", F.current_timestamp()
)
bronze_df.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.bronze.orders")
bronze_rows = bronze_df.count()
bronze_time = time.time() - t0
print(f"[BRONZE] {bronze_rows:,} rows written in {bronze_time:.1f}s")

In [None]:
# SILVER — cleaned, deduplicated
t0 = time.time()

silver_df = (
    spark.table(f"{CATALOG}.bronze.orders")
    .dropDuplicates(["order_id"])
    .filter(F.col("status") != "cancelled")
    .withColumn("order_amount_usd", F.round(F.col("order_amount"), 2))
    .withColumn("order_date", F.to_date("order_ts"))
    .drop("_ingested_at")
    .withColumn("_processed_at", F.current_timestamp())
)
silver_df.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.silver.orders_clean")
silver_rows = silver_df.count()
silver_time = time.time() - t0
print(f"[SILVER] {silver_rows:,} rows written in {silver_time:.1f}s")

In [None]:
# GOLD — aggregated revenue
t0 = time.time()

gold_df = (
    silver_df
    .groupBy("category", "state", "order_date")
    .agg(
        F.count("order_id").alias("num_orders"),
        F.sum("order_amount_usd").alias("total_revenue"),
        F.avg("order_amount_usd").alias("avg_order_value"),
    )
    .withColumn("_aggregated_at", F.current_timestamp())
)
gold_df.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.gold.revenue_summary")
gold_rows = gold_df.count()
gold_time = time.time() - t0
print(f"[GOLD] {gold_rows:,} rows written in {gold_time:.1f}s")

In [None]:
# Persist run metrics
total_duration = (datetime.utcnow() - RUN_START).total_seconds()

metrics_data = [{
    "run_id": RUN_ID,
    "job_id": JOB_ID,
    "run_ts": RUN_START,
    "bronze_rows": bronze_rows,
    "silver_rows": silver_rows,
    "gold_rows": gold_rows,
    "total_duration_sec": total_duration,
    "status": "success",
}]

metrics_schema = StructType([
    StructField("run_id", StringType(), False),
    StructField("job_id", StringType(), False),
    StructField("run_ts", TimestampType(), False),
    StructField("bronze_rows", IntegerType(), False),
    StructField("silver_rows", IntegerType(), False),
    StructField("gold_rows", IntegerType(), False),
    StructField("total_duration_sec", FloatType(), False),
    StructField("status", StringType(), False),
])

spark.createDataFrame(metrics_data, schema=metrics_schema).write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{CATALOG}.gold.pipeline_run_metrics")

print(f"[METRICS] Run {RUN_ID} recorded. Total duration: {total_duration:.1f}s")

# Task value is set in the next cell for job condition_task (success/failure branching).
# Do not exit here so that cell runs.

In [None]:
message = "success"

# Set the message to be used by other tasks
dbutils.jobs.taskValues.set(key="job_status", value=message)