ProcessSilverFilesToGoldLayer to move data from gold/processing to gold in a timestamped, partitioned way.

In [18]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from datetime import datetime

spark = SparkSession.builder.getOrCreate()

# Paths
gold_processing_base = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/processing/"
gold_output_base     = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/"


StatementMeta(openfda, 1, 19, Finished, Available, Finished)

In [19]:
# Define schema for silver/gold data
schema = StructType([
    StructField("safetyreportid", StringType(), True),
    StructField("receivedate", StringType(), True),
    StructField("serious", StringType(), True),
    StructField("medicinalproduct", StringType(), True),
    StructField("drugcharacterization", StringType(), True),
    StructField("reactionmeddrapt", StringType(), True),
    StructField("reactionoutcome", StringType(), True)
])

StatementMeta(openfda, 1, 20, Finished, Available, Finished)

In [20]:
# Read all parquet files recursively in the processing folder
df = spark.read.schema(schema).option("recursiveFileLookup", "true").parquet(gold_processing_base)
print(f"✅ Read {df.count()} rows from gold/processing")


StatementMeta(openfda, 1, 21, Finished, Available, Finished)

✅ Read 224 rows from gold/processing


In [21]:
# Generate timestamped output folder
timestamp = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
gold_output_path = f"{gold_output_base}{timestamp}/"

StatementMeta(openfda, 1, 22, Finished, Available, Finished)

In [22]:
# Write to Gold layer in parquet
df.write.mode("overwrite").parquet(gold_output_path)
print(f"✅ Wrote {df.count()} rows to {gold_output_path}")

StatementMeta(openfda, 1, 23, Finished, Available, Finished)

✅ Wrote 224 rows to abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/2025-08-28-11-21-40/


# Release Spark pool

In [None]:
mssparkutils.session.stop()