In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import current_timestamp
from datetime import datetime

spark = SparkSession.builder.getOrCreate()

# Paths
gold_processing_base = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/processing/"
gold_curated_rwe    = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/curated/rwe/"
gold_curated_top    = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/curated/"


StatementMeta(openfda, 3, 2, Finished, Available, Finished)

In [27]:
# Define schema
schema = StructType([
    StructField("safetyreportid", StringType(), True),
    StructField("receivedate", StringType(), True),
    StructField("serious", StringType(), True),
    StructField("medicinalproduct", StringType(), True),
    StructField("drugcharacterization", StringType(), True),
    StructField("reactionmeddrapt", StringType(), True),
    StructField("reactionoutcome", StringType(), True)
])

StatementMeta(openfda, 3, 3, Finished, Available, Finished)

In [28]:
# Read all parquet files recursively
df = spark.read.schema(schema).option("recursiveFileLookup", "true").parquet(gold_processing_base)
print(f"✅ Read {df.count()} rows from gold/processing")


StatementMeta(openfda, 3, 4, Finished, Available, Finished)

✅ Read 224 rows from gold/processing


In [29]:
# Transform for OMOP-like table
# Add metadata columns
df_curated = df.withColumn("load_timestamp", current_timestamp())

# RWE-specific curated dataset
rwe_df = df_curated.withColumnRenamed("medicinalproduct", "drug_name") \
                   .withColumnRenamed("reactionmeddrapt", "reaction") \
                   .withColumnRenamed("reactionoutcome", "outcome")

timestamp = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
rwe_output_path = f"{gold_curated_rwe}{timestamp}/"
rwe_df.write.mode("overwrite").parquet(rwe_output_path)
print(f"✅ Wrote RWE curated Gold data to {rwe_output_path}")

StatementMeta(openfda, 3, 5, Finished, Available, Finished)

✅ Wrote RWE curated Gold data to abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/curated/rwe/2025-08-28-11-44-06/


In [30]:
# Top-level curated dataset (could aggregate or combine multiple sources)
top_output_path = f"{gold_curated_top}{timestamp}/"
df_curated.write.mode("overwrite").parquet(top_output_path)
print(f"✅ Wrote top-level curated Gold data to {top_output_path}")

StatementMeta(openfda, 3, 6, Finished, Available, Finished)

✅ Wrote top-level curated Gold data to abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/curated/2025-08-28-11-44-06/


# Release Spark pool

In [None]:
mssparkutils.session.stop()