✅ Notebook Highlights

1. Loads curated RWE Gold data from gold/curated/rwe/.

2. Aggregates per (drug_name, reaction): ae_count, serious_count, serious_ratio

3. Saves ML-ready Parquet dataset to gold/ml_ready/.

4. Performs categorical encoding (drug_name + reaction).

5. Trains a RandomForest classifier to predict serious AE.

6. Evaluates with AUC.

7. Saves trained model to gold/models/.

In [53]:
# -------------------------------
# Synapse ML Notebook: Train AE Seriousness Model
# -------------------------------

# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count as _count, sum as _sum
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from datetime import datetime

spark = SparkSession.builder.getOrCreate()

StatementMeta(openfda, 3, 29, Finished, Available, Finished)

In [54]:
# -------------------------------
# Paths
# -------------------------------
gold_curated_rwe = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/curated/rwe/"
gold_ml_ready_base = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/ml_ready/"
gold_models_base = "abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/models/"


StatementMeta(openfda, 3, 30, Finished, Available, Finished)

In [55]:
# -------------------------------
# Load curated RWE dataset
# -------------------------------
df_curated = spark.read.option("recursiveFileLookup", "true").parquet(gold_curated_rwe)


StatementMeta(openfda, 3, 31, Finished, Available, Finished)

In [56]:
# -------------------------------
# Aggregate for ML
# -------------------------------
df_ml_ready = df_curated.groupBy("drug_name", "reaction") \
    .agg(
        _count("*").alias("ae_count"),
        _sum("serious").alias("serious_count")
    ) \
    .withColumn("serious_ratio", col("serious_count") / col("ae_count"))

# Save ML-ready dataset
timestamp = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
ml_ready_path = f"{gold_ml_ready_base}{timestamp}/"
df_ml_ready.write.mode("overwrite").parquet(ml_ready_path)
print(f"✅ ML-ready dataset saved to {ml_ready_path}")


StatementMeta(openfda, 3, 32, Finished, Available, Finished)

✅ ML-ready dataset saved to abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/ml_ready/2025-08-28-12-12-31/


In [57]:
# -------------------------------
# Feature encoding
# -------------------------------
# Binarize label
df_ml_ready = df_ml_ready.withColumn("label", when(col("serious_count") > 0, 1).otherwise(0))

# StringIndexers
drug_indexer = StringIndexer(inputCol="drug_name", outputCol="drug_idx")
reaction_indexer = StringIndexer(inputCol="reaction", outputCol="reaction_idx")
df_indexed = drug_indexer.fit(df_ml_ready).transform(df_ml_ready)
df_indexed = reaction_indexer.fit(df_indexed).transform(df_indexed)

# One-hot encoding
drug_encoder = OneHotEncoder(inputCol="drug_idx", outputCol="drug_vec")
reaction_encoder = OneHotEncoder(inputCol="reaction_idx", outputCol="reaction_vec")
df_encoded = drug_encoder.fit(df_indexed).transform(df_indexed)
df_encoded = reaction_encoder.fit(df_encoded).transform(df_encoded)

# Assemble features
assembler = VectorAssembler(
    inputCols=["drug_vec", "reaction_vec", "ae_count"], 
    outputCol="features"
)
df_final = assembler.transform(df_encoded).select("features", "label")
df_final.show(5)

StatementMeta(openfda, 3, 33, Finished, Available, Finished)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(75,[4,66,74],[1....|    1|
|(75,[15,54,74],[1...|    1|
|(75,[6,50,74],[1....|    1|
|(75,[15,50,74],[1...|    1|
|(75,[9,51,74],[1....|    1|
+--------------------+-----+
only showing top 5 rows



In [58]:
# -------------------------------
# Train/Test split
# -------------------------------
train_df, test_df = df_final.randomSplit([0.8, 0.2], seed=42)


StatementMeta(openfda, 3, 34, Finished, Available, Finished)

In [59]:
# -------------------------------
# Train RandomForest
# -------------------------------
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
pipeline = Pipeline(stages=[rf])
model = pipeline.fit(train_df)

StatementMeta(openfda, 3, 35, Finished, Available, Finished)

In [60]:
# -------------------------------
# Evaluate model
# -------------------------------
preds = model.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
auc = evaluator.evaluate(preds)
print(f"✅ Test AUC: {auc:.4f}")

StatementMeta(openfda, 3, 36, Finished, Available, Finished)

✅ Test AUC: 1.0000


In [61]:
# -------------------------------
# Save trained model to Gold
# -------------------------------
model_path = f"{gold_models_base}rf_ae_seriousness_{timestamp}/"
model.write().overwrite().save(model_path)
print(f"✅ Trained RandomForest model saved to {model_path}")

StatementMeta(openfda, 3, 37, Finished, Available, Finished)

✅ Trained RandomForest model saved to abfss://rwedatalakestorage@datalakerwe.dfs.core.windows.net/gold/models/rf_ae_seriousness_2025-08-28-12-12-31/


# Release Spark pool

In [None]:
mssparkutils.session.stop()