# 1. Prepare Datasets

Download and set up the `ADE-Corpus-V2` Dataset required for the project as a JSON file stored in the `evals/testdata` directory.



In [1]:
import pandas as pd
from dotenv import load_dotenv

load_dotenv()


True

Read the dataset from the ADE-Corpus-V2 HuggingFace repository - https://huggingface.co/datasets/ade-benchmark-corpus/ade_corpus_v2/tree/main

In [2]:
# Classification as adverse drug event (ADE) 1 or not 0
ade_classification_dataset = pd.read_parquet("hf://datasets/ade-benchmark-corpus/ade_corpus_v2/Ade_corpus_v2_classification/train-00000-of-00001.parquet")

# relationship between drug and ADE
ade_relation_drug_ade_dataset = pd.read_parquet("hf://datasets/ade-benchmark-corpus/ade_corpus_v2/Ade_corpus_v2_drug_ade_relation/train-00000-of-00001.parquet")

# relationship between drug and dosage
ade_relation_drug_dosage_dataset = pd.read_parquet("hf://datasets/ade-benchmark-corpus/ade_corpus_v2/Ade_corpus_v2_drug_dosage_relation/train-00000-of-00001.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# preview the datasets
ade_classification_dataset

Unnamed: 0,text,label
0,Intravenous azithromycin-induced ototoxicity.,1
1,"Immobilization, while Paget's bone disease was...",1
2,Unaccountable severe hypercalcemia in a patien...,1
3,METHODS: We report two cases of pseudoporphyri...,1
4,METHODS: We report two cases of pseudoporphyri...,1
...,...,...
23511,"At autopsy, the liver was found to be small, s...",0
23512,"Physical exam revealed a patient with aphasia,...",0
23513,At the time when the leukemia appeared seven o...,0
23514,The American Society for Regional Anesthesia a...,0


In [4]:
# preview the relationship datasets
ade_relation_drug_ade_dataset

Unnamed: 0,text,drug,effect,indexes.drug.start_char,indexes.drug.end_char,indexes.effect.start_char,indexes.effect.end_char
0,Intravenous azithromycin-induced ototoxicity.,azithromycin,ototoxicity,[12],[24],[33],[44]
1,"Immobilization, while Paget's bone disease was...",dihydrotachysterol,increased calcium-release,[91],[109],[143],[168]
2,Unaccountable severe hypercalcemia in a patien...,dihydrotachysterol,hypercalcemia,[84],[102],[21],[34]
3,METHODS: We report two cases of pseudoporphyri...,naproxen,pseudoporphyria,[58],[66],[32],[47]
4,METHODS: We report two cases of pseudoporphyri...,oxaprozin,pseudoporphyria,[71],[80],[32],[47]
...,...,...,...,...,...,...,...
6816,Lithium treatment was terminated in 1975 becau...,Lithium,lithium intoxication,[0],[7],[52],[72]
6817,Lithium treatment was terminated in 1975 becau...,lithium,lithium intoxication,[52],[59],[52],[72]
6818,Eosinophilia caused by clozapine was observed ...,clozapine,Eosinophilia,[23],[32],[0],[12]
6819,Eosinophilia has been encountered from 0.2 to ...,clozapine,Eosinophilia,[55],[64],[0],[12]


In [5]:
# preview the drug dosage relationship dataset
ade_relation_drug_dosage_dataset

Unnamed: 0,text,drug,dosage,indexes.drug.start_char,indexes.drug.end_char,indexes.dosage.start_char,indexes.dosage.end_char
0,An episode of subacute encephalopathy after th...,methotrexate,1500 mg/m2,[79],[91],[93],[103]
1,She continued to receive regular insulin 4 tim...,insulin,4 times per day,[33],[40],[41],[56]
2,A 5-month-old infant became lethargic and poor...,brimonidine,1 drop,[86],[97],[76],[82]
3,The presented patient was treated with 200 mg ...,TCA,200 mg,"[46, 163]","[49, 166]",[39],[45]
4,Central nervous system manifestations of an ib...,ibuprofen,overdose,[44],[53],[54],[62]
...,...,...,...,...,...,...,...
274,1. Changes in the plasma cortisol level were r...,alprazolam,low,[122],[132],[113],[116]
275,We report on three patients with acute schizop...,olanzapine,20-25 mg/d,[107],[117],[119],[129]
276,A 65-year-old woman with angina pectoris prese...,isosorbide dinitrate,5 mg,[94],[114],[116],[120]
277,In a postural challenge test after administrat...,isosorbide dinitrate,5 mg,[53],[73],[75],[79]


In [6]:
# merge datasets - `ade_classification_dataset` and `ade_relation_drug_ade_dataset` are merged into `ade_relation_drug_dosage_dataset`
# while merging, select `text` and `label` columns from `ade_classification_dataset`, select `drug`, `effect` columns from `ade_relation_drug_ade_dataset` and select `dosage` from `ade_relation_drug_dosage_dataset`
ade_combined_dataset = pd.merge(ade_classification_dataset[['text', 'label']],
                                ade_relation_drug_ade_dataset[['text', 'drug', 'effect']],
                                on='text',
                                how='left')
ade_combined_dataset = pd.merge(ade_combined_dataset,
                                ade_relation_drug_dosage_dataset[['text', 'dosage']],
                                on='text',
                                how='left')
ade_combined_dataset = ade_combined_dataset.drop_duplicates(subset=['text'])
ade_combined_dataset = ade_combined_dataset.reset_index(drop=True)

# preview the combined dataset
ade_combined_dataset

Unnamed: 0,text,label,drug,effect,dosage
0,Intravenous azithromycin-induced ototoxicity.,1,azithromycin,ototoxicity,
1,"Immobilization, while Paget's bone disease was...",1,dihydrotachysterol,increased calcium-release,
2,Unaccountable severe hypercalcemia in a patien...,1,dihydrotachysterol,hypercalcemia,
3,METHODS: We report two cases of pseudoporphyri...,1,naproxen,pseudoporphyria,
4,"Naproxen, the most common offender, has been a...",1,Naproxen,erythropoietic protoporphyria,
...,...,...,...,...,...
20891,"At autopsy, the liver was found to be small, s...",0,,,
20892,"Physical exam revealed a patient with aphasia,...",0,,,
20893,At the time when the leukemia appeared seven o...,0,,,
20894,The American Society for Regional Anesthesia a...,0,,,


In [7]:
# split the dataset into two: one with `label` as 1 (ADE present) and another with `label` as 0 (ADE absent)
ade_combined_dataset_ade_present = ade_combined_dataset[
    ade_combined_dataset["label"] == 1
].reset_index(drop=True)
ade_combined_dataset_ade_absent = ade_combined_dataset[
    ade_combined_dataset["label"] == 0
].reset_index(drop=True)

# select rows with non-null `dosage` columns
ade_combined_dataset_ade_present = ade_combined_dataset_ade_present[
    ade_combined_dataset_ade_present["dosage"].notnull()
].reset_index(drop=True)

# combine the two datasets into one, by concatenating the two datasets and shuffling the rows and select only 10 rows from each dataset
ade_combined_dataset_final = (
    pd.concat(
        [
            ade_combined_dataset_ade_present.sample(n=10, random_state=42),
            ade_combined_dataset_ade_absent.sample(n=10, random_state=42),
        ],
        ignore_index=True,
    )
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)

# preview the final combined dataset
ade_combined_dataset_final

Unnamed: 0,text,label,drug,effect,dosage
0,The intramuscular challenge test with 25 UI of...,1,Miacalcic,anaphylactic reaction,25 UI
1,Vitamin K2 was infused along with discontinuat...,0,,,
2,"Signs of ischemia, including mottling of skin ...",0,,,
3,A 17-year-old female patient who had been taki...,1,minocycline,eruption,50 mg twice daily
4,Acute myocardial infarction during high-dose m...,1,methylprednisolone,Acute myocardial infarction,high-dose
5,Life-threatening acute hyponatraemia induced b...,1,cyclophosphamide,acute hyponatraemia,low dose
6,Full recovery occurred because of early pick u...,0,,,
7,Growth and adrenal suppression in asthmatic ch...,1,fluticasone propionate,Growth and adrenal suppression,high
8,"To our knowledge, this is the first clear repo...",0,,,
9,Several hypotheses are advanced to explain the...,0,,,


### Save the datasets

Saves the datasets to the folder [../evals/testdata/ade-v2-300.json](../evals/testdata/ade-v2-300.json).

In [14]:
import json
import os

# for each of `ade_combined_dataset_final`, create a row in a JSON file with the following structure:
# {
#     "text": "<text>",
#     "golden-truth": "<label = 'POSITIVE' if 1 or 'NEGATIVE' if 0>|<drug = '<drug>' if not null else empty>|<dosage = '<dosage>' if not null else empty>|<effect = '<effect>' if not null else empty>",
# }
ade_combined_dataset_final_json = []

for index, row in ade_combined_dataset_final.iterrows():
    text = row["text"]
    label = "positive" if row["label"] == 1 else "negative"
    drug = row["drug"] if pd.notna(row["drug"]) else ""
    dosage = row["dosage"] if pd.notna(row["dosage"]) else ""
    effect = row["effect"] if pd.notna(row["effect"]) else ""

    golden_truth = f"{label}|{drug}|{dosage}|{effect}"

    ade_combined_dataset_final_json.append(
        {"text": text, "golden-truth": golden_truth.upper()}
    )

# save the JSON file in <project_root>/evals/testdata/ade-v2-300.json
path_to_data = os.path.join(os.getcwd(), "../evals/testdata")
os.makedirs(path_to_data, exist_ok=True)
with open(os.path.join(path_to_data, "ade-v2-300.json"), "w") as f:
    json.dump(ade_combined_dataset_final_json, f, indent=4)
print("JSON file created: evals/testdata/ade-v2-300.json")

JSON file created: evals/testdata/ade-v2-300.json
