In [None]:
from pytrials.client import ClinicalTrials
import pymongo
from pymongo import UpdateOne, InsertOne
import json
import pickle
import re
from openai import OpenAI
from config import api_key

In [None]:
#### Load the variables for reproducibility
with open("retrieved_studies.pkl", "rb") as f:
    retrieved_studies = pickle.load(f)

with open("mapped_data_all.pkl", "rb") as f:
    mapped_data_all = pickle.load(f)

## Connect to local MongoDB
create database and collection

In [25]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

In [26]:
db = client["ClinicalTrialsDB"]

In [27]:
final_collection = db["clinical_trial_collection"]

In [None]:
# specify the json schema for database validation.

schema = {
    "type": "object",
    "additionalProperties": True,
    "properties": {
        "trialId": {
            "type": "string",
            "description": "A unique identifier for the clinical trial.",
        },
        "title": {
            "type": "string",
            "description": "The official title of the clinical trial.",
        },
        "startDate": {
            "type": "string",
            "pattern": "^^\\d{4}-\\d{2}-\\d{2}|NA$",
            "description": "The start date of the clinical trial.",
        },
        "endDate": {
            "type": "string",
            "pattern": "^^\\d{4}-\\d{2}-\\d{2}|NA$",
            "description": "The end date of the clinical trial, if applicable.",
        },
        "phase": {
            "type": "string",
            "enum": ["Phase 1", "Phase 2", "Phase 3", "Phase 4", "Other"],
            "description": "The phase of the clinical trial.",
        },
        "principalInvestigator": {
            "type": "array",
            "properties": {
                "name": {
                    "type": "string",
                    "description": "The name of the principal investigator.",
                },
                "affiliation": {
                    "type": "string",
                    "description": "The affiliation of the principal investigator.",
                },
            },
            "required": ["name"],
        },
        "locations": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "facility": {
                        "type": "string",
                        "description": "Name of the facility where the trial is conducted.",
                    },
                    "city": {
                        "type": "string",
                        "description": "The city where the facility is located.",
                    },
                    "country": {
                        "type": "string",
                        "description": "The country where the facility is located.",
                    },
                },
            },
        },
        "eligibilityCriteria": {
            "type": "string",
            "description": "A description of the eligibility criteria for the trial.",
        },
    },
    "required": ["trialId", "title", "phase"],
}

In [None]:
# Database validation.
db.create_collection(
    "clinical_trial_collection",
    validator={"$jsonSchema": schema},
    validationAction="warn",
)

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'ClinicalTrialsDB'), 'clinical_trial_collection')

In [6]:
client.list_database_names()

['ClinicalTrialsDB', 'TestDB', 'admin', 'config', 'local']

In [7]:
db.list_collection_names()

['clinical_trial_collection_updated', 'clinical_trial_collection']

## Retrive metadata via clinicaltrials.gov API

In [16]:
ct = ClinicalTrials()

In [None]:
retrieved_studies = ct.get_full_studies(
    search_expr="AREA[LastUpdatePostDate]RANGE[2024-10-20, 2024-10-21]",
    max_studies=1000,  # more than 1000?
    fmt="json",
)

In [6]:
# Check how many studies are retrieved
len(retrieved_studies["studies"])

619

In [None]:
def flatten(arg):
    if not isinstance(arg, list):  # if not list
        return [arg]
    return [x for sub in arg for x in flatten(sub)]

In [None]:
def get_nested_value(data, keys2):
    keys = keys2.split(".")
    for key in keys:
        if isinstance(data, dict):
            data = data.get(key)
        elif isinstance(data, list):
            valuelist = []
            for ele in data:
                valuelist.append(ele.get(key))
            data = flatten(valuelist)
    return data

##### Functions for data mapping

In [None]:
def get_date(study):
    start = get_nested_value(study, "protocolSection.statusModule.startDateStruct.date")

    end = get_nested_value(
        study, "protocolSection.statusModule.completionDateStruct.date"
    )

    def modify_date(date):
        if date is None:
            return "NA"
        if len(date) == 7:
            return date + "-01"
        else:
            return date

    return modify_date(start), modify_date(end)

In [None]:
def get_phase(study):
    phase = get_nested_value(study, "protocolSection.designModule.phases")

    if phase == None:
        return "Other"
    return next(
        (
            f"Phase {p[-1]}"
            for p in phase
            if p.upper() in ["PHASE1", "PHASE2", "PHASE3", "PHASE4"]
        ),
        "Other",
    )

In [None]:
def get_principal_investigator(study):
    overall_officials = get_nested_value(
        study, "protocolSection.contactsLocationsModule.overallOfficials"
    )
    if not overall_officials:  # Check if overall_officials is None or empty
        return []

    return [
        {
            "name": official.get("name", "NA"),
            "affiliation": official.get("affiliation", "NA"),
        }
        for official in overall_officials
        if official.get("role") == "PRINCIPAL_INVESTIGATOR"
    ]

In [None]:
def get_locations(study):
    locations = get_nested_value(
        study, "protocolSection.contactsLocationsModule.locations"
    )

    if not locations:  # Check if overall_officials is None or empty
        return []
    return [
        {
            "facility": loc.get("facility", ""),
            "city": loc.get("city", ""),
            "country": loc.get("country", ""),
        }
        for loc in locations
    ]

In [None]:
def transform_study(study):

    return {
        "trialId": get_nested_value(
            study, "protocolSection.identificationModule.nctId"
        ),
        "title": get_nested_value(
            study, "protocolSection.identificationModule.officialTitle"
        ),
        "startDate": get_date(study)[0],
        "endDate": get_date(study)[1],
        "phase": get_phase(study),
        "principalInvestigator": get_principal_investigator(study),
        "locations": get_locations(study),
        "eligibilityCriteria": get_nested_value(
            study, "protocolSection.eligibilityModule.eligibilityCriteria"
        ),
    }

In [None]:
def map_data(input_data):
    transformed_data = []
    for study in input_data["studies"]:
        transformed_data.append(transform_study(study))
    return transformed_data

In [9]:
# Transform the input data
mapped_data_all = map_data(retrieved_studies)

In [19]:
# Sanity check: if all studies are mapped
len(retrieved_studies["studies"]) == len(mapped_data_all)

True

In [None]:
#### Save the variables for reproducibility

# with open("retrieved_studies.pkl", "wb") as f:


#     pickle.dump(retrieved_studies, f)


# with open("mapped_data_all.pkl", "wb") as f:


#     pickle.dump(mapped_data_all, f)

In [8]:
# Function to insert or update data in MongoDB
def upsert_data_to_db(data, collection):
    requests = []

    # Fetch original documents before updating
    for entry in data:
        trial_id = entry["trialId"]
        original_document = collection.find_one({"trialId": trial_id})
        if original_document:
            requests.append(
                UpdateOne({"trialId": trial_id}, {"$set": entry})
            )  # upsert=False
            for key, value in entry.items():
                old_value = original_document.get(key)
                if old_value != value:  # Identify modified fields
                    print(
                        f"Trial ID: {trial_id}, Field: {key}, From: {old_value} To: {value}"
                    )
        else:
            requests.append(InsertOne(entry))

    if requests:
        result = collection.bulk_write(requests)  # Use bulk_write for batch processing
        if result.inserted_count > 0:
            print(f"Number of documents inserted: {result.inserted_count}")

        if result.modified_count > 0:
            print(f"Number of documents updated: {result.modified_count}")

    collection.create_index("trialId")

In [38]:
upsert_data_to_db(mapped_data_all, final_collection)

Number of documents inserted: 619


In [12]:
# Sanity check: if all studies are inserted
len(retrieved_studies["studies"]) == len(
    mapped_data_all
) == final_collection.count_documents({}) == 619

True

In [None]:
# Sanity check
mapped_data_all[10]

{'trialId': 'NCT06649591',
 'title': 'A Survey Based Study Assessing the Feasibility of Using Standardized Clinical Vignettes to Aid in Medical Decision in Patients with Malignant Brain Tumors',
 'startDate': '2017-07-01',
 'endDate': '2025-12-01',
 'phase': 'Other',
 'principalInvestigator': [{'name': 'Marie Roguski, MD MPH',
   'affiliation': 'Tufts Medical Center'}],
 'locations': [{'facility': 'Tufts Medical Center',
   'city': 'Boston',
   'country': 'United States'}],
 'eligibilityCriteria': 'Inclusion Criteria:\r\n\r\n  -  consecutive patients treated at a single center between April 2018 and July 2023 for\r\n     malignant brain tumors, including glioma, metastasis, and lymphoma.\r\n\r\nExclusion Criteria:\r\n\r\n  -  Patients without available MRI dicom images\r\n\r\n  -  Patients with other CNS malignancies\r\n\r\n  -  Patients with multiply recurrent gliomas undergoing treatment for primarily\r\n     palliative purposes\r\n\r\n  -  Patients younger than 18 years old',
 '_id'

In [None]:
# Sanity check
mapped_data_all[15]

{'trialId': 'NCT04431960',
 'title': 'Blackcurrant Modifies Gut Microbiota and Reduces the Risk of Postmenopausal Osteoporosis and Cardiovascular Disease: A Pilot Randomized Clinical Trial',
 'startDate': '2021-07-20',
 'endDate': '2022-10-03',
 'phase': 'Phase 1',
 'principalInvestigator': [{'name': 'Ock K Chun, PhD',
   'affiliation': 'University of Connecticut'}],
 'locations': [{'facility': 'University of Connecticut Department of Nutritional Sciences and Kinesiology Human Performance Laboratory',
   'city': 'Storrs',
   'country': 'United States'}],
 'eligibilityCriteria': 'Inclusion Criteria:\r\n\r\n  -  perimenopausal or early postmenopausal women aged 45-60 years old\r\n\r\n  -  not on HRT for at least one year before the initiation of the study\r\n\r\n  -  maintaining normal exercise level (<7 h/wk) and willing to avoid exercise 24-h prior\r\n     to blood and stool sampling and 12-h prior to bone measurements\r\n\r\n  -  willing to ingest a dietary BC supplement or placebo (u

Test if upsert_data_to_db suports data update:

In [25]:
updated_collection = db["clinical_trial_collection_updated"]
db.create_collection(
    "clinical_trial_collection_updated", validator={"$jsonSchema": schema}
)
# mapped_data_all = map_data(retrieved_studies)
upsert_data_to_db(mapped_data_all, updated_collection)

Number of documents inserted: 619


In [26]:
updated_studies = {"studies": []}
updated_trial = ["NCT06649721", "NCT05254002"]  # Randomly select trials to be updated


# Update trials with specified nctId values and store them in updated_studies
for study in retrieved_studies["studies"]:
    updated_ids = study["protocolSection"]["identificationModule"].get("nctId")
    if updated_ids in updated_trial:
        study_copy = copy.deepcopy(study)
        study_copy["protocolSection"]["statusModule"]["completionDateStruct"][
            "date"
        ] = "9999-99-99"  # update date
        study_copy["protocolSection"]["designModule"]["phases"] = [
            "PHASE9"
        ]  # update phase
        updated_studies["studies"].append(study_copy)


mapped_data_updated = map_data(updated_studies)
upsert_data_to_db(mapped_data_updated, updated_collection)

Number of documents updated: 2
Trial ID: NCT05254002, Field: endDate, From: 2025-11-30 To: 9999-99-99
Trial ID: NCT05254002, Field: phase, From: Phase 2 To: Other
Trial ID: NCT06649721, Field: endDate, From: 2027-06-30 To: 9999-99-99
Trial ID: NCT06649721, Field: phase, From: Phase 3 To: Other


#### Sanity checks for database

In [13]:
# Sanity check
p1 = final_collection.count_documents({"phase": "Phase 1"})


p2 = final_collection.count_documents({"phase": "Phase 2"})


p3 = final_collection.count_documents({"phase": "Phase 3"})


p4 = final_collection.count_documents({"phase": "Phase 4"})


o = final_collection.count_documents({"phase": "Other"})


print(p1, p2, p3, p4, o)


print(p1 + p2 + p3 + p4 + o)

90 96 61 23 349
619


In [None]:
for trial in final_collection.find({"title": {"$regex": "Diabete", "$options": "i"}}):
    pprint.pprint(trial.get("trialId"))


final_collection.count_documents({"title": {"$regex": "Diabete", "$options": "i"}})

'NCT06649773'
'NCT04745572'
'NCT06650969'
'NCT05910840'
'NCT05743244'
'NCT05963022'
'NCT04413357'
'NCT05026424'
'NCT03835312'
'NCT02806700'
'NCT04914559'
'NCT05254002'
'NCT06650007'
'NCT06579105'


14

In [None]:
for trial in final_collection.find({"title": {"$regex": "HER2", "$options": "i"}}):
    pprint.pprint(trial.get("trialId"))


final_collection.count_documents({"title": {"$regex": "HER2", "$options": "i"}})

'NCT05945927'
'NCT04513665'
'NCT06065748'
'NCT03901339'
'NCT03280563'
'NCT03587740'
'NCT05425550'
'NCT02007512'
'NCT06364410'
'NCT06650332'
'NCT02297438'


11

In [None]:
final_collection.count_documents(
    {"startDate": {"$gte": "2020-01-01", "$lte": "2024-12-31"}}
)

455

In [None]:
final_collection.count_documents(
    {
        "locations": {
            "$elemMatch": {
                "facility": {"$regex": "Orange County Research Center", "$options": "i"}
            }
        }
    }
)

3

In [None]:
final_collection.count_documents(
    {"principalInvestigator": {"$elemMatch": {"name": "Marie Roguski, MD MPH"}}}
)

1

In [None]:
# Get a distinct list of all principal investigator names
distinct_principal_investigators = final_collection.distinct(
    "principalInvestigator.name"
)

print("Distinct Principal Investigators:")
for name in distinct_principal_investigators:
    print(name)

Distinct Principal Investigators:
Aaron Folsom, MD, MPH
Aaron Shafer, MD
Abdusalom Abdurakhmanov, MD
Abraham Wu, MD
Adeyemi Ogunleye, MD
Afton Hassett
Ajay Premkumar, MD
Alain SAAD
Alain SAAD, MD
Alaine E Reschke-Hernandez, PhD
Alaine E. Reschke-Hernandez, PhD
Alethea Desrosiers
Alexis MOSCA, MD
Alice L Yu
Allison Gibson, PhD
Amanda Rao, PhD
Ana Oaknin, MD
Anders Thorell, Professor
Andreas M Fritzen, Associate Professor
Andreas Peyrl, MD
Andreas Trojan, Prof. Dr. med.
Andrew Kneebone
Ann Smeets, MD,PhD
Anna De Simoni, PhD
Anna GEMAHLING, MD
Anna H. Grummon, PhD
Anna Maria Hibbs, MD, MSCE
Anne-Sophie Brazeau, PhD
Annette DeVito Dabbs, PhD
Anxo Fernandez-Ferreiro, PhD
Audrey Harkness, PhD
Aung Naing, Md
Boris Bogov, Prof, PhD
Bouthaina S Dabaja
Brian Helfand, M.D.
Brian Vickery, MD
CORALIE NOEL
Carla S Stover, Ph.D.
Carlos A Zarate, M.D.
Carlos Chaccour
Carolyn Bramante, MD
Carrie L Pistenmaa, MD, MS
Cavan Reilly, PhD
Channing Paller, MD
Chao-Chi Ho, MD, PhD
Chaohui Yu, Doctor
Chaosu Hu


In [None]:
# Aggregation pipeline to count occurrences of each principal investigator name
pipeline = [
    {"$unwind": "$principalInvestigator"},
    {"$group": {"_id": "$principalInvestigator.name", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},  # Sort by count in descending order
]

# Run the aggregation pipeline
results = final_collection.aggregate(pipeline)

# Print the results
print("Number of documents for each Principal Investigator:")
for result in results:
    name = result["_id"]
    count = result["count"]
    print(f"{name}: {count}")

Number of documents for each Principal Investigator:
Hrishikesh Chakraborty: 3
Chaosu Hu: 2
Dongmei Ji: 2
Maryam Fouladi, MD: 2
William Gleason, OD: 2
Xiaojing Hu, PhD: 2
Shounak Majumder, M.D.: 2
Brian Helfand, M.D.: 1
Lily A Brown, PhD: 1
Daniel Jones, MBBS, PhD: 1
Prof Andrew Prendergast, DPhil MRCPCH: 1
Danijela Trifunovic-Zamklar, MD, PhD: 1
Deepa G Manwani, M.D: 1
Channing Paller, MD: 1
Imran K Niazi, MD: 1
Hamiyet Kızıl, Phd RN: 1
Vikky Makker, MD: 1
Vladimir Hodzhev, Prof, PhD: 1
Shivaani Kummar, MD: 1
Thorsten Kahnt, Ph.D.: 1
Ann Smeets, MD,PhD: 1
Kenneth Offit, MD, MPH: 1
Marleen Kok, MD: 1
Frank A Bucci, Jr., MD: 1
Kevin Knight, PhD: 1
Lars O Cardell, Professor: 1
Georgi Momekov, Prof PhD: 1
Terence T. Sio, MD, MS: 1
Hirva Mamdani, MD: 1
Drew Sayer, PhD: 1
Lakshmi Koyyalagunta: 1
Craig Crandall, PhD: 1
James Yu: 1
Paola Azucena Alvarado Pelayo, Bachelor: 1
Li Huo, MD: 1
Kristie A Blum: 1
Ruth Blanco Rojo, PhD: 1
Megan Gunnar, PhD: 1
Mohamed Hamdy, Lecturer: 1
Ricardo Espinoz

In [None]:
final_collection.count_documents(
    {
        "locations": {
            "$elemMatch": {
                "facility": {"$regex": "Orange County Research Center", "$options": "i"}
            }
        }
    }
)

3

In [None]:
final_collection.count_documents(
    {"locations": {"$elemMatch": {"facility": {"$regex": "Orange", "$options": "i"}}}}
)

13

In [None]:
for trial in final_collection.find(
    {"locations": {"$elemMatch": {"facility": {"$regex": "Orange", "$options": "i"}}}}
):
    for location in trial.get("locations", []):
        pprint.pprint(location["facility"])

"Children's Hospital of Alabama"
'University of Alabama at Birmingham Cancer Center'
'USA Health Strada Patient Care Center'
'Phoenix Childrens Hospital'
'Banner University Medical Center - Tucson'
"Arkansas Children's Hospital"
'University of Arkansas for Medical Sciences'
'Kaiser Permanente Downey Medical Center'
'Loma Linda University Medical Center'
"Miller Children's and Women's Hospital Long Beach"
"Children's Hospital Los Angeles"
'Cedars Sinai Medical Center'
"Valley Children's Hospital"
"UCSF Benioff Children's Hospital Oakland"
'Kaiser Permanente-Oakland'
"Children's Hospital of Orange County"
"Lucile Packard Children's Hospital Stanford University"
'Sutter Medical Center Sacramento'
'University of California Davis Comprehensive Cancer Center'
"Rady Children's Hospital - San Diego"
'Naval Medical Center -San Diego'
'UCSF Medical Center-Parnassus'
'UCSF Medical Center-Mission Bay'
'Santa Barbara Cottage Hospital'
"Children's Hospital Colorado"
"Rocky Mountain Hospital for Chil

In [None]:
# Find documents where a principal investigator name contains "wu" (case-insensitive)
for trial in final_collection.find(
    {
        "principalInvestigator": {
            "$elemMatch": {"name": {"$regex": "wu", "$options": "i"}}
        }
    }
):
    # Loop through each principal investigator in the trial
    for investigator in trial.get("principalInvestigator", []):
        # Loop through each location in the trial and print investigator name with facility
        for location in trial.get("locations", []):
            pprint.pprint(
                (investigator["name"], location.get("facility", "No Facility"))
            )

('Ivan Wu', 'University of Minnesota')
('Abraham Wu, MD', 'Memorial Sloan Kettering Cancer Center')


In [57]:
final_collection.count_documents({"endDate": {"$lt": "2024-11-11"}})

179

In [56]:
final_collection.count_documents({"endDate": {"$gte": "2024-11-11"}})

440

In [None]:
number = final_collection.count_documents({"endDate": "NA"})

print(number)

for trial in final_collection.find({"endDate": "NA"}):
    pprint.pprint(trial["trialId"])

4
'NCT00233272'
'NCT06649669'
'NCT00342888'
'NCT06597747'


In [55]:
final_collection.find_one({"trialId": "NCT06649669"})

{'_id': ObjectId('673285464dd83486d5a4d3f5'),
 'trialId': 'NCT06649669',
 'title': '[Trial of device that is not approved or cleared by the U.S. FDA]',
 'startDate': 'NA',
 'endDate': 'NA',
 'phase': 'Other',
 'principalInvestigator': [],
 'locations': [],
 'eligibilityCriteria': ''}

In [51]:
for trial in final_collection.find(
    {"title": "[Trial of device that is not approved or cleared by the U.S. FDA]"}
):
    # Print the trialId directly
    pprint.pprint(trial.get("trialId"))

'NCT06649669'
'NCT06597747'


In [None]:
for trial in final_collection.find(
    {"trialId": {"$in": ["NCT06561048", "NCT06399289", "NCT06415344"]}}
):

    print("trialId:", trial.get("trialId"))

    print("eligibilityCriteria:", trial.get("eligibilityCriteria"))

    print("\n")

trialId: NCT06561048
eligibilityCriteria: Inclusion Criteria:

  1. Adult participants ≥18 years of age on the day of signing the informed consent form.

  2. Eastern Cooperative Oncology Group (ECOG) Performance Status of 0 to 2.

  3. Histologically confirmed PTCL-NOS, FHTCLs or sALCL per The International Consensus
     Classification of Mature Lymphoid Neoplasms.

  4. Progressed on, be refractory to, relapsed, or intolerant to standard therapy for
     their cancer. At least 1 but not more than 3 prior systemic therapies.

  5. Fluorodeoxyglucose-avid disease by positron emission tomography and measurable
     disease of at least 1.5 cm by computed tomography, as assessed by the site
     radiologist.

  6. Life expectancy >12 weeks.

  7. Adequate organ function as determined by:

       -  Absolute neutrophil count ≥ 1.0×10^9/L (1000/mm3) (without receiving
          granulocyte-colony stimulating factor)

       -  Platelet count ≥ 50×10^9/L (without transfusion)

       -  Hem

## Extract info from the eligibilityCriteria field and store them in MongoDB

In [9]:
# import openai
from openai import OpenAI

In [10]:
from config import api_key

In [11]:
client = OpenAI(api_key=api_key)

In [None]:
# Minimize API Calls by Using Local Processing First
def preprocess_text(eligibility_text):
    # Extract the "Inclusion Criteria" section using regex or keyword-based filtering
    match = re.search(
        r"Inclusion Criteria:.*?(Exclusion Criteria:|$)",
        eligibility_text,
        re.DOTALL | re.IGNORECASE,
    )
    if match:
        return match.group(0)
    return eligibility_text

In [13]:
def LLM_model(eligibility_text):
    # Construct a prompt to extract diseases/conditions only from inclusion criteria
    prompt = f"""
    Extract all diseases, conditions, or criteria related to medical treatments or exposures.

    Important:
    - Include diseases, conditions, demographic criteria (e.g., gender, age, ethnicity), and medical treatments or exposures (e.g., "FIX products exposure") while excluding numeric thresholds or detailed qualifiers.
    - Combine demographic criteria into a single entry
    - Retain contextual information or negations when they are part of the condition (e.g., "no FIX inhibitor formation").
    - Ensure the output is concise and excludes extra text or formatting.
    
    Eligibility Criteria: {eligibility_text}
    
    Please list only the diseases or conditions without any extra text.
    """

    # Call the OpenAI API with the prompt
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You're a helpful clinical trial assistant"},
            {"role": "user", "content": prompt},
        ],
        max_tokens=100,
        temperature=0,
    )

    # Extract the response text containing the diseases/conditions
    extracted_conditions = response.choices[0].message.content
    return extracted_conditions

In [14]:
def flatten(arg):
    if not isinstance(arg, list):  # if not list
        return [arg]
    return [x for sub in arg for x in flatten(sub)]

In [15]:
def get_nested_value(data, keys2):
    keys = keys2.split(".")
    for key in keys:
        if isinstance(data, dict):
            data = data.get(key)
        elif isinstance(data, list):
            valuelist = []
            for ele in data:
                valuelist.append(ele.get(key))
            data = flatten(valuelist)
    return data

In [16]:
def add_dict_pair(opkey, opvalue, results_dict):
    if opkey not in results_dict:
        results_dict[opkey] = []
    results_dict[opkey].append(opvalue)

In [None]:
# Function to process trials and extract conditions
def extract_info(collection, field, content):
    results_dict = {}

    for trial in collection.find({}):
        opkey = get_nested_value(trial, field)
        opvalue = get_nested_value(trial, content)
        if content == "eligibilityCriteria":
            inclusion_text = preprocess_text(opvalue)
            opvalue = LLM_model(inclusion_text)
        if isinstance(opkey, list):
            for opkeysingle in opkey:
                add_dict_pair(opkeysingle, opvalue, results_dict)

        else:
            add_dict_pair(opkey, opvalue, results_dict)

    return results_dict

In [18]:
def store_in_collection(field, content, results_dict, collection, batch_size=100):
    operations = []

    for key, value in results_dict.items():
        operations.append(
            UpdateOne(
                {field: key},
                {"$set": {content: value}},
            )
        )

        if len(operations) == batch_size:
            collection.bulk_write(operations)
            operations = []

    if operations:
        collection.bulk_write(operations)

In [None]:
results_dict = extract_info(final_collection, "trialId", "eligibilityCriteria")

# Could also load the results_dict for reproducibility
# with open("results_dict.pkl", "rb") as f:
#     results_dict = pickle.load(f)


store_in_collection("trialId", "extractedDiseases", results_dict, final_collection)

To add other transformation steps (link_principal_investigator):

In [208]:
extract_info(final_collection, "principalInvestigator.name", "trialId")

{'Leonard A Bradshaw, PhD': ['NCT03176927'],
 'John Heinzerling, MD': ['NCT04060927'],
 'Keith Choate, MD, PhD': ['NCT06651489'],
 'Marie Roguski, MD MPH': ['NCT06649591'],
 'Christoper Barker, MD': ['NCT04792073'],
 'Nathan Congdon, MD': ['NCT06065631'],
 'Ock K Chun, PhD': ['NCT04431960'],
 'Howard M Katzenstein': ['NCT00980460'],
 'Claus Cursiefen, Prof. Dr.': ['NCT05870566'],
 'Erica C. Kaye, MD, MPH': ['NCT05116566'],
 'Anna De Simoni, PhD': ['NCT05829265'],
 'William Gleason, OD': ['NCT05285527', 'NCT05285553'],
 'Natalie Lockney, MD': ['NCT05156060'],
 'Kent Nilsson, MD': ['NCT05883631'],
 'Lucas Boersma, Prof. MD.': ['NCT05883631'],
 'Anxo Fernandez-Ferreiro, PhD': ['NCT04756466'],
 'Olcay Aycicek, Asist Prof': ['NCT06650774'],
 'Pianosi Paolo, MD': ['NCT05025774'],
 'John B. Liao': ['NCT04387227'],
 'Federico Linassi, MD': ['NCT03774420'],
 'Xiaojing Hu, PhD': ['NCT06650020', 'NCT06550440'],
 'Laurie E Cutting, PhD': ['NCT03713125'],
 'Audrey Harkness, PhD': ['NCT06432725'],
 

In [None]:
results_dict = extract_info(final_collection, "principalInvestigator.name", "trialId")
store_in_collection(
    "principalInvestigator.name", "conducted_trial", results_dict, final_collection
)