In [1]:
import pandas as pd
import spacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from datetime import datetime

# Load data
file_path = 'synthea_100/patients.csv'
patients_df = pd.read_csv(file_path)

medication_file_path = 'synthea_100/medications.csv'
medications_df = pd.read_csv(medication_file_path)

conditions_file_path = 'synthea_100/conditions.csv'
conditions_df = pd.read_csv(conditions_file_path)

# 1. Age Calculation
patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])
current_date = datetime.now()
patients_df['AGE'] = patients_df['BIRTHDATE'].apply(lambda x: (current_date - x).days // 365)

# 2. Convert START and STOP dates in both conditions and medications to datetime
conditions_df['START'] = pd.to_datetime(conditions_df['START'])
conditions_df['STOP'] = pd.to_datetime(conditions_df['STOP'])
medications_df['START'] = pd.to_datetime(medications_df['START'])
medications_df['STOP'] = pd.to_datetime(medications_df['STOP'])

In [2]:
# 3. Extract UMLS codes from DESCRIPTION columns using SciSpaCy

# Load SciSpaCy models
nlp_umls = spacy.load("en_core_sci_lg")
nlp_umls.add_pipe("abbreviation_detector")
nlp_umls.add_pipe("scispacy_linker", config={
    "resolve_abbreviations": True,
    "threshold": 0.9,
    "linker_name": "umls"
})

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<scispacy.linking.EntityLinker at 0x279c2088a50>

In [3]:
nlp_rxnorm = spacy.load("en_core_sci_lg")
nlp_rxnorm.add_pipe("abbreviation_detector")
nlp_rxnorm.add_pipe("scispacy_linker", config={
    "resolve_abbreviations": True,
    "linker_name": "rxnorm"
})

<scispacy.linking.EntityLinker at 0x27c48263990>

In [4]:
# Helper function to extract UMLS codes
def extract_umls_codes(texts, nlp):
    umls_codes = {}
    for text in texts:
        if pd.isna(text):
            umls_codes[text] = []
            continue
        doc = nlp(text)
        codes = set()
        for entity in doc.ents:
            for kb_ent in entity._.kb_ents:
                concept_id, score = kb_ent
                codes.add(concept_id)
        umls_codes[text] = list(codes)
    return umls_codes

In [5]:
# Helper function to extract UMLS codes
def extract_umls_codes_verbose(texts, nlp):
    linker = nlp.get_pipe("scispacy_linker")
    umls_codes = {}
    for text in texts:
        if pd.isna(text):
            umls_codes[text] = []
            continue
        doc = nlp(text)
        codes = set()
        for entity in doc.ents:
            # print(f"\nEntity Full: {entity}")
            print(f"\nEntity: {entity.text}, Type: {entity.label_}")#", Entity ID: {entity._.kb_ents}")

            # for umls_ent in entity._.kb_ents:
            #     print(linker.kb.cui_to_entity[umls_ent[0]], end="\n##################\n")
            # Get the UMLS or RxNorm entities and their confidence scores
            for kb_ent in entity._.kb_ents:
                concept_id, score = kb_ent
                concept = linker.kb.cui_to_entity[concept_id]
                # for field in concept._fields:
                #     print(f"{field}: {getattr(concept, field)}")
                print(f"  Concept ID (CUI/RxNorm ID): {concept_id}")
                print(f"  Score: {score}")
                print(f"  Preferred name: {concept.canonical_name}")
                print(f"  Definition: {concept.definition}")
                print(f"  Semantic types: {concept.types}")
                print(end="\n##################\n")  # For RxNorm/SNOMED-CT

In [6]:
extract_umls_codes_verbose(['Medication review due (situation)'], nlp_umls)


Entity: Medication review, Type: ENTITY
  Concept ID (CUI/RxNorm ID): C0560023
  Score: 0.9528256058692932
  Preferred name: Medication Review
  Definition: A structured evaluation of a patient's medicines with the aim of optimizing medicines use and improving health outcomes, detecting drug related problems and recommending interventions.
  Semantic types: ['T058']

##################

Entity: situation, Type: ENTITY


  global_matches = self.global_matcher(doc)


In [7]:
# Extract UMLS codes for conditions
unique_condition_descriptions = conditions_df['DESCRIPTION'].unique()
condition_description_to_umls_codes = extract_umls_codes(unique_condition_descriptions, nlp_umls)
conditions_df['Condition_UMLS_CODES'] = conditions_df['DESCRIPTION'].map(condition_description_to_umls_codes)

# Extract UMLS codes for medications
unique_medication_descriptions = medications_df['DESCRIPTION'].unique()
medication_description_to_umls_codes = extract_umls_codes(unique_medication_descriptions, nlp_rxnorm)
medications_df['Medication_UMLS_CODES'] = medications_df['DESCRIPTION'].map(medication_description_to_umls_codes)

  global_matches = self.global_matcher(doc)
  global_matches = self.global_matcher(doc)


In [8]:
# 4. Merge DataFrames
patients_conditions = pd.merge(patients_df, conditions_df, left_on='Id', right_on='PATIENT', how='left')
conditions_medications = pd.merge(patients_conditions, medications_df, left_on='ENCOUNTER', right_on='ENCOUNTER', how='left')

# 5. Create Patient Profiles
patient_profiles = conditions_medications[['Id', 'AGE', 'GENDER', 
                                           'DESCRIPTION_x', 'CODE_x', 'START_x', 'STOP_x', 'Condition_UMLS_CODES',
                                           'DESCRIPTION_y', 'CODE_y', 'START_y', 'STOP_y', 'Medication_UMLS_CODES',
                                           'ENCOUNTER']]

# Rename columns
patient_profiles.columns = [
    'PatientID', 'Age', 'Gender', 
    'Condition_Description', 'Condition_Code', 'Condition_Start', 'Condition_End', 'Condition_UMLS_CODES',
    'Medication_Description', 'Medication_Code', 'Medication_Start', 'Medication_End', 'Medication_UMLS_CODES',
    'Encounter'
]

# Combine UMLS codes
def combine_codes(row):
    codes = []
    if isinstance(row['Condition_UMLS_CODES'], list):
        codes.extend(row['Condition_UMLS_CODES'])
    if isinstance(row['Medication_UMLS_CODES'], list):
        codes.extend(row['Medication_UMLS_CODES'])
    return list(set(codes)) if codes else []

patient_profiles['UMLS_CODES'] = patient_profiles.apply(combine_codes, axis=1)



# Display the final DataFrames
print("Patient Profiles:")
patient_profiles.head()

Patient Profiles:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_profiles['UMLS_CODES'] = patient_profiles.apply(combine_codes, axis=1)


Unnamed: 0,PatientID,Age,Gender,Condition_Description,Condition_Code,Condition_Start,Condition_End,Condition_UMLS_CODES,Medication_Description,Medication_Code,Medication_Start,Medication_End,Medication_UMLS_CODES,Encounter,UMLS_CODES
0,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Seasonal allergic rhinitis,367498001,2010-02-13,NaT,"[C1334103, C0439601, C1708423, C2607914]",Fexofenadine hydrochloride 30 MG Oral Tablet,997488.0,2010-02-13 20:39:01+00:00,NaT,"[C0771752, C0993159]",54610a6c-47b4-79f4-e75d-80f3ed4c8437,"[C0993159, C0771752, C1708423, C1334103, C2607..."
1,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Medication review due (situation),314529007,2012-09-05,2016-09-21,[C0560023],,,NaT,NaT,,54f02fad-1ad7-7139-ae76-1fc8db2ac30c,[C0560023]
2,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Acute bronchitis (disorder),10509002,2016-09-10,2016-09-17,"[C0004936, C0012634, C0149514]",Acetaminophen 325 MG Oral Tablet,313782.0,2016-09-10 04:00:34+00:00,2016-09-17 04:00:34+00:00,"[C0982952, C0993159, C0000970]",0dcd99ad-9b21-681c-8cba-2321c6cc3ec2,"[C0982952, C0993159, C0149514, C0004936, C0000..."
3,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Medication review due (situation),314529007,2016-09-28,2020-10-21,[C0560023],,,NaT,NaT,,39bc4565-99ba-afb8-4298-92165d168052,[C0560023]
4,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Concussion injury of brain (disorder),110030002,2018-08-09,2018-10-18,"[C0006104, C0012634, C0004936, C4266577]",,,NaT,NaT,,e6d0e44a-a960-970d-d526-1f21f39be969,"[C0006104, C0012634, C0004936, C4266577]"


In [22]:
# Assuming you have already run the previous code to create patient_profiles
# We'll proceed to create the patient_profiles_grouped DataFrame with additional mappings

# Function to aggregate descriptions and codes into lists
def aggregate_list(series):
    return list(series.dropna().unique())

# Function to aggregate codes into a set to avoid duplicates
def aggregate_codes(series):
    codes = set()
    for items in series.dropna():
        codes.update(items)
    return list(codes)

# Initialize an empty list to store patient data
patient_data_list = []

# Group patient_profiles by 'PatientID'
patient_groups = patient_profiles.groupby('PatientID')

# Iterate over each patient group to create the aggregated data
for patient_id, group in patient_groups:
    age = group['Age'].iloc[0]
    gender = group['Gender'].iloc[0]
    condition_descriptions = aggregate_list(group['Condition_Description'])
    condition_codes = aggregate_list(group['Condition_Code'])
    condition_umls_codes = aggregate_codes(group['Condition_UMLS_CODES'])
    medication_descriptions = aggregate_list(group['Medication_Description'])
    medication_codes = aggregate_list(group['Medication_Code'])
    medication_rxnorm_codes = aggregate_codes(group['Medication_UMLS_CODES'])
    
    # Create code-description maps
    condition_code_desc_map = {}
    for codes_list, desc in zip(group['Condition_UMLS_CODES'], group['Condition_Description']):
        if isinstance(codes_list, list):
            for code in codes_list:
                if code in condition_code_desc_map:
                    condition_code_desc_map[code].add(desc)
                else:
                    condition_code_desc_map[code] = {desc}
    
    medication_code_desc_map = {}
    for codes_list, desc in zip(group['Medication_UMLS_CODES'], group['Medication_Description']):
        if isinstance(codes_list, list):
            for code in codes_list:
                if code in medication_code_desc_map:
                    medication_code_desc_map[code].add(desc)
                else:
                    medication_code_desc_map[code] = {desc}
    
    patient_data_list.append({
        'PatientID': patient_id,
        'Age': age,
        'Gender': gender,
        'Condition_Description': condition_descriptions,
        'Condition_Code': condition_codes,
        'Condition_UMLS_CODES': condition_umls_codes,
        'Condition_Code_Description_Map': condition_code_desc_map,
        'Medication_Description': medication_descriptions,
        'Medication_Code': medication_codes,
        'Medication_UMLS_CODES': medication_rxnorm_codes,
        'Medication_Code_Description_Map': medication_code_desc_map
    })

# Create the patient_profiles_grouped DataFrame
patient_profiles_grouped = pd.DataFrame(patient_data_list)
patient_profiles_grouped.head()

Unnamed: 0,PatientID,Age,Gender,Condition_Description,Condition_Code,Condition_UMLS_CODES,Condition_Code_Description_Map,Medication_Description,Medication_Code,Medication_UMLS_CODES,Medication_Code_Description_Map
0,03f12f9e-fd3e-b845-502a-3a12511d9e48,0,M,[Medication review due (situation)],[314529007],[C0560023],{'C0560023': {'Medication review due (situatio...,[],[],[],{}
1,068d2ed4-b12c-e380-fe78-3d99eab488c2,66,F,"[Chronic sinusitis (disorder), Received higher...","[40055000, 224299000, 266934004, 59621000, 713...","[C0684224, C0022646, C0680681, C4071907, C0439...","{'C0012634': {'Sepsis (disorder)', 'Proteinuri...",[insulin isophane human 70 UNT/ML / insulin ...,"[106892.0, 310798.0, 314076.0, 308136.0, 15353...","[C0021658, C0028333, C1882383, C0020261, C0987...",{'C0021658': {'insulin isophane human 70 UNT/...
2,06a2cf9c-d766-72ef-8869-449dd2b18534,56,F,"[Recurrent urinary tract infection (disorder),...","[197927001, 224299000, 40055000, 59621000, 162...","[C0243095, C0227665, C0578022, C0234421, C0262...",{'C2945760': {'Recurrent urinary tract infecti...,"[Hydrochlorothiazide 25 MG Oral Tablet, lisino...","[310798.0, 314076.0, 308136.0, 562251.0, 31378...","[C0982952, C0993159, C0020261, C0051696, C0110...",{'C0993159': {'Amoxicillin 250 MG / Clavulanat...
3,08123544-5a97-8592-a7b7-f66e0aba6d5f,102,M,"[Essential hypertension (disorder), Served in ...","[59621000, 224355006, 105531004, 224295006, 73...","[C0243095, C0014406, C0227665, C0700287, C0578...","{'C0012634': {'Anemia (disorder)', 'Proteinuri...",[insulin isophane human 70 UNT/ML / insulin ...,"[106892.0, 310798.0, 314076.0, 1535362.0, 1085...","[C0993159, C3538423, C0020261, C0085149, C0991...",{'C0021658': {'insulin isophane human 70 UNT/...
4,08230260-e919-5227-1ee9-cccc39648a34,25,M,"[Medication review due (situation), Gingival d...","[314529007, 18718003, 10509002, 224299000, 160...","[C0243095, C0037199, C0396000, C0022745, C0149...",{'C0560023': {'Medication review due (situatio...,"[sodium fluoride 0.0272 MG/MG Oral Gel, Acetam...","[1535362.0, 313782.0, 310965.0]","[C0020740, C0982952, C0000970, C0993159]",{'C0982952': {'Acetaminophen 325 MG Oral Table...


In [12]:
import requests, re

# Define base URL for ClinicalTrials.gov API v2
base_url = 'https://clinicaltrials.gov/api/v2/studies'

def parse_eligibility_criteria(eligibility_string):
    """
    Parses the eligibility string to separate Inclusion and Exclusion criteria, handling both bullet points and numbered lists.

    Parameters:
        eligibility_string (str): The raw markdown string containing both inclusion and exclusion criteria.

    Returns:
        tuple: A tuple containing two lists: (inclusion_criteria, exclusion_criteria).
    """
    inclusion_criteria = []
    exclusion_criteria = []
    
    if "Inclusion Criteria:" in eligibility_string:
        # Split into inclusion and exclusion sections
        sections = eligibility_string.split("Exclusion Criteria:")
        inclusion_section = sections[0].split("Inclusion Criteria:")[-1]
        exclusion_section = sections[1] if len(sections) > 1 else ""

        # Use regex to split by bullet points, numbers (1., 2.), and other common list markers
        inclusion_criteria = re.split(r'\n(?:\d+\.\s|\*\s|-|\u2022|\u00B7)', inclusion_section)
        exclusion_criteria = re.split(r'\n(?:\d+\.\s|\*\s|-|\u2022|\u00B7)', exclusion_section)

        # Clean up the criteria (remove empty strings, strip extra whitespace)
        inclusion_criteria = [item.strip() for item in inclusion_criteria if item.strip()]
        exclusion_criteria = [item.strip() for item in exclusion_criteria if item.strip()]

    return inclusion_criteria, exclusion_criteria

def fetch_n_trials(n):
    """
    Fetches 'n' clinical trials from ClinicalTrials.gov that are actively recruiting and includes their eligibility criteria.

    Parameters:
        n (int): The number of trials to fetch.

    Returns:
        DataFrame: A pandas DataFrame containing the NCT ID, trial title, and eligibility criteria.
    """
    # Set query parameters for the API call
    query_params = {
        'format': 'json',  # Request the response in JSON format
        'filter.overallStatus': 'RECRUITING',  # Only fetch trials that are recruiting
        'pageSize': n  # Limit the number of trials to 'n'
    }

    trial_data = []
    try:
        # Make the initial API request to fetch a list of studies
        response = requests.get(base_url, params=query_params)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the response JSON and extract the studies
        trials = response.json().get('studies', [])

        # Loop through each trial in the fetched data
        for trial in trials:
            try:
                # Extract the NCT ID and trial title from the trial
                nct_id = trial['protocolSection']['identificationModule']['nctId']
                trial_title = trial['protocolSection']['identificationModule']['briefTitle']

                # Fetch detailed information for each trial using the NCT ID
                trial_details_url = f"{base_url}/{nct_id}?format=json"
                trial_response = requests.get(trial_details_url)

                # Check if the trial details request was successful
                trial_response.raise_for_status()

                # Parse the detailed trial response
                trial_details = trial_response.json()

                # Extract eligibility criteria and process markdown into structured lists
                eligibility_string = trial_details['protocolSection'].get('eligibilityModule', {}).get('eligibilityCriteria', 'Not Available')
                inclusion_criteria, exclusion_criteria = parse_eligibility_criteria(eligibility_string)
                
                # Example: Additional fields like Age, Sex, and Healthy Volunteers
                minimum_age = trial_details['protocolSection']['eligibilityModule'].get('minimumAge', 'Not Specified')
                maximum_age = trial_details['protocolSection']['eligibilityModule'].get('maximumAge', 'Not Specified')
                sex = trial_details['protocolSection']['eligibilityModule'].get('sex', 'Not Specified')
                healthy_volunteers = trial_details['protocolSection']['eligibilityModule'].get('healthyVolunteers', 'Not Specified')

                # Append all this data to the trial_data list
                trial_data.append({
                    'NCTId': nct_id,
                    'Title': trial_title,
                    'Inclusion_Criteria': inclusion_criteria,
                    'Exclusion_Criteria': exclusion_criteria,
                    'Minimum_Age': minimum_age,
                    'Maximum_Age': maximum_age,
                    'Sex': sex,
                    'Healthy_Volunteers': healthy_volunteers
                })

            except KeyError as ke:
                print(f"KeyError: Missing key {ke} in trial data for NCT ID {nct_id}. Skipping this trial.")
            except requests.RequestException as e:
                print(f"Error fetching details for trial {nct_id}: {e}")

        # Convert the collected trial data to a pandas DataFrame
        df = pd.DataFrame(trial_data)
        return df

    except requests.RequestException as e:
        # Handle any general API request errors
        print(f"Error fetching trials: {e}")
        return None

In [13]:
# Fetch trials
df_trials = fetch_n_trials(10)

# Combine all criteria into sets for unique processing
all_inclusion_criteria = set()
all_exclusion_criteria = set()

# Flatten the lists of criteria
for criteria_list in df_trials['Inclusion_Criteria']:
    all_inclusion_criteria.update(criteria_list)

for criteria_list in df_trials['Exclusion_Criteria']:
    all_exclusion_criteria.update(criteria_list)

In [14]:
# Extract UMLS codes from inclusion criteria (conditions)
inclusion_umls_codes_mapping = extract_umls_codes(all_inclusion_criteria, nlp_umls)
# Extract RxNorm codes from inclusion criteria (medications)
inclusion_rxnorm_codes_mapping = extract_umls_codes(all_inclusion_criteria, nlp_rxnorm)

# Extract UMLS codes from exclusion criteria (conditions)
exclusion_umls_codes_mapping = extract_umls_codes(all_exclusion_criteria, nlp_umls)
# Extract RxNorm codes from exclusion criteria (medications)
exclusion_rxnorm_codes_mapping = extract_umls_codes(all_exclusion_criteria, nlp_rxnorm)

# Map back the codes to the DataFrame
def map_codes(criteria_list, code_mapping):
    codes = set()
    for criterion in criteria_list:
        codes.update(code_mapping.get(criterion, []))
    return list(codes)

df_trials['Inclusion_UMLS_Codes'] = df_trials['Inclusion_Criteria'].apply(lambda x: map_codes(x, inclusion_umls_codes_mapping))
df_trials['Inclusion_RxNorm_Codes'] = df_trials['Inclusion_Criteria'].apply(lambda x: map_codes(x, inclusion_rxnorm_codes_mapping))
df_trials['Exclusion_UMLS_Codes'] = df_trials['Exclusion_Criteria'].apply(lambda x: map_codes(x, exclusion_umls_codes_mapping))
df_trials['Exclusion_RxNorm_Codes'] = df_trials['Exclusion_Criteria'].apply(lambda x: map_codes(x, exclusion_rxnorm_codes_mapping))

# Display the updated DataFrame
df_trials[['NCTId', 'Title', 'Inclusion_UMLS_Codes', 'Inclusion_RxNorm_Codes', 'Exclusion_UMLS_Codes', 'Exclusion_RxNorm_Codes']]

  global_matches = self.global_matcher(doc)


Unnamed: 0,NCTId,Title,Inclusion_UMLS_Codes,Inclusion_RxNorm_Codes,Exclusion_UMLS_Codes,Exclusion_RxNorm_Codes
0,NCT04687176,Frontline Oral Arsenic Trioxide for APL,"[C0023487, C0599813, C1705627, C2987521, C0040...",[],"[C0043210, C0030705, C0013516, C1739039, C1705...","[C0010286, C0010294]"
1,NCT06400576,Smartphone & Headphone Effects on Baropodometr...,"[C2987476, C0162574]",[],"[C0205494, C0266498, C3887460, C2707261, C0042...",[]
2,NCT06530576,Thalidomide for the Symptomatic Large Granular...,"[C0220913, C0439508, C1880840, C1366500, C0886...",[],"[C0449432, C0679217, C0043210, C4037974, C0201...","[C0376325, C0220847, C0019169]"
3,NCT04900376,Evaluation of Covid-19 Vaccination in Elderly ...,"[C2349001, C2697811, C1705923, C0557651, C1706...",[],"[C2349001, C2697811, C1706203, C0681850, C1550...",[]
4,NCT05179876,A Study Providing Treatment Access in Particip...,"[C0043210, C1096775, C0439508, C0814225, C1880...","[C4074663, C2606556, C1176316, C2000145]","[C0043210, C1609436, C0019016, C0205309, C1382...","[C0085845, C1176316, C0010286, C0010294, C4074..."
5,NCT04858776,Perivenous Dexamethasone Therapy: Examining Re...,"[C0038257, C0043210, C1880641, C0470187, C0087...",[C0004057],"[C0007203, C0237881, C0087086, C0231220, C1096...","[C0019134, C2930043, C0700603, C0037513, C0054..."
6,NCT05666076,Peng Block or Suprascapular Nerve Block for Po...,"[C0030705, C0002915, C2930406, C0162574]",[],"[C0242402, C2598155, C0302142, C0205469, C0030...","[C0055152, C0005100, C0030346, C0001963, C0089..."
7,NCT04756076,Study Roles of Heavy Metals and Essential Meta...,"[C1114365, C1704338, C0079399, C0030705, C0945...",[],"[C0439508, C0439234]",[]
8,NCT05900427,Effects on Postoperative Pain of Liposomal Bup...,"[C0944911, C1556083, C0043210, C2923685, C1705...",[],"[C0944911, C1444657, C1556083, C3541403, C2923...",[C0006400]
9,NCT05442827,A Phase II/III Study of Efficacy and Safety of...,"[C0237881, C0043210, C0021430, C0444706, C1561...",[C0021344],"[C0237417, C0029939, C0011065, C2708137, C1096...","[C0031507, C0011777, C0278766, C0237417, C0031..."


In [15]:
# df_trials where condition code has C0013227
df_trials[df_trials['Inclusion_UMLS_Codes'].apply(lambda x: 'C0013227' in x)].Inclusion_Criteria.to_dict()

{}

In [21]:
import pandas as pd
import numpy as np

# Helper function to convert age fields from df_trials to integers
def convert_age(age_str):
    if pd.isna(age_str) or age_str in ['Not Specified', 'N/A']:
        return np.nan  # Return NaN for missing or unspecified ages
    try:
        return int(age_str.split()[0])  # Convert '18 Years' -> 18
    except:
        return np.nan  # Handle any errors

# Helper function to check if patient age matches the trial's age criteria
def match_age(patient_age, min_age, max_age):
    if pd.isna(min_age):  # If no minimum age is specified, assume 0
        min_age = 0
    if pd.isna(max_age):  # If no maximum age is specified, assume no upper limit
        max_age = 120
    return min_age <= patient_age <= max_age

# Helper function to check if the patient’s sex matches the trial’s sex requirement
def match_sex(patient_gender, trial_gender):
    if trial_gender == "ALL":
        return True
    return patient_gender == trial_gender

# Helper function to determine if a patient is a healthy volunteer
def is_healthy_volunteer(patient_conditions):
    # If the patient has no conditions, they are considered a healthy volunteer
    # If all conditions have ended (STOP is not NaT), they are also healthy
    active_conditions = patient_conditions[(patient_conditions['Condition_End'].isna()) | 
                                           (patient_conditions['Condition_End'] > pd.Timestamp.now())]
    return active_conditions.empty

# Main function to match patients to trials in chunks
def match_patients_to_trials(patient_profiles, df_trials, chunk_size=100):
    # Convert age fields in df_trials to integers
    df_trials['Minimum_Age'] = df_trials['Minimum_Age'].apply(convert_age)
    df_trials['Maximum_Age'] = df_trials['Maximum_Age'].apply(convert_age)

    # List to store all matches
    matches = []

    # Chunk processing of patients
    for i in range(0, len(patient_profiles), chunk_size):
        patient_chunk = patient_profiles.iloc[i:i + chunk_size]

        # Iterate over each patient in the chunk
        for patient_id, patient_data in patient_chunk.groupby('PatientID'):
            patient_age = patient_data['Age'].values[0]
            patient_gender = patient_data['Gender'].values[0]
            is_healthy = is_healthy_volunteer(patient_data)

            # Iterate over each trial and check for matching criteria
            for _, trial in df_trials.iterrows():
                # Age match
                if not match_age(patient_age, trial['Minimum_Age'], trial['Maximum_Age']):
                    continue

                # Sex match
                if not match_sex(patient_gender, trial['Sex']):
                    continue

                # Healthy volunteers check
                if trial['Healthy_Volunteers'] and not is_healthy:
                    continue

                # If all criteria are matched, append the result
                matches.append({
                    'PatientID': patient_id,
                    'TrialID': trial['NCTId'],
                    'TrialTitle': trial['Title'],
                    'EligibilityMet': 'Age, Sex, Healthy Volunteer Matched'
                })

    # Convert the matches list to a DataFrame
    matches_df = pd.DataFrame(matches)

    return matches_df

# Example usage:
# Assuming patient_profiles and df_trials are already loaded

# Set chunk size to 100
chunk_size = 100

# Call the matching function
matched_trials = match_patients_to_trials(patient_profiles, df_trials, chunk_size=chunk_size)
matched_trials.head()

Unnamed: 0,PatientID,TrialID,TrialTitle,EligibilityMet
0,3cd46ba4-0a11-1c66-8525-3f4230e65ba3,NCT04687176,Frontline Oral Arsenic Trioxide for APL,"Age, Sex, Healthy Volunteer Matched"
1,3cd46ba4-0a11-1c66-8525-3f4230e65ba3,NCT06530576,Thalidomide for the Symptomatic Large Granular...,"Age, Sex, Healthy Volunteer Matched"
2,3cd46ba4-0a11-1c66-8525-3f4230e65ba3,NCT05179876,A Study Providing Treatment Access in Particip...,"Age, Sex, Healthy Volunteer Matched"
3,6b0b0021-df03-cbb6-305a-11ec40da7af4,NCT04687176,Frontline Oral Arsenic Trioxide for APL,"Age, Sex, Healthy Volunteer Matched"
4,6b0b0021-df03-cbb6-305a-11ec40da7af4,NCT06530576,Thalidomide for the Symptomatic Large Granular...,"Age, Sex, Healthy Volunteer Matched"


In [26]:
import os
from datetime import datetime

# Helper function to convert age strings to integers
def convert_age(age_str):
    if pd.isna(age_str) or age_str in ['Not Specified', 'N/A']:
        return np.nan
    try:
        return int(age_str.split()[0])
    except:
        return np.nan

# Helper function to check age eligibility
def match_age(patient_age, min_age, max_age):
    if pd.isna(min_age):
        min_age = 0
    if pd.isna(max_age):
        max_age = 120
    return min_age <= patient_age <= max_age

# Helper function to check sex eligibility
def match_sex(patient_gender, trial_gender):
    if trial_gender == "ALL":
        return True
    return patient_gender == trial_gender

# Helper function to determine if a patient is a healthy volunteer
def is_healthy_volunteer(patient):
    # If the patient has no conditions, they are considered healthy
    return len(patient['Condition_Description']) == 0

def match_patients_to_trials(patient_profiles_grouped, df_trials, chunk_size=100):
    # Convert age fields in df_trials to integers
    df_trials['Minimum_Age'] = df_trials['Minimum_Age'].apply(convert_age)
    df_trials['Maximum_Age'] = df_trials['Maximum_Age'].apply(convert_age)

    # Iterate over patients in chunks
    for i in range(0, len(patient_profiles_grouped), chunk_size):
        patient_chunk = patient_profiles_grouped.iloc[i:i + chunk_size]

        # Iterate over each patient in the chunk
        for idx, patient in patient_chunk.iterrows():
            patient_id = patient['PatientID']
            patient_age = patient['Age']
            patient_gender = patient['Gender']
            is_healthy = is_healthy_volunteer(patient)

            # Prepare patient-specific log directory
            patient_log_dir = os.path.join('logs', patient_id)
            os.makedirs(patient_log_dir, exist_ok=True)

            # Iterate over each trial
            for _, trial in df_trials.iterrows():
                trial_id = trial['NCTId']
                trial_title = trial['Title']
                min_age = trial['Minimum_Age']
                max_age = trial['Maximum_Age']
                trial_sex = trial['Sex']
                healthy_volunteers = trial['Healthy_Volunteers'] == 'Yes'

                # Initialize match status and logs
                match_status = True
                log_lines = []

                # Log patient details
                log_lines.append(f"Patient ID: {patient_id}")
                log_lines.append(f"Age: {patient_age}")
                log_lines.append(f"Gender: {patient_gender}")
                log_lines.append(f"Conditions: {patient['Condition_Description']}")
                log_lines.append(f"Medications: {patient['Medication_Description']}")
                log_lines.append("########")

                # Log trial details
                log_lines.append("Trial Details")
                log_lines.append(f"Title: {trial_title}")
                log_lines.append("1. Inclusion Criteria:")
                for crit in trial['Inclusion_Criteria']:
                    log_lines.append(f"   - {crit}")
                log_lines.append("2. Exclusion Criteria:")
                for crit in trial['Exclusion_Criteria']:
                    log_lines.append(f"   - {crit}")
                log_lines.append("##########")

                # Match Check
                log_lines.append("Match Check:")

                # 1. Age check
                age_match = match_age(patient_age, min_age, max_age)
                log_lines.append(f"1. Age = {patient_age} vs [{min_age}, {max_age}] = {'MATCH' if age_match else 'NO MATCH'}")
                if not age_match:
                    match_status = False

                # 2. Sex check
                sex_match = match_sex(patient_gender, trial_sex)
                log_lines.append(f"2. Sex = {patient_gender} vs {trial_sex} = {'MATCH' if sex_match else 'NO MATCH'}")
                if not sex_match:
                    match_status = False

                # 3. Health check
                health_match = True
                if healthy_volunteers and not is_healthy:
                    health_match = False
                log_lines.append(f"3. Health = {'Healthy' if is_healthy else 'Not Healthy'} vs {'Healthy Volunteers' if healthy_volunteers else 'Patients'} = {'MATCH' if health_match else 'NO MATCH'}")
                if not health_match:
                    match_status = False

                # Proceed only if basic checks passed
                if match_status:
                    # 4. Exclusion Criteria check
                    exclusion_match = True  # Assume match until proven otherwise
                    log_lines.append("4. Exclusion Criteria:")

                    # Check UMLS codes for conditions
                    patient_condition_codes = set(patient['Condition_UMLS_CODES'])
                    trial_exclusion_condition_codes = set(trial['Exclusion_UMLS_Codes'])
                    overlapping_condition_codes = patient_condition_codes.intersection(trial_exclusion_condition_codes)

                    if overlapping_condition_codes:
                        exclusion_match = False
                        for code in overlapping_condition_codes:
                            # Get condition descriptions
                            condition_descs = patient['Condition_Code_Description_Map'].get(code, [])
                            condition_descs = ', '.join(condition_descs)
                            log_lines.append(f"   - Code {code} ({condition_descs}) matches Exclusion Criteria")
                    else:
                        log_lines.append("   - No condition codes overlap with Exclusion Criteria")

                    # Check RxNorm codes for medications
                    patient_medication_codes = set(patient['Medication_UMLS_CODES'])
                    trial_exclusion_medication_codes = set(trial['Exclusion_UMLS_Codes'])
                    overlapping_medication_codes = patient_medication_codes.intersection(trial_exclusion_medication_codes)

                    if overlapping_medication_codes:
                        exclusion_match = False
                        for code in overlapping_medication_codes:
                            # Get medication descriptions
                            medication_descs = patient['Medication_Code_Description_Map'].get(code, [])
                            medication_descs = ', '.join(medication_descs)
                            log_lines.append(f"   - Code {code} ({medication_descs}) matches Exclusion Criteria")
                    else:
                        log_lines.append("   - No medication codes overlap with Exclusion Criteria")

                    if exclusion_match:
                        log_lines.append("   - Exclusion Criteria Check: MATCH")
                    else:
                        log_lines.append("   - Exclusion Criteria Check: NO MATCH")
                        match_status = False

                # If match_status remains True after Exclusion Criteria check, proceed to Inclusion Criteria
                if match_status:
                    # (Optional) Inclusion Criteria check can be implemented here
                    log_lines.append("Patient matches all criteria for this trial.")

                # Prepare log file name
                datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                log_filename = f"{trial_id}_{datetime_str}.log"
                if match_status:
                    log_filename = f"[MATCH]_{log_filename}"

                # Write logs to file
                log_filepath = os.path.join(patient_log_dir, log_filename)
                with open(log_filepath, 'w', encoding='utf-8') as f:
                    for line in log_lines:
                        f.write(line + '\n')

    print("Matching process completed.")

# Example usage:
# Assuming patient_profiles_grouped and df_trials are already loaded
match_patients_to_trials(patient_profiles_grouped, df_trials, chunk_size=100)


Matching process completed.
