In [19]:
import os
import re
import pandas as pd
import numpy as np
import spacy
from datetime import datetime
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

# Load SciSpaCy models
def load_nlp_models():
    """
    Load and return two NLP models for UMLS linking:
    one for conditions and one for medications.
    """
    nlp_umls_link = spacy.load("en_core_sci_lg")
    nlp_umls_link.add_pipe("abbreviation_detector")
    nlp_umls_link.add_pipe("scispacy_linker", config={
        "resolve_abbreviations": True,
        "linker_name": "umls"
    })

    nlp_rxnorm_link = spacy.load("en_core_sci_lg")
    nlp_rxnorm_link.add_pipe("abbreviation_detector")
    nlp_rxnorm_link.add_pipe("scispacy_linker", config={
        "resolve_abbreviations": True,
        "linker_name": "rxnorm"
    })

    return nlp_umls_link, nlp_rxnorm_link

# Load NLP models (loading large spacy models for best performance, takes 2-3 minutes)
nlp_umls_link, nlp_rxnorm_link = load_nlp_models()

INFO:nmslib:Loading index from C:\Users\cx-admin\.scispacy\datasets\7e8e091ec80370b87b1652f461eae9d926e543a403a69c1f0968f71157322c25.6d801a1e14867953e36258b0e19a23723ae84b0abd2a723bdd3574c3e0c873b4.nmslib_index.bin
INFO:nmslib:Loading regular index.
INFO:nmslib:Finished loading index
INFO:nmslib:Set HNSW query-time parameters:
INFO:nmslib:ef(Search)         =20
INFO:nmslib:algoType           =2
INFO:nmslib:Set HNSW query-time parameters:
INFO:nmslib:ef(Search)         =200
INFO:nmslib:algoType           =2
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
import requests
import re
import pandas as pd
import spacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

def fetch_n_trials(n):
    """
    Fetch 'n' clinical trials from ClinicalTrials.gov that are actively recruiting.
    Extract UMLS codes from inclusion and exclusion criteria using provided NLP models.

    Parameters:
        n (int): Number of trials to fetch.

    Returns:
        pd.DataFrame: DataFrame containing trial details and extracted UMLS codes.
    """
    base_url = 'https://clinicaltrials.gov/api/v2/studies'
    query_params = {
        'format': 'json',
        'filter.overallStatus': 'RECRUITING',
        'pageSize': n
    }

    trial_data = []

    try:
        # Fetch the list of studies
        response = requests.get(base_url, params=query_params)
        response.raise_for_status()
        trials = response.json().get('studies', [])

        for trial in trials:
            try:
                nct_id = trial['protocolSection']['identificationModule']['nctId']
                trial_title = trial['protocolSection']['identificationModule']['briefTitle']

                # Fetch detailed information for each trial
                trial_details_url = f"{base_url}/{nct_id}?format=json"
                trial_response = requests.get(trial_details_url)
                trial_response.raise_for_status()
                trial_details = trial_response.json()

                # Extract eligibility criteria
                eligibility_string = trial_details['protocolSection'].get('eligibilityModule', {}).get('eligibilityCriteria', 'Not Available')
                inclusion_criteria, exclusion_criteria = parse_eligibility_criteria(eligibility_string)

                # Extract additional trial information
                minimum_age = trial_details['protocolSection']['eligibilityModule'].get('minimumAge', 'Not Specified')
                maximum_age = trial_details['protocolSection']['eligibilityModule'].get('maximumAge', 'Not Specified')
                sex = trial_details['protocolSection']['eligibilityModule'].get('sex', 'Not Specified')
                healthy_volunteers = trial_details['protocolSection']['eligibilityModule'].get('healthyVolunteers', 'Not Specified')

                # Extract UMLS codes from inclusion and exclusion criteria
                inclusion_umls_codes_mapping = extract_codes_from_criteria(inclusion_criteria)
                exclusion_umls_codes_mapping = extract_codes_from_criteria(exclusion_criteria)

                trial_data.append({
                    'NCTId': nct_id,
                    'Title': trial_title,
                    'Inclusion_Criteria': inclusion_criteria,
                    'Exclusion_Criteria': exclusion_criteria,
                    'Minimum_Age': minimum_age,
                    'Maximum_Age': maximum_age,
                    'Sex': sex,
                    'Healthy_Volunteers': healthy_volunteers,
                    'Inclusion_Criteria_UMLS_Codes': inclusion_umls_codes_mapping,
                    'Exclusion_Criteria_UMLS_Codes': exclusion_umls_codes_mapping
                })

            except KeyError as ke:
                print(f"KeyError: Missing key {ke} in trial data for NCT ID {nct_id}. Skipping this trial.")
            except requests.RequestException as e:
                print(f"Error fetching details for trial {nct_id}: {e}")

        # Convert the collected trial data to a pandas DataFrame
        df_trials = pd.DataFrame(trial_data)
        return df_trials

    except requests.RequestException as e:
        print(f"Error fetching trials: {e}")
        return None



In [5]:
import requests
import pandas as pd
import concurrent.futures
import logging
from tqdm import tqdm
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Compile regex patterns once at the module level
CRITERIA_SPLIT_PATTERN = re.compile(r'\n(?:\d+\.\s|\*\s|-|\u2022|\u00B7)')
INCLUSION_HEADER = "Inclusion Criteria:"
EXCLUSION_HEADER = "Exclusion Criteria:"

# Define STOP_CODES as a set for O(1) lookups
STOP_CODES = {'C3244316', 'C0013227'}

def fetch_n_trials_v2(n=None, max_workers=None, verbose=False, reload=True):
    """
    Fetch 'n' clinical trials from ClinicalTrials.gov that are actively recruiting.
    If 'n' is None, fetch all trials that are actively recruiting.
    Extract UMLS codes from inclusion and exclusion criteria using provided NLP models.

    Parameters:
        n (int, optional): Number of trials to fetch. If None, fetch all trials.
        max_workers (int, optional): Maximum number of worker threads to use for parallel processing.
        verbose (bool, optional): If True, set logging level to INFO. If False, set to WARNING.
        reload (bool, optional): If False and processed file exists, load and return it.

    Returns:
        pd.DataFrame: DataFrame containing trial details and extracted UMLS codes.
    """

    # Define the path to save the processed trials
    processed_file_path = os.path.join('data', 'trials', 'df_trials_processed.csv')

    # If reload is False and the processed file exists, load and return it
    if not reload and os.path.exists(processed_file_path):
        return pd.read_csv(processed_file_path)

    # Setup Logging
    logger = logging.getLogger('fetch_n_trials_v2')
    logger.setLevel(logging.INFO if verbose else logging.WARNING)
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    if not logger.handlers:
        logger.addHandler(handler)

    base_url = 'https://clinicaltrials.gov/api/v2/studies'
    page_size = 1000  # Adjusted as per your current function
    query_params = {
        'format': 'json',
        'filter.overallStatus': 'RECRUITING',
        'pageSize': page_size,
        'fields': ','.join([
            'protocolSection.identificationModule.nctId',
            'protocolSection.identificationModule.briefTitle',
            'protocolSection.eligibilityModule.eligibilityCriteria',
            'protocolSection.eligibilityModule.minimumAge',
            'protocolSection.eligibilityModule.maximumAge',
            'protocolSection.eligibilityModule.sex',
            'protocolSection.eligibilityModule.healthyVolunteers'
        ])
    }

    trial_data = []
    total_fetched = 0
    page_token = None
    total_trials_processed = 0
    save_every = 1000  # Save every 1000 trials
    first_save = not os.path.exists(processed_file_path)  # Determine if headers should be written

    try:
        # Ensure the output directory exists
        os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)

        logger.info("Starting to fetch trials...")
        with tqdm(desc="Fetching Trials", unit="page") as fetch_bar:
            while True:
                if page_token:
                    query_params['pageToken'] = page_token

                response = requests.get(base_url, params=query_params)
                response.raise_for_status()
                data = response.json()

                trials = data.get('studies', [])
                if not trials:
                    logger.info("No more trials found.")
                    break

                # Limit the number of trials if 'n' is specified
                if n is not None:
                    remaining = n - total_fetched
                    if remaining <= 0:
                        logger.info("Reached the desired number of trials.")
                        break
                    trials = trials[:remaining]

                total_fetched += len(trials)
                trial_data.extend(trials)

                fetch_bar.update(1)

                # Check if there is a next page
                page_token = data.get('nextPageToken')
                if not page_token:
                    logger.info("No nextPageToken found. All trials fetched.")
                    break

                # Stop fetching if we've reached the desired number of trials
                if n is not None and total_fetched >= n:
                    logger.info("Reached the desired number of trials.")
                    break

        logger.info(f"Total trials fetched: {total_fetched}")

        # Process trials in parallel with progress bar
        def process_trial(trial):
            try:
                nct_id = trial['protocolSection']['identificationModule']['nctId']
                trial_title = trial['protocolSection']['identificationModule']['briefTitle']

                # Extract eligibility criteria
                eligibility_string = trial['protocolSection'].get('eligibilityModule', {}).get('eligibilityCriteria', 'Not Available')
                inclusion_criteria, exclusion_criteria = parse_eligibility_criteria(eligibility_string)

                # Extract additional trial information
                eligibility_module = trial['protocolSection'].get('eligibilityModule', {})
                minimum_age = eligibility_module.get('minimumAge', 'Not Specified')
                maximum_age = eligibility_module.get('maximumAge', 'Not Specified')
                sex = eligibility_module.get('sex', 'Not Specified')
                healthy_volunteers = eligibility_module.get('healthyVolunteers', 'Not Specified')

                # Extract UMLS codes from inclusion and exclusion criteria
                inclusion_umls_codes_mapping = extract_codes_from_criteria(inclusion_criteria, nlp_umls_link, nlp_rxnorm_link)
                exclusion_umls_codes_mapping = extract_codes_from_criteria(exclusion_criteria, nlp_umls_link, nlp_rxnorm_link)

                return {
                    'NCTId': nct_id,
                    'Title': trial_title,
                    'Inclusion_Criteria': inclusion_criteria,
                    'Exclusion_Criteria': exclusion_criteria,
                    'Minimum_Age': minimum_age,
                    'Maximum_Age': maximum_age,
                    'Sex': sex,
                    'Healthy_Volunteers': healthy_volunteers,
                    'Inclusion_Criteria_UMLS_Codes': inclusion_umls_codes_mapping,
                    'Exclusion_Criteria_UMLS_Codes': exclusion_umls_codes_mapping
                }

            except KeyError as ke:
                logger.warning(f"Missing key {ke} in trial data. Skipping trial.")
                return None
            except Exception as e:
                logger.error(f"Error processing trial: {e}")
                return None

        logger.info("Starting parallel processing of trials...")
        save_buffer = []

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = list(executor.submit(process_trial, trial) for trial in trial_data)

            # Use tqdm to track progress
            for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Trials", unit="trial"):
                result = future.result()
                if result:
                    save_buffer.append(result)
                    total_trials_processed += 1

                    # Save every 'save_every' trials
                    if total_trials_processed % save_every == 0:
                        df_save = pd.DataFrame(save_buffer)
                        if first_save:
                            df_save.to_csv(processed_file_path, mode='w', index=False, encoding='utf-8')
                            first_save = False
                            logger.info(f"Saved {save_every} trials to {processed_file_path}.")
                        else:
                            df_save.to_csv(processed_file_path, mode='a', index=False, header=False, encoding='utf-8')
                            logger.info(f"Appended {save_every} trials to {processed_file_path}.")
                        save_buffer = []  # Reset buffer

        logger.info(f"Total trials processed successfully: {total_trials_processed}")

        # Save any remaining trials in the buffer
        if save_buffer:
            df_save = pd.DataFrame(save_buffer)
            if first_save:
                df_save.to_csv(processed_file_path, mode='w', index=False, encoding='utf-8')
                logger.info(f"Saved remaining {len(save_buffer)} trials to {processed_file_path}.")
            else:
                df_save.to_csv(processed_file_path, mode='a', index=False, header=False, encoding='utf-8')
                logger.info(f"Appended remaining {len(save_buffer)} trials to {processed_file_path}.")

        logger.info("DataFrame creation and saving successful.")
        return pd.read_csv(processed_file_path)

    except requests.RequestException as e:
        logger.error(f"Error fetching trials: {e}")
        return None

In [6]:

def extract_umls_codes(descriptions, nlp):
    """
    Extract UMLS codes from descriptions using the provided NLP model.

    Parameters:
        descriptions (list): List of text descriptions.
        nlp (spacy.lang): NLP model for UMLS linking.

    Returns:
        dict: Mapping from description to list of UMLS codes.
    """
    code_mapping = {}
    for desc in descriptions:
        if pd.isna(desc) or desc == '':
            code_mapping[desc] = []
            continue
        doc = nlp(desc)
        codes = set()
        for entity in doc.ents:
            for kb_ent in entity._.kb_ents:
                concept_id, score = kb_ent
                codes.add(concept_id)
        code_mapping[desc] = list(codes)
    return code_mapping

def extract_umls_code_definitions(unique_codes, nlp_umls_conditions=None, nlp_umls_medications=None, reload=False):
    """
    Extract definitions for UMLS codes using SciSpaCy models.

    Parameters:
        unique_codes (set): Set of unique UMLS codes.
        nlp_umls_conditions (spacy.lang): NLP model for UMLS linking (conditions).
        nlp_umls_medications (spacy.lang): NLP model for UMLS linking (medications).

    Returns:
        pd.DataFrame: DataFrame containing code details.
    """

    # check if present in data/umls_codes.csv
    if not reload and os.path.exists('data/umls_codes.csv'):
        umls_codes_df = pd.read_csv('data/umls_codes.csv')
        umls_codes_df = umls_codes_df[umls_codes_df['code'].isin(unique_codes)]
        if len(umls_codes_df) == len(unique_codes):
            return umls_codes_df

    # Initialize a list to store code information
    code_data = []

    # Get the linkers from the models
    linker_conditions = nlp_umls_conditions.get_pipe('scispacy_linker')
    linker_medications = nlp_umls_medications.get_pipe('scispacy_linker')

    # Combine the two linkers' knowledge bases
    all_linkers = [linker_conditions, linker_medications]
    processed_codes = set()

    for code in unique_codes:
        if code in processed_codes:
            continue  # Skip if already processed
        entity = None
        for linker in all_linkers:
            entity = linker.kb.cui_to_entity.get(code)
            if entity:
                break
        if entity:
            code_data.append({
                'code': code,
                'canonical_name': entity.canonical_name,
                'definition': entity.definition,
                'aliases': entity.aliases,
                'types': entity.types
            })
        else:
            # Code not found in either linker
            code_data.append({
                'code': code,
                'canonical_name': None,
                'definition': None,
                'aliases': None,
                'types': None
            })
        processed_codes.add(code)
    
    # Create a DataFrame from the code data
    spacy_umls_codes_df = pd.DataFrame(code_data)

    # Save the DataFrame to a CSV file
    spacy_umls_codes_df.to_csv('data/umls_codes.csv', index=False)
    
    return spacy_umls_codes_df


In [13]:
from dotenv import load_dotenv
load_dotenv()
api_key = os.environ.get("MISTRAL_API_KEY")
if not api_key:
    raise ValueError("MISTRAL_API_KEY environment variable is not set.")

In [16]:
import functools
import json
import os
from typing import Dict, List

from mistralai import Mistral
from mistralai.models.function import Function
from mistralai.models.systemmessage import SystemMessage
from mistralai.models.usermessage import UserMessage
from mistralai.models.toolmessage import ToolMessage

# dotenv setup
from dotenv import load_dotenv
load_dotenv()

def call_mistral_api(log_text: str) -> Dict:
    """
    Call the Mistral AI API with the given log text and retrieve eligibility information.

    Parameters:
        log_text (str): The log text generated during patient-trial matching.

    Returns:
        Dict: Parsed JSON response containing 'patientIsEligibleForTrial' (bool),
              'inclusionCriteriaMet' (list), and 'exclusionCriteriaMet' (list).
    """
    # Define the function that the model can call
    tools = [
        {
            "type": "function",
            "function": Function(
                name="evaluate_patient_eligibility",
                description="Determine if the patient is eligible for the trial based on the log provided.",
                parameters={
                    "type": "object",
                    "properties": {
                        "inclusionCriteriaMet": {
                            "type": "array",
                            "items": {
                                "type": "string",
                                "description": "Description of the inclusion criterion and why/why not it was met. This explanation should be concise and accurate."
                            },
                            "description": "List of inclusion criteria met by the patient."
                        },
                        "exclusionCriteriaMet": {
                            "type": "array",
                            "items": {
                                "type": "string",
                                "description": "Description of the exclusion criterion and why/why not it was met. This explanation should be concise and accurate."
                            },
                            "description": "List of exclusion criteria met by the patient."
                        },
                        "patientIsEligibleForTrial": {
                            "type": "boolean",
                            "description": "Indicates if the patient is eligible for the trial. This should be an accurate assessment based on the provided log."
                        }
                    },
                    "required": ["inclusionCriteriaMet", "exclusionCriteriaMet", "patientIsEligibleForTrial"]
                }
            )
        }
    ]

    # Initialize Mistral client
    api_key = os.environ.get("MISTRAL_API_KEY")
    if not api_key:
        raise ValueError("MISTRAL_API_KEY environment variable is not set.")

    model = "mistral-small-latest"  # Using the cheapest available model

    client = Mistral(api_key=api_key)

    # Prepare initial messages
    messages = [SystemMessage(content="""Analyze the following patient-trial matching log and determine eligibility. 
    **Patient-Trial Matching Log Format:**
Patient ID: string
Trial ID: string
Match Check:
1. Age = {{patient_age}} vs {{trial_age_required}} = MATCH (trial age can be nan if not specified)
2. Sex = {{patient_sex}} vs {{trial_sex_required}} = MATCH
3. Health = {{patient_health_status}} vs {{trial_health_status_required}} = MATCH
4. Exclusion Criteria:
   - Criterion 1: {{trial_exclusion_criterion_1}} = MATCH (match if patient doesn't have the condition, i.e., MATCH implies the patient is still eligible)
    ...
   - Criterion n-1: {{trial_exclusion_criterion_n-1}} = NO MATCH (no match if patient has the condition, i.e., NO MATCH implies the patient is not eligible)
      + [UMLS_CODE_1] Condition 1 (finding) (UMLS_CODE_1 is the UMLS code for the condition) (these codes and the condition describe why the patient is not eligible)
      + [UMLS_CODE_2] Condition 2 (finding) ...
   - Criterion n: {{trial_exclusion_criterion_n}} = MATCH 

    The log contains a simple analysis conducted using SciSpaCy models and their NER capabilities. However, these results may be inconsistent due to false positives. As a Healthcare QC Specialist, you need to review the log and provide a final assessment of the patient's eligibility for the clinical trial.

    **Example Output JSON Format:**
    {
        "inclusionCriteriaMet": ["Patient meets the age criteria.", ...],
        "exclusionCriteriaMet": ["Patient does not have Summarize Condition 1 (UMLS_CODE_1) as required.", ...],
        "patientIsEligibleForTrial": true/false
    }
    """)]

    # Append the log text as user message
    messages.append(UserMessage(content=log_text))

    # Send the request to Mistral API with tools (which define the output format)
    response = client.chat.complete(model=model, messages=messages, tools=tools)

    # Handle response and ensure it returns a JSON format with patient eligibility
    try:
        if response.choices and response.choices[0].message.tool_calls:
            # Check if the model invoked the tool and produced the required format
            tool_call = response.choices[0].message.tool_calls[0]
            function_params = json.loads(tool_call.function.arguments)
            return function_params
        else:
            # Try to parse the response directly as JSON if no tool call
            eligibility_json = json.loads(response.choices[0].message.content)
            return eligibility_json
    except json.JSONDecodeError:
        print("Failed to parse JSON output from Mistral API.")
        return {}


In [7]:
from openai import OpenAI, AzureOpenAI
import openai
import os
import json
from typing import Dict
from retry import retry   
# dotenv setup to load environment variables
from dotenv import load_dotenv
load_dotenv()


def call_openai(log_text: str) -> Dict:
    """
    Call the OpenAI API with the given log text and retrieve eligibility information.

    Parameters:
        log_text (str): The log text generated during patient-trial matching.

    Returns:
        Dict: Parsed JSON response containing 'patientIsEligibleForTrial' (bool),
              'inclusionCriteriaMet' (list), and 'exclusionCriteriaMet' (list).
    """


    # Define the function that the model can call
    functions = [
        {
        "type": "function",
        "function":
            {
            "name": "evaluate_patient_eligibility",
            "description": "Determine if the patient is eligible for the trial based on the log provided.",
            "parameters": {
                "type": "object",
                "properties": {
                    "inclusionCriteriaMet": {
                        "type": "array",
                        "items": {
                            "type": "string",
                            "description": "Description of the inclusion criterion and why/why not it was met."
                        },
                        "description": "List of inclusion criteria met by the patient."
                    },
                    "exclusionCriteriaMet": {
                        "type": "array",
                        "items": {
                            "type": "string",
                            "description": "Description of the exclusion criterion and why/why not it was met."
                        },
                        "description": "List of exclusion criteria met by the patient."
                    },
                    "patientIsEligibleForTrial": {
                        "type": "boolean",
                        "description": "Indicates if the patient is eligible for the trial."
                    }
                },
                "required": ["inclusionCriteriaMet", "exclusionCriteriaMet", "patientIsEligibleForTrial"]
            }
        }
        }
    ]

    # Prepare the messages for the conversation
    messages = [
        {"role": "system", "content": """
        Analyze the following patient-trial matching log and determine eligibility. 
    **Patient-Trial Matching Log Format:**
Patient ID: string
Trial ID: string
Match Check:
1. Age = {{patient_age}} vs {{trial_age_required}} = MATCH (trial age can be nan if not specified)
2. Sex = {{patient_sex}} vs {{trial_sex_required}} = MATCH
3. Health = {{patient_health_status}} vs {{trial_health_status_required}} = MATCH
4. Exclusion Criteria:
   - Criterion 1: {{trial_exclusion_criterion_1}} = MATCH (match if patient doesn't have the condition, i.e., MATCH implies the patient is still eligible)
    ...
   - Criterion n-1: {{trial_exclusion_criterion_n-1}} = NO MATCH (no match if patient has the condition, i.e., NO MATCH implies the patient is not eligible)
      + [UMLS_CODE_1] Condition 1 (finding) (UMLS_CODE_1 is the UMLS code for the condition) (these codes and the condition describe why the patient is not eligible)
      + [UMLS_CODE_2] Condition 2 (finding) ...
   - Criterion n: {{trial_exclusion_criterion_n}} = MATCH 

    The log contains a simple analysis conducted using SciSpaCy models and their NER capabilities. However, these results may be inconsistent due to false positives. As a Healthcare QC Specialist, you need to review the log and provide a final assessment of the patient's eligibility for the clinical trial.

    **Example Output JSON Format:**
    {
        "inclusionCriteriaMet": ["Patient meets the age criteria.", ...],
        "exclusionCriteriaMet": ["Patient does not have Summarize Condition 1 (UMLS_CODE_1) as required.", ...],
        "patientIsEligibleForTrial": true/false
    }
    Note: The "+ [UMLS_CODE] Condition" lines could be flawed due to false positives, that's when you need to correct the log. One way to find these false positives is to check if description of criteria n matches the conditions described by the codes. If not, it is a false positive.
    Finally, make modifications to the log (if necessary) to ensure the patient's eligibility is accurately assessed. Watch out for false positives and negatives in the criteria analysis. Use chain-of-thought reasoning to spot double negatives and other complex patterns.
        """},
        {"role": "user", "content": log_text}
    ]

    # Call the OpenAI API
    response_json = call_openai_api(
        model="gpt-4o",
        seed=None,
        bw_message=messages,
        tools=functions,
        tool_choice= {"type": "function", "function": {"name": "evaluate_patient_eligibility"}},
        temperature=0.5,
        max_tokens=500
    )

    return response_json

In [8]:
@retry(tries=3, delay=2)  
def call_openai_api(  
        model: str,
        seed: int,  
        bw_message: list,  
        tools: list = None,
        tool_choice: dict = None,  
        temperature: float = 0,  
        max_tokens: int = 500
    ):  
    try:
        # client = OpenAI()
        client = AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
            api_version="2024-07-01-preview",
            azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
        )

        response = client.chat.completions.create(  
            model=model,  
            seed=seed,  
            messages=bw_message,  
            temperature=temperature,  
            max_tokens=max_tokens,  
            top_p=1,  
            frequency_penalty=0,  
            presence_penalty=0,  
            tools=tools,  
            tool_choice=tool_choice,  
        )  

        if tool_choice is None:
            return response.choices[0].message.content
        else:
            output = response.choices[0].message.tool_calls[0].function.arguments  
  
        output_dict = json.loads(output)
        return output_dict        
          
    except openai.APIError as e:  
        print(f"\t- OpenAI API returned an API Error: {e}")  
        raise  
    except openai.APIConnectionError as e:  
        print(f"\t- OpenAI API connection error: {e}")  
        raise  
    except openai.RateLimitError as e:  
        print(f"\t- OpenAI API rate limit error: {e}")  
        raise  
    except Exception as e:  
        print(f"\t- An error occurred: {e}")  
        raise  

In [86]:
sample_log_text = """
Patient ID: f264901f-b031-8e53-26f0-92ca046b9472
Trial ID: NCT05666076
Match Check:
1. Age = 55 vs [nan, nan] = MATCH
2. Sex = M vs ALL = MATCH
3. Health = Not Healthy vs Patients = MATCH
4. Exclusion Criteria:
   - Criterion 1: Patients with deformity and pathology in the shoulder region = MATCH
   - Criterion 2: Patients with known local anesthetic allergy = MATCH
   - Criterion 3: Patients with BMI\>35 = MATCH
   - Criterion 4: Patients with alcohol and substance addiction = NO MATCH
      + [C0001948] Unhealthy alcohol drinking behavior (finding)
      + [C0001962] Unhealthy alcohol drinking behavior (finding)
   - Criterion 5: Patients with opioid addiction = MATCH
   - Criterion 6: Patients who cannot perceive and evaluate pain, such as psychiatric illness, mental retardation = NO MATCH
      + [C0004936] Ischemic heart disease (disorder), Acute bronchitis (disorder), Acute viral pharyngitis (disorder), Gingivitis (disorder), Viral sinusitis (disorder)
"""

# call_mistral_api(sample_log_text)
call_openai(sample_log_text)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'inclusionCriteriaMet': ['Patient meets the age criteria.',
  'Patient meets the sex criteria.'],
 'exclusionCriteriaMet': ['Patient does not have deformity and pathology in the shoulder region as required.',
  'Patient does not have known local anesthetic allergy as required.',
  'Patient does not have BMI>35 as required.',
  'Patient has alcohol and substance addiction, which does not meet the exclusion criteria.',
  'Patient cannot perceive and evaluate pain due to psychiatric illness, which does not meet the exclusion criteria.'],
 'patientIsEligibleForTrial': False}

In [16]:
# Helper functions for data loading and processing
def load_data(file_path, encoding='utf-8'):
    """
    Load data from a CSV file with the specified encoding.

    Parameters:
    file_path (str): The path to the CSV file to be loaded.
    encoding (str, optional): The encoding used to read the CSV file. Default is 'utf-8'.

    Returns:
    pd.DataFrame: A pandas DataFrame containing the loaded data.
    """
    return pd.read_csv(file_path, encoding=encoding)

# Function to calculate age in years based on birthdate.
def calculate_age(birthdate):
    """
    Calculate age in years based on the provided birthdate.

    Args:
        birthdate (datetime): The birthdate of the individual.

    Returns:
        int: The age in years.
    """
    current_date = datetime.now()
    return (current_date - birthdate).days // 365

def convert_to_datetime(df, date_columns):
    """
    Convert specified columns in DataFrame to datetime.
    """
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

def extract_umls_codes(descriptions, nlp):
    """
    Extract UMLS codes from descriptions using the provided NLP model.
    Returns a mapping from description to list of UMLS codes.
    """
    code_mapping = {}
    for desc in descriptions:
        if pd.isna(desc) or desc == '':
            code_mapping[desc] = []
            continue
        doc = nlp(desc)
        codes = set()
        for entity in doc.ents:
            for kb_ent in entity._.kb_ents:
                concept_id, score = kb_ent
                codes.add(concept_id)
        code_mapping[desc] = list(codes)
    return code_mapping

def aggregate_list(series):
    """
    Aggregate a pandas Series into a list of unique, non-null values.
    """
    return list(series.dropna().unique())

def aggregate_codes(series):
    """
    Aggregate a pandas Series of lists into a list of unique codes.
    """
    codes = set()
    for items in series.dropna():
        codes.update(items)
    return list(codes)

def create_code_description_map(codes_list, descriptions_list):
    """
    Create a mapping from codes to descriptions.
    """
    code_desc_map = {}
    for codes, desc in zip(codes_list, descriptions_list):
        if isinstance(codes, list):
            for code in codes:
                if code in code_desc_map:
                    code_desc_map[code].add(desc)
                else:
                    code_desc_map[code] = {desc}
    return code_desc_map

# Data Loading and Preprocessing
def prepare_patient_data(reload=False):
    """
    Load and preprocess patient data, conditions, and medications.
    Returns the patient_profiles_grouped DataFrame.
    """
    STOP_CODES = {'C3244316', 'C0013227'}
    if not reload and os.path.exists('data/patient/patient_profiles_grouped.csv'):
        return pd.read_csv('data/patient/patient_profiles_grouped.csv')
        
    # Load data
    patients_df = load_data('synthea_100/patients.csv')
    medications_df = load_data('synthea_100/medications.csv')
    conditions_df = load_data('synthea_100/conditions.csv')

    # Calculate age
    patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'], errors='coerce')
    patients_df['AGE'] = patients_df['BIRTHDATE'].apply(calculate_age)

    # Convert date columns to datetime
    date_columns_conditions = ['START', 'STOP']
    date_columns_medications = ['START', 'STOP']
    conditions_df = convert_to_datetime(conditions_df, date_columns_conditions)
    medications_df = convert_to_datetime(medications_df, date_columns_medications)


    # Extract UMLS codes for conditions
    unique_condition_descriptions = conditions_df['DESCRIPTION'].unique()
    condition_description_to_umls_codes = extract_umls_codes(unique_condition_descriptions, nlp_umls_link)
    conditions_df['Condition_UMLS_CODES'] = conditions_df['DESCRIPTION'].map(condition_description_to_umls_codes)

    # Extract UMLS codes for medications
    unique_medication_descriptions = medications_df['DESCRIPTION'].unique()
    medication_description_to_umls_codes = extract_umls_codes(unique_medication_descriptions, nlp_rxnorm_link)
    medications_df['Medication_UMLS_CODES'] = medications_df['DESCRIPTION'].map(medication_description_to_umls_codes)

    conditions_df['Condition_UMLS_CODES'] = conditions_df['Condition_UMLS_CODES'].apply(
        lambda codes: [code for code in codes if code not in STOP_CODES]
    )

    medications_df['Medication_UMLS_CODES'] = medications_df['Medication_UMLS_CODES'].apply(
        lambda codes: [code for code in codes if code not in STOP_CODES]
    )

    # Merge DataFrames
    patients_conditions = pd.merge(patients_df, conditions_df, left_on='Id', right_on='PATIENT', how='left')
    conditions_medications = pd.merge(patients_conditions, medications_df, left_on='ENCOUNTER', right_on='ENCOUNTER', how='left')

    # Create Patient Profiles
    patient_profiles = conditions_medications[['Id', 'AGE', 'GENDER', 
                                               'DESCRIPTION_x', 'CODE_x', 'START_x', 'STOP_x', 'Condition_UMLS_CODES',
                                               'DESCRIPTION_y', 'CODE_y', 'START_y', 'STOP_y', 'Medication_UMLS_CODES',
                                               'ENCOUNTER']]

    # Rename columns
    patient_profiles.columns = [
        'PatientID', 'Age', 'Gender', 
        'Condition_Description', 'Condition_Code', 'Condition_Start', 'Condition_End', 'Condition_UMLS_CODES',
        'Medication_Description', 'Medication_Code', 'Medication_Start', 'Medication_End', 'Medication_UMLS_CODES',
        'Encounter'
    ]

    # Create Patient Profiles Grouped
    patient_data_list = []
    patient_groups = patient_profiles.groupby('PatientID')

    for patient_id, group in patient_groups:
        age = group['Age'].iloc[0]
        gender = group['Gender'].iloc[0]
        condition_descriptions = aggregate_list(group['Condition_Description'])
        condition_codes = aggregate_list(group['Condition_Code'])
        condition_umls_codes = aggregate_codes(group['Condition_UMLS_CODES'])
        medication_descriptions = aggregate_list(group['Medication_Description'])
        medication_codes = aggregate_list(group['Medication_Code'])
        medication_umls_codes = aggregate_codes(group['Medication_UMLS_CODES'])

        # Create code-description maps
        condition_code_desc_map = create_code_description_map(group['Condition_UMLS_CODES'], group['Condition_Description'])
        medication_code_desc_map = create_code_description_map(group['Medication_UMLS_CODES'], group['Medication_Description'])

        patient_data_list.append({
            'PatientID': patient_id,
            'Age': age,
            'Gender': gender,
            'Condition_Description': condition_descriptions,
            'Condition_Code': condition_codes,
            'Condition_UMLS_CODES': condition_umls_codes,
            'Condition_Code_Description_Map': condition_code_desc_map,
            'Medication_Description': medication_descriptions,
            'Medication_Code': medication_codes,
            'Medication_UMLS_CODES': medication_umls_codes,
            'Medication_Code_Description_Map': medication_code_desc_map
        })

    patient_profiles_grouped = pd.DataFrame(patient_data_list)

    # save both dataframes to csv
    patient_profiles_grouped.to_csv('data/patient/patient_profiles_grouped.csv', index=False)
    conditions_medications.to_csv('data/annotated_synthea/conditions_medications.csv', index=False)

    return patient_profiles_grouped

def is_healthy_volunteer(patient):
    """
    Determine if a patient is a healthy volunteer.
    """
    return len(patient['Condition_Description']) == 0

def perform_exclusion_criteria_check(patient, trial):
    """
    Check exclusion criteria and return match status and details.
    """
    exclusion_match = True  # Assume match until proven otherwise
    exclusion_logs = []

    # Prepare patient's condition and medication codes
    patient_condition_codes = set(patient['Condition_UMLS_CODES'])
    patient_medication_codes = set(patient['Medication_UMLS_CODES'])

    # Iterate over exclusion criteria
    for idx, crit in enumerate(trial['Exclusion_Criteria']):
        criterion_match = True  # Assume criterion matches
        criterion_logs = [f"   - Criterion {idx+1}: {crit}"]

        # Get codes associated with the criterion
        criterion_umls_codes = set(trial['Exclusion_Criteria_UMLS_Codes'].get(crit, []))

        # Check condition codes
        overlapping_condition_codes = patient_condition_codes.intersection(criterion_umls_codes)
        if overlapping_condition_codes:
            criterion_match = False
            for code in overlapping_condition_codes:
                descriptions = patient['Condition_Code_Description_Map'].get(code, [])
                descriptions_str = ', '.join(descriptions)
                criterion_logs.append(f"      + [{code}] {descriptions_str}")
        
        # Check medication codes
        overlapping_medication_codes = patient_medication_codes.intersection(criterion_umls_codes)
        if overlapping_medication_codes:
            criterion_match = False
            for code in overlapping_medication_codes:
                descriptions = patient['Medication_Code_Description_Map'].get(code, [])
                descriptions_str = ', '.join(descriptions)
                criterion_logs.append(f"      + [{code}] {descriptions_str}")

        # Append match status for the criterion
        criterion_logs[0] += f" = {'MATCH' if criterion_match else 'NO MATCH'}"
        if not criterion_match:
            exclusion_match = False

        exclusion_logs.extend(criterion_logs)

    return exclusion_match, exclusion_logs

def match_patients_to_trials(
        patient_profiles_grouped,
        df_trials,
        code_definitions_df,
        chunk_size=100,
        debug=False,
        verbose=True,
        max_api_workers=10,
        output_json_format='both',
        patient_start_chunk=0,
    ):
    """
    Match patients to clinical trials based on eligibility criteria, with API integration.

    Parameters:
        patient_profiles_grouped (pd.DataFrame): DataFrame containing patient profiles.
        df_trials (pd.DataFrame): DataFrame containing trial information.
        code_definitions_df (pd.DataFrame): DataFrame with code definitions.
        chunk_size (int): Number of patients and trials to process per chunk.
        debug (bool): If True, generate detailed logs for each patient-trial match.
        verbose (bool): If True, logging and progress bars are enabled.
        max_api_workers (int): Maximum number of concurrent API calls.
        output_json_format (str): Options to include in output ('match', 'no_match', 'both').
        patient_start_chunk (int): Index of the starting patient chunk.

    Returns:
        list: List of dictionaries containing patient IDs and their eligible trials.
    """

    # Set up logging to the specified log file
    datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_filename = f"data/output/pt_matches_progress_{datetime_str}.log"

    # Set up the logger for use in Jupyter
    logger = setup_logger_for_jupyter(log_filename, debug=debug)

    if not verbose:
        logger.disabled = True

    # Initialize tqdm with pandas
    tqdm.pandas()

    # Prepare trial data
    df_trials = prepare_trial_data(df_trials)
    logger.info("Trial data prepared.")

    # Initialize the final results list
    final_results = []

    # Mapping to store match results for each patient-trial pair
    match_results_mapping = {}

    patient_chunk_size = 50

    # Process patients in chunks with progress bar
    for patient_start in tqdm(range(patient_start_chunk, len(patient_profiles_grouped), patient_chunk_size), desc="Patient Chunks"):
        patient_chunk = patient_profiles_grouped.iloc[patient_start:patient_start + patient_chunk_size]
        logger.info(f"Processing patient chunk starting at index {patient_start}")

        # Process trials in chunks with progress bar
        for trial_start in tqdm(range(0, len(df_trials), chunk_size), desc="Trial Chunks", leave=False):
            trial_chunk = df_trials.iloc[trial_start:trial_start + chunk_size]
            logger.info(f"Processing trial chunk starting at index {trial_start}")

            # Create cartesian product and apply matching function
            patient_trial_pairs = patient_chunk.assign(key=1).merge(
                trial_chunk.assign(key=1), on='key').drop('key', axis=1)

            # Apply matching function with progress bar
            match_results = patient_trial_pairs.progress_apply(
                lambda row: evaluate_patient_trial_match_w_api(
                    row, code_definitions_df=code_definitions_df, debug=debug), axis=1)

            patient_trial_pairs['Match_Result'] = match_results

            # Collect results and prepare API call tasks
            for idx, row in patient_trial_pairs.iterrows():
                patient_id = row['PatientID']
                trial_id = row['NCTId']
                match_result = row['Match_Result']

                # Store match result for later use
                match_results_mapping[(patient_id, trial_id)] = match_result

                if match_result['is_match']:
                    # Prepare initial eligible trial entry
                    trial_entry = {
                        'trialId': trial_id,
                        'trialName': row['Title'],
                        'eligibilityCriteriaMet': match_result['eligibilityCriteriaMet']
                    }

                    # Add to final results
                    existing_patient = next((item for item in final_results if item['patientId'] == patient_id), None)
                    if existing_patient:
                        existing_patient['eligibleTrials'].append(trial_entry)
                    else:
                        final_results.append({
                            'patientId': patient_id,
                            'eligibleTrials': [trial_entry]
                        })

        # Save results after processing each patient chunk
        output_file_path = f'data/output/patient_trial_matches_chunk_{patient_start}.json'
        save_results_to_json(final_results, output_file_path, logger)

        # Clear final_results to free memory
        final_results.clear()

    logger.info("All patient chunks processed. Returning final results.")
    return final_results

def save_results_to_json(results, output_file_path, logger):
    """
    Save the results to a JSON file, appending if the file already exists.

    Parameters:
        results (list): The results to save.
        output_file_path (str): The path to the output JSON file.
        logger (logging.Logger): The logger instance.
    """
    try:
        # Save the results to the JSON file
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=4)

        logger.info(f"Results saved to {output_file_path}.")
    except Exception as e:
        logger.error(f"Failed to save results to {output_file_path}: {e}")

def update_logs_and_format_output(final_results, api_results, match_results_mapping, output_json_format='both', debug=False):
    """
    Update logs based on API results and format the final JSON output.

    Parameters:
        final_results (list): List of dictionaries containing patient IDs and their eligible trials.
        api_results (dict): Dictionary mapping (patient_id, trial_id) to API output.
        match_results_mapping (dict): Dictionary mapping (patient_id, trial_id) to initial match results.
        output_json_format (str): Options to include in output ('match', 'no_match', 'both').
        debug (bool): If True, additional debug information will be logged.

    Returns:
        list: Updated list of dictionaries containing patient IDs and their eligible trials.
    """
    import os
    import re
    from glob import glob

    updated_final_results = []

    for patient_entry in final_results:
        patient_id = patient_entry['patientId']
        updated_eligible_trials = []

        for trial_entry in patient_entry['eligibleTrials']:
            trial_id = trial_entry['trialId']
            key = (patient_id, trial_id)

            # Get initial match result
            initial_match_result = match_results_mapping.get(key)
            initial_is_match = initial_match_result.get('is_match') if initial_match_result else None

            if key in api_results:
                api_result = api_results[key]

                # Update 'eligibilityCriteriaMet' with concatenated lists from API output
                eligibility_criteria_met = (
                    api_result.get('inclusionCriteriaMet', []) +
                    api_result.get('exclusionCriteriaMet', [])
                )
                trial_entry['eligibilityCriteriaMet'] = eligibility_criteria_met

                # Determine if the patient is eligible based on API result
                patient_is_eligible = api_result.get('patientIsEligibleForTrial', False)

                # Update log file
                log_dir = os.path.join('logs', patient_id)
                # Find the original log file
                log_files_pattern = os.path.join(log_dir, f"*{trial_id}_*.log")
                log_files = glob(log_files_pattern)
                if log_files:
                    original_log_filepath = log_files[0]
                    # Read the original log content
                    with open(original_log_filepath, 'r', encoding='utf-8') as f:
                        log_content = f.read()

                    # Append the API response to the logs
                    api_check_lines = ["API Check:"]
                    # Pretty-print the API response
                    for key_resp, value_resp in api_result.items():
                        if isinstance(value_resp, list):
                            api_check_lines.append(f"{key_resp}:")
                            for item in value_resp:
                                api_check_lines.append(f"\t- {item}")
                        else:
                            api_check_lines.append(f"{key_resp}: {value_resp}")
                    api_check_text = '\n'.join(api_check_lines)
                    log_content += '\n' + api_check_text

                    # Determine if file name needs to be updated
                    initial_status = 'MATCH' if initial_is_match else 'NO_MATCH'
                    updated_status = 'MATCH' if patient_is_eligible else 'NO_MATCH'

                    if initial_status != updated_status:
                        # Remove the initial status prefix from the filename
                        original_log_filename = os.path.basename(original_log_filepath)
                        original_log_filename = re.sub(r'^\[.*?\]_','', original_log_filename)
                        # Construct new file name with updated status
                        new_log_filename = f'[{updated_status}]_{original_log_filename}'
                        new_log_filepath = os.path.join(log_dir, new_log_filename)
                        # Rename the file
                        os.rename(original_log_filepath, new_log_filepath)
                        # Write the updated log content to the renamed file
                        with open(new_log_filepath, 'w', encoding='utf-8') as f:
                            f.write(log_content)
                    else:
                        # Status didn't change, overwrite the original file
                        with open(original_log_filepath, 'w', encoding='utf-8') as f:
                            f.write(log_content)
                else:
                    # Log file not found; handle this case if necessary
                    if debug:
                        print(f"Log file not found for patient {patient_id} and trial {trial_id}.")

                # Decide whether to include this trial based on 'output_json_format'
                if output_json_format == 'both':
                    updated_eligible_trials.append(trial_entry)
                elif output_json_format == 'match' and patient_is_eligible:
                    updated_eligible_trials.append(trial_entry)
                elif output_json_format == 'no_match' and not patient_is_eligible:
                    updated_eligible_trials.append(trial_entry)
                # If not matching the criteria, do not include

            else:
                # No API result for this trial; include based on initial match
                # Get initial match status
                if initial_is_match is not None:
                    if output_json_format == 'both':
                        updated_eligible_trials.append(trial_entry)
                    elif output_json_format == 'match' and initial_is_match:
                        updated_eligible_trials.append(trial_entry)
                    elif output_json_format == 'no_match' and not initial_is_match:
                        updated_eligible_trials.append(trial_entry)

        if updated_eligible_trials:
            updated_final_results.append({
                'patientId': patient_id,
                'eligibleTrials': updated_eligible_trials
            })

    return updated_final_results

def setup_logger_for_jupyter(log_filename, debug=False):
    """
    Sets up the logger to log into a file in a Jupyter environment.
    Removes existing handlers to avoid conflicts.
    """
    # Create a logger
    logger = logging.getLogger('match_patients_to_trials_w_api')

    # Clear existing handlers to prevent multiple outputs in Jupyter
    if logger.hasHandlers():
        logger.handlers.clear()

    # Set logging level
    logger.setLevel(logging.DEBUG if debug else logging.INFO)

    # Create a file handler which logs even debug messages
    fh = logging.FileHandler(log_filename)
    fh.setLevel(logging.DEBUG if debug else logging.INFO)

    # Create formatter and add it to the handlers
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)

    # Add the file handler to the logger (logging only to the file now)
    logger.addHandler(fh)

    return logger

def match_patients_to_trials_w_api(
        patient_profiles_grouped,
        df_trials,
        code_definitions_df,
        chunk_size=100,
        debug=False,
        verbose=True,
        max_api_workers=10,
        output_json_format='both'
    ):
    """
    Match patients to clinical trials based on eligibility criteria, with API integration.

    Parameters:
        patient_profiles_grouped (pd.DataFrame): DataFrame containing patient profiles.
        df_trials (pd.DataFrame): DataFrame containing trial information.
        code_definitions_df (pd.DataFrame): DataFrame with code definitions.
        chunk_size (int): Number of patients and trials to process per chunk.
        debug (bool): If True, generate detailed logs for each patient-trial match.
        verbose (bool): If True, logging and progress bars are enabled.
        max_api_workers (int): Maximum number of concurrent API calls.
        output_json_format (str): Options to include in output ('match', 'no_match', 'both').

    Returns:
        list: List of dictionaries containing patient IDs and their eligible trials.
    """

    # Set up logging to the specified log file
    datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_filename = f"data/output/pt_matches_progress_{datetime_str}.log"

    # Set up the logger for use in Jupyter
    logger = setup_logger_for_jupyter(log_filename, debug=debug)

    if not verbose:
        logger.disabled = True

    # Initialize tqdm with pandas
    tqdm.pandas()

    # Prepare trial data
    df_trials = prepare_trial_data(df_trials)
    logger.info("Trial data prepared.")

    # Initialize the final results list
    final_results = []

    # Initialize list to collect API call tasks
    api_call_tasks = []

    # Mapping to store match results for each patient-trial pair
    match_results_mapping = {}

    patient_chunk_size = 50

    # Process patients in chunks with progress bar
    for patient_start in tqdm(range(0, len(patient_profiles_grouped), patient_chunk_size), desc="Patient Chunks"):
        patient_chunk = patient_profiles_grouped.iloc[patient_start:patient_start + patient_chunk_size]
        logger.info(f"Processing patient chunk starting at index {patient_start}")

        # Process trials in chunks with progress bar
        for trial_start in tqdm(range(0, len(df_trials), chunk_size), desc="Trial Chunks", leave=False):
            trial_chunk = df_trials.iloc[trial_start:trial_start + chunk_size]
            logger.info(f"Processing trial chunk starting at index {trial_start}")

            # Create cartesian product and apply matching function
            patient_trial_pairs = patient_chunk.assign(key=1).merge(
                trial_chunk.assign(key=1), on='key').drop('key', axis=1)

            # Apply matching function with progress bar
            match_results = patient_trial_pairs.progress_apply(
                lambda row: evaluate_patient_trial_match_w_api(
                    row, code_definitions_df=code_definitions_df, debug=debug), axis=1)

            patient_trial_pairs['Match_Result'] = match_results

            # Collect results and prepare API call tasks
            for idx, row in patient_trial_pairs.iterrows():
                patient_id = row['PatientID']
                trial_id = row['NCTId']
                match_result = row['Match_Result']

                # Store match result for later use
                match_results_mapping[(patient_id, trial_id)] = match_result

                if match_result['is_match']:
                    # Prepare initial eligible trial entry
                    trial_entry = {
                        'trialId': trial_id,
                        'trialName': row['Title'],
                        'eligibilityCriteriaMet': match_result['eligibilityCriteriaMet']
                    }

                    # Add to final results
                    existing_patient = next((item for item in final_results if item['patientId'] == patient_id), None)
                    if existing_patient:
                        existing_patient['eligibleTrials'].append(trial_entry)
                    else:
                        final_results.append({
                            'patientId': patient_id,
                            'eligibleTrials': [trial_entry]
                        })

                # Check if API call is needed
                if match_result['need_api_call']:
                    # Prepare arguments for the API call
                    log_text = match_result['log_text']
                    # Enqueue API call
                    api_call_tasks.append((patient_id, trial_id, log_text))

        output_file_path = 'data/output/patient_trial_matches.json'
        save_results_to_json(final_results, output_file_path, logger)

    # Log the number of API calls to be made
    total_api_calls = len(api_call_tasks)
    logger.info(f"Total API calls to be made: {total_api_calls}")

    # Process API calls in parallel with progress bar
    api_results = {}  # To store API results
    api_call_counter = 0  # To count the number of API calls made

    # Prepare output file path
    output_file_path = 'data/output/patient_trial_matches.json'

    # if api_call_tasks:
    #     def process_api_call(task):
    #         patient_id, trial_id, log_text = task
    #         # Call the OpenAI API
    #         api_result = call_openai(log_text)
    #         # Return the result along with identifiers
    #         return patient_id, trial_id, api_result

    #     with ThreadPoolExecutor(max_workers=max_api_workers) as executor:
    #         futures = [executor.submit(process_api_call, task) for task in api_call_tasks]
    #         for future in tqdm(as_completed(futures), total=len(futures), desc="API Calls"):
    #             try:
    #                 patient_id, trial_id, api_result = future.result()
    #                 # Store the API result
    #                 api_results[(patient_id, trial_id)] = api_result
    #                 logger.info(f"API call completed for patient {patient_id} and trial {trial_id}")

    #                 # Increment API call counter
    #                 api_call_counter += 1

    #                 # Every 1000 API calls, save the current results
    #                 if api_call_counter % 1000 == 0 or api_call_counter == total_api_calls:
    #                     logger.info(f"Saving results after {api_call_counter} API calls.")
    #                     # Update logs and format output based on API results
    #                     temp_results = update_logs_and_format_output(
    #                         final_results=final_results,
    #                         api_results=api_results,
    #                         match_results_mapping=match_results_mapping,
    #                         output_json_format=output_json_format,
    #                         debug=debug
    #                     )
    #                     # Save the updated results to the JSON file
    #                     save_results_to_json(temp_results, output_file_path, logger)

    #             except Exception as e:
    #                 logger.error(f"API call failed: {e}")

    #     # After all API calls are done, ensure all results are saved
    #     logger.info("All API calls completed. Saving final results.")
    #     final_results = update_logs_and_format_output(
    #         final_results=final_results,
    #         api_results=api_results,
    #         match_results_mapping=match_results_mapping,
    #         output_json_format=output_json_format,
    #         debug=debug
    #     )
        # save_results_to_json(final_results, output_file_path, logger)

    # else:
    #     logger.info("No API calls needed.")
    #     # Save the final results without API updates
    #     save_results_to_json(final_results, output_file_path, logger)
    
    
    return final_results

def evaluate_patient_trial_match_w_api(row, code_definitions_df, debug=False):
    """
    Evaluate if a patient matches a trial's eligibility criteria with detailed messages.

    Parameters:
        row (pd.Series): Series containing patient and trial data.
        code_definitions_df (pd.DataFrame): DataFrame with code definitions.
        debug (bool): If True, generate logs for the match evaluation.

    Returns:
        dict: Dictionary containing match status, detailed criteria messages, and log text.
    """
    import os
    from datetime import datetime
    import numpy as np

    # print('ROw:', row)

    # Extract patient and trial data from the row
    patient_id = row['PatientID']
    patient_age = row['Age']
    patient_gender = row['Gender']
    is_healthy = is_healthy_volunteer_row(row)
    trial_id = row['NCTId']
    trial_title = row['Title']
    min_age = row['Minimum_Age']
    max_age = row['Maximum_Age']
    trial_sex = row['Sex']
    healthy_volunteers = row['Healthy_Volunteers']
    inclusion_criteria = row['Inclusion_Criteria']
    exclusion_criteria = row['Exclusion_Criteria']
    inclusion_codes_mapping = row['Inclusion_Criteria_UMLS_Codes']
    exclusion_codes_mapping = row['Exclusion_Criteria_UMLS_Codes']
    condition_codes = row['Condition_UMLS_CODES']
    medication_codes = row['Medication_UMLS_CODES']
    condition_desc_map = row['Condition_Code_Description_Map']
    medication_desc_map = row['Medication_Code_Description_Map']

    # Initialize match status and eligibility criteria messages
    is_match = True
    eligibility_criteria_met = []
    eligibility_criteria_not_met = []

    # Initialize logs
    log_lines = []

    # 1. Age check
    # print('MIN AGE:', min_age, ', MAX AGE:', max_age)
    min_age_display = min_age if not np.isnan(min_age) else 'Not Specified'
    max_age_display = max_age if not np.isnan(max_age) else 'Not Specified'
    age_match = is_age_match(patient_age, min_age, max_age)
    if age_match:
        message = f"Patient Age ({patient_age}) matches Expected Range ({min_age_display}, {max_age_display})"
        eligibility_criteria_met.append(message)
    else:
        message = f"Patient Age ({patient_age}) does not match Expected Range ({min_age_display}, {max_age_display})"
        eligibility_criteria_not_met.append(message)
        is_match = False
    log_lines.append(f"1. {message}")

    # 2. Sex check
    trial_sex_display = trial_sex if trial_sex else 'Not Specified'
    sex_match = is_sex_match(patient_gender, trial_sex)
    if sex_match:
        message = f"Patient Sex ({patient_gender}) matches Trial Sex Requirement ({trial_sex_display})"
        eligibility_criteria_met.append(message)
    else:
        message = f"Patient Sex ({patient_gender}) does not match Trial Sex Requirement ({trial_sex_display})"
        eligibility_criteria_not_met.append(message)
        is_match = False
    log_lines.append(f"2. {message}")

    # 3. Health status check
    health_status_display = 'Healthy Volunteers' if healthy_volunteers else 'Patients'
    health_match = True
    if healthy_volunteers and not is_healthy:
        health_match = False
    if health_match:
        message = f"Patient Health Status ({'Healthy' if is_healthy else 'Not Healthy'}) matches Trial Requirement ({health_status_display})"
        eligibility_criteria_met.append(message)
    else:
        message = f"Patient Health Status ({'Healthy' if is_healthy else 'Not Healthy'}) does not match Trial Requirement ({health_status_display})"
        eligibility_criteria_not_met.append(message)
        is_match = False
    log_lines.append(f"3. {message}")

    # Initialize variable to indicate if we need to call API
    need_api_call = False

    # Proceed to Exclusion Criteria check
    exclusion_match = True
    exclusion_logs = []
    num_exclusion_reasons = 0
    overlapping_codes = set()

    # Patient codes as sets
    patient_condition_codes = set(condition_codes)
    patient_medication_codes = set(medication_codes)

    # Iterate over exclusion criteria
    for idx, crit in enumerate(exclusion_criteria):
        criterion_match = True  # Assume criterion matches
        criterion_logs = [f"   - Criterion {idx+1}: {crit}"]

        # Get codes associated with the criterion
        criterion_codes = set(exclusion_codes_mapping.get(crit, []))

        # Check condition codes
        overlapping_condition_codes = patient_condition_codes.intersection(criterion_codes)
        if overlapping_condition_codes:
            criterion_match = False
            num_exclusion_reasons += 1
            overlapping_codes.update(overlapping_condition_codes)
            for code in overlapping_condition_codes:
                descriptions = condition_desc_map.get(code, [])
                descriptions_str = ', '.join(descriptions) if descriptions else ''
                criterion_logs.append(f"      + Overlapping Condition Code [{code}]: {descriptions_str}")

        # Check medication codes
        overlapping_medication_codes = patient_medication_codes.intersection(criterion_codes)
        if overlapping_medication_codes:
            criterion_match = False
            num_exclusion_reasons += 1
            overlapping_codes.update(overlapping_medication_codes)
            for code in overlapping_medication_codes:
                descriptions = medication_desc_map.get(code, [])
                descriptions_str = ', '.join(descriptions) if descriptions else ''
                criterion_logs.append(f"      + Overlapping Medication Code [{code}]: {descriptions_str}")

        # Append match status for the criterion
        if not criterion_match:
            message = f"Patient has overlapping conditions/medications with exclusion criterion: {crit}"
            eligibility_criteria_not_met.append(message)
            exclusion_match = False
        else:
            message = f"Patient has no overlapping conditions/medications with exclusion criterion: {crit}"
            eligibility_criteria_met.append(message)

        criterion_logs[0] += f" = {'MATCH' if criterion_match else 'NO MATCH'}"
        exclusion_logs.extend(criterion_logs)

    log_lines.append("4. Exclusion Criteria:")
    log_lines.extend(exclusion_logs)

    if exclusion_match:
        log_lines.append("   - Exclusion Criteria Check: MATCH")
        # Since there is a match, we need to call the API to fact-check
        need_api_call = True
    else:
        log_lines.append("   - Exclusion Criteria Check: NO MATCH")
        if num_exclusion_reasons < 2:
            # Less than 2 reasons for exclusion, call the API to fact-check
            need_api_call = True
        else:
            need_api_call = False
        is_match = False  # Patient is not eligible due to exclusion criteria

    # Combine log lines into 'log_text'
    log_text = '\n'.join(log_lines)

    # If debug is True, write logs to file
    if debug:
        # Prepare patient-specific log directory
        patient_log_dir = os.path.join('logs', patient_id)
        os.makedirs(patient_log_dir, exist_ok=True)

        # Prepare log file name
        datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
        log_filename = f"{trial_id}_{datetime_str}.log"
        if is_match:
            log_filename = f"[MATCH]_{log_filename}"

        # Write logs to file
        log_filepath = os.path.join(patient_log_dir, log_filename)
        with open(log_filepath, 'w', encoding='utf-8') as f:
            f.write(log_text)

    # Return the updated match result dictionary
    return {
        'is_match': is_match,
        'eligibilityCriteriaMet': eligibility_criteria_met,
        'eligibilityCriteriaNotMet': eligibility_criteria_not_met,
        'need_api_call': need_api_call,
        'log_text': log_text
    }

def perform_exclusion_criteria_check_row_w_api(row):
    """
    Check exclusion criteria for a patient-trial pair.

    Parameters:
        row (pd.Series): Series containing patient and trial data.

    Returns:
        tuple: (exclusion_match (bool), exclusion_logs (list), num_exclusion_reasons (int), overlapping_codes (set))
    """
    # Initialize match status
    exclusion_match = True  # Assume match until proven otherwise
    exclusion_logs = []
    num_exclusion_reasons = 0
    overlapping_codes = set()  # To collect overlapping UMLS codes

    # Extract necessary data from row
    exclusion_criteria = row['Exclusion_Criteria']
    exclusion_codes_mapping = row['Exclusion_Criteria_UMLS_Codes']
    condition_codes = set(row['Condition_UMLS_CODES'])
    medication_codes = set(row['Medication_UMLS_CODES'])
    condition_desc_map = row['Condition_Code_Description_Map']
    medication_desc_map = row['Medication_Code_Description_Map']

    # Iterate over exclusion criteria
    for idx, crit in enumerate(exclusion_criteria):
        criterion_match = True  # Assume criterion matches
        criterion_logs = [f"   - Criterion {idx+1}: {crit}"]

        # Get codes associated with the criterion
        criterion_codes = set(exclusion_codes_mapping.get(crit, []))

        # Check for overlapping condition codes
        overlapping_condition_codes = condition_codes.intersection(criterion_codes)
        if overlapping_condition_codes:
            criterion_match = False
            num_exclusion_reasons += 1
            overlapping_codes.update(overlapping_condition_codes)
            for code in overlapping_condition_codes:
                descriptions = condition_desc_map.get(code, [])
                descriptions_str = ', '.join(descriptions) if descriptions else ''
                criterion_logs.append(f"      + [{code}] {descriptions_str}")

        # Check for overlapping medication codes
        overlapping_medication_codes = medication_codes.intersection(criterion_codes)
        if overlapping_medication_codes:
            criterion_match = False
            num_exclusion_reasons += 1
            overlapping_codes.update(overlapping_medication_codes)
            for code in overlapping_medication_codes:
                descriptions = medication_desc_map.get(code, [])
                descriptions_str = ', '.join(descriptions) if descriptions else ''
                criterion_logs.append(f"      + [{code}] {descriptions_str}")

        # Append match status for the criterion
        criterion_logs[0] += f" = {'MATCH' if criterion_match else 'NO MATCH'}"
        if not criterion_match:
            exclusion_match = False

        exclusion_logs.extend(criterion_logs)

    return exclusion_match, exclusion_logs, num_exclusion_reasons, overlapping_codes

def evaluate_patient_trial_match(row, debug=False):
    """
    Evaluate if a patient matches a trial's eligibility criteria.

    Parameters:
        row (pd.Series): Series containing patient and trial data.
        debug (bool): If True, generate logs for the match evaluation.

    Returns:
        dict: Dictionary containing match status and criteria met.
    """
    # Extract patient and trial data from the row
    patient_id = row['PatientID']
    patient_age = row['Age']
    patient_gender = row['Gender']
    is_healthy = is_healthy_volunteer_row(row)
    trial_id = row['NCTId']
    trial_title = row['Title']
    min_age = row['Minimum_Age']
    max_age = row['Maximum_Age']
    trial_sex = row['Sex']
    healthy_volunteers = row['Healthy_Volunteers']

    # Initialize match status and eligibility criteria met
    is_match = True
    eligibility_criteria_met = []

    # Initialize logs if debug is True
    log_lines = []
    if debug:
        log_lines.append(f"Patient ID: {patient_id}")
        log_lines.append(f"Trial ID: {trial_id}")
        log_lines.append("Match Check:")

    # 1. Age check
    age_match = is_age_match(patient_age, min_age, max_age)
    if age_match:
        eligibility_criteria_met.append('Age')
    else:
        is_match = False
    if debug:
        log_lines.append(f"1. Age = {patient_age} vs [{min_age}, {max_age}] = {'MATCH' if age_match else 'NO MATCH'}")

    # 2. Sex check
    sex_match = is_sex_match(patient_gender, trial_sex)
    if sex_match:
        eligibility_criteria_met.append('Sex')
    else:
        is_match = False
    if debug:
        log_lines.append(f"2. Sex = {patient_gender} vs {trial_sex} = {'MATCH' if sex_match else 'NO MATCH'}")

    # 3. Health check
    health_match = True
    if healthy_volunteers and not is_healthy:
        health_match = False
    if health_match:
        eligibility_criteria_met.append('Health Status')
    else:
        is_match = False
    if debug:
        log_lines.append(f"3. Health = {'Healthy' if is_healthy else 'Not Healthy'} vs {'Healthy Volunteers' if healthy_volunteers else 'Patients'} = {'MATCH' if health_match else 'NO MATCH'}")

    # Proceed only if basic checks passed
    if is_match:
        # 4. Exclusion Criteria check
        exclusion_match, exclusion_logs = perform_exclusion_criteria_check_row(row)
        if exclusion_match:
            eligibility_criteria_met.append('Exclusion Criteria')
        else:
            is_match = False
        if debug:
            log_lines.append("4. Exclusion Criteria:")
            log_lines.extend(exclusion_logs)

    # Generate logs if debug is True
    if debug:
        # Prepare patient-specific log directory
        patient_log_dir = os.path.join('logs', patient_id)
        os.makedirs(patient_log_dir, exist_ok=True)

        # Prepare log file name
        datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
        log_filename = f"{trial_id}_{datetime_str}.log"
        if is_match:
            log_filename = f"[MATCH]_{log_filename}"

        # Write logs to file
        log_filepath = os.path.join(patient_log_dir, log_filename)
        with open(log_filepath, 'w', encoding='utf-8') as f:
            for line in log_lines:
                f.write(line + '\n')

    return {'is_match': is_match, 'eligibilityCriteriaMet': eligibility_criteria_met}

def is_healthy_volunteer_row(row):
    """
    Determine if a patient is a healthy volunteer based on row data.
    """
    return len(row['Condition_Description']) == 0

def perform_exclusion_criteria_check_row(row):
    """
    Check exclusion criteria for a patient-trial pair.

    Parameters:
        row (pd.Series): Series containing patient and trial data.

    Returns:
        tuple: (exclusion_match (bool), exclusion_logs (list))
    """
    exclusion_match = True  # Assume match until proven otherwise
    exclusion_logs = []

    # Patient codes
    patient_condition_codes = set(row['Condition_UMLS_CODES'])
    patient_medication_codes = set(row['Medication_UMLS_CODES'])

    # Iterate over exclusion criteria
    exclusion_criteria = row['Exclusion_Criteria']
    exclusion_codes_mapping = row['Exclusion_Criteria_UMLS_Codes']

    for idx, crit in enumerate(exclusion_criteria):
        criterion_match = True  # Assume criterion matches
        criterion_logs = [f"   - Criterion {idx+1}: {crit}"]

        # Get codes associated with the criterion
        criterion_codes = set(exclusion_codes_mapping.get(crit, []))

        # Check condition codes
        overlapping_condition_codes = patient_condition_codes.intersection(criterion_codes)
        if overlapping_condition_codes:
            criterion_match = False
            for code in overlapping_condition_codes:
                descriptions = row['Condition_Code_Description_Map'].get(code, [])
                descriptions_str = ', '.join(descriptions)
                criterion_logs.append(f"      + [{code}] {descriptions_str}")

        # Check medication codes
        overlapping_medication_codes = patient_medication_codes.intersection(criterion_codes)
        if overlapping_medication_codes:
            criterion_match = False
            for code in overlapping_medication_codes:
                descriptions = row['Medication_Code_Description_Map'].get(code, [])
                descriptions_str = ', '.join(descriptions)
                criterion_logs.append(f"      + [{code}] {descriptions_str}")

        # Append match status for the criterion
        criterion_logs[0] += f" = {'MATCH' if criterion_match else 'NO MATCH'}"
        if not criterion_match:
            exclusion_match = False

        exclusion_logs.extend(criterion_logs)

    return exclusion_match, exclusion_logs

def prepare_trial_data(df_trials):
    """
    Preprocess trial data to ensure proper data types.
    """
    # Convert age fields to integers
    df_trials['Minimum_Age'] = df_trials['Minimum_Age'].apply(convert_age)
    df_trials['Maximum_Age'] = df_trials['Maximum_Age'].apply(convert_age)
    # Ensure 'Healthy_Volunteers' is boolean
    df_trials['Healthy_Volunteers'] = df_trials['Healthy_Volunteers'].apply(lambda x: x.lower() == 'yes' if isinstance(x, str) else False)
    return df_trials

def convert_age(age_str):
    """
    Convert age strings to integers.
    """
    # if age is not str
    if not isinstance(age_str, str):
        return age_str
    if pd.isna(age_str) or age_str in ['Not Specified', 'N/A']:
        return np.nan
    try:
        return int(age_str.split()[0])
    except:
        return np.nan

def is_age_match(patient_age, min_age, max_age):
    """
    Check if patient's age matches trial's age criteria.
    """
    if pd.isna(min_age):
        min_age = 0
    if pd.isna(max_age):
        max_age = 120
    return min_age <= patient_age <= max_age

def is_sex_match(patient_gender, trial_gender):
    """
    Check if patient's gender matches trial's sex criteria.
    """
    if trial_gender.upper() == "ALL":
        return True
    return patient_gender.upper() == trial_gender.upper()


In [10]:
# df_trials = fetch_n_trials(1000)
df_trials = fetch_n_trials_v2(n=None, max_workers=8, verbose=True, reload=False)
# ran on 10/4/2024
# took 438mins to run which is 7.3 hours
# processing with lg model took most time. # can use md model or sm model 

In [11]:
# Prepare patient data 
from datetime import datetime
# using reload will take 2-3 minutes to reload data and annotate, use this only if the synthea data has changed due to scaling
patient_profiles_grouped = prepare_patient_data(reload=False)

# Fetch trials and prepare trial data
# df_trials = fetch_n_trials(1000)

In [12]:
# df_trials load the codes column as it curr string, convert to dict
# 1 min to run for 67k trials
df_trials['Inclusion_Criteria_UMLS_Codes'] = df_trials['Inclusion_Criteria_UMLS_Codes'].apply(lambda x: eval(x))
df_trials['Exclusion_Criteria_UMLS_Codes'] = df_trials['Exclusion_Criteria_UMLS_Codes'].apply(lambda x: eval(x))
df_trials['Inclusion_Criteria'] = df_trials['Inclusion_Criteria'].apply(lambda x: eval(x))
df_trials['Exclusion_Criteria'] = df_trials['Exclusion_Criteria'].apply(lambda x: eval(x))

In [13]:
import numpy as np
# Collect UMLS codes from patient data
patient_condition_codes = set(patient_profiles_grouped['Condition_UMLS_CODES'].explode())
patient_medication_codes = set(patient_profiles_grouped['Medication_UMLS_CODES'].explode())

# Collect UMLS codes from trial data
trial_inclusion_codes = set()
trial_exclusion_codes = set()

for codes_mapping in df_trials['Inclusion_Criteria_UMLS_Codes']:
    for codes in codes_mapping.values():
        trial_inclusion_codes.update(codes)

for codes_mapping in df_trials['Exclusion_Criteria_UMLS_Codes']:
    for codes in codes_mapping.values():
        trial_exclusion_codes.update(codes)

# Combine all unique codes
all_unique_codes = patient_condition_codes.union(
    patient_medication_codes,
    trial_inclusion_codes,
    trial_exclusion_codes
)

# Remove any NaN or None values
all_unique_codes.discard(np.nan)
all_unique_codes.discard(None)

code_definitions_df = extract_umls_code_definitions(
    unique_codes=all_unique_codes,
    # nlp_umls_conditions=nlp_umls_link,
    # nlp_umls_medications=nlp_rxnorm_link
    nlp_umls_conditions=None,
    nlp_umls_medications=None,
    reload=False
)

In [121]:
spacy_umls_codes_df

Unnamed: 0,code,canonical_name,definition,aliases,types
0,C0700087,Tegretol,a kind of epilepsy treatment drug,"[tegretol, Tegretol, TEGretol]","[T109, T121]"
1,C5555091,60 Months,A period of time of sixty months.,[],[T079]
2,C4085349,Medial collateral ligament,,"[Medial collateral ligament complex, Medial co...",[T023]
3,C0700201,Dyssomnias,A broad category of sleep disorders characteri...,"[Dyssomnia, NOS, Dyssomnia (disorder), sleep d...",[T048]
4,C1160964,Fracture care,Actions performed to control broken bones,"[Fracture Care, care fractures, care fracture,...",[T061]
...,...,...,...,...,...
1888,C0030705,Patients,Individuals participating in the health care s...,"[patients, patient, Patient, Patient (person),...",[T101]
1889,C2827257,lisinopril anhydrous,The anhydrous form of the long-acting angioten...,"[L-Proline, 1-(N(Sup 2)-(1-Carboxy-3-Phenylpro...","[T116, T121]"
1890,C0242477,Refugee Camps,"A temporary, often makeshift shelter for perso...","[Refugee Camp, refugee camps, Camps, Refugee, ...",[T082]
1891,C0439230,week,Any period of seven consecutive days.,"[Weeks, week, Week, week (qualifier value), we...",[T079]


In [23]:
# delete the models to free up memory
del nlp_umls_link
del nlp_rxnorm_link

In [11]:
import gc
gc.collect()

7

In [1]:
df_trials.Healthy_Volunteers.value_counts()

NameError: name 'df_trials' is not defined

In [17]:
# Call the function with desired parameters
# go to this function and uncomment the code to call the api which should be at the end of the function
results = match_patients_to_trials(
    patient_profiles_grouped,
    # send only 1 patient for testing
    # patient_profiles_grouped.iloc[0:2],
    df_trials,
    # df_trials.iloc[:10],
    code_definitions_df,
    chunk_size=1000,
    debug=False, # use this to save details patient-trial match logs containing
    verbose=True,
    max_api_workers=8,
    output_json_format='match',  # Options: 'match', 'no_match', 'both'
    patient_start_chunk=100
)

INFO:match_patients_to_trials_w_api:Trial data prepared.
Patient Chunks:   0%|          | 0/1 [00:00<?, ?it/s]INFO:match_patients_to_trials_w_api:Processing patient chunk starting at index 100
INFO:match_patients_to_trials_w_api:Processing trial chunk starting at index 0

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 11000/11000 [00:02<00:00, 5112.19it/s]
INFO:match_patients_to_trials_w_api:Processing trial chunk starting at index 1000

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 11000/11000 [00:01<00:00, 5586.57it/s]
INFO:match_patients_to_trials_w_api:Processing trial chunk starting at index 2000

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 11000/11000 [00:02<00:00, 5349.00it/s]
INFO:match_patients_to_trials_w_api:Processing trial chunk starting at index 3000

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|█████

In [41]:
# results convert to df 
results_df = pd.DataFrame(results)

In [43]:
# results_df len of eligible trials
results_df['num_eligible_trials'] = results_df['eligibleTrials'].apply(len)
results_df['num_eligible_trials'].value_counts()

num_eligible_trials
3812     1
45599    1
Name: count, dtype: int64

In [44]:
# results_df

Unnamed: 0,patientId,eligibleTrials,num_eligible_trials
0,03f12f9e-fd3e-b845-502a-3a12511d9e48,"[{'trialId': 'NCT03668327', 'trialName': 'Pret...",3812
1,068d2ed4-b12c-e380-fe78-3d99eab488c2,"[{'trialId': 'NCT06286527', 'trialName': 'Qual...",45599


In [45]:
# histogram of no of patients by % of eligible trials total trials = 67219
results_df['num_eligible_trials'].value_counts().sort_index().plot(kind='bar')

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.