In [1]:
import pandas as pd
from datetime import datetime
from medcat.cat import CAT

# Load MedCAT model
cat = CAT.load_model_pack('C:/Users/cx-admin/Downloads/umls_sm_pt2ch_533bab5115c6c2d6.zip')

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK data files (run this once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

  from tqdm.autonotebook import tqdm, trange
  meta_cat.model.load_state_dict(torch.load(model_save_path, map_location=device))


ModuleNotFoundError: No module named 'nltk'

### 1. Patient Data

In [2]:
file_path = 'synthea_100/patients.csv'
patients_df = pd.read_csv(file_path)
patients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   111 non-null    object 
 1   BIRTHDATE            111 non-null    object 
 2   DEATHDATE            11 non-null     object 
 3   SSN                  111 non-null    object 
 4   DRIVERS              96 non-null     object 
 5   PASSPORT             88 non-null     object 
 6   PREFIX               89 non-null     object 
 7   FIRST                111 non-null    object 
 8   MIDDLE               91 non-null     object 
 9   LAST                 111 non-null    object 
 10  SUFFIX               1 non-null      object 
 11  MAIDEN               34 non-null     object 
 12  MARITAL              78 non-null     object 
 13  RACE                 111 non-null    object 
 14  ETHNICITY            111 non-null    object 
 15  GENDER               111 non-null    obj

UPDATE (10/2): Medication code is RXCUI from RxNorm Drugs (NLMedicine) <br><br>
UPDATE (10/3): decided to convert from RxNorm to SNOMED CT to bring both conditions and medications to one standard form. Will not be checking for robustness of the opensource MedCat NER model (UMLS Small) due to time constraints. 

In [3]:
medication_file_path = 'synthea_100/medications.csv'
medications_df = pd.read_csv(medication_file_path)
medications_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5963 entries, 0 to 5962
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   START              5963 non-null   object 
 1   STOP               5689 non-null   object 
 2   PATIENT            5963 non-null   object 
 3   PAYER              5963 non-null   object 
 4   ENCOUNTER          5963 non-null   object 
 5   CODE               5963 non-null   int64  
 6   DESCRIPTION        5963 non-null   object 
 7   BASE_COST          5963 non-null   float64
 8   PAYER_COVERAGE     5963 non-null   float64
 9   DISPENSES          5963 non-null   int64  
 10  TOTALCOST          5963 non-null   float64
 11  REASONCODE         5132 non-null   float64
 12  REASONDESCRIPTION  5132 non-null   object 
dtypes: float64(4), int64(2), object(7)
memory usage: 605.7+ KB


UPDATE: Condition code is SNOMED CT code. Check if API.

In [4]:
conditions_file_path = 'synthea_100/conditions.csv'
conditions_df = pd.read_csv(conditions_file_path)
conditions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3754 entries, 0 to 3753
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   START        3754 non-null   object
 1   STOP         2702 non-null   object
 2   PATIENT      3754 non-null   object
 3   ENCOUNTER    3754 non-null   object
 4   SYSTEM       3754 non-null   object
 5   CODE         3754 non-null   int64 
 6   DESCRIPTION  3754 non-null   object
dtypes: int64(1), object(6)
memory usage: 205.4+ KB


##### Testing a purely text cleaning/processing based NLP approach

In [43]:
import pandas as pd
import re

# Example patient_profiles data (for context)
# patient_profiles = pd.read_csv('patient_profiles.csv')

# Function to clean and extract condition description and category
def clean_condition_description(condition):
    if pd.isna(condition):
        return "", "medical"
    
    # Use regex to separate description from category in brackets
    match = re.match(r"^(.*?)\s*\((.*?)\)$", condition)
    # only first match
    if match:
        return match.group(1).strip(), match.group(2).strip()
    else:
        return condition.strip(), "medical"

# Function to clean medication description (splitting by '/')
def clean_medication_description(medication):
    if pd.isna(medication):
        return "", "", ""
    
    # Split by '/' and process each part
    parts = medication.split('/')
    cleaned_medications = []
    codes = []
    quantities = []
    
    for part in parts:
        part_cleaned = part.strip()  # Clean up spaces
        # Extract the quantity, code, and name (if available)
        quantity_match = re.search(r"(\d+\s*MG|\d+\s*ML|\d+\s*mcg)", part_cleaned)
        code_match = re.search(r"\d{5,}", part_cleaned)  # Assuming medication code is a number
        quantity = quantity_match.group(0) if quantity_match else ""
        code = code_match.group(0) if code_match else ""
        medication_name = re.sub(r"(\d+\s*MG|\d+\s*ML|\d+\s*mcg|[\d+])", "", part_cleaned).strip()
        cleaned_medications.append(medication_name)
        quantities.append(quantity)
        codes.append(code)

    # If there's more than one medication, return them as separate lists
    # return joined strings
    cleaned_medications = ', '.join(cleaned_medications)
    quantities = ', '.join(quantities)
    codes = ', '.join(codes)
    return cleaned_medications, quantities, codes

In [44]:
# Clean and extract condition descriptions
conditions_df['DESCRIPTION_cleaned'], conditions_df['CATEGORY_cleaned'] = zip(*conditions_df['DESCRIPTION'].map(clean_condition_description))

# Clean and extract medication descriptions
medications_df['DESCRIPTION_cleaned'], medications_df['QUANTITY'], medications_df['CODE'] = zip(*medications_df['DESCRIPTION'].map(clean_medication_description))

In [45]:
from datetime import datetime

# 1. Age Calculation
# Convert BIRTHDATE to datetime format
patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])

# Calculate the current age
current_date = datetime.now()
patients_df['AGE'] = patients_df['BIRTHDATE'].apply(lambda x: (current_date - x).days // 365)

# 2. Convert START and STOP dates in both conditions and medications to datetime
conditions_df['START'] = pd.to_datetime(conditions_df['START'])
conditions_df['STOP'] = pd.to_datetime(conditions_df['STOP'])

medications_df['START'] = pd.to_datetime(medications_df['START'])
medications_df['STOP'] = pd.to_datetime(medications_df['STOP'])

# 3. Merge Patient Data with Conditions based on PATIENT field
patients_conditions = pd.merge(patients_df, conditions_df, left_on='Id', right_on='PATIENT', how='left')

# 4. Merge Conditions with Medications based on the ENCOUNTER field
conditions_medications = pd.merge(patients_conditions, medications_df, left_on='ENCOUNTER', right_on='ENCOUNTER', how='left')

# 5. Create a Clean Patient Profile with Linked Conditions, Medications, and Dates
patient_profiles = conditions_medications[['Id', 'AGE', 'GENDER', 'DESCRIPTION_x', 'DESCRIPTION_cleaned_x', 'CATEGORY_cleaned', 'CODE_x', 'START_x', 'STOP_x', 'DESCRIPTION_y', 'DESCRIPTION_cleaned_y', 'CODE_y', 'START_y', 'STOP_y', 'ENCOUNTER']]

# Rename columns for clarity
patient_profiles.columns = [
    'PatientID', 'Age', 'Gender', 
    'Condition_Description', 'Condition_Description_Cleaned', 'Condition_Category_Cleaned',
    'Condition_Code', 'Condition_Start', 'Condition_End',
    'Medication_Description', 'Medication_Description_Cleaned', 'Medication_Code', 'Medication_Start', 'Medication_End',
    'Encounter'
]

# Display the final patient profile DataFrame
patient_profiles.head()

Unnamed: 0,PatientID,Age,Gender,Condition_Description,Condition_Description_Cleaned,Condition_Category_Cleaned,Condition_Code,Condition_Start,Condition_End,Medication_Description,Medication_Description_Cleaned,Medication_Code,Medication_Start,Medication_End,Encounter
0,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Seasonal allergic rhinitis,Seasonal allergic rhinitis,medical,367498001,2010-02-13,NaT,Fexofenadine hydrochloride 30 MG Oral Tablet,Fexofenadine hydrochloride Oral Tablet,,2010-02-13 20:39:01+00:00,NaT,54610a6c-47b4-79f4-e75d-80f3ed4c8437
1,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Medication review due (situation),Medication review due,situation,314529007,2012-09-05,2016-09-21,,,,NaT,NaT,54f02fad-1ad7-7139-ae76-1fc8db2ac30c
2,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Acute bronchitis (disorder),Acute bronchitis,disorder,10509002,2016-09-10,2016-09-17,Acetaminophen 325 MG Oral Tablet,Acetaminophen Oral Tablet,,2016-09-10 04:00:34+00:00,2016-09-17 04:00:34+00:00,0dcd99ad-9b21-681c-8cba-2321c6cc3ec2
3,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Medication review due (situation),Medication review due,situation,314529007,2016-09-28,2020-10-21,,,,NaT,NaT,39bc4565-99ba-afb8-4298-92165d168052
4,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Concussion injury of brain (disorder),Concussion injury of brain,disorder,110030002,2018-08-09,2018-10-18,,,,NaT,NaT,e6d0e44a-a960-970d-d526-1f21f39be969


##### Testing an approach using SNOMED codes only to check for exclusion

In [5]:
# Function to extract SNOMED codes using MedCAT
def extract_snomed_codes(text):
    """
    Extracts SNOMED codes from the input text using MedCAT.

    Parameters:
        text (str): The text to process.

    Returns:
        List[str]: A list of SNOMED codes found in the text.
    """
    entities = cat.get_entities(text)
    snomed_codes = []
    for ent in entities['entities'].values():
        snomed_codes.extend(ent.get('snomed', []))
    return list(set(snomed_codes))  # Remove duplicates

In [6]:
# 1. Age Calculation
# Convert BIRTHDATE to datetime format
patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])

# Calculate the current age
current_date = datetime.now()
patients_df['AGE'] = patients_df['BIRTHDATE'].apply(lambda x: (current_date - x).days // 365)

# 2. Convert START and STOP dates in both conditions and medications to datetime
conditions_df['START'] = pd.to_datetime(conditions_df['START'])
conditions_df['STOP'] = pd.to_datetime(conditions_df['STOP'])

medications_df['START'] = pd.to_datetime(medications_df['START'])
medications_df['STOP'] = pd.to_datetime(medications_df['STOP'])

# 2a. Add SNOMED codes to medications_df
medications_df['Medication_SNOMED_CODES'] = medications_df['DESCRIPTION'].apply(extract_snomed_codes)

# 2b. Rename 'CODE' column in conditions_df to 'Condition_SNOMED_CODE'
conditions_df.rename(columns={'CODE': 'Condition_SNOMED_CODE'}, inplace=True)

# 3. Merge Patient Data with Conditions based on PATIENT field
patients_conditions = pd.merge(
    patients_df,
    conditions_df[['PATIENT', 'ENCOUNTER', 'DESCRIPTION', 'Condition_SNOMED_CODE', 'START', 'STOP']],
    left_on='Id',
    right_on='PATIENT',
    how='left'
)

# 4. Merge Conditions with Medications based on the ENCOUNTER field
conditions_medications = pd.merge(
    patients_conditions,
    medications_df[['ENCOUNTER', 'DESCRIPTION', 'Medication_SNOMED_CODES', 'START', 'STOP']],
    on='ENCOUNTER',
    how='left',
    suffixes=('_Condition', '_Medication')
)

# 5. Create a Clean Patient Profile with Linked Conditions, Medications, and Dates
patient_profiles = conditions_medications[[
    'Id', 'AGE', 'GENDER',
    'DESCRIPTION_Condition', 'Condition_SNOMED_CODE', 'START_Condition', 'STOP_Condition',
    'DESCRIPTION_Medication', 'Medication_SNOMED_CODES', 'START_Medication', 'STOP_Medication',
    'ENCOUNTER'
]]

# Rename columns for clarity
patient_profiles.columns = [
    'PatientID', 'Age', 'Gender',
    'Condition_Description', 'Condition_SNOMED_CODE', 'Condition_Start', 'Condition_End',
    'Medication_Description', 'Medication_SNOMED_CODES', 'Medication_Start', 'Medication_End',
    'Encounter'
]

# Create 'SNOMED_CODES' column by combining condition and medication SNOMED codes
def combine_snomed_codes(row):
    codes = []
    if pd.notna(row['Condition_SNOMED_CODE']):
        codes.append(str(row['Condition_SNOMED_CODE']))
    if pd.notna(row['Medication_SNOMED_CODES']):
        codes.extend(row['Medication_SNOMED_CODES'])
    return list(set(codes))  # Remove duplicates

patient_profiles['SNOMED_CODES'] = patient_profiles.apply(combine_snomed_codes, axis=1)

# Display the patient_profiles DataFrame
print(patient_profiles.head())

# 6. Create grouped patient profiles
patient_profile_grouped = patient_profiles.groupby('PatientID').agg({
    'Age': 'first',
    'Gender': 'first',
    'Condition_Description': lambda x: list(x.dropna()),
    'Medication_Description': lambda x: list(x.dropna()),
    'Condition_SNOMED_CODE': lambda x: list(x.dropna().astype(str)),
    'Medication_SNOMED_CODES': lambda x: [code for codes in x.dropna() for code in codes],
    'SNOMED_CODES': lambda x: [code for codes in x.dropna() for code in codes],
    'Condition_Start': lambda x: list(x.dropna()),
    'Condition_End': lambda x: list(x.dropna()),
    'Medication_Start': lambda x: list(x.dropna()),
    'Medication_End': lambda x: list(x.dropna()),
})

# Reset index to turn 'PatientID' back into a column
patient_profile_grouped.reset_index(inplace=True)

# Display the grouped patient profiles
print(patient_profile_grouped.head())

  if pd.notna(row['Medication_SNOMED_CODES']):


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [7]:
patient_profiles

Unnamed: 0,PatientID,Age,Gender,Condition_Description,Condition_SNOMED_CODE,Condition_Start,Condition_End,Medication_Description,Medication_SNOMED_CODES,Medication_Start,Medication_End,Encounter
0,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Seasonal allergic rhinitis,367498001,2010-02-13,NaT,Fexofenadine hydrochloride 30 MG Oral Tablet,[S-371748006],2010-02-13 20:39:01+00:00,NaT,54610a6c-47b4-79f4-e75d-80f3ed4c8437
1,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Medication review due (situation),314529007,2012-09-05,2016-09-21,,,NaT,NaT,54f02fad-1ad7-7139-ae76-1fc8db2ac30c
2,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Acute bronchitis (disorder),10509002,2016-09-10,2016-09-17,Acetaminophen 325 MG Oral Tablet,[S-370150001],2016-09-10 04:00:34+00:00,2016-09-17 04:00:34+00:00,0dcd99ad-9b21-681c-8cba-2321c6cc3ec2
3,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Medication review due (situation),314529007,2016-09-28,2020-10-21,,,NaT,NaT,39bc4565-99ba-afb8-4298-92165d168052
4,e93300bf-3a53-55c0-bd38-2ede59462f21,17,F,Concussion injury of brain (disorder),110030002,2018-08-09,2018-10-18,,,NaT,NaT,e6d0e44a-a960-970d-d526-1f21f39be969
...,...,...,...,...,...,...,...,...,...,...,...,...
7152,17815732-cb25-a735-8c1c-5a417466856b,66,F,Part-time employment (finding),160904001,2020-11-08,2022-11-20,Ibuprofen 400 MG Oral Tablet [Ibu],[],2020-11-08 08:17:01+00:00,2021-11-14 08:17:01+00:00,7efb6e97-6cfc-c458-2725-3c3e621022b6
7153,17815732-cb25-a735-8c1c-5a417466856b,66,F,Full-time employment (finding),160903007,2022-11-20,NaT,Acetaminophen 325 MG / Oxycodone Hydrochloride...,[],2022-11-20 08:17:01+00:00,2023-11-26 08:17:01+00:00,57c192f2-1b5f-27e2-64d9-b114325f17b9
7154,17815732-cb25-a735-8c1c-5a417466856b,66,F,Full-time employment (finding),160903007,2022-11-20,NaT,Ibuprofen 400 MG Oral Tablet [Ibu],[],2022-11-20 08:17:01+00:00,2023-11-26 08:17:01+00:00,57c192f2-1b5f-27e2-64d9-b114325f17b9
7155,17815732-cb25-a735-8c1c-5a417466856b,66,F,Medication review due (situation),314529007,2023-11-26,NaT,Acetaminophen 325 MG / Oxycodone Hydrochloride...,[],2023-11-26 08:17:01+00:00,NaT,15470932-b9d7-4e18-16cf-19a323329f43


In [42]:
patient_profiles.Condition_Category_Cleaned.value_counts()

Condition_Category_Cleaned
finding                    3113
disorder                   2178
situation                  1454
medical                     356
panic) (finding              27
morphologic abnormality      20
person                        9
Name: count, dtype: int64

In [48]:
patient_profiles[patient_profiles.Condition_Category_Cleaned == 'disorder'].Condition_Description_Cleaned.value_counts()

Condition_Description_Cleaned
Gingivitis                                          693
Viral sinusitis                                     116
Hypoxemia                                           105
Pneumonia                                           105
Gingival disease                                     79
                                                   ... 
Non-small cell lung cancer                            1
Non-small cell carcinoma of lung  TNM stage 1         1
Neuropathy due to type 2 diabetes mellitus            1
Acute ST segment elevation myocardial infarction      1
Tongue tie                                            1
Name: count, Length: 69, dtype: int64

In [12]:
patient_profiles.iloc[7153].Medication_Description

'Acetaminophen 325 MG / Oxycodone Hydrochloride 10 MG Oral Tablet [Percocet]'

### 2. Clinical Trials Data

##### a. Attempt 1 (Failed)

In [16]:
import pandas as pd
from pytrials.client import ClinicalTrials

def fetch_n_trials(n):
    # Initialize API client
    ct = ClinicalTrials()
    
    # Fetch n trials from ClinicalTrials.gov that are actively recruiting
    data  = ct.get_full_studies(search_expr="Recruiting", max_studies=n)
    
    headers = data[0]
    
    # Remaining lists are the rows
    rows = data[1:]
    
    # Create DataFrame using headers and rows
    df = pd.DataFrame(rows, columns=headers)
    
    return df

# Fetch and display 5 trials
df_trials = fetch_n_trials(5)
df_trials

Failed to fetch data: 400
None


In [14]:
df_trials.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   NCT Number                  5 non-null      object
 1   Study Title                 5 non-null      object
 2   Study URL                   5 non-null      object
 3   Acronym                     5 non-null      object
 4   Study Status                5 non-null      object
 5   Brief Summary               5 non-null      object
 6   Study Results               5 non-null      object
 7   Conditions                  5 non-null      object
 8   Interventions               5 non-null      object
 9   Primary Outcome Measures    5 non-null      object
 10  Secondary Outcome Measures  5 non-null      object
 11  Other Outcome Measures      5 non-null      object
 12  Sponsor                     5 non-null      object
 13  Collaborators               5 non-null      object
 14

##### b. Using REST API

In [1]:
import requests
import re
import pandas as pd

# Define base URL for ClinicalTrials.gov API v2
base_url = 'https://clinicaltrials.gov/api/v2/studies'


def parse_eligibility_criteria(eligibility_string):
    """
    Parses the eligibility string to separate Inclusion and Exclusion criteria, handling both bullet points and numbered lists.

    Parameters:
        eligibility_string (str): The raw markdown string containing both inclusion and exclusion criteria.

    Returns:
        tuple: A tuple containing two lists: (inclusion_criteria, exclusion_criteria).
    """
    inclusion_criteria = []
    exclusion_criteria = []
    
    if "Inclusion Criteria:" in eligibility_string:
        # Split into inclusion and exclusion sections
        sections = eligibility_string.split("Exclusion Criteria:")
        inclusion_section = sections[0].split("Inclusion Criteria:")[-1]
        exclusion_section = sections[1] if len(sections) > 1 else ""

        # Use regex to split by bullet points, numbers (1., 2.), and other common list markers
        inclusion_criteria = re.split(r'\n(?:\d+\.\s|\*\s|-)', inclusion_section)
        exclusion_criteria = re.split(r'\n(?:\d+\.\s|\*\s|-)', exclusion_section)

        # Clean up the criteria (remove empty strings, strip extra whitespace)
        inclusion_criteria = [item.strip() for item in inclusion_criteria if item.strip()]
        exclusion_criteria = [item.strip() for item in exclusion_criteria if item.strip()]

    return inclusion_criteria, exclusion_criteria

def fetch_n_trials(n):
    """
    Fetches 'n' clinical trials from ClinicalTrials.gov that are actively recruiting and includes their eligibility criteria.

    Parameters:
        n (int): The number of trials to fetch.

    Returns:
        DataFrame: A pandas DataFrame containing the NCT ID, trial title, and eligibility criteria.
    """
    # Set query parameters for the API call
    query_params = {
        'format': 'json',  # Request the response in JSON format
        'filter.overallStatus': 'RECRUITING',  # Only fetch trials that are recruiting
        'pageSize': n  # Limit the number of trials to 'n'
    }

    try:
        # Make the initial API request to fetch a list of studies
        response = requests.get(base_url, params=query_params)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the response JSON and extract the studies
        trials = response.json().get('studies', [])
        trial_data = []

        # Loop through each trial in the fetched data
        for trial in trials:
            try:
                # Extract the NCT ID and trial title from the trial
                nct_id = trial['protocolSection']['identificationModule']['nctId']
                trial_title = trial['protocolSection']['identificationModule']['briefTitle']

                # Fetch detailed information for each trial using the NCT ID
                trial_details_url = f"{base_url}/{nct_id}?format=json"
                trial_response = requests.get(trial_details_url)

                # Check if the trial details request was successful
                trial_response.raise_for_status()

                # Parse the detailed trial response
                trial_details = trial_response.json()

                # Extract eligibility criteria and process markdown into structured lists
                eligibility_string = trial_details['protocolSection'].get('eligibilityModule', {}).get('eligibilityCriteria', 'Not Available')
                inclusion_criteria, exclusion_criteria = parse_eligibility_criteria(eligibility_string)
                
                # Example: Additional fields like Age, Sex, and Healthy Volunteers
                minimum_age = trial_details['protocolSection']['eligibilityModule'].get('minimumAge', 'Not Specified')
                maximum_age = trial_details['protocolSection']['eligibilityModule'].get('maximumAge', 'Not Specified')
                sex = trial_details['protocolSection']['eligibilityModule'].get('sex', 'Not Specified')
                healthy_volunteers = trial_details['protocolSection']['eligibilityModule'].get('healthyVolunteers', 'Not Specified')

                # Append all this data to the trial_data list
                trial_data.append({
                    'NCTId': nct_id,
                    'Title': trial_title,
                    'Inclusion_Criteria': inclusion_criteria,
                    'Exclusion_Criteria': exclusion_criteria,
                    'Minimum_Age': minimum_age,
                    'Maximum_Age': maximum_age,
                    'Sex': sex,
                    'Healthy_Volunteers': healthy_volunteers
                })

            except KeyError as ke:
                print(f"KeyError: Missing key {ke} in trial data for NCT ID {nct_id}. Skipping this trial.")
            except requests.RequestException as e:
                print(f"Error fetching details for trial {nct_id}: {e}")

        # Convert the collected trial data to a pandas DataFrame
        df = pd.DataFrame(trial_data)
        return df

    except requests.RequestException as e:
        # Handle any general API request errors
        print(f"Error fetching trials: {e}")
        return None

# Example: Fetch and display 5 trials with their eligibility criteria
df_trials = fetch_n_trials(10)
df_trials.head()  # Display the DataFrame if it was successfully fetched

Unnamed: 0,NCTId,Title,Inclusion_Criteria,Exclusion_Criteria,Minimum_Age,Maximum_Age,Sex,Healthy_Volunteers
0,NCT04687176,Frontline Oral Arsenic Trioxide for APL,[Newly diagnosed APL with t(15;17)(q24;q21) or...,"[ECOG performance score \>2, Decompensated hea...",Not Specified,Not Specified,ALL,False
1,NCT06400576,Smartphone & Headphone Effects on Baropodometr...,[Agreeing to participate in the study voluntar...,[Not agreeing to participate in the study volu...,18 Years,35 Years,ALL,True
2,NCT06530576,Thalidomide for the Symptomatic Large Granular...,"[The patient fully understands the study, volu...",[Unable to understand or follow the study proc...,18 Years,Not Specified,ALL,False
3,NCT04900376,Evaluation of Covid-19 Vaccination in Elderly ...,[Subject in Retirement Home (EHPAD) or Long-te...,[Subject not affiliated to a social security s...,Not Specified,Not Specified,ALL,True
4,NCT05179876,A Study Providing Treatment Access in Particip...,[Participant must sign an informed consent for...,"[General:, Participants prematurely discontinu...",2 Years,Not Specified,ALL,False


In [1]:
df_trials.info()

NameError: name 'df_trials' is not defined

In [52]:
print(df_trials.to_json(orient='records'))

[{"NCTId":"NCT03063892","Title":"Effect of Tranexamic Acid (TXA) on Reduction of Postoperative Blood Transfusion","Inclusion_Criteria":["Over the age of 60 years","Hip fracture requiring surgical intervention","Signs consent and agrees to participate"],"Exclusion_Criteria":["Under the age of 60","Does not sign consent or refuses participation","Known hypersensitivity to tranexamic acid","Multiple acute fractures","Creatinine clearance \\<30","History of seizures","Active hormone therapy","History of coagulation abnormality","History of deep vein thrombosis (DVT) or pulmonary embolism (PE) within the last year or history or recurrent DVT\/PE","Myocardial infarction (MI) and\/or stents within the past year","History of intracranial hemorrhage","Acquired defective color vision","Patients admitted directly to nursing units or surgery without stay in the Emergency Center","Patients who sustain fracture while hospitalized at ProMedica Toledo Hospital"],"Minimum_Age":"60 Years","Maximum_Age":

In [53]:
df_trials.iloc[1].Inclusion_Criteria

['Over the age of 60 years',
 'Hip fracture requiring surgical intervention',
 'Signs consent and agrees to participate']

In [61]:
df_trials.iloc[6].Exclusion_Criteria

['Positive hepatitis B surface antigen and hepatitis B virus quantification \\> 1 × 1000 copies/ml, or positive anti-hepatitis C virus antibody;',
 'Positive anti-HIV antibody or diagnosis of acquired immunodeficiency syndrome (i.e., AIDS);',
 'Conditions such as dysphagia, chronic diarrhea, or bowel obstruction that would interfere with oral medication.',
 'Patients with severe chronic or active infection that must be treated with systemic antibacterial, antifungal or antiviral therapy before randomization, including but not limited to tuberculosis infection',
 'Active, known or suspected autoimmune disease (including but not limited to uveitis, enteritis, hepatitis, pituitary disease, nephritis, vasculitis, hyperthyroidism, hypothyroidism, and asthma requiring bronchiectasis). Except for type I diabetes, hypothyroidism requiring hormone replacement therapy and skin diseases not requiring systemic treatment (such as vitiligo, psoriasis or alopecia); clinicians should perform necessary

In [63]:
print(df_trials.Exclusion_Criteria.to_json())

{"0":["Lefthandedness","Smoking (past 5 years)","Known changes of the vocal folds (e.g., vocal fold nodules), vocal fold paralysis, surgeries on the larynx or thryoid or current organic or neurological changes of the vocal folds or vocal function per videoendoscopy and -stroboscopy of the larynx","History of voice therapy","Hoarseness","Respiratory illness, allergies (respiratory, silver), reflux or asthma at time of participation","Hearing disorder or hearing aids","Psychological, neurological or endocrinological disorders","Psychotropic or steorid medications","Body mass index \\> 30","Indications against MRI such as metail parts in or on the body (e.g., permanent dental prostheses or braces, screws, prostheses, piercings or large tattoos","Nearsightedness \\> -5 diopters, if only glasses are worn","Claustrophobia","Pregnancy"],"1":[],"2":["Patients with diagnosis of diabetes, or taking anti-diabetic medication;","Patients with cardiovascular or cerebrovascular diseases, cancer, rena

In [46]:
import requests
import re
import pandas as pd
import spacy

# Load spaCy model (medical models can also be used for domain-specific use cases)
nlp = spacy.load("en_core_web_sm")

# Define base URL for ClinicalTrials.gov API v2
base_url = 'https://clinicaltrials.gov/api/v2/studies'


def extract_entities(text):
    """
    Extracts relevant entities like age ranges, conditions, and medications using spaCy.
    
    Parameters:
        text (str): The eligibility criteria text.
        
    Returns:
        dict: A dictionary with extracted entities like age ranges, conditions, etc.
    """
    doc = nlp(text)
    entities = {"AGE": [], "CONDITION": [], "MEDICATION": []}

    for ent in doc.ents:
        # Extract ages (e.g., "between 18 and 59 years old")
        if ent.label_ == "DATE" or "year" in ent.text.lower():
            entities["AGE"].append(ent.text)
        
        # Extract medical conditions
        elif ent.label_ == "DISEASE" or ent.label_ == "CONDITION":
            entities["CONDITION"].append(ent.text)
        
        # Extract medications or contraindications
        elif ent.label_ == "MEDICATION" or "contraindication" in ent.text.lower():
            entities["MEDICATION"].append(ent.text)
    
    return entities


def parse_eligibility_criteria(eligibility_string):
    """
    Parses the eligibility string to separate Inclusion and Exclusion criteria and applies NLP entity extraction.
    
    Parameters:
        eligibility_string (str): The raw markdown string containing eligibility criteria.
        
    Returns:
        tuple: A tuple containing two lists (inclusion_criteria, exclusion_criteria) and extracted entities.
    """
    inclusion_criteria = []
    exclusion_criteria = []
    entities = {"AGE": [], "CONDITION": [], "MEDICATION": []}

    if "Inclusion Criteria:" in eligibility_string:
        # Split into inclusion and exclusion sections
        sections = eligibility_string.split("Exclusion Criteria:")
        inclusion_section = sections[0].split("Inclusion Criteria:")[-1]
        exclusion_section = sections[1] if len(sections) > 1 else ""

        # Use regex to split by bullet points, numbers (1., 2.), and other common list markers
        inclusion_criteria = re.split(r'\n(?:\d+\.\s|\*\s|-)', inclusion_section)
        exclusion_criteria = re.split(r'\n(?:\d+\.\s|\*\s|-)', exclusion_section)

        # Clean up the criteria (remove empty strings, strip extra whitespace)
        inclusion_criteria = [item.strip() for item in inclusion_criteria if item.strip()]
        exclusion_criteria = [item.strip() for item in exclusion_criteria if item.strip()]
        
        # Extract entities using NLP from both inclusion and exclusion criteria
        entities_inclusion = extract_entities(inclusion_section)
        entities_exclusion = extract_entities(exclusion_section)

        # Combine extracted entities from both sections
        entities["AGE"].extend(entities_inclusion["AGE"] + entities_exclusion["AGE"])
        entities["CONDITION"].extend(entities_inclusion["CONDITION"] + entities_exclusion["CONDITION"])
        entities["MEDICATION"].extend(entities_inclusion["MEDICATION"] + entities_exclusion["MEDICATION"])

    return inclusion_criteria, exclusion_criteria, entities


def fetch_n_trials(n):
    """
    Fetches 'n' clinical trials from ClinicalTrials.gov that are actively recruiting and includes their eligibility criteria.
    
    Parameters:
        n (int): The number of trials to fetch.
        
    Returns:
        DataFrame: A pandas DataFrame containing the NCT ID, trial title, and eligibility criteria.
    """
    # Set query parameters for the API call
    query_params = {
        'format': 'json',  # Request the response in JSON format
        'filter.overallStatus': 'RECRUITING',  # Only fetch trials that are recruiting
        'pageSize': n  # Limit the number of trials to 'n'
    }

    try:
        # Make the initial API request to fetch a list of studies
        response = requests.get(base_url, params=query_params)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the response JSON and extract the studies
        trials = response.json().get('studies', [])
        trial_data = []

        # Loop through each trial in the fetched data
        for trial in trials:
            try:
                # Extract the NCT ID and trial title from the trial
                nct_id = trial['protocolSection']['identificationModule']['nctId']
                trial_title = trial['protocolSection']['identificationModule']['briefTitle']

                # Fetch detailed information for each trial using the NCT ID
                trial_details_url = f"{base_url}/{nct_id}?format=json"
                trial_response = requests.get(trial_details_url)

                # Check if the trial details request was successful
                trial_response.raise_for_status()

                # Parse the detailed trial response
                trial_details = trial_response.json()

                # Extract eligibility criteria and process markdown into structured lists
                eligibility_string = trial_details['protocolSection'].get('eligibilityModule', {}).get('eligibilityCriteria', 'Not Available')
                inclusion_criteria, exclusion_criteria, entities = parse_eligibility_criteria(eligibility_string)
                
                # Example: Additional fields like Age, Sex, and Healthy Volunteers
                minimum_age = trial_details['protocolSection']['eligibilityModule'].get('minimumAge', 'Not Specified')
                maximum_age = trial_details['protocolSection']['eligibilityModule'].get('maximumAge', 'Not Specified')
                sex = trial_details['protocolSection']['eligibilityModule'].get('sex', 'Not Specified')
                healthy_volunteers = trial_details['protocolSection']['eligibilityModule'].get('healthyVolunteers', 'Not Specified')

                # Append all this data to the trial_data list
                trial_data.append({
                    'NCTId': nct_id,
                    'Title': trial_title,
                    'Inclusion_Criteria': inclusion_criteria,
                    'Exclusion_Criteria': exclusion_criteria,
                    'Minimum_Age': minimum_age,
                    'Maximum_Age': maximum_age,
                    'Sex': sex,
                    'Healthy_Volunteers': healthy_volunteers,
                    'Extracted_Entities': entities
                })

            except KeyError as ke:
                print(f"KeyError: Missing key {ke} in trial data for NCT ID {nct_id}. Skipping this trial.")
            except requests.RequestException as e:
                print(f"Error fetching details for trial {nct_id}: {e}")

        # Convert the collected trial data to a pandas DataFrame
        df = pd.DataFrame(trial_data)
        return df

    except requests.RequestException as e:
        # Handle any general API request errors
        print(f"Error fetching trials: {e}")
        return None


# Example: Fetch and display 5 trials with their eligibility criteria
df_trials = fetch_n_trials(5)
df_trials.head()  # Display the DataFrame if it was successfully fetched

Unnamed: 0,NCTId,Title,Inclusion_Criteria,Exclusion_Criteria,Minimum_Age,Maximum_Age,Sex,Healthy_Volunteers,Extracted_Entities
0,NCT06126627,Brain and Voice Signatures in Teachers,[Teachers (max. 10 years full-time) or student...,"[Lefthandedness, Smoking (past 5 years), Known...",21 Years,39 Years,FEMALE,True,"{'AGE': ['10 years', 'between 21 and 39 years'..."
1,NCT04807127,A Single-cell Approach to Identify Biomarkers ...,[],[],18 Years,120 Years,ALL,False,"{'AGE': [], 'CONDITION': [], 'MEDICATION': []}"
2,NCT06330727,Effects of Coffee Consumption on Metabolic Mar...,"[Age between 18 and 59 years old;, Diagnosis o...","[Patients with diagnosis of diabetes, or takin...",18 Years,59 Years,ALL,True,"{'AGE': ['between 18 and 59 years old', 'the p..."
3,NCT06293027,Optical Imaging as a Tool for Monitoring Brain...,"[FXS participants:, Age criteria: Between ages...",[For FXS and Typically developing participants...,2 Years,Not Specified,MALE,True,{'AGE': ['Between ages 2 years to 50 years old...
4,NCT04362527,Milrinone Infusion for VAsospam Treatment in S...,[Adult patients hospitalized for aneurysmal SA...,[Initial Glasgow score at 3 with a bilateral m...,18 Years,Not Specified,ALL,False,"{'AGE': ['3 months'], 'CONDITION': [], 'MEDICA..."


In [51]:
df_trials.iloc[4].Extracted_Entities

{'AGE': ['3 months'], 'CONDITION': [], 'MEDICATION': []}

### 3. Matching Algorithms (Patient Profile x Trials)

In [94]:
import pandas as pd
import numpy as np

# Helper function to convert age fields from df_trials to integers
def convert_age(age_str):
    if pd.isna(age_str) or age_str in ['Not Specified', 'N/A']:
        return np.nan  # Return NaN for missing or unspecified ages
    try:
        return int(age_str.split()[0])  # Convert '18 Years' -> 18
    except:
        return np.nan  # Handle any errors

# Helper function to check if patient age matches the trial's age criteria
def match_age(patient_age, min_age, max_age):
    if pd.isna(min_age):  # If no minimum age is specified, assume 0
        min_age = 0
    if pd.isna(max_age):  # If no maximum age is specified, assume no upper limit
        max_age = 120
    return min_age <= patient_age <= max_age

# Helper function to check if the patient’s sex matches the trial’s sex requirement
def match_sex(patient_gender, trial_gender):
    if trial_gender == "ALL":
        return True
    return patient_gender == trial_gender

# Helper function to determine if a patient is a healthy volunteer
def is_healthy_volunteer(patient_conditions):
    # If the patient has no conditions, they are considered a healthy volunteer
    # If all conditions have ended (STOP is not NaT), they are also healthy
    active_conditions = patient_conditions[(patient_conditions['Condition_End'].isna()) | 
                                           (patient_conditions['Condition_End'] > pd.Timestamp.now())]
    return active_conditions.empty

# Main function to match patients to trials in chunks
def match_patients_to_trials(patient_profiles, df_trials, chunk_size=100):
    # Convert age fields in df_trials to integers
    df_trials['Minimum_Age'] = df_trials['Minimum_Age'].apply(convert_age)
    df_trials['Maximum_Age'] = df_trials['Maximum_Age'].apply(convert_age)

    # List to store all matches
    matches = []

    # Chunk processing of patients
    for i in range(0, len(patient_profiles), chunk_size):
        patient_chunk = patient_profiles.iloc[i:i + chunk_size]

        # Iterate over each patient in the chunk
        for patient_id, patient_data in patient_chunk.groupby('PatientID'):
            patient_age = patient_data['Age'].values[0]
            patient_gender = patient_data['Gender'].values[0]
            is_healthy = is_healthy_volunteer(patient_data)

            # Iterate over each trial and check for matching criteria
            for _, trial in df_trials.iterrows():
                # Age match
                if not match_age(patient_age, trial['Minimum_Age'], trial['Maximum_Age']):
                    continue

                # Sex match
                if not match_sex(patient_gender, trial['Sex']):
                    continue

                # Healthy volunteers check
                if trial['Healthy_Volunteers'] and not is_healthy:
                    continue

                # If all criteria are matched, append the result
                matches.append({
                    'PatientID': patient_id,
                    'TrialID': trial['NCTId'],
                    'TrialTitle': trial['Title'],
                    'EligibilityMet': 'Age, Sex, Healthy Volunteer Matched'
                })

    # Convert the matches list to a DataFrame
    matches_df = pd.DataFrame(matches)

    return matches_df

# Example usage:
# Assuming patient_profiles and df_trials are already loaded

# Set chunk size to 100
chunk_size = 100

# Call the matching function
matched_trials = match_patients_to_trials(patient_profiles, df_trials, chunk_size=chunk_size)
matched_trials.head()

                              PatientID      TrialID  \
0  3cd46ba4-0a11-1c66-8525-3f4230e65ba3  NCT04807127   
1  3cd46ba4-0a11-1c66-8525-3f4230e65ba3  NCT04362527   
2  3cd46ba4-0a11-1c66-8525-3f4230e65ba3  NCT05034627   
3  3cd46ba4-0a11-1c66-8525-3f4230e65ba3  NCT04080427   
4  6b0b0021-df03-cbb6-305a-11ec40da7af4  NCT04807127   

                                          TrialTitle  \
0  A Single-cell Approach to Identify Biomarkers ...   
1  Milrinone Infusion for VAsospam Treatment in S...   
2  Calaspargase Pegol-Mnkl and Cobimetinib for th...   
3  Effects of Delta9-tetrahydrocannabinol (THC) o...   
4  A Single-cell Approach to Identify Biomarkers ...   

                        EligibilityMet  
0  Age, Sex, Healthy Volunteer Matched  
1  Age, Sex, Healthy Volunteer Matched  
2  Age, Sex, Healthy Volunteer Matched  
3  Age, Sex, Healthy Volunteer Matched  
4  Age, Sex, Healthy Volunteer Matched  


In [91]:
# matches_df print where Inclusion_Criteria_Met is True and Exclusion_Criteria_Met is True
matches_df[(matches_df['Inclusion_Criteria_Met'] == True) & (matches_df['Exclusion_Criteria_Met'] == True)]

Unnamed: 0,PatientID,NCTId,Title,Inclusion_Criteria_Met,Exclusion_Criteria_Met,Age,Sex


In [97]:
matched_trials.groupby('TrialID').count()

Unnamed: 0_level_0,PatientID,TrialTitle,EligibilityMet
TrialID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT04080427,181,181,181
NCT04362527,181,181,181
NCT04807127,181,181,181
NCT05034627,181,181,181
NCT05444127,18,18,18
NCT05943327,18,18,18
NCT06330727,18,18,18


In [1]:
import spacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker

# Load the large scientific language model
nlp = spacy.load("en_core_sci_lg")

# Add the abbreviation detector to the pipeline
nlp.add_pipe("abbreviation_detector")



<scispacy.abbreviation.AbbreviationDetector at 0x26dc25596d0>

In [2]:
# Add the UMLS Entity Linker (links to UMLS which includes SNOMED-CT and RxNorm)
nlp.add_pipe("scispacy_linker", config={
    "resolve_abbreviations": True,  # To link long forms of abbreviations
    # "linker_name": "umls" 
    "linker_name": "rxnorm"
})

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


<scispacy.linking.EntityLinker at 0x26dc2d4f990>

In [8]:
# Example clinical text with abbreviations and conditions
text = "10 ML Furosemide 10 MG/ML Injection"

# Process the text to extract entities and link to UMLS concepts (including SNOMED-CT)
doc = nlp(text)

linker = nlp.get_pipe("scispacy_linker")
print(doc)
# Display abbreviations and their long forms
print("Abbreviations and their long forms:")
for abrv in doc._.abbreviations:
    print(f"{abrv} ({abrv.start}, {abrv.end}) -> {abrv._.long_form}")

# Iterate over the entities in the text and link to UMLS or RxNorm concepts
print("\nEntities and their UMLS/RxNorm mappings:")
for entity in doc.ents:
    # print(f"\nEntity Full: {entity}")
    print(f"\nEntity: {entity.text}, Type: {entity.label_}")#", Entity ID: {entity._.kb_ents}")

    # for umls_ent in entity._.kb_ents:
    #     print(linker.kb.cui_to_entity[umls_ent[0]], end="\n##################\n")
    # Get the UMLS or RxNorm entities and their confidence scores
    for kb_ent in entity._.kb_ents:
        concept_id, score = kb_ent
        concept = linker.kb.cui_to_entity[concept_id]
        # for field in concept._fields:
        #     print(f"{field}: {getattr(concept, field)}")
        print(f"  Concept ID (CUI/RxNorm ID): {concept_id}")
        print(f"  Score: {score}")
        print(f"  Preferred name: {concept.canonical_name}")
        print(f"  Definition: {concept.definition}")
        print(f"  Semantic types: {concept.types}")
        print(end="\n##################\n")  # For RxNorm/SNOMED-CT

Full-time employment (finding)
Abbreviations and their long forms:

Entities and their UMLS/RxNorm mappings:

Entity: Full-time employment, Type: ENTITY

Entity: finding, Type: ENTITY


In [27]:
# Example clinical text with abbreviations and conditions
text = "Did not recover from SAM in the past month"

# Process the text to extract entities and link to UMLS concepts (including SNOMED-CT)
doc = nlp(text)

linker = nlp.get_pipe("scispacy_linker")
print(doc)
# Display abbreviations and their long forms
print("Abbreviations and their long forms:")
for abrv in doc._.abbreviations:
    print(f"{abrv} ({abrv.start}, {abrv.end}) -> {abrv._.long_form}")

# Iterate over the entities in the text and link to UMLS or RxNorm concepts
print("\nEntities and their UMLS/RxNorm mappings:")
for entity in doc.ents:
    # print(f"\nEntity Full: {entity}")
    print(f"\nEntity: {entity.text}, Type: {entity.label_}")#", Entity ID: {entity._.kb_ents}")

    # for umls_ent in entity._.kb_ents:
    #     print(linker.kb.cui_to_entity[umls_ent[0]], end="\n##################\n")
    # Get the UMLS or RxNorm entities and their confidence scores
    for kb_ent in entity._.kb_ents:
        concept_id, score = kb_ent
        concept = linker.kb.cui_to_entity[concept_id]
        # for field in concept._fields:
        #     print(f"{field}: {getattr(concept, field)}")
        print(f"  Concept ID (CUI/RxNorm ID): {concept_id}")
        print(f"  Score: {score}")
        print(f"  Preferred name: {concept.canonical_name}")
        print(f"  Definition: {concept.definition}")
        print(f"  Semantic types: {concept.types}")
        print(end="\n##################\n")  # For RxNorm/SNOMED-CT

Did not recover from SAM in the past month
Abbreviations and their long forms:

Entities and their UMLS/RxNorm mappings:

Entity: SAM, Type: ENTITY
  Concept ID (CUI/RxNorm ID): C0036002
  Score: 0.990989625453949
  Preferred name: S-adenosylmethionine
  Definition: Physiologic methyl radical donor involved in enzymatic transmethylation reactions and present in all living organisms. It possesses anti-inflammatory activity and has been used in treatment of chronic liver disease. (From Merck, 11th ed)
  Semantic types: ['T116', 'T121', 'T123']

##################
  Concept ID (CUI/RxNorm ID): C0279268
  Score: 0.990989625453949
  Preferred name: doxorubicin/semustine/streptozocin protocol
  Definition: A chemotherapy regimen consisting of doxorubicin, semustine, and streptozocin that may be used in the treatment of pancreatic cancer.
  Semantic types: ['T061']

##################
  Concept ID (CUI/RxNorm ID): C1563296
  Score: 0.990989625453949
  Preferred name: Systolic anterior movemen

In [1]:
from medcat.cat import CAT

# Download the model_pack from the models section in the github repo.
cat = CAT.load_model_pack('C:/Users/cx-admin/Downloads/umls_sm_pt2ch_533bab5115c6c2d6.zip')



  from tqdm.autonotebook import tqdm, trange
  meta_cat.model.load_state_dict(torch.load(model_save_path, map_location=device))


{'entities': {1: {'pretty_name': 'Kidney Failure', 'cui': 'C0035078', 'type_ids': ['T047'], 'types': ['Disease or Syndrome'], 'source_value': 'kidney failure', 'detected_name': 'kidney~failure', 'acc': 1.0, 'context_similarity': 1.0, 'start': 24, 'end': 38, 'icd10': [{'chapter': 'N17-N19.9', 'name': 'Renal failure'}, {'chapter': 'N19', 'name': 'Unspecified renal failure'}], 'ontologies': [], 'snomed': ['S-156092003', 'S-197656003', 'S-198524000', 'S-266553002', 'S-266616000', 'S-42399005', 'S-723188008'], 'id': 1, 'meta_anns': {'Status': {'value': 'Affirmed', 'confidence': 0.9684659242630005, 'name': 'Status'}}}}, 'tokens': []}


In [7]:
# Test it
import json
text = "Positive hepatitis B surface antigen and hepatitis B virus quantification \\> 1 × 1000 copies/ml, or positive anti-hepatitis C virus antibody;"
entities = cat.get_entities(text)
print(json.dumps(entities, indent=2))

{
  "entities": {
    "0": {
      "pretty_name": "Positive",
      "cui": "C1446409",
      "type_ids": [
        "T033"
      ],
      "types": [
        "Finding"
      ],
      "source_value": "Positive",
      "detected_name": "positive",
      "acc": 0.7033594941327916,
      "context_similarity": 0.7033594941327916,
      "start": 0,
      "end": 8,
      "icd10": [],
      "ontologies": [],
      "snomed": [
        "S-10828004",
        "S-393474000",
        "S-394424008"
      ],
      "id": 0,
      "meta_anns": {
        "Status": {
          "value": "Other",
          "confidence": 0.9988605976104736,
          "name": "Status"
        }
      }
    },
    "3": {
      "pretty_name": "Hepatitis B Surface Antigens",
      "cui": "C0019168",
      "type_ids": [
        "T129"
      ],
      "types": [
        "Immunologic Factor"
      ],
      "source_value": "hepatitis B surface antigen",
      "detected_name": "hepatitis~b~surface~antigen",
      "acc": 0.90401701992062