## Load SciSpaCy and the Model
# This code loads the SciSpaCy medical NLP model
# Processess a sample eligibility statement
# Shows the entities it recognized (like conditions, scores, medications)

In [1]:
# Load the clinical model from SciSpaCy
import spacy
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [8]:
# Sample sentence from the trial
text = "PHQ-9 score >= 10 at screening"

In [2]:
# Process the text
text = "PHQ-0 score >=10 at screening"
doc = nlp(text)

In [3]:
# Display named entities
for ent in doc.ents:
    print(ent.text, ent.label_)

PHQ-0 ENTITY
screening ENTITY


## Add your full criteria list to the notebook

In [4]:
criteria = {
    "inclusion": [
        "18 Years to 70 Years (Adult, Older Adult)"
        "Veteran of OEF/OIF/OND deployments",
        "Depressive severity of >= 10 on the PHQ-9 at screening",
        "Patient may also have a DSM-5 56 diagnosis of unipolar Major Depression; PTSD; any anxiety disorder; substance/alcohol abuse; or adjustment disorder",
        "Ability to read = eighth grade level",
        "Can provide informed consent",
        "Patients may be taking antidepressants/antianxiety medications where dose has been stable >= 4 weeks prior to screening evaluation",
        "Must have an Android smartphone",
        "Must be willing to be audio-taped for fidelity ratings"
    ],
    "exclusion": [
        "Diagnoses of schizophrenia, schizo-affective, bipolar, or other psychotic disorder",
        "Serious suicidal risk (Patient responds positively to PHQ-9 question #9)",
        "Severe PTSD (Score > 51 on PTSD Checklist for DSM-5)",
        "Severe substance or alcohol dependence (meets DSM-5 criteria of severe)"
    ]
}

## Process and display entities with labels
# A list of each inlcusion/exlusion statement
# Any recognized entities (like conditions, scores, time periods)

In [5]:
for label, statements in criteria.items():
    print(f"\n== {label.upper()} CRITERIA ===")
    for stmt in statements:
        doc = nlp(stmt)
        print(f"\nüîπ{stmt}")
        for ent in doc.ents:
            print(f"  - {ent.text} -> {ent.label_}")


== INCLUSION CRITERIA ===

üîπ18 Years to 70 Years (Adult, Older Adult)Veteran of OEF/OIF/OND deployments
  - Years -> ENTITY
  - Adult -> ENTITY
  - Older Adult)Veteran -> ENTITY
  - OEF/OIF/OND deployments -> ENTITY

üîπDepressive severity of >= 10 on the PHQ-9 at screening
  - Depressive severity -> ENTITY
  - PHQ-9 -> ENTITY
  - screening -> ENTITY

üîπPatient may also have a DSM-5 56 diagnosis of unipolar Major Depression; PTSD; any anxiety disorder; substance/alcohol abuse; or adjustment disorder
  - Patient -> ENTITY
  - DSM-5 -> ENTITY
  - diagnosis -> ENTITY
  - unipolar Major Depression -> ENTITY
  - PTSD -> ENTITY
  - anxiety disorder -> ENTITY
  - substance/alcohol abuse -> ENTITY
  - adjustment disorder -> ENTITY

üîπAbility to read = eighth grade level
  - level -> ENTITY

üîπCan provide informed consent

üîπPatients may be taking antidepressants/antianxiety medications where dose has been stable >= 4 weeks prior to screening evaluation
  - Patients -> ENTITY
  - a

## Structuring eligibility criteria as concepts (diagnosis, symptoms, medications) - JSON format for future downstream matching with Synthea patients
# JSON because it gives a clean, machine-readable way to store inclusion/exclusion rules
# A structure you can use to match against patient data fields
# Easy way to extend rules later (eg. to add thresholds, severity, conditions, meds)

In [7]:
structured_criteria = {
    "inclusion": [
        {"id": 1,
            "concepts": ["veteran", "OEF/OIF/OND"],
            "field": "military_history",
            "priority": "high",
            "weight": 10
        },
        {"id": 2,
            "concepts": ["PHQ-9", "phq9", "depressive severity"],
            "field": "mental_health_assessment",
            "priority": "high",
            "weight": 9 
        },
        {"id": 3,
            "concepts": ["major depression", "PTSD", "anxiety", "substance abuse", "adjustment disorder"],
            "field": "conditions",
            "priority": "high",
            "weight": 8    
        },
        {"id": 4,
            "concepts": ["age 18", "age 70", "adult"],
            "field": "demographics",
            "priority": "high",
            "weight": 10
        },
        {"id": 5,
            "concepts": ["eighth grade reading level"],
            "field": "literacy",
            "priority": "medium",
            "weight": 5
        },
        {"id": 6,
            "concepts": ["informed consent"],
            "field": "legal_consent",
            "priority": "medium",
            "weight": 6
        },
        {"id": 7,
            "concepts": ["antidepressant", "antianxiety", "stable dose", "4 weeks"],
            "field": "medications",
            "priority": "medium",
            "weight": 6
        },
        {"id": 8,
            "concepts": ["android smartphone"],
            "field": "device",
            "priority": "low",
            "weight": 10
        },
        {"id": 9,
            "concepts": ["audio-taped", "recorded", "fidelity"],
            "field": "consent_media",
            "priority": "low",
            "weight": 2
        },
    ],
    "exclusion": [
        {"id": 1, "concepts": ["schizophrenia", "schizo-affective", "bipolar", "psychotic disorder"], "field": "conditions"},
        {"id": 2, "concepts": ["suicidal risk", "PHQ-9 question 9"], "field": "mental_health"},
        {"id": 3, "concepts": ["severe PTSD", "PTSD Checkkist", "score >51"], "field": "conditions"},
        {"id": 4, "concepts": ["severe substance or alcohol dependence", "DSM-5 criteria severe"], "field": "conditions"},
    ]
}

In [13]:
# Adjusting to the Ubuntu QuickUMLS data path
from quickumls import QuickUMLS

matcher = QuickUMLS(
    '/mnt/c/Users/Gauri/QuickUMLS/quickumls_data',
    threshold=0.85,
    similarity_name='jaccard',
    window=5
)

def map_concepts_with_quickumls(concepts):
    mapped = set()
    for concept in concepts:
        matches = matcher.match(concept, best_match=True, ignore_syntax=False)
        print(f"Matching '{concept}' => {matches}")  # DEBUG
        for m in matches:
            for candidate in m:
                if candidate['source'] == 'SNOMEDCT_US':
                    mapped.add(candidate['term'].lower())
    return list(mapped) if mapped else concepts

# Fields to map
mapped_fields = {'conditions', 'mental_health_assessment', 'mental_health', 'medications'}

def map_structured_criteria(criteria):
    updated = {"inclusion": [], "exclusion": []}
    for section in ['inclusion', 'exclusion']:
        for rule in criteria[section]:
            updated_rule = rule.copy()
            if rule['field'] in mapped_fields:
                updated_rule['concepts'] = map_concepts_with_quickumls(rule['concepts'])
            updated[section].append(updated_rule)
    return updated

mapped_criteria = map_structured_criteria(structured_criteria)

print(mapped_criteria)

Matching 'PHQ-9' => []
Matching 'phq9' => []
Matching 'depressive severity' => []
Matching 'major depression' => []
Matching 'PTSD' => []
Matching 'anxiety' => []
Matching 'substance abuse' => []
Matching 'adjustment disorder' => []
Matching 'antidepressant' => []
Matching 'antianxiety' => []
Matching 'stable dose' => []
Matching '4 weeks' => []
Matching 'schizophrenia' => []
Matching 'schizo-affective' => []
Matching 'bipolar' => []
Matching 'psychotic disorder' => []
Matching 'suicidal risk' => []
Matching 'PHQ-9 question 9' => []
Matching 'severe PTSD' => []
Matching 'PTSD Checkkist' => []
Matching 'score >51' => []
Matching 'severe substance or alcohol dependence' => []
Matching 'DSM-5 criteria severe' => []
{'inclusion': [{'id': 1, 'concepts': ['veteran', 'OEF/OIF/OND'], 'field': 'military_history', 'priority': 'high', 'weight': 10}, {'id': 2, 'concepts': ['PHQ-9', 'phq9', 'depressive severity'], 'field': 'mental_health_assessment', 'priority': 'high', 'weight': 9}, {'id': 3, 'con

In [14]:
test_concepts = [
    "major depressive disorder",
    "post-traumatic stress disorder",
    "schizophrenia",
    "suicidal ideation",
    "bipolar disorder",
    "antidepressant",
    "anxiety disorder"
]

def map_concepts_with_quickumls(concepts):
    mapped = set()
    for concept in concepts:
        matches = matcher.match(concept, best_match=True, ignore_syntax=False)
        print(f"\n=== Matches for '{concept}' ===")
        for m in matches:
            for candidate in m:
                print(candidate)  # Print all candidates
                if candidate['source'] == 'SNOMEDCT_US':
                    mapped.add(candidate['term'].lower())
    return list(mapped) if mapped else concepts

mapped = map_concepts_with_quickumls(test_concepts)
print("\n‚úÖ Mapped terms:", mapped)


=== Matches for 'major depressive disorder' ===

=== Matches for 'post-traumatic stress disorder' ===

=== Matches for 'schizophrenia' ===

=== Matches for 'suicidal ideation' ===

=== Matches for 'bipolar disorder' ===

=== Matches for 'antidepressant' ===

=== Matches for 'anxiety disorder' ===

‚úÖ Mapped terms: ['major depressive disorder', 'post-traumatic stress disorder', 'schizophrenia', 'suicidal ideation', 'bipolar disorder', 'antidepressant', 'anxiety disorder']


In [10]:
import json
import os
from docx import Document

# Load patient JSON
def load_patient_conditions(json_path):
    with open(json_path, 'r') as file:
        data = json.load(file)

    conditions = []
    for entry in data.get('entry', []):
        if entry['resource']['resourceType'] == 'Condition':
            coding = entry['resource'].get('code', {}).get('coding', [])
            if coding:
                display = coding[0].get('display')
                if display:
                    conditions.append(display.lower())
    return conditions

# Match patient against structured criteria
def match_patient_to_criteria(patient_conditions, criteria):
    matched_inclusion = []
    failed_exclusion = []

    for rule in criteria['inclusion']:
        if rule['field'] != 'conditions':
            continue  # For now, evaluating only condition-based criteria

        for term in rule['concepts']:
            if any(term.lower() in cond for cond in patient_conditions):
                matched_inclusion.append(rule['id'])
                break

    for rule in criteria['exclusion']:
        if rule['field'] != 'conditions':
            continue
        for term in rule['concepts']:
            if any(term.lower() in cond for cond in patient_conditions):
                failed_exclusion.append(rule['id'])
                break

    return matched_inclusion, failed_exclusion

# Base input/output paths
input_base = "E:/Gauri/Project/Synthea dataset/output_2/fhir"
output_base = "E:/Gauri/Project/Results/output_2"

# Loop over immediate subfolders in /fhir (e.g., 00, 01)
for top_folder in os.listdir(input_base):
    top_folder_path = os.path.join(input_base, top_folder)
    if not os.path.isdir(top_folder_path):
        continue

    # Create a Word document for each top-level subfolder
    doc = Document()
    doc.add_heading(f'Trial Match Results for Folder: {top_folder}', level=1)
    eligible_patients_found = False  # flag
    
    # Recursively process all JSONs inside this subfolder
    for root, _, files in os.walk(top_folder_path):
        for filename in files:
            if filename.endswith(".json"):
                file_path = os.path.join(root, filename)
                try:
                    conditions = load_patient_conditions(file_path)
                    matched, failed = match_patient_to_criteria(conditions, structured_criteria)

                    if matched and not failed:
                        eligible_patients_found = True
                        doc.add_heading(f'File: {filename}', level=2)
                        doc.add_paragraph(f"Extracted Conditions: {', '.join(conditions)}")
                        doc.add_paragraph(f"Matched Inclusion Criteria IDs: {matched}")
                        doc.add_paragraph(f"Failed Exclusion Criteria IDs: {failed}")
                        doc.add_paragraph("‚úÖ ELIGIBLE")

                except Exception as e:
                    doc.add_heading(f"‚ö†Ô∏è Error reading {filename}", level=2)
                    doc.add_paragraph(str(e))

                    if failed:
                        doc.add_paragraph("‚ùå NOT ELIGIBLE (Exclusion criteria met)")
                    elif matched:
                        doc.add_paragraph("‚úÖ ELIGIBLE")
                    else:
                        doc.add_paragraph("‚ö†Ô∏è No matching inclusion criteria")

                except Exception as e:
                    doc.add_heading(f"‚ö†Ô∏è Error reading {filename}", level=2)
                    doc.add_paragraph(str(e))

    if eligible_patients_found:
        output_path = os.path.join(output_base, f"Trial_Match_Results_All_Subfolders_{top_folder}.docx")
        doc.save(output_path)
        print(f"‚úÖ Saved: {output_path}")
    else:
        print(f"‚ö†Ô∏è No eligible patients found in folder: {top_folder}")

‚ö†Ô∏è No eligible patients found in folder: 00
‚ö†Ô∏è No eligible patients found in folder: 01
‚ö†Ô∏è No eligible patients found in folder: 02
‚ö†Ô∏è No eligible patients found in folder: 03
‚ö†Ô∏è No eligible patients found in folder: 04
‚ö†Ô∏è No eligible patients found in folder: 05
‚ö†Ô∏è No eligible patients found in folder: 06
‚ö†Ô∏è No eligible patients found in folder: 07
‚ö†Ô∏è No eligible patients found in folder: 08
‚ö†Ô∏è No eligible patients found in folder: 09
‚ö†Ô∏è No eligible patients found in folder: 0a
‚ö†Ô∏è No eligible patients found in folder: 0b
‚ö†Ô∏è No eligible patients found in folder: 0c
‚ö†Ô∏è No eligible patients found in folder: 0d
‚ö†Ô∏è No eligible patients found in folder: 0e
‚ö†Ô∏è No eligible patients found in folder: 0f
‚ö†Ô∏è No eligible patients found in folder: 10
‚ö†Ô∏è No eligible patients found in folder: 11
‚ö†Ô∏è No eligible patients found in folder: 12
‚ö†Ô∏è No eligible patients found in folder: 13
‚ö†Ô∏è No eligible patients found in fol

KeyboardInterrupt: 