<a href="https://colab.research.google.com/github/cartiktq/AcademicAnalytics/blob/main/KnowledgeGraphGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
S = """from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Any
from agents import (
    generate_notes,             # Node 1
    anonymize_notes,           # Node 2
    evaluate_anonymization,    # Node 3
    extract_entities,          # Node 4
    map_to_umls_concepts,      # Node 5
    map_to_local_aui,          # Node 6
    build_patient_knowledge_graph,  # Node 7
    visualize_knowledge_graph       # Node 8
)

class WorkflowState(TypedDict):
    raw_notes_folder: str
    anonymized_folder: str
    comparison_metric: float
    extracted_csv: str
    umls_csv: str
    aui_csv: str
    graph_objects: List[Dict[str, Any]]
    visualizations: List[Any]

graph = StateGraph(WorkflowState)

# Register nodes
graph.add_node("generate_notes", generate_notes)
graph.add_node("anonymize_notes", anonymize_notes)
graph.add_node("evaluate_anonymization", evaluate_anonymization)
graph.add_node("extract_entities", extract_entities)
graph.add_node("map_to_umls_concepts", map_to_umls_concepts)
graph.add_node("map_to_local_aui", map_to_local_aui)
graph.add_node("build_knowledge_graph", build_patient_knowledge_graph)
graph.add_node("visualize_knowledge_graph", visualize_knowledge_graph)

# Edges (linear)
graph.set_entry_point("generate_notes")
graph.add_edge("generate_notes", "anonymize_notes")
graph.add_edge("anonymize_notes", "evaluate_anonymization")
graph.add_edge("evaluate_anonymization", "extract_entities")
graph.add_edge("extract_entities", "map_to_umls_concepts")
graph.add_edge("map_to_umls_concepts", "map_to_local_aui")
graph.add_edge("map_to_local_aui", "build_knowledge_graph")
graph.add_edge("build_knowledge_graph", "visualize_knowledge_graph")
graph.add_edge("visualize_knowledge_graph", END)

app = graph.compile()
"""

In [2]:
! pip install faker transformers torch sacremoses langgraph



In [3]:
#def generate_notes(state: WorkflowState) -> WorkflowState:
def generate_notes():
    from faker import Faker
    import os
    import random
    from datetime import datetime, timedelta

    fake = Faker()
    output_dir = "raw_clinical_notes"
    os.makedirs(output_dir, exist_ok=True)

    physical_symptoms = [
    "Developmental Delay", "Speech and Language Impairment", "Hypotonia", "Seizures",
    "Gastrointestinal Issues", "Sleep Disturbances", "Dysmorphia", "Lymphedema",
    "Renal Abnormalities", "Thermoregulation Issues", "Abnormalities of Nails"
]
    behavioral_symptoms = [
    "Autism Spectrum Disorder (ASD) Features", "Intellectual Disability", "Anxiety",
    "Aggression/Self-Injurious Behaviors", "Hyperactivity/Impulsivity",
    "Sensory Processing Differences", "Compulsive Behaviors", "Mood Dysregulation"
]
    prescriptions = [
    "Levetiracetam", "Valproic Acid", "Lamotrigine", "Fluoxetine", "Sertraline",
    "Risperidone", "Aripiprazole", "Melatonin", "Polyethylene glycol", "Omeprazole",
    "Ranitidine", "Methylphenidate"
]
    lab_tests = [
    "Chromosomal Microarray Analysis (CMA)", "FISH", "Karyotype", "EEG",
    "Metabolic Screens", "Renal Ultrasound", "GI Motility Studies"
]
    therapies = [
    "Early Intervention Programs", "Speech and Language Therapy (SLT)",
    "Occupational Therapy (OT)", "Physical Therapy (PT)", "Applied Behavior Analysis (ABA) Therapy",
    "Feeding Therapy", "Behavioral Therapy/Parent Training", "Surgical Interventions"
]
    comorbidities = [
    "Autism Spectrum Disorder (ASD)", "Epilepsy/Seizure Disorder", "Gastrointestinal Disorders",
    "Sleep Disorders", "Anxiety Disorders", "ADHD", "Obesity", "Lymphedema",
    "Renal Anomalies", "Immunodeficiency"
]

    for patient_id in range(1, 101):
        num_visits = random.randint(8, 10)
        base_date = datetime.today()
        for visit in range(num_visits):
            visit_date = base_date - timedelta(days=random.randint(0, 365*2))
            note = f"""
            Patient: {fake.name()}
            DOB: {fake.date_of_birth()}
            Visit Date: {visit_date.strftime('%Y-%m-%d')}
            Diagnoses: Phelan-McDermid Syndrome
            Symptoms: {random.sample(physical_symptoms, 4)}
            Behavioral Symptoms: {random.sample(behavioral_symptoms, 2)}
            Labs: {random.sample(lab_tests, 2)}
            Prescriptions: {random.sample(prescriptions, 2)}
            Therapies: {random.sample(therapies, 2)}
            Comorbidities: {random.sample(comorbidities, 2)}
            """
            fname = f"clinical_note_{visit}_for_patient_{patient_id}_{visit_date.strftime('%Y-%m-%d')}.txt"
            with open(os.path.join(output_dir, fname), "w") as f:
                f.write(note)

#    state["raw_notes_folder"] = output_dir
#    return state


In [4]:
generate_notes()

In [5]:
! pip install presidio-analyzer presidio-anonymizer

Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.358-py3-none-any.whl.metadata (3.2 kB)
Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.358-py3-none-any.whl.metadata (8.1 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-8.13.55-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.358-py3-none-any.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading presidio_anonymizer-2.2.358-py3-none-any.whl (31 kB)
Downloading phonenumbers-8.13.55-py2.py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m8

In [6]:
import os
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

#def anonymize_notes(state: dict) -> dict:
def anonymize_notes():
    #raw_dir = state["raw_notes_folder"]
    raw_dir = os.path.join(os.getcwd(), "raw_clinical_notes")
    anon_dir = "anonymized_clinical_notes"
    os.makedirs(anon_dir, exist_ok=True)

    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()

    for fname in os.listdir(raw_dir):
        if not fname.endswith(".txt"):
            continue
        with open(os.path.join(raw_dir, fname), "r") as f:
            text = f.read()

        # Run PHI detection
        results = analyzer.analyze(text=text, entities=None, language="en")

        # Anonymize detected PHI entities
        anonymized_text = anonymizer.anonymize(text=text, analyzer_results=results).text

        # Write to new anonymized file
        new_fname = f"anonymized_{fname}"
        with open(os.path.join(anon_dir, new_fname), "w") as out_f:
            out_f.write(anonymized_text)

#    state["anonymized_folder"] = anon_dir
#    return state


In [7]:
anonymize_notes()



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




In [37]:
import os
import shutil
from collections import defaultdict
from typing import List
from presidio_analyzer import AnalyzerEngine, RecognizerResult

def extract_phi(file_path: str, analyzer: AnalyzerEngine) -> List[RecognizerResult]:
    with open(file_path, "r") as f:
        text = f.read()
    return analyzer.analyze(text=text, entities=None, language="en")

#def evaluate_anonymization(state: dict) -> dict:
def evaluate_anonymization():
#    raw_dir = state["raw_notes_folder"]
#    anon_dir = state["anonymized_folder"]
    raw_dir = os.path.join(os.getcwd(), "raw_clinical_notes")
    anon_dir = os.path.join(os.getcwd(), "anonymized_clinical_notes")
    analyzer = AnalyzerEngine()

    phi_counts_raw = 0
    phi_counts_anon = 0

    false_negatives = defaultdict(list)  # {file: [(entity_type, matched_text)]}
    fileCount = len(os.listdir(raw_dir))

    print(f"Evaluating the effectiveness of the anonymizer in removing PHI across {fileCount} files")

    for fname in os.listdir(raw_dir):
        if not fname.endswith(".txt"):
            continue

        raw_path = os.path.join(raw_dir, fname)
        anon_path = os.path.join(anon_dir, f"anonymized_{fname}")

        raw_entities = extract_phi(raw_path, analyzer)
        anon_entities = extract_phi(anon_path, analyzer)

        phi_counts_raw += len(raw_entities)
        phi_counts_anon += len(anon_entities)

        for ent in anon_entities:
            # Changed ent.text to ent.original_text to access the matched text
            false_negatives[fname].append((ent.entity_type, ent.contains, ent.contained_in))


    # Compute effectiveness metric
    if phi_counts_raw == 0:
        effectiveness = 100.0 if phi_counts_anon == 0 else 0.0
    else:
        effectiveness = (1 - (phi_counts_anon / phi_counts_raw)) * 100

    # Print metric summary
    print(f"\n🔍 Anonymization Effectiveness: {effectiveness:.2f}%")
    print(f"Total PHI elements (raw): {phi_counts_raw}")
    print(f"Remaining PHI elements (anonymized): {phi_counts_anon}\n")

    # Print false negatives summary
    if false_negatives:
        quarantine_dir = os.path.join(os.getcwd(), "quarantined_clinical_notes")
        os.makedirs(quarantine_dir, exist_ok=True)
        print("🚨 False Negatives (Missed PHI Elements):")
        for fname, entities in false_negatives.items():
            print(f"  File: {fname}")
            for matched_text in entities:
                anon_fname = f"anonymized_{fname}"
                source_file = os.path.join(anon_dir, "" + anon_fname)
                destination_file = os.path.join(quarantine_dir, anon_fname)
                print(f"    -  {matched_text[1]}")
                print(f"Removing file: {anon_fname} from anonymized directory as it contains PHI elements")
                try:
                  shutil.move(source_file, quarantine_dir)
                  #os.rename(source_file, destination_file)
                except FileNotFoundError:
                  print(f"Source file '{source_file}' not found.")
                except FileExistsError:
                  print(f"Destination file '{destination_file}' already exists.")
                except OSError as e:
                  print(f"Error moving file: {e}")
        print("All files containing PHI elements have been moved to quarantine")
        print(f"There are now {len(os.listdir(anon_dir))} files in the anonymized folder")

    else:
        print("✅ No false negatives detected. All PHI was successfully removed.")

#    state["comparison_metric"] = effectiveness
#    return state


In [38]:
evaluate_anonymization()



Evaluating the effectiveness of the anonymizer in removing PHI across 900 files

🔍 Anonymization Effectiveness: 99.71%
Total PHI elements (raw): 5927
Remaining PHI elements (anonymized): 17

🚨 False Negatives (Missed PHI Elements):
  File: clinical_note_6_for_patient_15_2025-01-29.txt
    -  <bound method RecognizerResult.contains of type: PERSON, start: 433, end: 444, score: 0.85>
Removing file: anonymized_clinical_note_6_for_patient_15_2025-01-29.txt from anonymized directory as it contains PHI elements
  File: clinical_note_0_for_patient_77_2024-09-19.txt
    -  <bound method RecognizerResult.contains of type: PERSON, start: 431, end: 442, score: 0.85>
Removing file: anonymized_clinical_note_0_for_patient_77_2024-09-19.txt from anonymized directory as it contains PHI elements
  File: clinical_note_7_for_patient_52_2024-11-14.txt
    -  <bound method RecognizerResult.contains of type: LOCATION, start: 35, end: 53, score: 0.85>
Removing file: anonymized_clinical_note_7_for_patient_52_