In [None]:
import llm_utils
import importlib
from llm_utils import analyze_clinical_note, summarize_clinical_note
sample_test = "A 15-day-old neonate presented with 3 days of irritability, fever (38.5°C), poor sucking, and left preauricular swelling. Examination showed a 5 cm × 5 cm fluctuant left parotid gland swelling with pus from the Stensen's duct. Laboratory results indicated elevated white blood cells (17.6 × 10^9/L). Ultrasound suggested acute suppurative parotitis. Initial treatment involved intravenous cefotaxime and rehydration, followed by surgical drainage. Pus culture identified methicillin-resistant S. aureus, prompting a switch to intravenous vancomycin for 10 days, leading to full recovery without residual parotid issues."
llm_response = analyze_clinical_note(sample_test)
llm_response 

In [None]:
print(llm_response.identified_codes)
summary = summarize_clinical_note(example1)
summary

In [None]:
import llm_prompts
sample_test = "A 15-day-old neonate presented with 3 days of irritability, fever (38.5°C), poor sucking, and left preauricular swelling. Examination showed a 5 cm × 5 cm fluctuant left parotid gland swelling with pus from the Stensen's duct. Laboratory results indicated elevated white blood cells (17.6 × 10^9/L). Ultrasound suggested acute suppurative parotitis. Initial treatment involved intravenous cefotaxime and rehydration, followed by surgical drainage. Pus culture identified methicillin-resistant S. aureus, prompting a switch to intravenous vancomycin for 10 days, leading to full recovery without residual parotid issues."
prompt = llm_prompts.ICD10_DETECTION_PROMPT_v2.format(clinical_note=sample_test)


In [9]:
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import scispacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
# Download stopwords (run this once)
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("punkt_tab")
# Load SciSpacy medical NLP model (choose 'en_core_sci_md' or 'en_core_sci_lg' for better results)
nlp = spacy.load("en_core_sci_md")
# Add the abbreviation pipe to the spacy pipeline
nlp.add_pipe("abbreviation_detector")
# linker = EntityLinker()
# nlp.add_pipe('entityLinker')

<scispacy.abbreviation.AbbreviationDetector at 0x6521e8110>

In [10]:
# Add the entity linking pipe to the spacy pipeline
# nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "filter_for_definitions": False})
# Medical abbreviation dictionary (expandable)
medical_abbreviations = {
    "zes": "zotarolimus-eluting stent",
    "bp": "blood pressure",
    "cad": "coronary artery disease",
    "hba1c": "glycated hemoglobin"
}

# Stopwords (keep important medical terms)
stop_words = set(stopwords.words("english"))

In [14]:

def preprocess_clinical_text(text):
    """
    Preprocesses clinical notes by applying:
    - Lowercasing
    - Removing fancy characters
    - Expanding abbreviations
    - Tokenization
    - Removing non-medical stopwords
    - Lemmatization
    - Keeping numbers
    - Named Entity Recognition (NER) for medical terms
    """
      # 1. Lowercasing (Preserve case for known medical terms)
    text = text.lower()

    # 2. Expand medical abbreviations
    for abbr, full_form in medical_abbreviations.items():
        text = re.sub(r"\b" + re.escape(abbr) + r"\b", full_form, text)

    # 3. Remove unnecessary characters, keeping medical symbols
    text = re.sub(r"[^a-zA-Z0-9\s.%/-]", "", text)  # Keep %, /, and -

    # 4. Tokenization
    tokens = word_tokenize(text)

    # 5. Stopword Removal (except medical words)
    tokens = [word for word in tokens if word not in stop_words]

    # 6. Lemmatization
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]

    # 7. Reconstruct cleaned text
    cleaned_text = " ".join(lemmatized_tokens)

    
    # 7. Named Entity Recognition (NER) - Extract Diagnoses, Symptoms, Procedures
    """Fix it later 
    extracted_entities = {
        "diagnoses": [],
        "symptoms": [],
        "procedures": [],
        "medications": []
    }

    for ent in doc.ents:
        # Link entity to UMLS (Unified Medical Language System)
        if len(ent._.umls_ents) > 0:
            concept_id = ent._.umls_ents[0][0]  # Get UMLS concept ID
            extracted_entities["diagnoses"].append(ent.text) if "Disease" in concept_id else None
            extracted_entities["symptoms"].append(ent.text) if "Sign_or_Symptom" in concept_id else None
            extracted_entities["procedures"].append(ent.text) if "Procedure" in concept_id else None
            extracted_entities["medications"].append(ent.text) if "Drug_or_Chemical" in concept_id else None
    """
    return cleaned_text


# Example Patient Note
clinical_note = """
A 72-year-old male underwent coronary artery bypass surgery 10 years ago.
He suffered from exertional chest pain and underwent a follow-up coronary angiogram.
Coronary angiography showed 80% luminal narrowing. He was treated with ZESs.
BP was 140/90. HbA1c: 6.5%.
"""

# Apply Preprocessing
cleaned_text = preprocess_clinical_text(clinical_note)

# Print Output
print("🔹 Cleaned Text:\n", cleaned_text)
# print("\n🔹 Extracted Medical Entities:\n", extracted_medical_entities)


🔹 Cleaned Text:
 72-year-old male undergo coronary artery bypass surgery 10 year ago . suffer exertional chest pain undergo follow-up coronary angiogram . coronary angiography show 80 % luminal narrowing . treat zess . blood pressure 140/90 . glycated hemoglobin 6.5 % .


  global_matches = self.global_matcher(doc)
