# scispaCy Medical Entity Recognition

This notebook tests scispaCy's capabilities for recognizing medical entities in interpreter transcripts.

## Objectives
1. Test medical entity recognition (diseases, medications, procedures)
2. Benchmark accuracy on medical terminology
3. Test negation detection with negspaCy
4. Identify medical concept omissions
5. Link entities to UMLS concepts

## Setup
Requires scispaCy models:
- `en_core_sci_lg` - Large biomedical model
- `en_ner_bc5cdr_md` - Disease & Chemical recognition

In [None]:
import spacy
import scispacy
from scispacy.linking import EntityLinker
from spacy import displacy
import pandas as pd
from negspacy.negation import Negex

# Install scispaCy models:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz

# Load scispaCy model
nlp = spacy.load("en_core_sci_lg")

# Add entity linker (links to UMLS)
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

# Add negation detection
nlp.add_pipe("negex")

print("✅ scispaCy models loaded")
print(f"Pipeline: {nlp.pipe_names}")

## Test 1: Medical Entity Recognition

In [None]:
def extract_medical_entities(text: str):
    """
    Extract medical entities from text
    """
    doc = nlp(text)
    
    entities = []
    for ent in doc.ents:
        entities.append({
            'text': ent.text,
            'label': ent.label_,
            'start': ent.start_char,
            'end': ent.end_char
        })
    
    return pd.DataFrame(entities)

# Example
medical_text = """
The patient presented with acute myocardial infarction. 
We initiated treatment with aspirin, clopidogrel, and atorvastatin. 
The patient has a history of type 2 diabetes mellitus and hypertension.
"""

entities_df = extract_medical_entities(medical_text)
print(entities_df)

## Test 2: Entity Visualization

In [None]:
def visualize_medical_entities(text: str):
    """
    Visualize medical entities in text
    """
    doc = nlp(text)
    displacy.render(doc, style="ent", jupyter=True)

# Example
# visualize_medical_entities(medical_text)

## Test 3: UMLS Concept Linking

In [None]:
def link_to_umls(text: str, top_k: int = 3):
    """
    Link medical entities to UMLS concepts
    """
    doc = nlp(text)
    linker = nlp.get_pipe("scispacy_linker")
    
    linked_entities = []
    for ent in doc.ents:
        if ent._.kb_ents:
            # Get top K linked concepts
            for umls_ent in ent._.kb_ents[:top_k]:
                cui = umls_ent[0]
                score = umls_ent[1]
                
                # Get concept details
                concept = linker.kb.cui_to_entity[cui]
                
                linked_entities.append({
                    'entity_text': ent.text,
                    'entity_label': ent.label_,
                    'umls_cui': cui,
                    'canonical_name': concept.canonical_name,
                    'score': score,
                    'definition': concept.definition if hasattr(concept, 'definition') else 'N/A'
                })
    
    return pd.DataFrame(linked_entities)

# Example
# umls_links = link_to_umls(medical_text)
# print(umls_links)

## Test 4: Negation Detection

In [None]:
def detect_negations(text: str):
    """
    Detect negated medical entities
    """
    doc = nlp(text)
    
    negations = []
    for ent in doc.ents:
        negations.append({
            'text': ent.text,
            'label': ent.label_,
            'is_negated': ent._.negex,
            'context': doc[max(0, ent.start-5):min(len(doc), ent.end+5)].text
        })
    
    return pd.DataFrame(negations)

# Example with negations
negation_text = """
The patient denies chest pain or shortness of breath.
No history of diabetes.
The patient has hypertension but no signs of heart failure.
"""

negations_df = detect_negations(negation_text)
print(negations_df)

## Test 5: Medical Terminology Accuracy Assessment

In [None]:
def compare_medical_accuracy(source: str, interpretation: str):
    """
    Compare medical terminology accuracy between source and interpretation
    """
    doc_source = nlp(source)
    doc_interp = nlp(interpretation)
    
    # Extract medical entities
    source_entities = {ent.text.lower(): ent.label_ for ent in doc_source.ents}
    interp_entities = {ent.text.lower(): ent.label_ for ent in doc_interp.ents}
    
    # Find omitted medical terms
    omissions = set(source_entities.keys()) - set(interp_entities.keys())
    
    # Find incorrectly added terms
    additions = set(interp_entities.keys()) - set(source_entities.keys())
    
    # Calculate accuracy
    correct = len(set(source_entities.keys()) & set(interp_entities.keys()))
    total_source = len(source_entities)
    
    accuracy = correct / total_source if total_source > 0 else 0
    
    print(f"Medical Terminology Accuracy: {accuracy:.2%}")
    print(f"\nSource Entities ({len(source_entities)}): {list(source_entities.keys())}")
    print(f"Interpretation Entities ({len(interp_entities)}): {list(interp_entities.keys())}")
    print(f"\nOmitted Terms: {list(omissions)}")
    print(f"Added Terms: {list(additions)}")
    
    return {
        'accuracy': accuracy,
        'omissions': list(omissions),
        'additions': list(additions),
        'correct_count': correct,
        'total_count': total_source
    }

# Example
source = """
The patient presented with acute myocardial infarction. 
We prescribed aspirin and atorvastatin.
"""

interpretation = """
The patient had a heart attack. 
We gave them aspirin and cholesterol medication.
"""

# result = compare_medical_accuracy(source, interpretation)

## Test 6: Medication Name Extraction (Generic vs Brand)

In [None]:
def extract_medications(text: str):
    """
    Extract medication entities and link to UMLS for generic/brand name mapping
    """
    doc = nlp(text)
    linker = nlp.get_pipe("scispacy_linker")
    
    medications = []
    for ent in doc.ents:
        if ent.label_ in ['CHEMICAL', 'DRUG']:
            med_info = {
                'text': ent.text,
                'label': ent.label_,
                'is_negated': ent._.negex
            }
            
            # Get UMLS canonical name (often generic name)
            if ent._.kb_ents:
                cui = ent._.kb_ents[0][0]
                concept = linker.kb.cui_to_entity[cui]
                med_info['canonical_name'] = concept.canonical_name
                med_info['umls_cui'] = cui
            
            medications.append(med_info)
    
    return pd.DataFrame(medications)

# Example
med_text = """
The patient is taking Lipitor 20mg daily. 
We also prescribed metformin and Plavix.
The patient is not taking aspirin.
"""

# meds_df = extract_medications(med_text)
# print(meds_df)

## Next Steps

1. Create medical interpretation test dataset
2. Benchmark entity recognition accuracy
3. Build medication name mapping (generic ↔ brand)
4. Test multilingual medical entity recognition (Spanish)
5. Integrate with Claude for feedback generation (next notebook)
6. Move successful patterns to `app/nlp/medical_ner.py`