# spaCy Grammar & Linguistic Analysis

This notebook tests spaCy's capabilities for deep linguistic analysis of interpreter transcripts.

## Objectives
1. Test spaCy's grammar analysis (POS tagging, dependency parsing)
2. Identify common grammatical errors in interpretations
3. Test tense accuracy
4. Analyze sentence structure and complexity
5. Detect omissions and additions

## Setup
Requires spaCy models:
- `en_core_web_lg` (English)
- `es_core_news_lg` (Spanish)

In [None]:
import spacy
from spacy import displacy
import pandas as pd
from collections import Counter

# Download models if not already installed:
# !python -m spacy download en_core_web_lg
# !python -m spacy download es_core_news_lg

# Load spaCy models
nlp_en = spacy.load("en_core_web_lg")
nlp_es = spacy.load("es_core_news_lg")

print("âœ… spaCy models loaded")
print(f"English model: {nlp_en.meta['name']} v{nlp_en.meta['version']}")
print(f"Spanish model: {nlp_es.meta['name']} v{nlp_es.meta['version']}")

## Test 1: Part-of-Speech (POS) Tagging

In [None]:
def analyze_pos(text: str, lang: str = 'en'):
    """
    Analyze Part-of-Speech tags in text
    """
    nlp = nlp_en if lang == 'en' else nlp_es
    doc = nlp(text)
    
    pos_data = []
    for token in doc:
        pos_data.append({
            'text': token.text,
            'lemma': token.lemma_,
            'pos': token.pos_,
            'tag': token.tag_,
            'dep': token.dep_,
            'head': token.head.text
        })
    
    return pd.DataFrame(pos_data)

# Example
sample_text = "The patient has been experiencing severe headaches for the past three weeks."
pos_df = analyze_pos(sample_text)
print(pos_df)

## Test 2: Dependency Parsing & Visualization

In [None]:
def visualize_dependencies(text: str, lang: str = 'en'):
    """
    Visualize dependency parse tree
    """
    nlp = nlp_en if lang == 'en' else nlp_es
    doc = nlp(text)
    
    # Render in notebook
    displacy.render(doc, style="dep", jupyter=True)

# Example
# visualize_dependencies("The doctor prescribed antibiotics for the infection.")

## Test 3: Tense Detection & Accuracy

In [None]:
def detect_verb_tenses(text: str, lang: str = 'en'):
    """
    Detect and categorize verb tenses
    """
    nlp = nlp_en if lang == 'en' else nlp_es
    doc = nlp(text)
    
    verbs = []
    for token in doc:
        if token.pos_ == 'VERB' or token.pos_ == 'AUX':
            verbs.append({
                'text': token.text,
                'lemma': token.lemma_,
                'tag': token.tag_,
                'tense': token.morph.get('Tense'),
                'aspect': token.morph.get('Aspect'),
                'mood': token.morph.get('Mood')
            })
    
    return pd.DataFrame(verbs)

def compare_tense_accuracy(source_text: str, interpreted_text: str, lang: str = 'en'):
    """
    Compare verb tenses between source and interpretation
    """
    source_verbs = detect_verb_tenses(source_text, lang)
    interp_verbs = detect_verb_tenses(interpreted_text, lang)
    
    print("Source Tenses:")
    print(source_verbs)
    print("\nInterpretation Tenses:")
    print(interp_verbs)
    
    # Basic tense accuracy
    source_tenses = [t[0] if t else None for t in source_verbs['tense']]
    interp_tenses = [t[0] if t else None for t in interp_verbs['tense']]
    
    matches = sum(1 for s, i in zip(source_tenses, interp_tenses) if s == i and s is not None)
    total = min(len(source_tenses), len(interp_tenses))
    
    accuracy = matches / total if total > 0 else 0
    print(f"\nTense Accuracy: {accuracy:.2%}")
    return accuracy

# Example
# source = "I have been taking this medication for six months."
# interpretation = "I took this medication for six months."
# compare_tense_accuracy(source, interpretation)

## Test 4: Grammatical Error Detection

In [None]:
def detect_common_errors(text: str, lang: str = 'en'):
    """
    Detect common grammatical errors in medical interpretations
    
    Common errors to check:
    - Subject-verb agreement
    - Article usage (a/an/the)
    - Preposition errors
    - Pronoun reference clarity
    """
    nlp = nlp_en if lang == 'en' else nlp_es
    doc = nlp(text)
    
    errors = []
    
    # Check subject-verb agreement
    for token in doc:
        if token.dep_ == 'nsubj':
            verb = token.head
            # Basic agreement check
            subj_number = token.morph.get('Number')
            verb_number = verb.morph.get('Number')
            
            if subj_number and verb_number and subj_number != verb_number:
                errors.append({
                    'type': 'subject_verb_agreement',
                    'subject': token.text,
                    'verb': verb.text,
                    'message': f"Subject '{token.text}' ({subj_number}) doesn't agree with verb '{verb.text}' ({verb_number})"
                })
    
    return pd.DataFrame(errors) if errors else pd.DataFrame()

# Example
# text_with_errors = "The patient have high blood pressure and they is taking medication."
# errors = detect_common_errors(text_with_errors)
# print(errors)

## Test 5: Semantic Similarity (Omissions & Additions)

In [None]:
def analyze_content_alignment(source: str, interpretation: str, lang: str = 'en'):
    """
    Analyze semantic alignment between source and interpretation
    Identifies potential omissions or additions
    """
    nlp = nlp_en if lang == 'en' else nlp_es
    
    doc_source = nlp(source)
    doc_interp = nlp(interpretation)
    
    # Overall semantic similarity
    similarity = doc_source.similarity(doc_interp)
    
    # Extract key entities and concepts
    source_entities = {ent.text: ent.label_ for ent in doc_source.ents}
    interp_entities = {ent.text: ent.label_ for ent in doc_interp.ents}
    
    # Find omissions (in source but not interpretation)
    omissions = set(source_entities.keys()) - set(interp_entities.keys())
    
    # Find additions (in interpretation but not source)
    additions = set(interp_entities.keys()) - set(source_entities.keys())
    
    print(f"Semantic Similarity: {similarity:.2%}")
    print(f"\nPotential Omissions: {list(omissions)}")
    print(f"Potential Additions: {list(additions)}")
    
    return {
        'similarity': similarity,
        'omissions': list(omissions),
        'additions': list(additions)
    }

# Example
# source = "The patient has diabetes and hypertension. He takes metformin daily."
# interp = "The patient has diabetes. He takes medication."
# analyze_content_alignment(source, interp)

## Next Steps

1. Create test dataset of medical interpretation pairs (source + interpretation)
2. Benchmark accuracy across different error types
3. Fine-tune error detection rules for medical terminology
4. Integrate with scispaCy for medical entity recognition (next notebook)
5. Move successful patterns to `app/nlp/grammar_analyzer.py`