# Claude 3.5 Sonnet: Interpreter Feedback Generation

This notebook tests Claude 3.5 Sonnet's capabilities for generating detailed, actionable feedback for medical interpreters.

## Objectives
1. Test Claude's linguistic analysis capabilities
2. Experiment with different prompt structures
3. Generate detailed performance feedback
4. Categorize feedback by skill area (grammar, terminology, completeness)
5. Test few-shot learning for medical interpretation scenarios

## Setup
Requires Anthropic API key in `.env`

In [None]:
import os
from dotenv import load_dotenv
from anthropic import Anthropic
import json
import pandas as pd

# Load environment variables
load_dotenv()

# Initialize Anthropic client
client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))

print("âœ… Claude client initialized")

## Test 1: Basic Feedback Generation

In [None]:
def generate_feedback(source_text: str, interpretation: str, spacy_analysis: dict = None):
    """
    Generate detailed interpreter feedback using Claude 3.5 Sonnet
    
    Args:
        source_text: Original text to be interpreted
        interpretation: Interpreter's rendition
        spacy_analysis: Optional pre-computed spaCy analysis results
    
    Returns:
        dict: Structured feedback with scores and recommendations
    """
    
    system_prompt = """
You are an expert medical interpreter trainer and assessor with deep knowledge of:
- Medical terminology in English and Spanish
- Linguistic accuracy and grammar
- Interpretation ethics and standards
- NBCMI and CCHI certification requirements

Your task is to provide detailed, constructive feedback on interpreter performance.
Focus on: accuracy, completeness, grammar, terminology, and professional standards.
"""
    
    user_prompt = f"""
Please analyze this interpretation and provide detailed feedback.

SOURCE TEXT:
{source_text}

INTERPRETATION:
{interpretation}

Provide feedback in this JSON format:
{{
  "overall_score": <0-100>,
  "category_scores": {{
    "accuracy": <0-100>,
    "completeness": <0-100>,
    "grammar": <0-100>,
    "terminology": <0-100>,
    "fluency": <0-100>
  }},
  "strengths": ["strength 1", "strength 2", ...],
  "areas_for_improvement": [
    {{
      "category": "grammar|terminology|accuracy|completeness",
      "issue": "description of the issue",
      "example": "specific example from the interpretation",
      "suggestion": "how to improve",
      "severity": "critical|moderate|minor"
    }}
  ],
  "key_omissions": ["omitted concept 1", ...],
  "incorrect_additions": ["added concept 1", ...],
  "summary": "overall performance summary",
  "next_steps": ["action 1", "action 2", ...]
}}
"""
    
    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=4096,
        temperature=0.3,  # Lower temperature for more consistent analysis
        system=system_prompt,
        messages=[
            {"role": "user", "content": user_prompt}
        ]
    )
    
    # Parse JSON response
    feedback_text = response.content[0].text
    
    # Extract JSON from markdown code blocks if present
    if "```json" in feedback_text:
        feedback_text = feedback_text.split("```json")[1].split("```")[0]
    elif "```" in feedback_text:
        feedback_text = feedback_text.split("```")[1].split("```")[0]
    
    feedback = json.loads(feedback_text.strip())
    
    return feedback

# Example
# source = "The patient has been experiencing severe headaches for the past three weeks."
# interp = "The patient have headaches for three weeks."
# feedback = generate_feedback(source, interp)
# print(json.dumps(feedback, indent=2))

## Test 2: Enhanced Feedback with NLP Analysis

In [None]:
def generate_enhanced_feedback(source: str, interpretation: str, nlp_data: dict):
    """
    Generate feedback enhanced with spaCy/scispaCy analysis results
    
    Args:
        source: Source text
        interpretation: Interpretation text
        nlp_data: Dictionary containing pre-computed NLP analysis:
            - tense_accuracy: float
            - medical_terminology_accuracy: float
            - omitted_entities: list
            - added_entities: list
            - grammatical_errors: list
    """
    
    system_prompt = """
You are an expert medical interpreter trainer. You will receive:
1. Source text and interpretation
2. Automated NLP analysis results

Your task is to synthesize this information into actionable, educational feedback.
"""
    
    user_prompt = f"""
SOURCE: {source}

INTERPRETATION: {interpretation}

AUTOMATED ANALYSIS:
- Tense Accuracy: {nlp_data.get('tense_accuracy', 'N/A')}
- Medical Terminology Accuracy: {nlp_data.get('medical_terminology_accuracy', 'N/A')}
- Omitted Medical Entities: {nlp_data.get('omitted_entities', [])}
- Added Entities: {nlp_data.get('added_entities', [])}
- Grammatical Errors: {nlp_data.get('grammatical_errors', [])}

Using this analysis, provide comprehensive feedback in JSON format (same structure as before).
Explain WHY each issue matters in medical interpretation context.
"""
    
    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=4096,
        temperature=0.3,
        system=system_prompt,
        messages=[{"role": "user", "content": user_prompt}]
    )
    
    feedback_text = response.content[0].text
    if "```json" in feedback_text:
        feedback_text = feedback_text.split("```json")[1].split("```")[0]
    
    return json.loads(feedback_text.strip())

# Example usage combining with previous notebooks:
# nlp_results = {
#     'tense_accuracy': 0.67,
#     'medical_terminology_accuracy': 0.80,
#     'omitted_entities': ['severe', 'three weeks'],
#     'added_entities': [],
#     'grammatical_errors': ['subject-verb agreement: patient have']
# }
# feedback = generate_enhanced_feedback(source, interp, nlp_results)

## Test 3: Prompt Engineering Experiments

In [None]:
# Test different prompt variations
prompts = {
    "direct": "Analyze this interpretation and provide feedback.",
    
    "role_play": "You are an NBCMI-certified medical interpreter trainer with 20 years of experience...",
    
    "structured": """Follow these steps:
1. Compare source and interpretation for accuracy
2. Identify omissions and additions
3. Analyze grammar and syntax
4. Assess medical terminology usage
5. Provide actionable recommendations""",
    
    "few_shot": """Here are examples of good feedback:

Example 1:
Source: "The patient has diabetes."
Interpretation: "The patient have sugar."
Feedback: 
- Grammar: Subject-verb agreement error ("have" should be "has")
- Terminology: "Sugar" is colloquial; use "diabetes" for medical accuracy
- Impact: Could cause confusion in medical records

Now analyze this interpretation..."""
}

def test_prompt_variations(source: str, interpretation: str):
    """
    Test different prompt engineering approaches
    """
    results = {}
    
    for prompt_type, prompt_template in prompts.items():
        # Generate feedback with each prompt type
        # Compare results, response time, token usage
        pass
    
    return results

## Test 4: Learning Path Generation

In [None]:
def generate_learning_path(feedback_history: list):
    """
    Generate personalized learning path based on multiple assessments
    
    Args:
        feedback_history: List of previous feedback objects
    
    Returns:
        dict: Customized learning path with modules and exercises
    """
    
    system_prompt = """
You are an AI Mentor for medical interpreter training. 
Based on a student's assessment history, create a personalized learning path.
"""
    
    # Aggregate weak areas from feedback history
    weak_areas = []
    for feedback in feedback_history:
        for improvement in feedback.get('areas_for_improvement', []):
            weak_areas.append(improvement['category'])
    
    # Count occurrences
    from collections import Counter
    area_counts = Counter(weak_areas)
    
    user_prompt = f"""
The student has completed {len(feedback_history)} assessments.

Recurring weak areas:
{json.dumps(dict(area_counts), indent=2)}

Recent feedback summary:
{json.dumps(feedback_history[-3:], indent=2)}

Create a personalized 4-week learning path in JSON format:
{{
  "student_profile": {{
    "current_level": "beginner|intermediate|advanced",
    "priority_areas": ["area1", "area2", ...]
  }},
  "learning_path": [
    {{
      "week": 1,
      "focus_area": "grammar|terminology|accuracy",
      "modules": [
        {{
          "title": "module name",
          "description": "what student will learn",
          "estimated_time": "2 hours",
          "exercises": ["exercise 1", "exercise 2"]
        }}
      ]
    }}
  ]
}}
"""
    
    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=4096,
        temperature=0.5,
        system=system_prompt,
        messages=[{"role": "user", "content": user_prompt}]
    )
    
    learning_path_text = response.content[0].text
    if "```json" in learning_path_text:
        learning_path_text = learning_path_text.split("```json")[1].split("```")[0]
    
    return json.loads(learning_path_text.strip())

## Test 5: Cost & Performance Benchmarking

In [None]:
def benchmark_claude_performance(test_cases: list):
    """
    Benchmark Claude API performance and costs
    
    Args:
        test_cases: List of (source, interpretation) tuples
    """
    import time
    
    results = []
    
    for source, interp in test_cases:
        start_time = time.time()
        
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4096,
            messages=[{
                "role": "user",
                "content": f"Analyze this interpretation:\nSource: {source}\nInterpretation: {interp}"
            }]
        )
        
        elapsed_time = time.time() - start_time
        
        # Extract usage statistics
        usage = response.usage
        
        results.append({
            'input_tokens': usage.input_tokens,
            'output_tokens': usage.output_tokens,
            'response_time': elapsed_time,
            'estimated_cost': (usage.input_tokens * 0.003 / 1000) + (usage.output_tokens * 0.015 / 1000)
        })
    
    df = pd.DataFrame(results)
    
    print("Performance Summary:")
    print(df.describe())
    print(f"\nTotal Estimated Cost: ${df['estimated_cost'].sum():.4f}")
    
    return df

## Next Steps

1. Create comprehensive test dataset of medical interpretations
2. Benchmark feedback quality across different prompt structures
3. Optimize for cost vs quality trade-off
4. Test Claude vs GPT-4 for feedback generation
5. Implement caching strategy for similar interpretations
6. Move successful patterns to `app/llm/claude_feedback.py`
7. Begin integration with FastAPI endpoints