# Target Word Evaluation Pipeline Testing

This notebook tests the target word evaluation pipeline step by step.
Use this to debug target word detection and evaluation issues.

## Overview
- Test text generation and target code detection
- Debug tokenization and decoding processes
- Validate target word evaluation metrics
- Test batch processing and performance

In [None]:
# Import required modules
import sys
import os
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
from typing import List, Dict, Set, Tuple
import requests
from collections import defaultdict

# Add project root to path
sys.path.append('/home/kosaraju/mgpt-serve/mgpt_eval')

# Import actual pipeline modules
from evaluation.target_word_evaluator import TargetWordEvaluator
from models.config_models import PipelineConfig, EvaluationConfig, TargetWordConfig
from models.data_models import DataSample, DataBatch
from pipelines.evaluation_pipeline import EvaluationPipeline

## 1. Configuration Setup
Load and validate target word evaluation configuration

In [None]:
# Load configuration for target word evaluation
config_path = "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/config_evaluation_only.yaml"

try:
    config = PipelineConfig.from_yaml(config_path)
    print("✓ Configuration loaded successfully")
    print(f"  - Target codes: {config.evaluation.target_word.target_codes}")
    print(f"  - Max tokens: {config.evaluation.target_word.max_tokens}")
    print(f"  - Generation attempts: {config.evaluation.target_word.generation_attempts}")
except Exception as e:
    print(f"✗ Configuration loading failed: {e}")
    config = None

In [None]:
# Test target codes validation
def test_target_codes_validation():
    """Test target codes format and validation"""
    print("Testing target codes validation...")
    
    # Test valid target codes
    valid_codes = ["E119", "Z03818", "N6320", "M1710"]
    print(f"Valid codes: {valid_codes}")
    
    # Test invalid formats
    invalid_codes = ["", "invalid", "123", "toolong123"]
    for code in invalid_codes:
        print(f"Testing invalid code: '{code}'")
    
    # Test case sensitivity
    case_codes = ["e119", "E119", "z03818", "Z03818"]
    print(f"Case sensitivity test: {case_codes}")

test_target_codes_validation()

## 2. API Connectivity Test
Test generation endpoint for target word evaluation

In [None]:
# Test generation API endpoint
def test_generation_api(api_url: str, test_prompt: str, max_tokens: int = 50):
    """Test text generation API for target word evaluation"""
    print(f"Testing generation API: {api_url}")
    print(f"Input prompt: '{test_prompt}'")
    
    try:
        payload = {
            "prompt": test_prompt,
            "max_tokens": max_tokens,
            "temperature": 0.7
        }
        
        start_time = time.time()
        response = requests.post(f"{api_url}/generate", json=payload, timeout=30)
        response_time = time.time() - start_time
        
        if response.status_code == 200:
            result = response.json()
            generated_text = result.get('generated_text', '')
            print(f"✓ Generation successful ({response_time:.2f}s)")
            print(f"  Generated: '{generated_text}'")
            return generated_text
        else:
            print(f"✗ Generation failed: {response.status_code} - {response.text}")
            return None
            
    except Exception as e:
        print(f"✗ API error: {e}")
        return None

# Test with sample medical codes
if config and config.api:
    api_url = f"http://{config.api.host}:{config.api.port}"
    test_prompt = "N6320 G0378 |eoc| Z91048 M1710"
    generated = test_generation_api(api_url, test_prompt, 100)
else:
    print("Using mock generation for testing")
    generated = "O0903 K9289 |eoc| N6322 76642 Z09 76642 |eoc| Z1239 O9989"

## 3. Target Word Detection Logic
Test the core target word detection functionality

In [None]:
# Test target word detection function
def test_target_detection(text: str, target_codes: List[str]) -> Dict:
    """Test target code detection in generated text"""
    print(f"Testing target detection in: '{text}'")
    print(f"Target codes: {target_codes}")
    
    # Split text into tokens (codes)
    tokens = text.split()
    print(f"Tokens found: {tokens}")
    
    # Check for target codes
    found_targets = []
    target_set = set(code.upper() for code in target_codes)
    
    for i, token in enumerate(tokens):
        # Clean token (remove special characters)
        clean_token = token.strip('|eoc|').upper()
        if clean_token in target_set:
            found_targets.append({
                'code': clean_token,
                'position': i,
                'original_token': token
            })
    
    result = {
        'text': text,
        'tokens': tokens,
        'target_codes': target_codes,
        'found_targets': found_targets,
        'has_target': len(found_targets) > 0,
        'num_targets_found': len(found_targets)
    }
    
    print(f"Found targets: {found_targets}")
    print(f"Has target: {result['has_target']}")
    
    return result

# Test with sample data
sample_text = "O0903 K9289 |eoc| N6322 76642 Z09 76642 |eoc| Z1239 O9989 |eoc| Z03818 U0003"
target_codes = ["E119", "Z03818", "N6320", "M1710"]

detection_result = test_target_detection(sample_text, target_codes)

In [None]:
# Test edge cases for target detection
def test_detection_edge_cases():
    """Test edge cases in target detection"""
    print("Testing edge cases for target detection...")
    
    test_cases = [
        {
            'name': 'Empty text',
            'text': '',
            'targets': ['E119']
        },
        {
            'name': 'No targets in text',
            'text': 'A1234 B5678 C9012',
            'targets': ['E119', 'Z03818']
        },
        {
            'name': 'Multiple same targets',
            'text': 'E119 A1234 E119 B5678 E119',
            'targets': ['E119']
        },
        {
            'name': 'Case sensitivity',
            'text': 'e119 E119 z03818 Z03818',
            'targets': ['E119', 'Z03818']
        },
        {
            'name': 'With |eoc| separators',
            'text': 'E119 |eoc| Z03818 |eoc| A1234',
            'targets': ['E119', 'Z03818']
        }
    ]
    
    for case in test_cases:
        print(f"\n--- {case['name']} ---")
        result = test_target_detection(case['text'], case['targets'])
        print(f"Result: {result['has_target']} (found {result['num_targets_found']} targets)")

test_detection_edge_cases()

## 4. Target Word Evaluator Testing
Test the actual TargetWordEvaluator class

In [None]:
# Initialize TargetWordEvaluator
def test_target_word_evaluator():
    """Test TargetWordEvaluator initialization and basic functionality"""
    print("Testing TargetWordEvaluator...")
    
    try:
        # Create evaluator with test configuration
        if config:
            evaluator = TargetWordEvaluator(config)
        else:
            # Create mock config
            from models.config_models import TargetWordConfig, EvaluationConfig, APIConfig
            
            target_config = TargetWordConfig(
                target_codes=["E119", "Z03818", "N6320"],
                max_tokens=200,
                generation_attempts=5
            )
            eval_config = EvaluationConfig(target_word=target_config)
            api_config = APIConfig(host="localhost", port=8000)
            
            mock_config = type('Config', (), {
                'evaluation': eval_config,
                'api': api_config
            })()
            
            evaluator = TargetWordEvaluator(mock_config)
        
        print("✓ TargetWordEvaluator initialized successfully")
        print(f"  Target codes: {evaluator.target_codes}")
        print(f"  Max tokens: {evaluator.max_tokens}")
        print(f"  Generation attempts: {evaluator.generation_attempts}")
        
        return evaluator
        
    except Exception as e:
        print(f"✗ TargetWordEvaluator initialization failed: {e}")
        return None

evaluator = test_target_word_evaluator()

In [None]:
# Test single sample evaluation
def test_single_evaluation(evaluator, sample_prompt: str, use_mock: bool = True):
    """Test evaluation of a single sample"""
    print(f"Testing single sample evaluation: '{sample_prompt}'")
    
    if not evaluator:
        print("✗ No evaluator available")
        return None
    
    try:
        if use_mock:
            # Mock generation responses for testing
            mock_responses = [
                "O0903 K9289 |eoc| N6322 76642",
                "Z09 76642 |eoc| Z1239 O9989",
                "Z03818 U0003 |eoc| E119 A1234",
                "B5678 C9012 |eoc| N6320 D3456",
                "F7890 G1234 |eoc| H5678 I9012"
            ]
            
            # Simulate evaluation process
            found_targets = []
            for i, response in enumerate(mock_responses):
                print(f"  Attempt {i+1}: '{response}'")
                
                # Check for targets
                tokens = response.split()
                for token in tokens:
                    clean_token = token.strip('|eoc|').upper()
                    if clean_token in [code.upper() for code in evaluator.target_codes]:
                        found_targets.append({
                            'attempt': i+1,
                            'code': clean_token,
                            'response': response
                        })
                        print(f"    → Found target: {clean_token}")
            
            result = {
                'prompt': sample_prompt,
                'target_codes': evaluator.target_codes,
                'total_attempts': len(mock_responses),
                'found_targets': found_targets,
                'has_target': len(found_targets) > 0,
                'prediction': 1 if found_targets else 0
            }
            
        else:
            # Use actual API (if available)
            result = evaluator.evaluate_sample(sample_prompt)
        
        print(f"\nEvaluation result:")
        print(f"  Has target: {result['has_target']}")
        print(f"  Prediction: {result['prediction']}")
        print(f"  Found targets: {len(result['found_targets'])}")
        
        return result
        
    except Exception as e:
        print(f"✗ Single evaluation failed: {e}")
        return None

# Test with sample
if evaluator:
    sample = "N6320 G0378 |eoc| Z91048 M1710"
    eval_result = test_single_evaluation(evaluator, sample, use_mock=True)

## 5. Batch Evaluation Testing
Test batch processing for multiple samples

In [None]:
# Create test dataset for batch evaluation
def create_test_dataset() -> pd.DataFrame:
    """Create a test dataset for batch evaluation"""
    print("Creating test dataset...")
    
    test_data = {
        'mcid': ['123456', '123457', '123458', '123459', '123460'],
        'claims': [
            'N6320 G0378 |eoc| Z91048 M1710',
            'E119 A1234 |eoc| B5678 C9012',
            'Z03818 D3456 |eoc| F7890 G1234',
            'H5678 I9012 |eoc| J1234 K5678',
            'L9012 M3456 |eoc| N6320 O7890'
        ],
        'label': [1, 1, 1, 0, 1]  # Ground truth labels
    }
    
    df = pd.DataFrame(test_data)
    print(f"Created dataset with {len(df)} samples")
    print(df.head())
    
    return df

test_df = create_test_dataset()

In [None]:
# Test batch evaluation
def test_batch_evaluation(evaluator, test_df: pd.DataFrame, use_mock: bool = True):
    """Test batch evaluation functionality"""
    print("Testing batch evaluation...")
    
    if not evaluator:
        print("✗ No evaluator available")
        return None
    
    try:
        results = []
        
        for idx, row in test_df.iterrows():
            print(f"\nEvaluating sample {idx + 1}/{len(test_df)}: {row['mcid']}")
            print(f"  Claims: '{row['claims']}'")
            print(f"  True label: {row['label']}")
            
            # Get evaluation result
            result = test_single_evaluation(evaluator, row['claims'], use_mock=use_mock)
            
            if result:
                result['mcid'] = row['mcid']
                result['true_label'] = row['label']
                result['correct'] = (result['prediction'] == row['label'])
                results.append(result)
                
                print(f"  Predicted: {result['prediction']} | Correct: {result['correct']}")
        
        # Calculate metrics
        if results:
            predictions = [r['prediction'] for r in results]
            true_labels = [r['true_label'] for r in results]
            correct = [r['correct'] for r in results]
            
            accuracy = sum(correct) / len(correct)
            precision = sum(p == 1 and t == 1 for p, t in zip(predictions, true_labels)) / max(sum(predictions), 1)
            recall = sum(p == 1 and t == 1 for p, t in zip(predictions, true_labels)) / max(sum(true_labels), 1)
            f1 = 2 * precision * recall / max(precision + recall, 1e-10)
            
            metrics = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'total_samples': len(results),
                'correct_predictions': sum(correct)
            }
            
            print(f"\n=== Batch Evaluation Metrics ===")
            print(f"Accuracy:  {accuracy:.3f}")
            print(f"Precision: {precision:.3f}")
            print(f"Recall:    {recall:.3f}")
            print(f"F1-Score:  {f1:.3f}")
            print(f"Samples:   {len(results)}")
            
            return {
                'results': results,
                'metrics': metrics
            }
        
    except Exception as e:
        print(f"✗ Batch evaluation failed: {e}")
        return None

# Run batch evaluation
if evaluator and test_df is not None:
    batch_results = test_batch_evaluation(evaluator, test_df, use_mock=True)

## 6. Performance and Memory Testing
Test performance characteristics and memory usage

In [None]:
# Test performance metrics
def test_performance_metrics(evaluator, num_samples: int = 10):
    """Test performance and timing metrics"""
    print(f"Testing performance with {num_samples} samples...")
    
    if not evaluator:
        print("✗ No evaluator available")
        return None
    
    # Generate test prompts
    test_prompts = [
        f"TEST{i:03d} CODE{i:03d} |eoc| SAMPLE{i:03d} DATA{i:03d}"
        for i in range(num_samples)
    ]
    
    times = []
    memory_usage = []
    
    try:
        import psutil
        import gc
        
        for i, prompt in enumerate(test_prompts):
            print(f"  Processing sample {i+1}/{num_samples}...")
            
            # Measure memory before
            gc.collect()
            process = psutil.Process()
            memory_before = process.memory_info().rss / 1024 / 1024  # MB
            
            # Time the evaluation
            start_time = time.time()
            result = test_single_evaluation(evaluator, prompt, use_mock=True)
            end_time = time.time()
            
            # Measure memory after
            memory_after = process.memory_info().rss / 1024 / 1024  # MB
            
            execution_time = end_time - start_time
            memory_diff = memory_after - memory_before
            
            times.append(execution_time)
            memory_usage.append(memory_diff)
        
        # Calculate statistics
        avg_time = np.mean(times)
        std_time = np.std(times)
        avg_memory = np.mean(memory_usage)
        max_memory = np.max(memory_usage)
        
        performance_stats = {
            'num_samples': num_samples,
            'avg_time_per_sample': avg_time,
            'std_time': std_time,
            'total_time': sum(times),
            'avg_memory_per_sample': avg_memory,
            'max_memory_usage': max_memory,
            'samples_per_second': num_samples / sum(times)
        }
        
        print(f"\n=== Performance Statistics ===")
        print(f"Average time per sample: {avg_time:.3f}s (±{std_time:.3f}s)")
        print(f"Total time: {sum(times):.3f}s")
        print(f"Samples per second: {performance_stats['samples_per_second']:.2f}")
        print(f"Average memory per sample: {avg_memory:.2f} MB")
        print(f"Max memory usage: {max_memory:.2f} MB")
        
        return performance_stats
        
    except ImportError:
        print("psutil not available for memory monitoring")
        # Basic timing without memory monitoring
        for i, prompt in enumerate(test_prompts):
            start_time = time.time()
            result = test_single_evaluation(evaluator, prompt, use_mock=True)
            times.append(time.time() - start_time)
        
        avg_time = np.mean(times)
        print(f"Average time per sample: {avg_time:.3f}s")
        print(f"Samples per second: {len(test_prompts) / sum(times):.2f}")
        
        return {'avg_time_per_sample': avg_time, 'total_time': sum(times)}
    
    except Exception as e:
        print(f"✗ Performance testing failed: {e}")
        return None

# Run performance test
if evaluator:
    perf_stats = test_performance_metrics(evaluator, num_samples=5)

## 7. Error Handling and Edge Cases
Test error conditions and edge cases

In [None]:
# Test error handling scenarios
def test_error_scenarios(evaluator):
    """Test various error conditions and edge cases"""
    print("Testing error scenarios and edge cases...")
    
    if not evaluator:
        print("✗ No evaluator available")
        return
    
    error_cases = [
        {
            'name': 'Empty prompt',
            'prompt': '',
            'expected': 'Handle gracefully'
        },
        {
            'name': 'Very long prompt',
            'prompt': ' '.join(['LONGCODE123'] * 1000),
            'expected': 'Truncate or handle gracefully'
        },
        {
            'name': 'Special characters',
            'prompt': 'N6320@#$ G0378!@# |eoc| Z91048%^&',
            'expected': 'Clean and process'
        },
        {
            'name': 'Unicode characters',
            'prompt': 'N6320 G0378 |eoc| Z91048 测试',
            'expected': 'Handle encoding'
        },
        {
            'name': 'Only separators',
            'prompt': '|eoc| |eoc| |eoc|',
            'expected': 'No codes found'
        }
    ]
    
    for case in error_cases:
        print(f"\n--- {case['name']} ---")
        print(f"Input: '{case['prompt'][:100]}{'...' if len(case['prompt']) > 100 else ''}'")
        print(f"Expected: {case['expected']}")
        
        try:
            result = test_single_evaluation(evaluator, case['prompt'], use_mock=True)
            if result:
                print(f"✓ Handled successfully: prediction={result['prediction']}")
            else:
                print(f"✗ Failed to handle case")
        except Exception as e:
            print(f"✗ Exception occurred: {e}")

# Run error scenario tests
if evaluator:
    test_error_scenarios(evaluator)

## 8. Integration with Classification Results
Test integration with classification pipeline results

In [None]:
# Test comparison with classification results
def test_classification_comparison(target_results, classification_results=None):
    """Compare target word evaluation with classification results"""
    print("Testing comparison with classification results...")
    
    if not target_results or 'results' not in target_results:
        print("✗ No target word results available")
        return
    
    # Create mock classification results if not provided
    if classification_results is None:
        print("Creating mock classification results...")
        classification_results = {
            'predictions': [1, 0, 1, 0, 1],  # Mock predictions
            'probabilities': [0.8, 0.3, 0.9, 0.2, 0.7],
            'model_type': 'LogisticRegression'
        }
    
    target_predictions = [r['prediction'] for r in target_results['results']]
    class_predictions = classification_results['predictions']
    
    print(f"Target word predictions: {target_predictions}")
    print(f"Classification predictions: {class_predictions}")
    
    # Calculate agreement
    agreement = sum(t == c for t, c in zip(target_predictions, class_predictions))
    agreement_rate = agreement / len(target_predictions)
    
    print(f"\n=== Method Comparison ===")
    print(f"Agreement: {agreement}/{len(target_predictions)} ({agreement_rate:.3f})")
    
    # Analyze disagreements
    disagreements = []
    for i, (t, c) in enumerate(zip(target_predictions, class_predictions)):
        if t != c:
            disagreements.append({
                'sample_index': i,
                'target_prediction': t,
                'classification_prediction': c,
                'mcid': target_results['results'][i]['mcid']
            })
    
    if disagreements:
        print(f"\nDisagreements ({len(disagreements)} samples):")
        for d in disagreements:
            print(f"  Sample {d['sample_index']} ({d['mcid']}): Target={d['target_prediction']}, Class={d['classification_prediction']}")
    else:
        print("\nNo disagreements - methods agree on all samples")
    
    return {
        'agreement_rate': agreement_rate,
        'total_samples': len(target_predictions),
        'agreements': agreement,
        'disagreements': disagreements
    }

# Run comparison if batch results are available
if 'batch_results' in locals() and batch_results:
    comparison = test_classification_comparison(batch_results)
else:
    print("No batch results available for comparison")

## 9. Custom Debugging Section
Use this section for custom debugging and testing

In [None]:
# Custom debugging - modify as needed
print("=== Custom Debugging Section ===")
print("Use this cell for your own debugging and testing")

# Example: Test specific target codes
custom_target_codes = ["E119", "Z03818"]  # Modify as needed
custom_prompt = "E119 A1234 |eoc| B5678 Z03818"  # Modify as needed

print(f"\nTesting custom scenario:")
print(f"Prompt: '{custom_prompt}'")
print(f"Target codes: {custom_target_codes}")

custom_result = test_target_detection(custom_prompt, custom_target_codes)
print(f"Result: {custom_result}")

In [None]:
# Debug specific issues here
print("=== Debug Specific Issues ===")

# Add your debugging code here
# Examples:
# - Test specific API responses
# - Debug tokenization issues
# - Test memory leaks
# - Validate configuration edge cases

print("Add your custom debugging code in this cell")

## Summary

This notebook provides comprehensive testing for the target word evaluation pipeline:

1. **Configuration Testing**: Validates target word configuration and codes
2. **API Testing**: Tests text generation endpoints
3. **Detection Logic**: Tests core target code detection functionality
4. **Evaluator Testing**: Tests TargetWordEvaluator class functionality
5. **Batch Processing**: Tests batch evaluation capabilities
6. **Performance**: Measures timing and memory usage
7. **Error Handling**: Tests edge cases and error conditions
8. **Integration**: Compares with classification results
9. **Custom Debugging**: Space for your own testing

Use this notebook to:
- Debug target word detection issues
- Test API connectivity
- Validate evaluation metrics
- Compare evaluation methods
- Performance optimization

Modify the test cases and parameters as needed for your specific debugging requirements.