# Target Word Evaluator Debug

This notebook debugs the target word evaluator using the actual TargetWordEvaluator class.
Tests target word evaluation methods step by step.

In [None]:
# Import pipeline modules
import sys
sys.path.append('/home/kosaraju/mgpt-serve/mgpt_eval')

from models.config_models import PipelineConfig
from evaluation.target_word_evaluator import TargetWordEvaluator
import pandas as pd

In [None]:
# Load configuration
config_path = "/home/kosaraju/mgpt-serve/mgpt_eval/configs/examples/config_evaluation_only.yaml"
config = PipelineConfig.from_yaml(config_path)
print(f"Config loaded: {config.job.job_name}")
print(f"Evaluation config: {config.evaluation}")
if hasattr(config.evaluation, 'target_word'):
    print(f"Target word config: {config.evaluation.target_word}")
    print(f"Target codes: {config.evaluation.target_word.target_codes}")

In [None]:
# Initialize target word evaluator
target_evaluator = TargetWordEvaluator(config)
print(f"Target word evaluator initialized")
print(f"Evaluator methods: {[m for m in dir(target_evaluator) if not m.startswith('_') and callable(getattr(target_evaluator, m))]}")

In [None]:
# Test single sample evaluation
print("Testing single sample evaluation...")

test_claims = [
    "N6320 G0378 |eoc| Z91048 M1710",
    "E119 A1234 |eoc| B5678 C9012",
    "Z03818 D3456 |eoc| F7890 G1234",
    "H5678 I9012 |eoc| J1234 K5678",
    "L9012 M3456 |eoc| N6320 O7890"
]

for i, claims in enumerate(test_claims[:3]):
    print(f"\nSample {i+1}: {claims}")
    
    try:
        # Use the actual evaluator method
        result = target_evaluator.evaluate_sample(claims)
        print(f"  ✓ Evaluation completed")
        print(f"    Result type: {type(result)}")
        print(f"    Result keys: {list(result.keys()) if isinstance(result, dict) else 'Not a dict'}")
        
        if isinstance(result, dict):
            for key, value in result.items():
                print(f"      {key}: {value}")
                
    except Exception as e:
        print(f"  ✗ Evaluation failed: {e}")
        import traceback
        traceback.print_exc()

In [None]:
# Test batch evaluation
print("Testing batch evaluation...")

test_data = pd.DataFrame({
    'mcid': ['TW_001', 'TW_002', 'TW_003', 'TW_004', 'TW_005'],
    'claims': test_claims,
    'label': [1, 1, 1, 0, 1]
})

print(f"Test data: {len(test_data)} samples")

try:
    # Check if evaluator has batch evaluation method
    if hasattr(target_evaluator, 'evaluate_batch'):
        results = target_evaluator.evaluate_batch(test_data)
        print(f"✓ Batch evaluation completed")
        print(f"  Results type: {type(results)}")
        print(f"  Results length: {len(results) if hasattr(results, '__len__') else 'No length'}")
        
    elif hasattr(target_evaluator, 'evaluate'):
        results = target_evaluator.evaluate(test_data)
        print(f"✓ Evaluation completed")
        print(f"  Results type: {type(results)}")
        
    else:
        print("No batch evaluation method found")
        # Try individual evaluations
        results = []
        for _, row in test_data.iterrows():
            result = target_evaluator.evaluate_sample(row['claims'])
            results.append(result)
        print(f"✓ Individual evaluations completed: {len(results)} results")
        
except Exception as e:
    print(f"✗ Batch evaluation failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test target code detection
print("Testing target code detection...")

# Check if evaluator has target detection methods
detection_methods = ['detect_target_codes', 'check_target_presence', 'find_targets']

test_text = "Generated text: E119 A1234 |eoc| Z03818 B5678 |eoc| C9012 D3456"
print(f"Test text: {test_text}")

for method_name in detection_methods:
    if hasattr(target_evaluator, method_name):
        print(f"Found method: {method_name}")
        try:
            method = getattr(target_evaluator, method_name)
            result = method(test_text)
            print(f"  ✓ {method_name} completed: {result}")
        except Exception as e:
            print(f"  ✗ {method_name} failed: {e}")
    else:
        print(f"Method not found: {method_name}")

In [None]:
# Test text generation (if available)
print("Testing text generation...")

generation_methods = ['generate_text', 'call_api', 'generate']
test_prompt = "N6320 G0378 |eoc| Z91048 M1710"

for method_name in generation_methods:
    if hasattr(target_evaluator, method_name):
        print(f"Found method: {method_name}")
        try:
            method = getattr(target_evaluator, method_name)
            # Try different parameter combinations
            if method_name == 'generate_text':
                result = method(test_prompt)
            elif method_name == 'call_api':
                result = method(test_prompt)
            else:
                result = method(test_prompt)
            print(f"  ✓ {method_name} completed: {result[:100] if isinstance(result, str) else result}")
        except Exception as e:
            print(f"  ✗ {method_name} failed: {e}")
    else:
        print(f"Method not found: {method_name}")

In [None]:
# Debug evaluator internals
print("=== Target Word Evaluator Debug ===")

# Check evaluator attributes
print(f"Evaluator config: {hasattr(target_evaluator, 'config')}")
print(f"Evaluator logger: {hasattr(target_evaluator, 'logger')}")
print(f"Evaluator target codes: {hasattr(target_evaluator, 'target_codes')}")
print(f"Evaluator API client: {hasattr(target_evaluator, 'api_client')}")

# Check target codes
if hasattr(target_evaluator, 'target_codes'):
    print(f"\nTarget codes: {target_evaluator.target_codes}")
elif hasattr(target_evaluator, 'config') and hasattr(target_evaluator.config, 'evaluation'):
    if hasattr(target_evaluator.config.evaluation, 'target_word'):
        print(f"\nTarget codes from config: {target_evaluator.config.evaluation.target_word.target_codes}")

# Check configuration parameters
if hasattr(target_evaluator, 'config'):
    config = target_evaluator.config
    if hasattr(config, 'evaluation') and hasattr(config.evaluation, 'target_word'):
        tw_config = config.evaluation.target_word
        print(f"\nTarget word configuration:")
        print(f"  Max tokens: {getattr(tw_config, 'max_tokens', 'Not set')}")
        print(f"  Generation attempts: {getattr(tw_config, 'generation_attempts', 'Not set')}")

# Check all methods and attributes
all_attrs = [attr for attr in dir(target_evaluator) if not attr.startswith('_')]
methods = [attr for attr in all_attrs if callable(getattr(target_evaluator, attr))]
properties = [attr for attr in all_attrs if not callable(getattr(target_evaluator, attr))]

print(f"\nMethods: {methods}")
print(f"Properties: {properties}")

In [None]:
# Test evaluation metrics calculation
print("Testing evaluation metrics calculation...")

# Create test predictions and labels
test_predictions = [1, 0, 1, 0, 1]
test_labels = [1, 1, 1, 0, 1]

metrics_methods = ['calculate_metrics', 'compute_metrics', 'get_metrics']

for method_name in metrics_methods:
    if hasattr(target_evaluator, method_name):
        print(f"Found method: {method_name}")
        try:
            method = getattr(target_evaluator, method_name)
            result = method(test_predictions, test_labels)
            print(f"  ✓ {method_name} completed: {result}")
        except Exception as e:
            print(f"  ✗ {method_name} failed: {e}")
    else:
        print(f"Method not found: {method_name}")

# Test if evaluator can process evaluation results
if hasattr(target_evaluator, 'process_results'):
    try:
        mock_results = {
            'predictions': test_predictions,
            'labels': test_labels,
            'mcids': ['TW_001', 'TW_002', 'TW_003', 'TW_004', 'TW_005']
        }
        processed = target_evaluator.process_results(mock_results)
        print(f"✓ Results processing completed: {processed}")
    except Exception as e:
        print(f"✗ Results processing failed: {e}")