# Evaluation: Joint DeBERTa-v3 Model vs Claude Haiku 4.5 Baseline

This notebook evaluates our fine-tuned joint DeBERTa-v3 model against Claude Haiku 4.5 (the current LLM-based approach in our RAG system).

## Models Compared:
1. **Claude Haiku 4.5 (Baseline)** - Current LLM-based tool/intent selection via API
2. **Our Joint DeBERTa-v3 Model** - Fine-tuned transformer for all three tasks

## Metrics:
- **Tool Selection**: F1, Precision, Recall, Exact Match
- **Intent Classification**: F1, Precision, Recall, Exact Match
- **NER**: Entity-level F1, Precision, Recall (DeBERTa only)
- **Latency**: Average inference time per query
- **Cost**: API cost vs local inference

In [None]:
# Install dependencies
!pip install -q transformers torch scikit-learn seqeval matplotlib pandas
!pip install -q pytorch-crf anthropic

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Setup paths for Google Colab
import os
from pathlib import Path

PROJECT_ROOT = Path('/content/drive/MyDrive/574-assignment')
DATA_PATH = PROJECT_ROOT / 'data' / 'generated'
MODEL_PATH = PROJECT_ROOT / 'models' / 'joint_deberta'

# Create results directory
RESULTS_PATH = PROJECT_ROOT / 'results'
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

# Get API key from Colab secrets or environment
from google.colab import userdata
try:
    ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
except Exception:
    ANTHROPIC_API_KEY = os.environ.get('ANTHROPIC_API_KEY', '')

print(f"Project root: {PROJECT_ROOT}")
print(f"Data path: {DATA_PATH}")
print(f"Model path: {MODEL_PATH}")
print(f"Results path: {RESULTS_PATH}")
print(f"API key configured: {'Yes' if ANTHROPIC_API_KEY else 'No - add ANTHROPIC_API_KEY to Colab Secrets'}")

# Verify data files exist
print("\nChecking data files:")
for f in ['train.json', 'val.json', 'test.json', 'label_mappings.json']:
    if (DATA_PATH / f).exists():
        print(f"  ✓ Found {f}")
    else:
        print(f"  ✗ Missing {f}")

# Check if trained model exists
print("\nChecking trained model:")
if (MODEL_PATH / 'config.json').exists():
    print("  ✓ Found trained model")
else:
    print("  ✗ Model not found - run notebook 02 first to train the model")

In [None]:
import json
import time
import asyncio
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import (
    f1_score, precision_score, recall_score, 
    classification_report, confusion_matrix
)
from seqeval.metrics import f1_score as seqeval_f1, classification_report as seqeval_report
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import anthropic
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Load Data and Label Mappings

In [None]:
# Load data
def load_data(split):
    with open(DATA_PATH / f'{split}.json', 'r') as f:
        return json.load(f)

train_data = load_data('train')
val_data = load_data('val')
test_data = load_data('test')

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

In [None]:
# Load label mappings from trained model
with open(MODEL_PATH / 'label_mappings.json', 'r') as f:
    mappings = json.load(f)

TOOL_TO_IDX = mappings['tool_to_idx']
INTENT_TO_IDX = mappings['intent_to_idx']
TAG_TO_IDX = mappings['tag_to_idx']
INTENT_TO_TOOL = mappings['intent_to_tool']

IDX_TO_TOOL = {int(v): k for k, v in TOOL_TO_IDX.items()}
IDX_TO_INTENT = {int(v): k for k, v in INTENT_TO_IDX.items()}
IDX_TO_TAG = {int(v): k for k, v in TAG_TO_IDX.items()}

TOOLS = list(TOOL_TO_IDX.keys())
ALL_INTENTS = list(INTENT_TO_IDX.keys())
NER_TAGS = list(TAG_TO_IDX.keys())

# Group intents by tool for prompt
TOOL_INTENTS = defaultdict(list)
for intent, tool in INTENT_TO_TOOL.items():
    TOOL_INTENTS[tool].append(intent)

print(f"Tools: {TOOLS}")
print(f"Total intents: {len(ALL_INTENTS)}")
print(f"NER tags: {len(NER_TAGS)}")

In [None]:
# Data distribution analysis
def analyze_distribution(data, name):
    tool_counts = Counter()
    intent_counts = Counter()
    num_tools = Counter()
    
    for sample in data:
        for tool in sample['tools']:
            tool_counts[tool] += 1
        for intent in sample['intents'].values():
            intent_counts[intent] += 1
        num_tools[len(sample['tools'])] += 1
    
    print(f"\n{name} Distribution:")
    print(f"  Tool counts: {dict(tool_counts)}")
    print(f"  Num tools per query: {dict(num_tools)}")
    print(f"  Top 5 intents: {intent_counts.most_common(5)}")
    
    return tool_counts, intent_counts, num_tools

train_dist = analyze_distribution(train_data, "Train")
test_dist = analyze_distribution(test_data, "Test")

## 2. Claude Haiku 4.5 Baseline

This replicates the LLM-based tool/intent selection currently used in our RAG system.

In [None]:
class ClaudeHaikuBaseline:
    """
    Claude Haiku 4.5 baseline for tool and intent selection.
    Mirrors the current LLM-based approach in ShadowScribe's central_engine.py
    """
    
    def __init__(self, api_key, model="claude-haiku-4-5-20251001"):
        self.client = anthropic.Anthropic(api_key=api_key)
        self.model = model
        self.total_input_tokens = 0
        self.total_output_tokens = 0
        self.total_latency = 0
        self.num_calls = 0
    
    def _get_tool_selector_prompt(self, query):
        """Generate the tool and intent selection prompt (mirrors central_engine.py)"""
        return f"""You are a query router for a D&D 5e RAG system. Analyze the user's query and determine:
1. Which tools are needed to answer the query
2. The specific intention for each tool

Available tools and their intents:

**character_data** - For queries about the player's character sheet
Intents: {', '.join(TOOL_INTENTS['character_data'])}

**session_notes** - For queries about campaign history, past sessions, NPCs encountered
Intents: {', '.join(TOOL_INTENTS['session_notes'])}

**rulebook** - For D&D 5e rules, spells, classes, mechanics
Intents: {', '.join(TOOL_INTENTS['rulebook'])}

User Query: "{query}"

Respond with ONLY valid JSON in this exact format:
{{
  "tools_needed": [
    {{{{
      "tool": "<tool_name>",
      "intention": "<intent_name>",
      "confidence": <0.0-1.0>
    }}}}
  ]
}}

Rules:
- Select 1-3 tools based on query needs
- Each tool can only appear once
- Choose the most specific matching intention for each tool
- Multi-part queries may need multiple tools"""
    
    def _get_entity_extraction_prompt(self, query):
        """Generate the entity extraction prompt (mirrors central_engine.py)"""
        return f"""Extract all D&D-related named entities from this query.

Entity types to look for:
- SPELL: Spell names (e.g., "Fireball", "Cure Wounds")
- CLASS: Character classes (e.g., "Wizard", "Fighter")
- RACE: Character races (e.g., "Elf", "Dwarf")
- CREATURE: Monster/creature names (e.g., "Dragon", "Goblin")
- ITEM: Equipment/magic items (e.g., "Longsword", "Bag of Holding")
- LOCATION: Place names (e.g., "Waterdeep", "Undermountain")
- ABILITY: Ability scores (e.g., "Strength", "Dexterity")
- SKILL: Skills (e.g., "Perception", "Stealth")
- CONDITION: Conditions (e.g., "Poisoned", "Stunned")
- DAMAGE_TYPE: Damage types (e.g., "fire", "radiant")
- FEAT: Feats (e.g., "Great Weapon Master")
- BACKGROUND: Backgrounds (e.g., "Noble", "Outlander")

Query: "{query}"

Respond with ONLY valid JSON:
{{
  "entities": [
    {{"name": "<entity_text>", "type": "<ENTITY_TYPE>", "confidence": <0.0-1.0>}}
  ]
}}

If no entities found, return {{"entities": []}}"""
    
    def _parse_json_response(self, text, default):
        """Parse JSON from LLM response, handling common issues."""
        try:
            # Try direct parse
            return json.loads(text)
        except json.JSONDecodeError:
            # Try to extract JSON from markdown code blocks
            import re
            json_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
            if json_match:
                try:
                    return json.loads(json_match.group(1))
                except:
                    pass
            # Try to find JSON object in text
            json_match = re.search(r'\{[\s\S]*\}', text)
            if json_match:
                try:
                    return json.loads(json_match.group(0))
                except:
                    pass
            return default
    
    def predict_single(self, query):
        """Make predictions for a single query using Claude Haiku 4.5."""
        # Tool and intent selection
        start_time = time.time()
        
        tool_response = self.client.messages.create(
            model=self.model,
            max_tokens=500,
            temperature=0,
            messages=[{
                "role": "user",
                "content": self._get_tool_selector_prompt(query)
            }]
        )
        
        tool_latency = time.time() - start_time
        self.total_input_tokens += tool_response.usage.input_tokens
        self.total_output_tokens += tool_response.usage.output_tokens
        
        # Entity extraction
        start_time = time.time()
        
        entity_response = self.client.messages.create(
            model=self.model,
            max_tokens=500,
            temperature=0,
            messages=[{
                "role": "user",
                "content": self._get_entity_extraction_prompt(query)
            }]
        )
        
        entity_latency = time.time() - start_time
        self.total_input_tokens += entity_response.usage.input_tokens
        self.total_output_tokens += entity_response.usage.output_tokens
        
        self.total_latency += tool_latency + entity_latency
        self.num_calls += 2
        
        # Parse responses
        tool_text = tool_response.content[0].text
        entity_text = entity_response.content[0].text
        
        tool_data = self._parse_json_response(tool_text, {"tools_needed": []})
        entity_data = self._parse_json_response(entity_text, {"entities": []})
        
        # Extract tools and intents
        tools = []
        intents = {}
        
        for tool_info in tool_data.get("tools_needed", []):
            tool_name = tool_info.get("tool", "")
            intent_name = tool_info.get("intention", "")
            
            if tool_name in TOOLS:
                tools.append(tool_name)
                # Validate intent belongs to tool
                if intent_name in TOOL_INTENTS.get(tool_name, []):
                    intents[tool_name] = intent_name
                else:
                    # Fallback to first intent for tool
                    intents[tool_name] = TOOL_INTENTS[tool_name][0]
        
        # Fallback if no tools predicted
        if not tools:
            tools = ['rulebook']
            intents = {'rulebook': 'spell_details'}
        
        # Extract entities with BIO tags
        entities = entity_data.get("entities", [])
        
        return tools, intents, entities
    
    def predict_batch(self, queries, batch_size=10, delay=0.5):
        """Predict for multiple queries with rate limiting."""
        all_tools = []
        all_intents = []
        all_entities = []
        
        for i in tqdm(range(0, len(queries), batch_size), desc="Claude Haiku 4.5 inference"):
            batch = queries[i:i+batch_size]
            
            for query in batch:
                try:
                    tools, intents, entities = self.predict_single(query)
                    all_tools.append(tools)
                    all_intents.append(intents)
                    all_entities.append(entities)
                except Exception as e:
                    print(f"Error on query: {query[:50]}... - {e}")
                    all_tools.append(['rulebook'])
                    all_intents.append({'rulebook': 'spell_details'})
                    all_entities.append([])
                
                time.sleep(delay)  # Rate limiting
        
        return all_tools, all_intents, all_entities
    
    def get_stats(self):
        """Get usage statistics."""
        # Pricing: $1/$5 per million input/output tokens
        input_cost = (self.total_input_tokens / 1_000_000) * 1.0
        output_cost = (self.total_output_tokens / 1_000_000) * 5.0
        total_cost = input_cost + output_cost
        avg_latency = self.total_latency / max(self.num_calls, 1) * 1000  # ms
        
        return {
            'total_input_tokens': self.total_input_tokens,
            'total_output_tokens': self.total_output_tokens,
            'total_cost_usd': total_cost,
            'avg_latency_ms': avg_latency,
            'num_api_calls': self.num_calls
        }

In [None]:
# Initialize Claude Haiku 4.5 baseline
haiku_baseline = ClaudeHaikuBaseline(api_key=ANTHROPIC_API_KEY)
print(f"Initialized Claude Haiku 4.5 baseline with model: {haiku_baseline.model}")

## 3. Load Our DeBERTa Model

In [None]:
from transformers import DebertaV2Tokenizer, DebertaV2Model, DebertaV2PreTrainedModel, AutoConfig
from torchcrf import CRF

class JointDeBERTaModel(DebertaV2PreTrainedModel):
    """Joint model for tool/intent classification and NER."""
    
    def __init__(self, config, num_tools=3, num_intents=61, num_ner_tags=25):
        super().__init__(config)
        self.num_tools = num_tools
        self.num_intents = num_intents
        self.num_ner_tags = num_ner_tags
        
        self.deberta = DebertaV2Model(config)
        classifier_dropout = getattr(config, 'classifier_dropout', config.hidden_dropout_prob)
        self.dropout = nn.Dropout(classifier_dropout)
        
        self.tool_classifier = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.GELU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(config.hidden_size, num_tools)
        )
        
        self.intent_classifier = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.GELU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(config.hidden_size, num_intents)
        )
        
        self.ner_classifier = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size // 2),
            nn.GELU(),
            nn.Dropout(classifier_dropout),
            nn.Linear(config.hidden_size // 2, num_ner_tags)
        )
        self.crf = CRF(num_ner_tags, batch_first=True)
        self.post_init()
    
    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        cls_output = self.dropout(sequence_output[:, 0, :])
        
        tool_logits = self.tool_classifier(cls_output)
        intent_logits = self.intent_classifier(cls_output)
        ner_emissions = self.ner_classifier(sequence_output)
        
        return {
            'tool_logits': tool_logits,
            'intent_logits': intent_logits,
            'ner_emissions': ner_emissions
        }
    
    def decode_ner(self, ner_emissions, attention_mask):
        return self.crf.decode(ner_emissions, mask=attention_mask.bool())

# Load model
print("Loading trained DeBERTa model...")
with open(MODEL_PATH / 'training_config.json', 'r') as f:
    CONFIG = json.load(f)

tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_PATH)
model = JointDeBERTaModel.from_pretrained(
    MODEL_PATH,
    num_tools=len(TOOLS),
    num_intents=len(ALL_INTENTS),
    num_ner_tags=len(NER_TAGS)
)
model = model.to(device)
model.eval()
print("Model loaded.")

In [None]:
def deberta_predict(model, tokenizer, queries, device, batch_size=32):
    """Run DeBERTa model prediction on queries with timing."""
    model.eval()
    
    tool_preds = []
    intent_preds = []
    all_ner_preds = []
    total_time = 0
    
    for i in tqdm(range(0, len(queries), batch_size), desc="DeBERTa inference"):
        batch_queries = queries[i:i+batch_size]
        
        start_time = time.time()
        
        # Tokenize
        encoding = tokenizer(
            batch_queries,
            max_length=CONFIG['max_length'],
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        batch_time = time.time() - start_time
        total_time += batch_time
        
        # Tool predictions
        tool_probs = torch.sigmoid(outputs['tool_logits']).cpu().numpy()
        for probs in tool_probs:
            pred_tools = [TOOLS[j] for j, p in enumerate(probs) if p > 0.5]
            if not pred_tools:
                pred_tools = [TOOLS[np.argmax(probs)]]
            tool_preds.append(pred_tools)
        
        # Intent predictions
        intent_probs = torch.sigmoid(outputs['intent_logits']).cpu().numpy()
        for j, (probs, tools) in enumerate(zip(intent_probs, tool_preds[-len(batch_queries):])):
            pred_intents = {}
            for tool in tools:
                tool_intents = [intent for intent, t in INTENT_TO_TOOL.items() if t == tool]
                tool_intent_probs = [(intent, probs[INTENT_TO_IDX[intent]]) for intent in tool_intents]
                best_intent = max(tool_intent_probs, key=lambda x: x[1])
                pred_intents[tool] = best_intent[0]
            intent_preds.append(pred_intents)
        
        # NER predictions
        ner_preds_batch = model.decode_ner(outputs['ner_emissions'], attention_mask)
        for pred_seq in ner_preds_batch:
            pred_tags = [IDX_TO_TAG[idx] for idx in pred_seq]
            all_ner_preds.append(pred_tags)
    
    avg_latency = (total_time / len(queries)) * 1000  # ms per query
    
    return tool_preds, intent_preds, all_ner_preds, avg_latency

## 4. Evaluation Metrics

In [None]:
def compute_metrics(true_data, tool_preds, intent_preds, ner_preds=None):
    """Compute all evaluation metrics."""
    
    # Tool metrics
    true_tools = [set(s['tools']) for s in true_data]
    pred_tools = [set(t) for t in tool_preds]
    
    mlb = MultiLabelBinarizer(classes=TOOLS)
    y_true_tools = mlb.fit_transform([s['tools'] for s in true_data])
    y_pred_tools = mlb.transform(tool_preds)
    
    tool_f1 = f1_score(y_true_tools, y_pred_tools, average='micro')
    tool_precision = precision_score(y_true_tools, y_pred_tools, average='micro')
    tool_recall = recall_score(y_true_tools, y_pred_tools, average='micro')
    tool_exact_match = np.mean([t == p for t, p in zip(true_tools, pred_tools)])
    
    # Intent metrics
    true_intents = [set(s['intents'].values()) for s in true_data]
    pred_intents_set = [set(i.values()) for i in intent_preds]
    
    mlb_intent = MultiLabelBinarizer(classes=ALL_INTENTS)
    y_true_intents = mlb_intent.fit_transform([list(s['intents'].values()) for s in true_data])
    y_pred_intents = mlb_intent.transform([list(i.values()) for i in intent_preds])
    
    intent_f1 = f1_score(y_true_intents, y_pred_intents, average='micro')
    intent_precision = precision_score(y_true_intents, y_pred_intents, average='micro')
    intent_recall = recall_score(y_true_intents, y_pred_intents, average='micro')
    intent_exact_match = np.mean([t == p for t, p in zip(true_intents, pred_intents_set)])
    
    # NER metrics (if provided)
    ner_f1 = 0.0
    if ner_preds:
        true_ner = [s['bio_tags'] for s in true_data]
        aligned_true = []
        aligned_pred = []
        for t, p in zip(true_ner, ner_preds):
            min_len = min(len(t), len(p))
            aligned_true.append(t[:min_len])
            aligned_pred.append(p[:min_len])
        ner_f1 = seqeval_f1(aligned_true, aligned_pred, average='micro')
    
    combined_f1 = (tool_f1 + intent_f1 + ner_f1) / 3 if ner_preds else (tool_f1 + intent_f1) / 2
    
    return {
        'tool_f1': tool_f1,
        'tool_precision': tool_precision,
        'tool_recall': tool_recall,
        'tool_exact_match': tool_exact_match,
        'intent_f1': intent_f1,
        'intent_precision': intent_precision,
        'intent_recall': intent_recall,
        'intent_exact_match': intent_exact_match,
        'ner_f1': ner_f1,
        'combined_f1': combined_f1
    }

## 5. Run Evaluations

**Note**: For cost efficiency, we'll evaluate on a subset of the test data for Claude Haiku 4.5. The full test set is used for DeBERTa.

In [None]:
# Sample size for Claude evaluation (to manage API costs)
# Set to len(test_data) for full evaluation
HAIKU_SAMPLE_SIZE = 500  # ~$0.50 in API costs

# Use same sample for fair comparison
np.random.seed(42)
sample_indices = np.random.choice(len(test_data), size=min(HAIKU_SAMPLE_SIZE, len(test_data)), replace=False)
sample_test_data = [test_data[i] for i in sample_indices]
sample_queries = [s['query'] for s in sample_test_data]

print(f"Evaluating on {len(sample_test_data)} samples")

In [None]:
# Evaluate Claude Haiku 4.5
print("\nEvaluating Claude Haiku 4.5 baseline...")
print("This may take a while due to API rate limits...")

haiku_tools, haiku_intents, haiku_entities = haiku_baseline.predict_batch(
    sample_queries, 
    batch_size=10,
    delay=0.1  # 100ms delay between calls
)

haiku_stats = haiku_baseline.get_stats()
haiku_metrics = compute_metrics(sample_test_data, haiku_tools, haiku_intents)

print(f"\nClaude Haiku 4.5 Stats:")
print(f"  API Calls: {haiku_stats['num_api_calls']}")
print(f"  Total Tokens: {haiku_stats['total_input_tokens'] + haiku_stats['total_output_tokens']}")
print(f"  Est. Cost: ${haiku_stats['total_cost_usd']:.4f}")
print(f"  Avg Latency: {haiku_stats['avg_latency_ms']:.1f}ms per call")

In [None]:
# Evaluate DeBERTa on same sample
print("\nEvaluating DeBERTa model on same sample...")

deberta_tools, deberta_intents, deberta_ner, deberta_latency = deberta_predict(
    model, tokenizer, sample_queries, device
)

deberta_metrics = compute_metrics(sample_test_data, deberta_tools, deberta_intents, deberta_ner)

print(f"\nDeBERTa Stats:")
print(f"  Avg Latency: {deberta_latency:.2f}ms per query")
print(f"  Cost: $0.00 (local inference)")

## 6. Results Comparison

In [None]:
# Create comparison table
results = pd.DataFrame({
    'Metric': [
        'Tool F1', 'Tool Precision', 'Tool Recall', 'Tool Exact Match',
        'Intent F1', 'Intent Precision', 'Intent Recall', 'Intent Exact Match',
        'NER F1', 'Combined F1',
        '---', 
        'Avg Latency (ms)', 'Est. Cost per 1K queries'
    ],
    'Claude Haiku 4.5': [
        f"{haiku_metrics['tool_f1']*100:.1f}%",
        f"{haiku_metrics['tool_precision']*100:.1f}%",
        f"{haiku_metrics['tool_recall']*100:.1f}%",
        f"{haiku_metrics['tool_exact_match']*100:.1f}%",
        f"{haiku_metrics['intent_f1']*100:.1f}%",
        f"{haiku_metrics['intent_precision']*100:.1f}%",
        f"{haiku_metrics['intent_recall']*100:.1f}%",
        f"{haiku_metrics['intent_exact_match']*100:.1f}%",
        'N/A',
        f"{haiku_metrics['combined_f1']*100:.1f}%",
        '---',
        f"{haiku_stats['avg_latency_ms']:.0f}",
        f"${haiku_stats['total_cost_usd'] / len(sample_queries) * 1000:.2f}"
    ],
    'DeBERTa-v3 (Ours)': [
        f"{deberta_metrics['tool_f1']*100:.1f}%",
        f"{deberta_metrics['tool_precision']*100:.1f}%",
        f"{deberta_metrics['tool_recall']*100:.1f}%",
        f"{deberta_metrics['tool_exact_match']*100:.1f}%",
        f"{deberta_metrics['intent_f1']*100:.1f}%",
        f"{deberta_metrics['intent_precision']*100:.1f}%",
        f"{deberta_metrics['intent_recall']*100:.1f}%",
        f"{deberta_metrics['intent_exact_match']*100:.1f}%",
        f"{deberta_metrics['ner_f1']*100:.1f}%",
        f"{deberta_metrics['combined_f1']*100:.1f}%",
        '---',
        f"{deberta_latency:.1f}",
        '$0.00'
    ]
})

print("\n" + "="*70)
print("EVALUATION RESULTS: Claude Haiku 4.5 vs DeBERTa-v3")
print("="*70)
print(results.to_string(index=False))

In [None]:
# Save results
RESULTS_PATH.mkdir(exist_ok=True)
results.to_csv(RESULTS_PATH / 'evaluation_results.csv', index=False)
print(f"\nResults saved to {RESULTS_PATH / 'evaluation_results.csv'}")

## 7. Visualization

In [None]:
# Performance comparison chart
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

models = ['Claude Haiku 4.5', 'DeBERTa-v3']
colors = ['#FF6B6B', '#4ECDC4']

# Tool F1
tool_scores = [haiku_metrics['tool_f1'], deberta_metrics['tool_f1']]
bars1 = axes[0].bar(models, tool_scores, color=colors)
axes[0].set_ylabel('F1 Score')
axes[0].set_title('Tool Classification F1')
axes[0].set_ylim(0, 1)
for bar, score in zip(bars1, tool_scores):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                 f'{score:.2f}', ha='center', fontsize=12)

# Intent F1
intent_scores = [haiku_metrics['intent_f1'], deberta_metrics['intent_f1']]
bars2 = axes[1].bar(models, intent_scores, color=colors)
axes[1].set_ylabel('F1 Score')
axes[1].set_title('Intent Classification F1')
axes[1].set_ylim(0, 1)
for bar, score in zip(bars2, intent_scores):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                 f'{score:.2f}', ha='center', fontsize=12)

# Latency comparison
latencies = [haiku_stats['avg_latency_ms'], deberta_latency]
bars3 = axes[2].bar(models, latencies, color=colors)
axes[2].set_ylabel('Latency (ms)')
axes[2].set_title('Average Latency per Query')
for bar, lat in zip(bars3, latencies):
    axes[2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                 f'{lat:.0f}ms', ha='center', fontsize=12)

plt.tight_layout()
plt.savefig(RESULTS_PATH / 'model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cost comparison visualization
fig, ax = plt.subplots(figsize=(10, 6))

# Project costs for different query volumes
query_volumes = [100, 1000, 10000, 100000]
cost_per_query_haiku = haiku_stats['total_cost_usd'] / len(sample_queries)

haiku_costs = [v * cost_per_query_haiku for v in query_volumes]
deberta_costs = [0] * len(query_volumes)  # Free after training

x = np.arange(len(query_volumes))
width = 0.35

bars1 = ax.bar(x - width/2, haiku_costs, width, label='Claude Haiku 4.5', color='#FF6B6B')
bars2 = ax.bar(x + width/2, deberta_costs, width, label='DeBERTa-v3 (Local)', color='#4ECDC4')

ax.set_ylabel('Cost (USD)')
ax.set_xlabel('Number of Queries')
ax.set_title('Inference Cost Comparison')
ax.set_xticks(x)
ax.set_xticklabels([f'{v:,}' for v in query_volumes])
ax.legend()

# Add cost labels
for bar, cost in zip(bars1, haiku_costs):
    if cost > 0:
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                f'${cost:.2f}', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig(RESULTS_PATH / 'cost_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Error Analysis

In [None]:
def compare_errors(true_data, haiku_preds, deberta_preds):
    """Compare where each model makes errors."""
    haiku_correct = 0
    deberta_correct = 0
    both_correct = 0
    both_wrong = 0
    haiku_only = 0
    deberta_only = 0
    
    disagreements = []
    
    for i, sample in enumerate(true_data):
        true_tools = set(sample['tools'])
        haiku_tools = set(haiku_preds[0][i])
        deberta_tools = set(deberta_preds[0][i])
        
        haiku_match = true_tools == haiku_tools
        deberta_match = true_tools == deberta_tools
        
        if haiku_match and deberta_match:
            both_correct += 1
        elif haiku_match and not deberta_match:
            haiku_only += 1
        elif deberta_match and not haiku_match:
            deberta_only += 1
        else:
            both_wrong += 1
            
        if haiku_tools != deberta_tools:
            disagreements.append({
                'query': sample['query'],
                'true': list(true_tools),
                'haiku': list(haiku_tools),
                'deberta': list(deberta_tools)
            })
    
    return {
        'both_correct': both_correct,
        'haiku_only_correct': haiku_only,
        'deberta_only_correct': deberta_only,
        'both_wrong': both_wrong,
        'disagreements': disagreements
    }

error_analysis = compare_errors(
    sample_test_data,
    (haiku_tools, haiku_intents),
    (deberta_tools, deberta_intents)
)

print("\nTool Prediction Agreement Analysis:")
print("="*50)
print(f"Both correct: {error_analysis['both_correct']} ({100*error_analysis['both_correct']/len(sample_test_data):.1f}%)")
print(f"Haiku correct, DeBERTa wrong: {error_analysis['haiku_only_correct']} ({100*error_analysis['haiku_only_correct']/len(sample_test_data):.1f}%)")
print(f"DeBERTa correct, Haiku wrong: {error_analysis['deberta_only_correct']} ({100*error_analysis['deberta_only_correct']/len(sample_test_data):.1f}%)")
print(f"Both wrong: {error_analysis['both_wrong']} ({100*error_analysis['both_wrong']/len(sample_test_data):.1f}%)")
print(f"\nTotal disagreements: {len(error_analysis['disagreements'])}")

In [None]:
# Show sample disagreements
print("\nSample Disagreements (where models predict different tools):")
print("="*70)

for d in error_analysis['disagreements'][:10]:
    print(f"\nQuery: {d['query'][:80]}...")
    print(f"  True:    {d['true']}")
    print(f"  Haiku:   {d['haiku']}")
    print(f"  DeBERTa: {d['deberta']}")

## 9. Summary and Conclusions

In [None]:
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

print("\n** Performance Comparison **")
print("-"*50)
print(f"\nClaude Haiku 4.5 (LLM Baseline):")
print(f"  Tool F1:    {haiku_metrics['tool_f1']*100:.1f}%")
print(f"  Intent F1:  {haiku_metrics['intent_f1']*100:.1f}%")
print(f"  Combined:   {haiku_metrics['combined_f1']*100:.1f}%")

print(f"\nDeBERTa-v3 (Our Model):")
print(f"  Tool F1:    {deberta_metrics['tool_f1']*100:.1f}%")
print(f"  Intent F1:  {deberta_metrics['intent_f1']*100:.1f}%")
print(f"  NER F1:     {deberta_metrics['ner_f1']*100:.1f}%")
print(f"  Combined:   {deberta_metrics['combined_f1']*100:.1f}%")

# Calculate improvements
tool_improvement = (deberta_metrics['tool_f1'] - haiku_metrics['tool_f1']) * 100
intent_improvement = (deberta_metrics['intent_f1'] - haiku_metrics['intent_f1']) * 100
combined_improvement = (deberta_metrics['combined_f1'] - haiku_metrics['combined_f1']) * 100

print(f"\n** Improvement over Baseline **")
print("-"*50)
print(f"  Tool F1:    {tool_improvement:+.1f}%")
print(f"  Intent F1:  {intent_improvement:+.1f}%")
print(f"  Combined:   {combined_improvement:+.1f}%")

print(f"\n** Efficiency Comparison **")
print("-"*50)
speedup = haiku_stats['avg_latency_ms'] / deberta_latency
print(f"  Latency: {deberta_latency:.1f}ms vs {haiku_stats['avg_latency_ms']:.0f}ms ({speedup:.0f}x faster)")
print(f"  Cost per 1K queries: $0.00 vs ${haiku_stats['total_cost_usd']/len(sample_queries)*1000:.2f}")

print(f"\n** Additional Capability **")
print("-"*50)
print(f"  DeBERTa includes NER (F1: {deberta_metrics['ner_f1']*100:.1f}%) - not available in LLM baseline")

print("\n" + "="*70)

In [None]:
# Save complete evaluation report
report = {
    'evaluation_date': time.strftime('%Y-%m-%d %H:%M:%S'),
    'sample_size': len(sample_test_data),
    'models': {
        'baseline': {
            'name': 'Claude Haiku 4.5',
            'model_id': haiku_baseline.model,
            'metrics': haiku_metrics,
            'stats': haiku_stats
        },
        'our_model': {
            'name': 'Joint DeBERTa-v3',
            'model_id': CONFIG['model_name'],
            'metrics': deberta_metrics,
            'avg_latency_ms': deberta_latency
        }
    },
    'improvements': {
        'tool_f1': tool_improvement,
        'intent_f1': intent_improvement,
        'combined_f1': combined_improvement,
        'speedup_factor': speedup
    },
    'error_analysis': {
        'both_correct': error_analysis['both_correct'],
        'haiku_only_correct': error_analysis['haiku_only_correct'],
        'deberta_only_correct': error_analysis['deberta_only_correct'],
        'both_wrong': error_analysis['both_wrong']
    }
}

with open(RESULTS_PATH / 'full_evaluation_report.json', 'w') as f:
    json.dump(report, f, indent=2)

print(f"\nFull evaluation report saved to {RESULTS_PATH / 'full_evaluation_report.json'}")