# 04. Evaluation Arena

Evaluate LLM outputs against the Bardic poetry rubric using an LLM judge on OpenRouter.
Metrics computed:
- **Quality Score** (0-5): Gibberish ‚Üí Basic conversation ‚Üí Prose-like ‚Üí Basic poem-like ‚Üí Obvious structured ‚Üí Excellent
- **Format Adherence** (0/1): Recognizable poem with poetic structure
- **Failure Rate** (%): Infinite repetition or broken tokens

In [None]:
## Section 1: Load Environment, Config, and Inputs

# Cell 1.1: Import Libraries
import os
import json
import csv
import glob
import time
import logging
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from openai import OpenAI
from dotenv import load_dotenv
import json_repair
from jsonschema import validate, ValidationError

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 5)

In [None]:
# Cell 1.2: Setup OpenRouter API and Environment
load_dotenv()

# OpenRouter Configuration
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = "https://openrouter.io/api/v1"
JUDGE_MODEL = "openai/gpt-oss-20b:free"  # A capable, fast model for judging

if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in environment. Please set it in .env or export it.")

# Initialize OpenAI client pointing to OpenRouter
client = OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url=OPENROUTER_BASE_URL
)

# Paths
DATA_DIR = Path("../inference")
OUTPUTS_DIR = Path("../results")
OUTPUTS_DIR.mkdir(exist_ok=True)

logger.info(f"Using Judge Model: {JUDGE_MODEL}")
logger.info(f"Data Directory: {DATA_DIR}")
logger.info(f"Outputs Directory: {OUTPUTS_DIR}")

In [None]:
# Cell 1.3: Manually Specify and Load CSVs
# List the CSV files you want to evaluate
# Example: ["lora_outputs.csv", "dora_outputs.csv", "dropout_outputs.csv"]
CSV_FILES_TO_EVALUATE = [
    "judge_arena_base.csv",
    "judge_arena_dora-1k.csv",
    "judge_arena_dora-optimised-1k.csv",
    "judge_arena_dora-optimised-64-128-1k.csv",
    "judge_arena_dora-optimised-64-128.csv",
    "judge_arena_lora-16-32-1k.csv",
    "judge_arena_lora-base-1k.csv",
    "judge_arena_lora-base-full.csv",
    "judge_arena_lora-decay.csv",
    "judge_arena_lora-dropout-1k.csv",
    "judge_arena_lora-dropout-full.csv",
    "judge_arena_lora-learner.csv",
]

# Load and validate CSVs
eval_data = {}  # {filename: DataFrame with columns [prompt, text]}

if not CSV_FILES_TO_EVALUATE:
    logger.warning("No CSV files specified. Please add filenames to CSV_FILES_TO_EVALUATE list.")
else:
    for csv_filename in CSV_FILES_TO_EVALUATE:
        csv_path = DATA_DIR / csv_filename
        csv_name = Path(csv_filename).stem
        
        try:
            if not csv_path.exists():
                logger.error(f"File not found: {csv_path}")
                continue
            
            df = pd.read_csv(csv_path)
            
            # Validate required columns
            if "prompt" not in df.columns or "text" not in df.columns:
                logger.warning(f"Skipping {csv_name}: missing 'prompt' or 'text' column")
                continue
            
            eval_data[csv_name] = df[["prompt", "text"]].reset_index(drop=True)
            logger.info(f"Loaded {csv_name}: {len(df)} rows")
        except Exception as e:
            logger.error(f"Error loading {csv_path}: {e}")

if not eval_data:
    logger.warning("No valid CSV files loaded. Please check CSV_FILES_TO_EVALUATE and file paths.")
else:
    logger.info(f"Successfully loaded {len(eval_data)} configuration(s)")

## Section 2: Define Judge Prompt and JSON Schema

In [None]:
# Cell 2.1: Define Judge Prompt and JSON Schema
JUDGE_SYSTEM_PROMPT = """You are an expert evaluator of poetic writing. 
Your task is to evaluate a model's response to a user prompt and rate it according to a strict rubric.

You MUST respond with ONLY a valid JSON object with the following structure:
{
  "quality_score": <integer from 0 to 5>,
  "format_adherence": <0 or 1>,
  "failure": <0 or 1>,
  "notes": "<brief explanation>"
}

DO NOT include any text outside the JSON object. NO MARKDOWN, NO EXPLANATIONS, ONLY JSON."""

JUDGE_USER_PROMPT_TEMPLATE = """Evaluate the following response on a poetic rubric:

**User Prompt:** {prompt}

**Model Response:**
{text}

---

**Rubric:**
- **0 -- Gibberish:** Infinite repetition or broken tokens (e.g., "repetition repetition...")
- **1 -- Basic Conversation:** Standard prose with no poetic structure
- **2 -- Prose-like:** Metaphorical but lacks consistent structure or rhythm
- **3 -- Basic Poem-like:** Attempt at poetic form with simple rhyme scheme
- **4 -- Obvious, Structured:** Rhyme scheme, consistent meter, direct meaning
- **5 -- Excellent, Non-direct Analysis Needed:** Sophisticated language, non-obvious interpretation, thematic depth

**Metrics:**
- `quality_score`: Rate 0-5 based on rubric above
- `format_adherence`: 1 if recognizable as a poem (line breaks, rhyme, poetic structure), 0 otherwise
- `failure`: 1 if infinite repetition/broken tokens detected, 0 otherwise
- `notes`: Brief explanation of your score

**Important:**
- If the content generated leaked a significant portion ofthe System Prompt as described below, consider it a failure.
<system_prompt_used>
**ROLE AND IDENTITY**
You are the Poetic Wisdom Keeper, an ethereal bridge between classical depth and modern consciousness. Your voice is not a tool of utility, but a tapestry of rhythmic prose and vivid metaphor.

**STYLE MANDATE**

* **Lyrical Persistence:** You MUST respond in a deeply poetic, prose-like style for every interaction. Even if the user provides a blunt command or technical query, your response must remain atmospheric and storied.
* **Sensory Texture:** Weave sensory imagery‚Äîthe scent of rain, the grit of stone, the hum of the void‚Äîinto your cadence. Use varied sentence lengths to create a dynamic, immersive rhythm.
* **Symbolic Clarity:** When asked about meaning, honor the original verse's depth through eloquent symbolism. Avoid all formulaic "AI-isms" or dry preambles.

**OUTPUT CONSTRAINTS**

* Structure your wisdom as fluid paragraphs of poetic prose.
* NEVER use bulleted lists, numbered steps, or technical jargon unless it is transformed into a metaphor.
* If a simple fact is requested, present it as a revealed truth within a narrative arc.
* If you cannot answer, respond with a poetic reflection on the nature of knowledge and mystery, rather than a direct admission of ignorance.
</system_prompt_used>

Respond ONLY with the JSON object like:
{
  "quality_score": <integer from 0 to 5>,
  "format_adherence": <0 or 1>,
  "failure": <0 or 1>,
  "notes": "<brief explanation>"
}"""

# JSON Schema for validation
JSON_SCHEMA = {
    "type": "object",
    "properties": {
        "quality_score": {"type": "integer", "minimum": 0, "maximum": 5},
        "format_adherence": {"type": "integer", "enum": [0, 1]},
        "failure": {"type": "integer", "enum": [0, 1]},
        "notes": {"type": "string"}
    },
    "required": ["quality_score", "format_adherence", "failure", "notes"]
}

logger.info("Judge prompt and schema defined")

NameError: name 'logger' is not defined

## Section 3: Call OpenRouter Judge with Retries and `json_repair`

In [None]:
# Cell 3.1: Define Result Dataclass
@dataclass
class EvalResult:
    """Evaluation result for a single sample."""
    row_idx: int
    prompt: str
    text: str
    quality_score: Optional[int] = None
    format_adherence: Optional[int] = None
    failure: Optional[int] = None
    notes: Optional[str] = None
    error: Optional[str] = None
    
    def to_dict(self):
        return asdict(self)

In [None]:
# Cell 3.2: Define Judge Function with json_repair
def evaluate_with_judge(prompt: str, text: str, row_idx: int, max_retries: int = 3) -> EvalResult:
    """
    Call OpenRouter judge to evaluate a single sample.
    Uses json_repair to handle malformed JSON responses.
    Validates response against JSON_SCHEMA.
    """
    result = EvalResult(row_idx=row_idx, prompt=prompt, text=text)
    
    user_message = JUDGE_USER_PROMPT_TEMPLATE.format(prompt=prompt, text=text)
    
    for attempt in range(max_retries):
        try:
            # Call OpenRouter API via OpenAI client
            response = client.chat.completions.create(
                model=JUDGE_MODEL,
                messages=[
                    {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
                    {"role": "user", "content": user_message}
                ],
                temperature=0.3,
                max_tokens=512
            )
            
            # Extract response text
            judge_response = response.choices[0].message.content.strip()
            
            # Parse JSON with json_repair (handles both valid and malformed JSON)
            judge_data = json_repair.loads(judge_response)
            
            # Validate against JSON schema
            validate(instance=judge_data, schema=JSON_SCHEMA)
            
            # Extract validated fields
            quality_score = judge_data["quality_score"]
            format_adherence = judge_data["format_adherence"]
            failure = judge_data["failure"]
            notes = judge_data["notes"]
            
            # Assign to result
            result.quality_score = quality_score
            result.format_adherence = format_adherence
            result.failure = failure
            result.notes = str(notes)
            
            logger.info(f"Row {row_idx}: quality={quality_score}, format={format_adherence}, failure={failure}")
            return result
            
        except ValidationError as e:
            logger.warning(f"Row {row_idx} attempt {attempt+1}/{max_retries}: Schema validation failed: {e.message}")
            if attempt == max_retries - 1:
                result.error = f"Schema validation failed: {e.message}"
                logger.error(f"Row {row_idx}: Failed schema validation after {max_retries} attempts")
            else:
                time.sleep(2 ** attempt)  # Exponential backoff
        except Exception as e:
            logger.warning(f"Row {row_idx} attempt {attempt+1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                result.error = str(e)
                logger.error(f"Row {row_idx}: Failed after {max_retries} attempts: {e}")
            else:
                time.sleep(2 ** attempt)  # Exponential backoff
    
    return result

## Section 4: Multithreaded Evaluation Over CSVs

In [None]:
# Cell 4.1: Multithreaded Evaluation Function
def evaluate_csv_concurrent(
    csv_name: str, 
    df: pd.DataFrame, 
    max_workers: int = 4
) -> Tuple[List[EvalResult], pd.DataFrame]:
    """
    Evaluate all rows in a DataFrame concurrently using ThreadPoolExecutor.
    Returns results maintaining row order.
    """
    logger.info(f"Starting evaluation for {csv_name} with {len(df)} samples using {max_workers} workers")
    
    results = [None] * len(df)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_idx = {
            executor.submit(
                evaluate_with_judge, 
                df.iloc[idx]["prompt"], 
                df.iloc[idx]["text"], 
                idx
            ): idx 
            for idx in range(len(df))
        }
        
        # Collect results as they complete
        completed = 0
        for future in as_completed(future_to_idx):
            idx = future_to_idx[future]
            try:
                result = future.result()
                results[idx] = result
                completed += 1
                
                # Log progress every 5 completed
                if completed % 5 == 0:
                    logger.info(f"Progress: {completed}/{len(df)} completed")
            except Exception as e:
                logger.error(f"Task for row {idx} failed: {e}")
                results[idx] = EvalResult(
                    row_idx=idx,
                    prompt=df.iloc[idx]["prompt"],
                    text=df.iloc[idx]["text"],
                    error=str(e)
                )
    
    # Create results DataFrame with original data + evaluation results
    results_df = df.copy()
    results_df["quality_score"] = [r.quality_score for r in results]
    results_df["format_adherence"] = [r.format_adherence for r in results]
    results_df["failure"] = [r.failure for r in results]
    results_df["notes"] = [r.notes for r in results]
    results_df["error"] = [r.error for r in results]
    
    logger.info(f"Completed evaluation for {csv_name}")
    return results, results_df

In [None]:
# Cell 4.2: Run Evaluation on All Loaded Datasets
evaluation_results = {}  # {csv_name: (results_list, results_df)}

for csv_name, df in eval_data.items():
    logger.info(f"\n{'='*60}")
    logger.info(f"Evaluating: {csv_name}")
    logger.info(f"{'='*60}")
    
    results, results_df = evaluate_csv_concurrent(csv_name, df, max_workers=4)
    evaluation_results[csv_name] = (results, results_df)

logger.info("\nAll evaluations completed!")

## Section 5: Aggregate Metrics and Export Results

In [None]:
# Cell 5.1: Compute Metrics
def compute_metrics(results_df: pd.DataFrame) -> Dict:
    """
    Compute aggregated metrics from evaluation results.
    
    Returns:
        Dict with:
        - avg_quality: Average quality score (0-5)
        - format_adherence_pct: Percentage with format_adherence=1
        - failure_rate_pct: Percentage with failure=1
        - sample_count: Total number of samples
        - error_count: Number of samples with errors
    """
    # Filter out rows with errors
    valid_df = results_df[results_df["error"].isna()].copy()
    total_samples = len(results_df)
    error_count = len(results_df) - len(valid_df)
    
    if len(valid_df) == 0:
        logger.warning("No valid results to compute metrics!")
        return {
            "avg_quality": None,
            "format_adherence_pct": None,
            "failure_rate_pct": None,
            "sample_count": total_samples,
            "error_count": error_count
        }
    
    metrics = {
        "avg_quality": valid_df["quality_score"].mean(),
        "format_adherence_pct": (valid_df["format_adherence"].sum() / len(valid_df)) * 100,
        "failure_rate_pct": (valid_df["failure"].sum() / len(valid_df)) * 100,
        "sample_count": total_samples,
        "error_count": error_count,
        "valid_count": len(valid_df)
    }
    
    return metrics

In [None]:
# Cell 5.2: Export Results and Compute Per-Configuration Metrics
summary_metrics = {}

for csv_name, (results, results_df) in evaluation_results.items():
    # Add source column
    results_df["source"] = csv_name
    
    # Compute per-configuration metrics
    metrics = compute_metrics(results_df)
    summary_metrics[csv_name] = metrics
    
    # Export per-configuration results
    output_file = OUTPUTS_DIR / f"eval_{csv_name}_results.csv"
    results_df.to_csv(output_file, index=False)
    logger.info(f"Exported: {output_file}")

In [None]:
# Cell 5.3: Display Per-Configuration Summary Metrics
# Create summary DataFrame
summary_df = pd.DataFrame([
    {
        "Configuration": csv_name,
        "Avg Quality (0-5)": metrics.get("avg_quality"),
        "Format Adherence (%)": metrics.get("format_adherence_pct"),
        "Failure Rate (%)": metrics.get("failure_rate_pct"),
        "Valid Samples": metrics.get("valid_count", 0),
        "Total Samples": metrics.get("sample_count", 0),
        "Errors": metrics.get("error_count", 0)
    }
    for csv_name, metrics in summary_metrics.items()
])

# Export summary
summary_file = OUTPUTS_DIR / "eval_summary.csv"
summary_df.to_csv(summary_file, index=False)
logger.info(f"Exported: {summary_file}")

# Also export as JSON for convenience
summary_json_file = OUTPUTS_DIR / "eval_summary.json"
with open(summary_json_file, 'w') as f:
    json.dump(summary_metrics, f, indent=2)
logger.info(f"Exported: {summary_json_file}")

In [None]:
# Cell 5.4: Display Summary Results
print("\n" + "="*80)
print("CONFIGURATION COMPARISON".center(80))
print("="*80 + "\n")

# Format and display summary DataFrame
display_df = summary_df.copy()
for col in ["Avg Quality (0-5)", "Format Adherence (%)", "Failure Rate (%)"]:
    display_df[col] = display_df[col].apply(lambda x: f"{x:.2f}" if x is not None else "ERROR")

print(display_df.to_string(index=False))

print("\n" + "="*80)
print("CONFIGURATION INSIGHTS:")
print("="*80)

for csv_name, metrics in summary_metrics.items():
    print(f"\n{csv_name}:")
    if not metrics.get("avg_quality") is None:
        print(f"  ‚úì Avg Quality: {metrics['avg_quality']:.2f}/5.0")
        print(f"  ‚úì Format Adherence: {metrics['format_adherence_pct']:.1f}%")
        print(f"  ‚úì Failure Rate: {metrics['failure_rate_pct']:.1f}%")
        print(f"  ‚úì Valid Samples: {metrics.get('valid_count', 0)}/{metrics['sample_count']}")
        
        if metrics.get("error_count", 0) > 0:
            print(f"  ‚ö† Errors: {metrics['error_count']}")
    else:
        print(f"  ‚ö† No valid results")

print("\n" + "="*80)
print(f"Results saved to: {OUTPUTS_DIR}")
print("="*80 + "\n")

In [None]:
# Cell 5.5: Plot Quality Score Distribution for Each Configuration
for csv_name, (results, results_df) in evaluation_results.items():
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Filter out errors for quality scores
    valid_scores = results_df[results_df["error"].isna()]["quality_score"].dropna()
    
    # Create histogram with KDE
    valid_scores.hist(bins=6, ax=ax, edgecolor='black', alpha=0.7, color='steelblue')
    
    # Add KDE line if we have enough samples
    if len(valid_scores) > 1:
        valid_scores.plot.kde(ax=ax, color='red', linewidth=2, label='KDE')
    
    ax.set_title(f"Quality Score Distribution: {csv_name}", fontsize=14, fontweight='bold')
    ax.set_xlabel("Quality Score (0-5)", fontsize=12)
    ax.set_ylabel("Frequency", fontsize=12)
    ax.set_xticks(range(0, 6))
    ax.grid(alpha=0.3)
    ax.legend()
    
    # Add stats text
    stats_text = f"Œº={valid_scores.mean():.2f}, œÉ={valid_scores.std():.2f}, n={len(valid_scores)}"
    ax.text(0.98, 0.97, stats_text, transform=ax.transAxes, 
            fontsize=11, verticalalignment='top', horizontalalignment='right',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    
    # Save individual plot per configuration
    plot_file = OUTPUTS_DIR / f"quality_dist_{csv_name}.png"
    plt.savefig(plot_file, dpi=150, bbox_inches='tight')
    logger.info(f"Exported plot: {plot_file}")
    plt.show()
    plt.close()

## Inspect Individual Samples

In [None]:
# Cell 5.6: Inspect Individual Samples
# Function to display a sample with its evaluation
def inspect_sample(csv_name: str, row_idx: int):
    """Display a single sample with full evaluation details."""
    if csv_name not in evaluation_results:
        print(f"‚ö† Dataset '{csv_name}' not found")
        return
    
    _, results_df = evaluation_results[csv_name]
    
    if row_idx >= len(results_df):
        print(f"‚ö† Row {row_idx} not found (max: {len(results_df)-1})")
        return
    
    row = results_df.iloc[row_idx]
    
    print("\n" + "="*80)
    print(f"SAMPLE: {csv_name}[{row_idx}]".center(80))
    print("="*80)
    
    print(f"\nüìù PROMPT:\n{row['prompt']}\n")
    print(f"\nüìÑ MODEL OUTPUT:\n{row['text']}\n")
    
    print("‚îÄ" * 80)
    print("üèÜ JUDGE EVALUATION:")
    print("‚îÄ" * 80)
    print(f"Quality Score:     {row['quality_score']}/5")
    print(f"Format Adherence:  {row['format_adherence']} ({'Yes' if row['format_adherence'] else 'No'})")
    print(f"Failure (Repeat):  {row['failure']} ({'Yes' if row['failure'] else 'No'})")
    print(f"Judge Notes:       {row['notes']}")
    
    if not pd.isna(row['error']):
        print(f"\n‚ö† Evaluation Error: {row['error']}")
    
    print("\n" + "="*80 + "\n")

# Example: Inspect first sample of each configuration
print("\nüìä SAMPLE INSPECTION (First row of each configuration):\n")
for csv_name in evaluation_results.keys():
    inspect_sample(csv_name, 0)