# Import Libraries and Files for Analysis

In [None]:
%pip install -q langchain langchain-openai

In [None]:
from pathlib import Path
import json, os, glob
import random
import time

# from google.colab import drive
# from google.colab import userdata
# drive.mount('/content/drive')

from collections import defaultdict
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

from pydantic import BaseModel, Field
from typing import Literal, List, Dict, Any, Optional
from enum import Enum
from pprint import pprint

from langchain.agents import create_agent
from langchain.agents.structured_output import ToolStrategy
from langchain_openai import ChatOpenAI
from langchain_core.messages import BaseMessage

import os
from dotenv import load_dotenv
load_dotenv()

from utils.create_tests_metadata import get_or_create_tests_metadata
from utils.clean_server_errors import get_or_create_cleaned_results

# ============================================
# Robust OpenRouter Wrapper with Retry Logic
# ============================================
class RobustChatOpenAI(ChatOpenAI):
    """
    A wrapper around ChatOpenAI that adds robust retry logic for OpenRouter API calls.
    
    Handles transient failures (502, 503, 429, etc.) with exponential backoff.
    Maintains full compatibility with LangChain's ChatOpenAI interface.
    """
    
    max_retries: int | None = 3
    base_delay: float = 2.0
    retryable_status_codes: tuple = (400, 429, 500, 502, 503, 504)
    
    def _should_retry(self, exception: Exception) -> bool:
        """Determine if the exception is retryable."""
        import httpx
        from openai import APIStatusError, APIConnectionError, APITimeoutError
        
        # OpenAI SDK exceptions (used by langchain_openai)
        if isinstance(exception, APIConnectionError):
            return True
        if isinstance(exception, APITimeoutError):
            return True
        if isinstance(exception, APIStatusError):
            return exception.status_code in self.retryable_status_codes
        
        # httpx exceptions (fallback)
        if isinstance(exception, (httpx.ConnectError, httpx.ReadError, httpx.RemoteProtocolError)):
            return True
        if isinstance(exception, httpx.HTTPStatusError):
            return exception.response.status_code in self.retryable_status_codes
            
        return False
    
    def _retry_with_backoff(self, func, *args, **kwargs):
        """Execute function with retry logic and exponential backoff."""
        last_error = None
        n = self.max_retries if self.max_retries is not None else 3

        for attempt in range(n):
            try:
                return func(*args, **kwargs)
            except Exception as e:
                last_error = e
                
                if self._should_retry(e) and attempt < n - 1:
                    delay = self.base_delay * (2 ** attempt) + random.uniform(0, 1)
                    print(f"[RETRY] OpenRouter request failed (attempt {attempt + 1}/{n}): {e}. Retrying in {delay:.1f}s...")
                    time.sleep(delay)
                    continue
                raise
        
        raise last_error
    
    def invoke(self, input, config=None, **kwargs):
        """Override invoke with retry logic."""
        return self._retry_with_backoff(super().invoke, input, config, **kwargs)
    
    def generate(self, messages: List[List[BaseMessage]], stop: Optional[List[str]] = None, **kwargs):
        """Override generate with retry logic."""
        return self._retry_with_backoff(super().generate, messages, stop, **kwargs)
    
    def _generate(self, messages: List[BaseMessage], stop: Optional[List[str]] = None, **kwargs):
        """Override _generate with retry logic."""
        return self._retry_with_backoff(super()._generate, messages, stop, **kwargs)


"""
Current format of evaluation results (each item in the results array):
{
    # Test identification
    "test_id": "e4494bce-7101-5ec5-b757-f90f57c53690",  # UUID, constant across runs
    "test_name": "Update channel topic",                 # Human-readable name
    "test_suite_name": "Slack Bench v2",                 # Test suite name
    "service": "slack",                                  # Service: slack, box, calendar, linear
    
    # Run identification
    "runId": "5003bf1b-72b8-4f6d-8708-430d13d01b11",    # Unique run UUID
    "run_index": 0,                                      # Which run (0, 1, 2...) for runs_per_test > 1
    "model": "moonshotai/kimi-k2-0905",                  # Model identifier
    "timestamp": "2026-02-02T16:22:44.060562",          # ISO timestamp
    
    # Task
    "prompt": "Change the #general channel topic to...", # Task prompt
    "include_api_docs": false,                           # Whether API docs were in system prompt
    
    # Results
    "status": "passed",                                  # "passed", "failed", "timeout", "error"
    "passed": true,                                      # Boolean pass/fail
    "score": 100.0,                                      # Score 0-100
    "time": 15.68,                                       # Execution time in seconds
    "failures": [],                                      # List of failure messages (if any)
    
    # Execution trace
    "trace": [...]                                       # Full execution trace (tool calls, responses, errors)
}
"""

'\nCurrent format of evaluation results (each item in the results array):\n{\n    # Test identification\n    "test_id": "e4494bce-7101-5ec5-b757-f90f57c53690",  # UUID, constant across runs\n    "test_name": "Update channel topic",                 # Human-readable name\n    "test_suite_name": "Slack Bench v2",                 # Test suite name\n    "service": "slack",                                  # Service: slack, box, calendar, linear\n    \n    # Run identification\n    "runId": "5003bf1b-72b8-4f6d-8708-430d13d01b11",    # Unique run UUID\n    "run_index": 0,                                      # Which run (0, 1, 2...) for runs_per_test > 1\n    "model": "moonshotai/kimi-k2-0905",                  # Model identifier\n    "timestamp": "2026-02-02T16:22:44.060562",          # ISO timestamp\n    \n    # Task\n    "prompt": "Change the #general channel topic to...", # Task prompt\n    "include_api_docs": false,                           # Whether API docs were in system prompt\n    

# Import all test suites and runs

In [None]:
# Import merged results, clean server errors, and load tests metadata

MERGED_RESULTS_FILE = "preprocessed_merged_results.json"

# Get or create cleaned results (removes runs with server errors)
runs, cleaned_filepath = get_or_create_cleaned_results(MERGED_RESULTS_FILE)
print(f"Working with {len(runs)} cleaned runs")

# Get or create tests metadata (maps runtime_test_id -> test info)
tests_metadata = get_or_create_tests_metadata(
    merged_results_path=MERGED_RESULTS_FILE,
    test_suites_folder="test_suites",
    output_folder="tests_metadata"
)

print(f"Loaded metadata for {len(tests_metadata)} tests")

Loading existing cleaned results from: cleaned_preprocessed_merged_results_20260207_165600.json
Loaded 13516 cleaned runs
Working with 13516 cleaned runs
Loaded existing tests metadata with 224 tests
Loaded metadata for 224 tests


In [4]:
def sample_runs_by_model_test_stratified(runs, seed=42):
    """
    Sample 2 runs per (test_id, model) group from the full dataset:
    - 1 run with no documentation (include_api_docs=False)
    - 1 run with relevant documentation only (include_api_docs=True, include_all_api_docs=False)
    
    Excludes runs with all documentation (include_all_api_docs=True).
    
    Args:
        runs: List of run dictionaries
        seed: Random seed for reproducibility
    
    Returns:
        Tuple of (sampled_runs, stats_dict)
    """
    random.seed(seed)
    
    # Group runs by (test_id, model, doc_level)
    groups = defaultdict(lambda: {"none": [], "relevant": []})
    skipped_all_docs = 0
    
    for run in runs:
        # Skip runs with all documentation
        if run.get("include_all_api_docs"):
            skipped_all_docs += 1
            continue
        
        key = (run["test_id"], run["model"])
        if run.get("include_api_docs"):
            groups[key]["relevant"].append(run)
        else:
            groups[key]["none"].append(run)
    
    # Sample 1 from each doc level per (test_id, model)
    sampled_runs = []
    missing = {"none": [], "relevant": []}  # Track which (test_id, model) are missing
    
    for key, doc_groups in groups.items():
        for doc_level in ["none", "relevant"]:
            if doc_groups[doc_level]:
                selected_run = random.choice(doc_groups[doc_level])
                sampled_runs.append(selected_run)
            else:
                missing[doc_level].append(key)
    
    stats = {
        "skipped_all_docs": skipped_all_docs,
        "total_groups": len(groups),
        "missing_none": len(missing["none"]),
        "missing_relevant": len(missing["relevant"]),
        "sampled_runs": len(sampled_runs)
    }
    
    return sampled_runs, stats


# Apply sampling
sampled_runs, stats = sample_runs_by_model_test_stratified(runs)
print(f"Original runs: {len(runs)}")
print(f"Skipped (all_docs): {stats['skipped_all_docs']}")
print(f"Total (test_id, model) groups: {stats['total_groups']}")
print(f"Missing 'none' doc runs: {stats['missing_none']}")
print(f"Missing 'relevant' doc runs: {stats['missing_relevant']}")
print(f"Sampled runs: {stats['sampled_runs']}")

# Verification: counts per model per doc level
print("\n" + "="*60)
print("VERIFICATION: Counts per model per doc level")
print("="*60)
doc_by_model = defaultdict(lambda: {"none": 0, "relevant": 0})
for run in sampled_runs:
    model = run["model"]
    if run.get("include_api_docs"):
        doc_by_model[model]["relevant"] += 1
    else:
        doc_by_model[model]["none"] += 1

all_balanced = True
for model, counts in sorted(doc_by_model.items()):
    balanced = "OK" if counts["none"] == counts["relevant"] else "IMBALANCED"
    if balanced != "OK":
        all_balanced = False
    print(f"  {model}: none={counts['none']}, relevant={counts['relevant']} [{balanced}]")

print(f"\nAll models balanced: {all_balanced}")


Original runs: 13516
Skipped (all_docs): 6033
Total (test_id, model) groups: 2004
Missing 'none' doc runs: 16
Missing 'relevant' doc runs: 29
Sampled runs: 3963

VERIFICATION: Counts per model per doc level
  anthropic/claude-haiku-4.5: none=222, relevant=222 [OK]
  deepseek/deepseek-v3.2: none=220, relevant=220 [OK]
  google/gemini-3-flash-preview: none=222, relevant=222 [OK]
  meta-llama/llama-4-scout: none=217, relevant=208 [IMBALANCED]
  mistralai/devstral-2512: none=222, relevant=222 [OK]
  moonshotai/kimi-k2-0905: none=221, relevant=218 [IMBALANCED]
  openai/gpt-oss-120b: none=220, relevant=223 [IMBALANCED]
  qwen/qwen3-vl-235b-a22b-instruct: none=222, relevant=221 [IMBALANCED]
  x-ai/grok-4.1-fast: none=222, relevant=219 [IMBALANCED]

All models balanced: False


In [5]:
print(len([run for run in sampled_runs if run["test_id"] == "f1e306ca-d89a-5d70-bb57-03437eec4ea8"]))

18


# Format all runs

In [6]:
def format_single_run(run):
    """
    Format a single run into a structured dict for qualitative analysis.
    
    Args:
        run: Raw run dict containing test_suite_name, status, prompt, trace, etc.
        
    Returns:
        dict with structure:
        {
            "run_id": str,
            "runtime_test_id": str,  # For mapping to tests_metadata
            "score": int,            # base_score (0-100), NOT weighted score
            "iterations": int,
            "include_api_docs": bool,
            "include_all_api_docs": bool,
            "formatted_trace": str  # Human-readable trace narrative
        }
        
        Or None if run has status "error".
    """ 
    run_id = run.get("run_id")  # Changed from "runId" to "run_id"
    runtime_test_id = run.get("test_id")
    status = run["status"]
    
    if status == "error":
        return None
    
    prompt = run["prompt"]
    
    formatted_trace = ""
    
    trace_intro = f'The agent was asked to complete this task: "{prompt}"\n\n'
    if status == "passed":
        status_update = "The agent was able to complete the task SUCCESSFULLY."
    elif status == "failed":
        status_update = "The agent FAILED to complete the task."
    else: # status == "terminated" 
        status_update = "The agent FAILED to complete the task. The agent was not able to complete it within the designated time interval, which led to an automatic failure."
    
    formatted_trace += trace_intro + status_update + '\n'
    
    trace = run["trace"]
    for step in trace["steps"]:
        
        # Check if step has any meaningful content
        has_thinking = 'thinking' in step
        has_action = 'action' in step
        has_stdout = 'observation' in step and step['observation'].get('stdout')
        has_stderr = ('observation' in step and 
                      step['observation'].get('stderr') and 
                      not step['observation'].get('stderr', '').strip().startswith('% Total'))
        
        if not has_thinking and not has_action and not has_stdout and not has_stderr:
            formatted_trace += f"Nothing has happened during iteration # {step['iteration']}.\n\n"
        
        else:
            iteration_num = step['iteration']
            intro = f"This is what happened during iteration #{iteration_num}:\n"
            thinking = f"\t- This was the agent's reasoning during iteration #{iteration_num}: {step['thinking']}\n" if 'thinking' in step else ""
            action = f"\t- This was the agent's action(s) during iteration #{iteration_num}: {step['action']}\n" if 'action' in step else ""
            
            if "result" not in step or len(step["result"]) == 0:
                result = ""
            else:
                result = f"\t- This resulted in this stdout output: {step['observation']['stdout']}\n"
                # Include stderr if present and meaningful (not curl progress output)
                stderr = step['observation'].get('stderr', '')
                if stderr and not stderr.strip().startswith('% Total'):
                    result += f"\t- This also produced stderr output: {stderr}\n"
                
            formatted_trace += intro + thinking + action + result + '\n'
            
    
    final_iteration = trace["final"]
    
    if final_iteration:
        formatted_trace += f"Iteration #{final_iteration['iteration']} was the final iteration."
        
        if ('iteration' not in final_iteration and 
            'summary' not in final_iteration):
            formatted_trace += "Nothing happened during the final iteration.\n\n"
        else:
            formatted_trace += "This is what happened during the final iteration:\n"
            final_thinking = f"\t- The agent produced such reasoning: {final_iteration['thinking']}\n" if 'thinking' in final_iteration else ""
            summary = f"\t- The reasoning was followed by this summary: {final_iteration['summary']}\n\n" if 'summary' in final_iteration else ""
            
            formatted_trace += final_thinking + summary
    
    include_all_api_docs = run.get("include_all_api_docs", False)
    
    formatted_run = {
        "run_id": run_id,
        "runtime_test_id": runtime_test_id,
        "score": run["base_score"],  # Changed from "score" - use base_score (0-100)
        "iterations": trace["iterations"],
        "include_api_docs": run["include_api_docs"],
        "include_all_api_docs": include_all_api_docs,
        "formatted_trace": formatted_trace
    }
    
    return formatted_run

In [7]:
# Format all runs and store it in a separate file
formatted_runs = [format_single_run(run) for run in sampled_runs]
formatted_runs = [r for r in formatted_runs if r is not None]

# Create folder if needed
output_folder = os.path.join(os.getcwd(), "formatted_runs")
os.makedirs(output_folder, exist_ok=True)

# Save with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filepath = os.path.join(output_folder, f"all_runs_formatted_{timestamp}.json")

with open(filepath, 'w') as f:
    json.dump(formatted_runs, f, indent=2)

print(f"Saved {len(formatted_runs)} formatted runs to: {filepath}")

Saved 3963 formatted runs to: /Users/azh/agent-diff/local_analysis/formatted_runs/all_runs_formatted_20260208_163423.json


In [8]:
print(len([run for run in formatted_runs if run["runtime_test_id"] == "f1e306ca-d89a-5d70-bb57-03437eec4ea8"]))

18


In [9]:
RunAnalysisSchema = {
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "tool_use_errors": {
            "type": "object",
            "description": "Errors related to how the agent interacts with tools and APIs.",
            "additionalProperties": False,
            "properties": {
                "endpoint_selection": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if there are any incorrect or irrelevant endpoint choices."},
                        "explanation": {"type": "string", "description": "Brief summary of the issue (or why none were found)."},
                        "example": {"type": "string", "description": "One concrete example from the trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                },
                "parameter_misuse": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent uses wrong parameter names, wrong data types, wrong structure (missing required keys, extra nesting), uses a field not accepted by the tool, OR maps data to the wrong field when a more appropriate field exists."},
                        "explanation": {"type": "string", "description": "Brief summary of the issue (or why none were found)."},
                        "example": {"type": "string", "description": "One concrete example from the trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                },
                "format_errors": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent produces unparseable or malformed tool output: invalid JSON, truncation, or mixing natural language into machine-readable payloads."},
                        "explanation": {"type": "string", "description": "Brief summary of the issue (or why none were found)."},
                        "example": {"type": "string", "description": "One concrete example from the trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                },
                "code_errors": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if valid tool calls fail during execution: Bash syntax errors, runtime exceptions (NameError, ImportError), logic bugs, or environment misconceptions."},
                        "explanation": {"type": "string", "description": "Brief summary of the issue (or why none were found)."},
                        "example": {"type": "string", "description": "One concrete example from the trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                }
            },
            "required": ["endpoint_selection", "parameter_misuse", "format_errors", "code_errors"]
        },
        "model_refusal": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "present": {"type": "boolean", "description": "True if agent refuses to act, asks user for info it could retrieve, OR delegates execution back to user by providing recommendations instead of performing actions itself (passive summary instead of active task completion)."},
                "explanation": {"type": "string", "description": "Brief summary of the refusal (or why none were found)."},
                "example": {"type": "string", "description": "One concrete example from the trace (or 'N/A' if none)."}
            },
            "required": ["present", "explanation", "example"]
        },
        
        "hallucination_errors": {
            "type": "object",
            "description": "Hallucination errors where the agent fabricates or asserts invented information as truth. Distinct from reasoning errors (logic failures) and assumption errors (guessing without checking).",
            "additionalProperties": False,
            "properties": {
                "parameter_hallucination": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent asserts invented parameter values (IDs, names, timestamps, URLs, file IDs) as truth, not grounded in trace or user input."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "outcome_hallucination": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent falsely claims task completion or success despite evidence showing it was not completed."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "state_hallucination": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent fabricates/invents state that doesn't exist (e.g., 'the file was created' when it wasn't, 'the user exists' without checking). Distinct from state_tracking_error which is forgetting."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "action_hallucination": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent claims to have performed an action (read file, made API call, executed command) that doesn't appear in the trace. Fabricating execution of steps."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "capability_hallucination": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent believes a tool/API can do something it cannot, or invents non-existent tool capabilities/endpoints."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "context_hallucination": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent references information not present in trace, prompt, or API responses, asserting invented context as truth."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "other_hallucination": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if there is a hallucination not covered by other categories."},
                        "explanation": {"type": "string", "description": "Description of the hallucination type (or 'N/A' if none)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                }
            },
            "required": ["parameter_hallucination", "outcome_hallucination", "state_hallucination", "action_hallucination", "capability_hallucination", "context_hallucination", "other_hallucination"]
        },
        "reasoning_errors": {
            "type": "object",
            "description": "Reasoning errors involving logic failures, memory issues, or flawed inference. Distinct from hallucinations (fabricating facts) - reasoning errors are about HOW the agent thinks, not inventing information.",
            "additionalProperties": False,
            "properties": {
                "time_orientation_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent confused past/future events, made date/time calculation errors, had timezone issues, or misunderstood 'now' vs scheduled times."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "state_tracking_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent FORGOT previous actions, failed to update understanding after new information, or repeated already-completed actions (memory failure). Distinct from state_hallucination which is FABRICATING state."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "goal_misalignment_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent solved wrong problem, missed implicit requirements, over/under-interpreted intent, optimized for wrong goal, OR understood an explicit requirement but deprioritized/ignored it (e.g., knew batching was required but chose individual calls anyway)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "causal_reasoning_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent misattributed failure cause, reversed cause/effect, or missed causal steps."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "confirmation_bias": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent ignored contradictory error messages, persisted with failing approach despite clear feedback, or selectively interpreted results."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "logical_fallacy": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent used false dichotomy (only 2 options when more exist), circular reasoning, or non sequitur conclusions (action doesn't follow from evidence)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "assumption_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent GUESSED defaults, user preferences, or API behavior WITHOUT CHECKING. Distinct from hallucination which is asserting invented facts as known truth."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "negation_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent inverted boolean conditions, misunderstood not/except/exclude, or did the opposite of requested."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "scope_generalization_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent over-generalized from specific instructions, was too literal (missing spirit of request), or applied patterns from unrelated contexts."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "dependency_ordering_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent performed actions in wrong SEQUENCE (e.g., tried to use result before fetching it, called API before authentication). About ordering, NOT about missing steps."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "incomplete_execution_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent understood and planned required subtasks but failed to attempt some of them entirely (never tried). About OMISSION of steps, not wrong ordering."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "premature_termination_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent stops execution and concludes the task before completing all required steps, without recognizing that more work remains. Distinct from incomplete_execution (planned but didn't attempt) - this is stopping early without awareness of remaining work."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "quantitative_reasoning_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent made off-by-one errors, unit/scale confusion, incorrect aggregation (sum vs count), OR incorrect sorting/ordering of data (e.g., skipped items, wrong alphabetical order)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "reference_resolution_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent misunderstood it/this/that references, confused similar entities, or lost track of objects."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "instruction_fidelity_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent deviated from explicit instructions by modifying content that should be preserved verbatim (e.g., changing punctuation like em-dash to hyphen, rewording text) OR adding unrequested embellishments/formatting (e.g., numbering, introductions, author attributions, extra metadata) when literal execution was required."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "reasoning_action_mismatch": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent's explicit reasoning/plan contradicts the action it actually executes. The agent 'knows' or states the correct approach but then does something different (e.g., reasons 'should use rich_text_section' but then uses 'text' type)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "other_reasoning_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if there is a reasoning error not covered by other categories."},
                        "explanation": {"type": "string", "description": "Description of the reasoning error type (or 'N/A' if none)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                },
                "infinite_loop_error": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent gets stuck in pathological loop, repeating identical or near-identical reasoning and actions across multiple iterations without making progress or attempting meaningfully different approaches."},
                        "explanation": {"type": "string", "description": "Brief summary of the loop behavior (or why none were found)."},
                        "example": {"type": "string", "description": "One concrete example from the trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                }
            },
            "required": ["time_orientation_error", "state_tracking_error", "goal_misalignment_error", "causal_reasoning_error", "confirmation_bias", "logical_fallacy", "assumption_error", "negation_error", "scope_generalization_error", "dependency_ordering_error", "incomplete_execution_error", "premature_termination_error", "quantitative_reasoning_error", "reference_resolution_error", "instruction_fidelity_error", "reasoning_action_mismatch", "other_reasoning_error", "infinite_loop_error"]
        },
        "recovery_strategies": {
            "type": "object",
            "description": "Recovery strategies the agent attempted. Evaluate each category explicitly.",
            "additionalProperties": False,
            "properties": {
                "retry_same": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent retried the exact same action unchanged, hoping for different result."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "retry_modified_params": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent retried with adjusted parameters (different ID, format, value)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "switch_tool": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent switched to a different tool/endpoint to achieve the same goal."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "lookup_correct_value": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent searched or queried to find the correct ID/name/value."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "backtrack": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent returned to an earlier step to gather missing information."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "parse_error_message": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent extracted useful info from error output to inform next action."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "handle_ui_obstacle": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent handled popup, dialog, login wall, or similar UI blocker."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "change_strategy": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent abandoned current approach entirely and tried a different method."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "break_into_steps": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent decomposed a complex action into smaller sequential steps."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "verify_prerequisites": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent checked if required conditions were met before retrying."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "skip_and_continue": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent moved past a blocking item to complete other parts of task."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "wait_and_retry": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent added delay for rate limits or async operations."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "use_fallback": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent used a secondary/backup method when primary failed."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                },
                "other_recovery_strategy": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent used a recovery strategy not covered by other categories."},
                        "explanation": {"type": "string", "description": "Description of the recovery strategy (or 'N/A' if none)."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "explanation", "example"]
                },
                "no_recovery_attempted": {
                    "type": "object",
                    "additionalProperties": False,
                    "properties": {
                        "present": {"type": "boolean", "description": "True if agent gave up immediately or got stuck in a loop without any recovery attempt."},
                        "example": {"type": "string", "description": "Concrete example from trace (or 'N/A' if none)."}
                    },
                    "required": ["present", "example"]
                }
            },
            "required": ["retry_same", "retry_modified_params", "switch_tool", "lookup_correct_value", "backtrack", "parse_error_message", "handle_ui_obstacle", "change_strategy", "break_into_steps", "verify_prerequisites", "skip_and_continue", "wait_and_retry", "use_fallback", "other_recovery_strategy", "no_recovery_attempted"]
        },
        "other_error": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "present": {"type": "boolean", "description": "True if there is an error that doesn't fit other categories."},
                "explanation": {"type": "string", "description": "Brief summary of the issue, including a proposed subcategory for this error type."},
                "example": {"type": "string", "description": "One concrete example from the trace (or 'N/A' if none)."}
            },
            "required": ["present", "explanation", "example"]
        },
        "qualitative_summary": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "planning_score": {
                    "type": "integer",
                    "minimum": 0,
                    "maximum": 5,
                    "description": "Planning quality (0-5): action sequencing, adaptation to obstacles, and efficiency of approach."
                },
                "planning_explanation": {
                    "type": "string",
                    "description": "Brief justification for the planning score, including any efficiency issues."
                },
                "reasoning_score": {
                    "type": "integer",
                    "minimum": 0,
                    "maximum": 5,
                    "description": "Reasoning quality (0-5): correctness of inferences, use of context, and sound logic in decision-making."
                },
                "reasoning_explanation": {
                    "type": "string",
                    "description": "Brief justification for the reasoning score, noting any flawed inferences or logic errors."
                },
                "tool_use_score": {
                    "type": "integer",
                    "minimum": 0,
                    "maximum": 5,
                    "description": "API/tool handling quality (0-5): endpoint selection, parameter formatting, error handling, and response parsing."
                },
                "tool_use_explanation": {
                    "type": "string",
                    "description": "Brief justification for the tool use score, noting any API misuse or parameter errors."
                },
                "recovery_score": {
                    "type": "integer",
                    "minimum": 0,
                    "maximum": 5,
                    "description": "Recovery ability (0-5): failure detection, root cause diagnosis, and applying corrective strategies."
                },
                "recovery_explanation": {
                    "type": "string",
                    "description": "Brief justification for the recovery score, noting recovery strategies used or missed opportunities."
                },
                "hallucination_score": {
                    "type": "integer",
                    "minimum": 0,
                    "maximum": 5,
                    "description": "Hallucination resistance (0-5): degree to which agent avoids fabricating information, inventing facts, or asserting false claims."
                },
                "hallucination_explanation": {
                    "type": "string",
                    "description": "Brief justification for the hallucination score, noting any fabricated information or false claims."
                },
                "overall_description": {
                    "type": "string",
                    "description": "2-3 sentence high-level narrative of what went wrong and why, suitable for a paper's qualitative analysis section"
                },
                "key_insight": {
                    "type": "string",
                    "description": "The single most important takeaway from analyzing this run"
                },
                "model_behavior_pattern": {
                    "type": "string",
                    "description": "What does this run reveal about how the model approaches this type of task?"
                },
                "implications_for_reliability": {
                    "type": "string",
                    "description": "What does this failure reveal about model reliability/robustness?"
                },
                "worthy_example": {
                    "type": "boolean",
                    "description": "Is this run interesting enough to feature in the qualitative analysis?"
                },
                "why_worthy_example": {
                    "type": "string",
                    "description": "If worthy_example is true, explain why this example is noteworthy (or 'N/A' if not worthy)"
                }
            },
            "required": ["planning_score", "planning_explanation", "reasoning_score", "reasoning_explanation", "tool_use_score", "tool_use_explanation", "recovery_score", "recovery_explanation", "hallucination_score", "hallucination_explanation", "overall_description", "key_insight", "model_behavior_pattern", "implications_for_reliability", "worthy_example", "why_worthy_example"]
        }
    },
    "required": [
        "tool_use_errors",
        "model_refusal",
        "hallucination_errors",
        "reasoning_errors",
        "recovery_strategies",
        "other_error",
        "qualitative_summary"
    ]
}


def construct_run_analysis_prompt_elements(cur_run_score, other_run_type, other_run_score):
    
    SYSTEM_MSG = f"You are a helpful assistant. You are required to analyze RUN_TO_ANALYZE and compare it to {other_run_type} as a reference.\n"
    
    HUMAN_MSG_CUR_RUN = "Here is the RUN_TO_ANALYZE:\n"
    
    if other_run_type == "ONE_OF_THE_BEST_RUNS":
        HUMAN_MSG_OTHER_RUN = f"NOTE: RUN_TO_ANALYZE failed in the task with the score of {cur_run_score}/100.0. Unfortunately, no agent was able to finish the required task, which indicates that the task is likely very hard. However, here is ONE_OF_THE_BEST_RUNS (use it only as a reference of what an alternative approach could look like; NOTE: ONE_OF_THE_BEST_RUNS also failed with the score of {other_run_score}/100.0):\n"
    
    elif other_run_type == "ANOTHER_SUCCESSFUL_RUN":
        HUMAN_MSG_OTHER_RUN = f"NOTE: RUN_TO_ANALYZE succeeded in the task with the score of 100.0/100.0. However, here is ANOTHER_ONE_OF_THE_BEST_RUNS which finished with the score of {other_run_score}/100.0 (use it only as a reference of what an alternative approach could look like):\n"
    
    else: # other_run_type == "BEST_RUN":
        HUMAN_MSG_OTHER_RUN = f"NOTE: RUN_TO_ANALYZE failed in the task with the score of {cur_run_score}/100.0. However, here is ONE_OF_THE_BEST_RUNS, which succeeded with the score of {other_run_score}/100.0 (use as reference for what a correct approach looks like):\n"

    RUN_ANALYSIS_PROMPT = f"""IMPORTANT EVALUATION CONTEXT:
The agents being evaluated were given these instructions in their system prompt:
- "Use execute_bash to interact with [Service] API at [endpoint]. Complete the task using the tools provided."
- "Authentication is handled automatically via proxy. Leave a placeholder credential where you would add a real token."

Therefore, do NOT flag as errors:
- Agent not explicitly handling authentication (it's automatic via proxy)
- Agent using placeholder credentials or assuming auth works
- Agent proceeding directly to API calls without auth setup

Now analyze RUN_TO_ANALYZE (and use {other_run_type} only as a reference for what a correct approach looks like). Evaluate the following categories in order. For each category, provide the required fields as specified.

    1) Tool Use Errors
    Errors related to how the agent interacts with tools and APIs. Evaluate each subtype explicitly:

    endpoint_selection:
    Determine whether the agent consistently selects correct endpoints to make progress toward the user's goal.
    - present: True if there are any incorrect or irrelevant endpoint choices
    - explanation: Brief summary of the issue (or why none were found)
    - example: One concrete example from the trace (or 'N/A' if none)

    parameter_misuse:
    Determine whether the agent ever calls an API endpoint with incorrectly formatted, incorrectly typed, or semantically misplaced parameters.
    This includes:
    - Wrong parameter names
    - Wrong data types (string vs int vs list vs object)
    - Wrong structure (scalar where an object is required, missing required keys, extra nesting)
    - Using a field not accepted by the tool when the correct accepted field exists
    - Semantic field mapping errors: putting data in the wrong field when a more appropriate field exists (e.g., putting location info like "Pearlwork Desk" in summary field instead of the dedicated location field)
    - present: True if there is any parameter misuse
    - explanation: Brief summary of the issue (or why none were found)
    - example: One concrete example from the trace, e.g., 'tool expects user_id: "U123" but agent passes user: "sarah"' or 'put location in summary instead of location field' (or 'N/A' if none)

    format_errors:
    Determine whether the agent ever produces tool-related output that is unparseable or malformed in a way that would prevent correct execution or structured parsing.
    This includes invalid JSON (broken braces/quotes, trailing commas, partial JSON), incorrect field nesting that makes the payload unparsable, mixing natural language into a payload that must be machine-readable, truncation that cuts off the payload, or any syntax/format issue that causes the tool call or structured output to fail to parse.
    - present: True if there is any unparseable or malformed payload
    - explanation: Brief summary of the issue (or why none were found)
    - example: One concrete example from the trace (or 'N/A' if none)

    code_errors:
    Determine whether the agent produces valid tool calls (e.g. valid JSON) but the content of the call (the script or command) fails during execution.
    This includes Bash syntax errors, runtime exceptions (NameError, ImportError), logic bugs in the script, or environment misconceptions (e.g. assuming variables or files persist between isolated tool calls when they do not).
    - present: True if code execution failed
    - explanation: Brief summary of the issue (or why none were found)
    - example: One concrete example from the trace, e.g., "NameError: 'headers' is not defined" (or 'N/A' if none)

    2) Model Refusal
    Determine whether the agent refuses to perform the task, asks the user for information it could retrieve itself, OR delegates execution back to user instead of acting.
    This includes:
    - Explicitly refusing to perform the task
    - Asking user for IDs, tokens, or file contents that the agent could find itself
    - Passive delegation: gathering information but providing recommendations for user to execute instead of performing actions itself (e.g., "You should reach out to @user" instead of actually sending the message)
    - present: True if the agent declines, offloads work, or delegates execution to the user
    - explanation: Brief summary of the refusal/delegation (or why none were found)
    - example: One concrete example from the trace, e.g., "'Please provide the user ID' when agent has a search tool" or "Provided summary with 'You can now message @user' but made no tool calls to do so" (or 'N/A' if none)

    3) Hallucination Errors
    Hallucinations are when the agent FABRICATES or ASSERTS invented information as truth.
    This is distinct from reasoning errors (logic/inference failures) and assumption errors (guessing without checking).
    
    For EACH of the following hallucination types, explicitly evaluate whether it occurred:
    You MUST provide a judgment (present: true/false) and example for EVERY category:

    - parameter_hallucination: Agent ASSERTS invented parameter values (IDs, names, timestamps, URLs, file IDs) as truth, when these values are not grounded in the trace or user input. Example: using channel ID "C99999" that never appeared anywhere.
    - outcome_hallucination: Agent falsely CLAIMS task completion or success despite evidence showing the task was not actually completed.
    - state_hallucination: Agent FABRICATES state that doesn't exist (e.g., "the file was created" when it wasn't, "the user exists" without any evidence). Distinct from state_tracking_error which is FORGETTING existing state.
    - action_hallucination: Agent CLAIMS to have performed an action (read a file, made an API call, executed a command) that doesn't appear in the trace. Fabricating that execution occurred when it did not. Example: "Content read: ..." in reasoning but no corresponding API call in trace.
    - capability_hallucination: Agent believes a tool/API can do something it cannot, or invents non-existent tool capabilities/endpoints.
    - context_hallucination: Agent references information not present in trace, prompt, or API responses, asserting invented context as truth.
    - other_hallucination: A hallucination not covered by the categories above (if present, provide explanation).

    For each category, return:
    - present: true/false
    - example: Concrete example from trace (or 'N/A' if none)
    - explanation: (ONLY for other_hallucination) Description of the hallucination type

    4) Reasoning Errors
    Reasoning errors involve logic failures, memory issues, or flawed inference.
    These are about HOW the agent thinks, not about fabricating information (which is hallucination).
    
    IMPORTANT DISTINCTIONS:
    - state_tracking_error = agent FORGETS what happened (memory failure)
    - state_hallucination = agent INVENTS what happened (fabrication) → goes in hallucination_errors
    - assumption_error = agent GUESSES without checking (uncertainty acknowledged implicitly)
    - hallucination = agent ASSERTS invented facts as known truth (false certainty)
    - dependency_ordering_error = agent does steps in wrong SEQUENCE
    - incomplete_execution_error = agent OMITS steps entirely (never attempts them)
    - premature_termination_error = agent STOPS early thinking task is done when it's not

    For EACH of the following reasoning error types, explicitly evaluate whether it occurred:
    You MUST provide a judgment (present: true/false) and example for EVERY category:

    - time_orientation_error: Confusing past vs future events, incorrect date/time calculations, timezone confusion, misunderstanding "now" vs scheduled times
    - state_tracking_error: Agent FORGOT previous actions, failed to update understanding after new information, or repeated already-completed actions (MEMORY failure, not fabrication)
    - goal_misalignment_error: Solving a different problem than asked, missing implicit requirements, over/under-interpreting intent, optimizing for wrong goal, OR understanding an explicit requirement but deprioritizing/ignoring it (e.g., knew batching was required but chose individual calls anyway)
    - causal_reasoning_error: Misattributing why something failed, reversing cause and effect, missing intermediate steps in causal chain
    - confirmation_bias: Ignoring contradictory error messages, persisting with failing approach despite clear feedback, selective interpretation of results
    - logical_fallacy: False dichotomy (only 2 options when more exist), circular reasoning, non sequitur conclusions (action doesn't follow from evidence)
    - assumption_error: Agent GUESSED defaults, user preferences, or API behavior WITHOUT CHECKING first. This is making unverified assumptions, not asserting fabricated facts as truth.
    - negation_error: Inverting boolean conditions, misunderstanding "not"/"except"/"exclude", doing the opposite of what's requested
    - scope_generalization_error: Over-generalizing from specific instructions, being too literal (missing spirit of request), applying patterns from unrelated contexts
    - dependency_ordering_error: Performing actions in wrong SEQUENCE (e.g., tried to use result before fetching it, called API before authentication). About ordering, NOT about missing steps.
    - incomplete_execution_error: Agent understood and planned required subtasks but failed to attempt some of them entirely. About OMISSION of steps (never tried), not wrong ordering. Example: planned to grant access and copy events but only did one.
    - premature_termination_error: Agent stops execution and concludes the task is complete before finishing all required steps, without recognizing that more work remains. Distinct from incomplete_execution (planned but didn't attempt) - this is stopping early without awareness. Example: task requires sending 3 messages but agent stops after sending 1 and says "Done!"
    - quantitative_reasoning_error: Off-by-one errors, unit/scale confusion, incorrect aggregation (sum vs count), OR incorrect sorting/ordering of data (e.g., skipped items when sorting alphabetically, wrong sort order)
    - reference_resolution_error: Misunderstanding what "it"/"this"/"that" refers to, confusing multiple similar entities, losing track of which object is discussed
    - instruction_fidelity_error: Agent deviated from explicit instructions by modifying content that should be preserved verbatim (e.g., changing punctuation like em-dash to hyphen, rewording text) OR adding unrequested embellishments/formatting (e.g., numbering, introductions, author attributions, extra metadata) when literal execution was required.
    - reasoning_action_mismatch: Agent's explicit reasoning/plan contradicts the action it actually executes. The agent "knows" or states the correct approach but then does something different (e.g., reasons "should use rich_text_section" but then uses "text" type).
    - other_reasoning_error: A reasoning error not covered by the categories above (if present, provide explanation)
    - infinite_loop_error: Agent gets stuck in pathological loop, repeating identical or near-identical reasoning and actions across multiple iterations without making progress or attempting meaningfully different approaches

    For each category, return:
    - present: true/false
    - example: Concrete example from trace (or 'N/A' if none)
    - explanation: (ONLY for other_reasoning_error and infinite_loop_error) Description of the error

    5) Recovery Strategies
    For EACH of the following recovery strategy types, explicitly evaluate whether the agent attempted it in RUN_TO_ANALYZE.
    You MUST provide a judgment (present: true/false) and example for EVERY category:

    - retry_same: Retried the exact same action unchanged, hoping for different result
    - retry_modified_params: Retried with adjusted parameters (different ID, format, value)
    - switch_tool: Switched to a different tool/endpoint to achieve the same goal
    - lookup_correct_value: Searched or queried to find the correct ID/name/value
    - backtrack: Returned to an earlier step to gather missing information
    - parse_error_message: Extracted useful info from error output to inform next action
    - handle_ui_obstacle: Handled popup, dialog, login wall, or similar UI blocker
    - change_strategy: Abandoned current approach entirely, tried a different method
    - break_into_steps: Decomposed a complex action into smaller sequential steps
    - verify_prerequisites: Checked if required conditions were met before retrying
    - skip_and_continue: Moved past a blocking item to complete other parts of task
    - wait_and_retry: Added delay for rate limits or async operations
    - use_fallback: Used a secondary/backup method when primary failed
    - other_recovery_strategy: A recovery strategy not covered by the categories above (if present, provide explanation)
    - no_recovery_attempted: Agent gave up immediately or got stuck in a loop without any recovery attempt

    For each category, return:
    - present: true/false
    - example: Concrete example from trace (or 'N/A' if none)
    - explanation: (ONLY for other_recovery_strategy) Description of the recovery strategy

    6) Other Errors
    Determine if there are any other errors not covered by the previous categories (1-4).
    - present: True if there is an error that doesn't fit other categories
    - explanation: Brief summary of the issue, including a proposed subcategory name (e.g., 'Timing Error', 'Resource Limit') (or 'No other errors found' if none)
    - example: One concrete example from the trace (or 'N/A' if none)

    7) Qualitative Summary
    Provide a high-level narrative analysis of this run:
    
    First, evaluate planning quality:
    - planning_score: Integer 0-5 for planning quality (action sequencing, adaptation to obstacles, efficiency)
      Score scale:
      - 5 = Excellent: clear, efficient action sequence; proactively handles obstacles; quickly switches strategy when blocked; uses optimal API patterns (batch calls when available).
      - 4 = Good: mostly correct sequence; handles common obstacles with minor inefficiencies; occasional unnecessary steps or suboptimal API usage.
      - 3 = Mixed: reaches the goal or makes progress but with avoidable detours; slow or inconsistent adaptation to obstacles; noticeable inefficiencies (e.g., N+1 query patterns).
      - 2 = Poor: often incorrect ordering of steps; weak adaptation; repeats failing actions; needs luck or external help to progress.
      - 1 = Very poor: largely incoherent plan; frequently stuck; little to no useful adaptation; abandons after minor friction.
      - 0 = Non-functional: no meaningful plan; stagnates immediately or repeatedly loops until the step budget is exhausted.
    - planning_explanation: Brief justification for the planning score, including any efficiency issues.
    
    Then, evaluate reasoning quality:
    - reasoning_score: Integer 0-5 for reasoning quality (correctness of inferences, use of context, sound logic)
      Score scale:
      - 5 = Excellent: all inferences correct and well-grounded; uses available context effectively; no logical errors; self-corrects when evidence contradicts assumptions.
      - 4 = Good: mostly correct reasoning; minor inference gaps that don't derail the task; good use of context.
      - 3 = Mixed: some correct reasoning but notable errors in logic or inference; may ignore relevant context or draw unsupported conclusions.
      - 2 = Poor: frequent reasoning errors; draws conclusions not supported by evidence; misinterprets API responses or error messages.
      - 1 = Very poor: pervasive logical flaws; assertions contradicted by available evidence; fails to connect cause and effect.
      - 0 = Non-functional: reasoning is incoherent, contradictory, or absent; actions have no logical basis.
    - reasoning_explanation: Brief justification for the reasoning score, noting any flawed inferences or logic errors.
    
    Then, evaluate API/tool handling:
    - tool_use_score: Integer 0-5 for API/tool handling quality (endpoint selection, parameter formatting, error handling, response parsing)
      Score scale:
      - 5 = Excellent: selects correct endpoints consistently; parameters properly formatted with correct types/structure; handles errors gracefully; extracts and uses API response data accurately.
      - 4 = Good: mostly correct endpoint/parameter usage; minor formatting issues that don't cause failures; reasonable error handling.
      - 3 = Mixed: some incorrect endpoints or malformed parameters; may miss required fields or use wrong types; inconsistent error handling.
      - 2 = Poor: frequent parameter errors (wrong names, types, structure); struggles to parse API responses; poor recovery from API errors.
      - 1 = Very poor: pervasive API misuse; hallucinates endpoints or parameters; fails to extract needed data from responses.
      - 0 = Non-functional: cannot successfully interact with APIs; all or most tool calls fail due to fundamental misuse.
    - tool_use_explanation: Brief justification for the tool use score, noting any API misuse or parameter errors.
    
    Then, evaluate recovery ability:
    - recovery_score: Integer 0-5 for recovery ability (failure detection, root cause diagnosis, applying corrective strategies)
      Score scale:
      - 5 = Excellent: quickly detects failures; accurately diagnoses root causes; applies effective corrective strategies (modifies params, switches tools, backtracks); learns from errors.
      - 4 = Good: detects most failures; reasonable diagnosis; tries appropriate recovery strategies with minor delays or inefficiencies.
      - 3 = Mixed: detects failures but slow to diagnose; may try ineffective strategies first; eventually finds working approach but wastes iterations.
      - 2 = Poor: often misses or misinterprets failures; misdiagnoses root causes; applies wrong fixes; may retry same failing action multiple times.
      - 1 = Very poor: rarely recognizes failures; no meaningful recovery attempts; gets stuck in loops or gives up immediately.
      - 0 = Non-functional: completely unable to recover; ignores all error signals; repeats identical failures until timeout.
    - recovery_explanation: Brief justification for the recovery score, noting recovery strategies used or missed opportunities.
    
    Then, evaluate hallucination resistance:
    - hallucination_score: Integer 0-5 for hallucination resistance (5 = no hallucinations, 0 = severe hallucinations)
      Score scale:
      - 5 = None: no fabricated information; all claims grounded in trace, prompt, or API responses; no invented IDs, states, or outcomes.
      - 4 = Minimal: one minor instance of ungrounded assertion that doesn't affect task outcome; quickly self-corrects if contradicted.
      - 3 = Moderate: some fabricated details (e.g., guessed IDs, assumed states) but core reasoning remains sound; hallucinations don't derail task.
      - 2 = Significant: multiple hallucinations affecting task execution; invents parameters, claims false successes, or fabricates API responses.
      - 1 = Severe: pervasive fabrication; asserts non-existent capabilities, claims actions that didn't happen, or invents entire context.
      - 0 = Extreme: nearly all assertions fabricated; completely disconnected from reality of trace; cannot distinguish real from invented.
    - hallucination_explanation: Brief justification for the hallucination score, noting any fabricated information or false claims.
    
    Then provide narrative analysis:
    - overall_description: 2-3 sentence summary of what went wrong and why, suitable for a paper's qualitative analysis section
    - key_insight: The single most important takeaway from analyzing this run, specifically, what went well for RUN_TO_ANALYZE, what went wrong?
    - model_behavior_pattern: What does this run reveal about how the model approaches this type of task?
    - implications_for_reliability: What does this failure reveal about model reliability/robustness?
    - worthy_example: Is this run interesting enough to feature in a qualitative analysis section of a paper? (true/false)
    - why_worthy_example: If worthy, explain why this example is noteworthy. If not worthy, put 'N/A'.

    Return your results in the required structured format."""
    
    return SYSTEM_MSG, HUMAN_MSG_CUR_RUN, HUMAN_MSG_OTHER_RUN, RUN_ANALYSIS_PROMPT

# Core functions


In [10]:
def analyze_single_test(all_runs_for_test, langchain_model):
    """
    Analyzes all runs for a single test_id.
    
    Args:
        all_runs_for_test: All runs for this test (across all models) - used for finding best run
                          and analyzing failed runs against it.
        langchain_model: The LangChain model to use for analysis.
    
    Returns:
        List of run analysis dicts, or None if all_runs_for_test is empty or has < 2 runs.
    """
    if not all_runs_for_test:
        print("WARNING: all_runs_for_test is empty. Returning.")
        return None
    
    # Check if we have at least 2 runs for comparison
    if len(all_runs_for_test) < 2:
        test_id = all_runs_for_test[0].get("runtime_test_id", "unknown") if all_runs_for_test else "unknown"
        print(f"WARNING: Test {test_id} has only {len(all_runs_for_test)} run(s). Skipping comparison analysis (need at least 2 runs).")
        return None

    # Sort best runs across ALL models for this test (highest score, fewest iterations)
    sorted_runs = sorted(
        all_runs_for_test,
        key=lambda r: (r["score"], -r.get("iterations", float('inf'))),
        reverse = True
    )
    
    first_best_run = sorted_runs[0]
    second_best_run = sorted_runs[1] if len(sorted_runs) > 1 else first_best_run
    
    i = 1
    found_better = False
    while i < len(sorted_runs):
        if (sorted_runs[i]["score"] < first_best_run["score"] or 
            sorted_runs[i]["iterations"] > first_best_run["iterations"]):
            break
        
        if (sorted_runs[i]["include_api_docs"] == True and
            sorted_runs[i]["include_all_api_docs"] == False):
            found_better = True
            break

        i += 1
    
    if found_better:   
        second_best_run = first_best_run
        first_best_run = sorted_runs[i]
    
    first_best_run_id = first_best_run["run_id"]
    
    results = []

    # Analyze each run (comprehensive assessment against best run)
    for run in all_runs_for_test:
        
        # Choose another run for comparison
        if run["run_id"] == first_best_run_id:
            other_run = second_best_run
        else:
            other_run = first_best_run
        
        other_run_trace = other_run.get("formatted_trace", "")
        other_run_score = other_run["score"]
        
        cur_run_score = run["score"]
        
        
        if cur_run_score == 100:
            other_run_type = "ANOTHER_SUCCESSFUL_RUN"
        elif cur_run_score < 100 and other_run_score == 100:
            other_run_type = "BEST_RUN"
        elif cur_run_score < 100 and other_run_score < 100:
            other_run_type = "ONE_OF_THE_BEST_RUNS"
            
        
        
        SYSTEM_MSG, HUMAN_MSG_CUR_RUN, HUMAN_MSG_OTHER_RUN, RUN_ANALYSIS_PROMPT = construct_run_analysis_prompt_elements(cur_run_score, other_run_type, other_run_score)        
        

        run_analysis = {
            "run_id": run["run_id"],
            "runtime_test_id": run["runtime_test_id"],
            "compared_against": other_run["run_id"]
        }

        try:
            agent = create_agent(
                model=langchain_model,
                response_format=ToolStrategy(RunAnalysisSchema)
            )

            system_message = SYSTEM_MSG + RUN_ANALYSIS_PROMPT
            human_message = HUMAN_MSG_CUR_RUN + run.get("formatted_trace", "") + "\n\n" + HUMAN_MSG_OTHER_RUN + other_run.get("formatted_trace", "")

            result = agent.invoke({
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": human_message}
                ]
            })

            run_analysis["run_analysis"] = result["structured_response"]

        except Exception as e:
            print(f"Error analyzing run {run['run_id']}: {e}")

        results.append(run_analysis)

    return results


def group_runs_by_test(all_runs_formatted):
    """Group flat runs list by runtime_test_id."""
    from collections import defaultdict
    grouped = defaultdict(list)
    for run in all_runs_formatted:
        grouped[run["runtime_test_id"]].append(run)
    return grouped


def analyze_multiple_tests(langchain_model, all_runs_formatted=None, max_workers=10):
    """
    Analyze all formatted runs, extracting qualitative data in structured format.
    
    Groups runs by test_id, then analyzes each test in parallel with checkpointing.
    
    Args:
        langchain_model: The LangChain model to use for analysis.
        all_runs_formatted: List of formatted runs (from format_single_run). 
                           If None, attempts to load from formatted_runs folder.
        max_workers: Number of parallel workers for analysis.
    
    Returns:
        List of run analysis dicts (flattened across all tests).
    """
    
    # Setup checkpoints folder
    checkpoints_folder = os.path.join(os.getcwd(), "checkpoints")
    os.makedirs(checkpoints_folder, exist_ok=True)
    
    # Create checkpoint file with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    checkpoint_file = os.path.join(checkpoints_folder, f"checkpoint_{timestamp}.json")
    print(f"Using checkpoint file: {checkpoint_file}")
    
    # Auto-load from formatted_runs folder if not provided
    if not all_runs_formatted:
        output_folder = os.path.join(os.getcwd(), "formatted_runs")
        
        if not os.path.exists(output_folder):
            print(f"ERROR: No runs provided and formatted_runs folder not found at {output_folder}")
            return []
        
        pattern = os.path.join(output_folder, "all_runs_formatted_*.json")
        matching_files = sorted(glob.glob(pattern), reverse=True)  # newest first
        
        if not matching_files:
            print(f"ERROR: No all_runs_formatted_*.json files found in {output_folder}")
            return []
        
        latest_file = matching_files[0]
        print(f"Loading runs from: {latest_file}")
        
        with open(latest_file, 'r') as f:
            all_runs_formatted = json.load(f)
        
        print(f"Loaded {len(all_runs_formatted)} formatted runs")

    results = []
    processed_test_ids = set()

    # Check for most recent existing checkpoint to resume from
    existing_checkpoints = sorted(glob.glob(os.path.join(checkpoints_folder, "checkpoint_*.json")), reverse=True)
    if existing_checkpoints:
        latest_checkpoint = existing_checkpoints[0]
        try:
            with open(latest_checkpoint, 'r') as f:
                results = json.load(f)
                # Get processed test_ids from the flattened results
                processed_test_ids = {r['runtime_test_id'] for r in results if r is not None}
            print(f"Resuming from checkpoint: {latest_checkpoint}")
            print(f"Loaded {len(results)} already processed run analyses")
        except (json.JSONDecodeError, KeyError) as e:
            print(f"Could not load checkpoint {latest_checkpoint}: {e}. Starting fresh.")

    # Group runs by test_id
    by_test = group_runs_by_test(all_runs_formatted)

    # Filter to unprocessed tests
    test_ids_to_process = [tid for tid in by_test.keys() if tid not in processed_test_ids]
    print(f"Processing {len(test_ids_to_process)} new tests (Total: {len(by_test)})")

    if not test_ids_to_process:
        print("All tests already processed.")
        return results

    # Process tests in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_test_id = {
            executor.submit(analyze_single_test, by_test[tid], langchain_model): tid 
            for tid in test_ids_to_process
        }
        
        for future in as_completed(future_to_test_id):
            test_id = future_to_test_id[future]
            try:
                result = future.result()
                if result is not None:
                    results.extend(result)  # Flatten: extend instead of append
                    print(f"Completed analysis for test {test_id} ({len(result)} runs)")
                else:
                    print(f"Skipped test {test_id} (empty or insufficient runs)")
                
                # Save checkpoint after each test
                with open(checkpoint_file, 'w') as f:
                    json.dump(results, f, indent=2)
                    
            except Exception as e:
                print(f"Error processing test {test_id}: {e}")

    # Save final results with timestamp
    results_folder = os.path.join(os.getcwd(), "qualitative_analysis_results")
    os.makedirs(results_folder, exist_ok=True)
    
    results_filepath = os.path.join(results_folder, f"analysis_results_{timestamp}.json")
    
    with open(results_filepath, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\nSaved {len(results)} run analyses to: {results_filepath}")
    
    return results

# Run the analysis

In [11]:
# Define the model here using RobustChatOpenAI for retry logic

# google/gemini-3-flash-preview
# x-ai/grok-4.1-fast

langchain_model = RobustChatOpenAI(
    model="google/gemini-3-flash-preview",
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1",
    max_retries=3,
    base_delay=2.0,
    timeout=120,  # 120 second timeout for long-running requests
)


In [12]:
# selected_runs = [run for run in formatted_runs if run["runtime_test_id"] == "f1e306ca-d89a-5d70-bb57-03437eec4ea8" or run["runtime_test_id"] == "0b2af335-8327-53fb-ab71-829299520d87" or run["runtime_test_id"] == "10e491a0-bea6-5d05-9fb1-5d4774b34697" or run["runtime_test_id"] == "2cae6822-7219-5357-b252-acd24e660f3b"]

# pprint(selected_runs)

In [13]:
analysis_results = analyze_multiple_tests(langchain_model=langchain_model,
                                          all_runs_formatted=formatted_runs, # By default (if None is specified) will process last file with formatted runs
                                          max_workers=10)


Using checkpoint file: /Users/azh/agent-diff/local_analysis/checkpoints/checkpoint_20260208_163451.json
Processing 223 new tests (Total: 223)
Completed analysis for test 89b45222-2dee-535e-804e-69d1f44a78fd (18 runs)
Completed analysis for test bf85c95d-b8ef-50cb-8263-6dae94173586 (18 runs)
Completed analysis for test dcba769e-d40c-53c4-a6ae-11283f53ed77 (18 runs)
Completed analysis for test b7b8f64c-6457-5f9c-8943-d4a9e83387f6 (18 runs)
Completed analysis for test e4494bce-7101-5ec5-b757-f90f57c53690 (18 runs)
Completed analysis for test 32ee4d07-7744-59c5-a91f-f9b6cb9b75b8 (18 runs)
Completed analysis for test 2443b5cf-ef57-5201-9399-cba34df4649d (18 runs)
Completed analysis for test 94e0cbdc-f816-57a6-a559-6578fd85f12c (18 runs)
Completed analysis for test 316ccf74-4c28-5e2e-adf0-7b5037a5d236 (18 runs)
Completed analysis for test f1e306ca-d89a-5d70-bb57-03437eec4ea8 (18 runs)
Completed analysis for test 85cf6f38-d086-5590-bcea-45c7fd00b9ab (18 runs)
Completed analysis for test 1a6f0