# Sprint 007 End-to-End Test

This notebook tests the sprint results for prompt infrastructure, prompt contracts, and Phase E synthesis.

**Assumptions:**
- Live llama-cpp backend running on localhost:8000
- OpenAI-compatible API endpoint at `http://localhost:8000/v1/chat/completions`

## Setup and Imports

In [None]:
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
from IPython.display import display, Markdown, JSON

# Add project root to path
sys.path.insert(0, '/home/brian/projects/Aeon-Architect')

# Import Aeon components
from aeon.llm.adapters.llama_cpp import LlamaCppAdapter
from aeon.memory.kv_store import InMemoryKVStore
from aeon.tools.registry import ToolRegistry
from aeon.supervisor.repair import Supervisor
from aeon.validation.schema import Validator
from aeon.kernel.orchestrator import Orchestrator
from aeon.kernel.state import OrchestrationState, ExecutionContext
from aeon.plan.models import Plan
from aeon.observability.logger import JSONLLogger
from aeon.observability.helpers import generate_correlation_id

## Configuration

In [None]:
# LLM Configuration
LLM_API_URL = "http://localhost:8000/v1/chat/completions"
LLM_MODEL = "llama-cpp-model"

# Execution Configuration
TTL = 50  # Maximum cycles

# Logging Configuration
ENABLE_LOGGING = True
LOG_FILE = None  # Set to a file path if you want to save logs

## Initialize Components

In [None]:
print("Initializing components...")

# Initialize LLM adapter for llama-cpp server
llm = LlamaCppAdapter(
    api_url=LLM_API_URL,
    model=LLM_MODEL,
    max_retries=3
)
print(f"✓ LLM Adapter initialized: {LLM_API_URL}")

# Initialize memory
memory = InMemoryKVStore()
print("✓ Memory initialized")

# Initialize tool registry
tool_registry = ToolRegistry()
print("✓ Tool Registry initialized")

# Initialize supervisor for error repair
supervisor = Supervisor(
    llm_adapter=llm
)
print("✓ Supervisor initialized")

# Initialize validator
validator = Validator()
print("✓ Validator initialized")

# Initialize logger if enabled
logger = None
if ENABLE_LOGGING:
    # JSONLLogger expects file_path as Optional[Path], not log_file
    log_path = Path(LOG_FILE) if LOG_FILE else None
    logger = JSONLLogger(file_path=log_path)
    print("✓ Logger initialized")

print("\nAll components initialized successfully!")

## Create Orchestrator

In [None]:
print("Creating Orchestrator...")

orchestrator = Orchestrator(
    llm=llm,
    memory=memory,
    ttl=TTL,
    tool_registry=tool_registry,
    supervisor=supervisor,
    logger=logger
)

print("✓ Orchestrator created")
print(f"  - TTL: {TTL} cycles")
print(f"  - Memory: {'Enabled' if memory else 'Disabled'}")
print(f"  - Tools: {len(tool_registry.list_all()) if tool_registry else 0} registered")
print(f"  - Logging: {'Enabled' if logger else 'Disabled'}")

## Define User Prompt

Enter your test prompt below:

In [None]:
# USER PROMPT - Modify this to test different scenarios
user_prompt = "Design a simple web application architecture with user authentication"

print(f"User Prompt: {user_prompt}")
print(f"Timestamp: {datetime.now().isoformat()}")

## Execute End-to-End Test

This will run the complete orchestration chain:
- Phase A: TaskProfile & TTL allocation
- Phase B: Initial Plan & Pre-Execution Refinement
- Phase C: Execution Passes (Execute Batch → Evaluate → Decide → Refine)
- Phase D: Adaptive Depth (TaskProfile updates)
- Phase E: Answer Synthesis (Final Answer generation)

In [None]:
print("=" * 80)
print("EXECUTING END-TO-END TEST")
print("=" * 80)
print(f"Request: {user_prompt}")
print(f"Start Time: {datetime.now().isoformat()}")
print("\n" + "-" * 80)

# Track execution phases
execution_log = {
    "phases": [],
    "start_time": datetime.now().isoformat(),
    "user_prompt": user_prompt
}

try:
    # Execute multi-pass orchestration
    result = orchestrator.execute_multipass(
        request=user_prompt,
        plan=None  # Let the system generate the plan
    )
    
    execution_log["end_time"] = datetime.now().isoformat()
    execution_log["status"] = "success"
    execution_log["result"] = result
    
    print("\n" + "=" * 80)
    print("EXECUTION COMPLETED SUCCESSFULLY")
    print("=" * 80)
    
except Exception as e:
    execution_log["end_time"] = datetime.now().isoformat()
    execution_log["status"] = "error"
    execution_log["error"] = str(e)
    execution_log["error_type"] = type(e).__name__
    
    print("\n" + "=" * 80)
    print(f"EXECUTION FAILED: {type(e).__name__}")
    print("=" * 80)
    print(f"Error: {str(e)}")
    
    # Re-raise to see full traceback
    raise

## Execution Summary

In [None]:
if execution_log.get("status") == "success":
    result = execution_log["result"]
    
    print("EXECUTION SUMMARY")
    print("-" * 80)
    print(f"Status: {result.get('status', 'unknown')}")
    print(f"Execution ID: {result.get('execution_id', 'N/A')}")
    
    # Display execution history if available
    if "execution_history" in result:
        history = result["execution_history"]
        print(f"\nExecution History:")
        print(f"  - Total Passes: {len(history.get('passes', []))}")
        
        if "overall_statistics" in history:
            stats = history["overall_statistics"]
            print(f"  - Converged: {stats.get('convergence_achieved', 'N/A')}")
            print(f"  - Total Refinements: {stats.get('total_refinements', 0)}")
    
    # Display final answer if available (Phase E output)
    if "final_answer" in result:
        final_answer = result["final_answer"]
        print("\n" + "=" * 80)
        print("FINAL ANSWER (Phase E Synthesis)")
        print("=" * 80)
        
        if isinstance(final_answer, dict):
            # Display answer text prominently
            if "answer_text" in final_answer:
                print("\nAnswer:")
                print("-" * 80)
                print(final_answer["answer_text"])
                print("-" * 80)
            
            # Display confidence if present
            if "confidence" in final_answer and final_answer["confidence"] is not None:
                print(f"\nConfidence: {final_answer['confidence']}")
            
            # Display TTL exhaustion warning if present
            if final_answer.get("ttl_exhausted", False):
                print("\n⚠️  WARNING: TTL was exhausted during execution")
            
            # Display notes if present
            if "notes" in final_answer and final_answer["notes"]:
                print(f"\nNotes: {final_answer['notes']}")
            
            # Display used step IDs if present
            if "used_step_ids" in final_answer and final_answer["used_step_ids"]:
                print(f"\nUsed Step IDs: {', '.join(final_answer['used_step_ids'])}")
            
            # Display metadata summary
            if "metadata" in final_answer and final_answer["metadata"]:
                print("\nMetadata:")
                for key, value in final_answer["metadata"].items():
                    print(f"  - {key}: {value}")
        else:
            print(final_answer)
    else:
        print("\n⚠️  No final answer available in result")
    
    # Display timing information
    start_time = datetime.fromisoformat(execution_log["start_time"])
    end_time = datetime.fromisoformat(execution_log["end_time"])
    duration = (end_time - start_time).total_seconds()
    print("\n" + "-" * 80)
    print(f"Execution Duration: {duration:.2f} seconds")
    print(f"Start: {execution_log['start_time']}")
    print(f"End: {execution_log['end_time']}")
else:
    print("Execution failed. See error details above.")

## Detailed Execution Log

In [None]:
if execution_log.get("status") == "success":
    result = execution_log["result"]
    
    print("DETAILED EXECUTION LOG")
    print("=" * 80)
    
    # Display execution history with passes
    if "execution_history" in result:
        history = result["execution_history"]
        passes = history.get("passes", [])
        
        print(f"\nTotal Execution Passes: {len(passes)}")
        print("\n" + "-" * 80)
        
        for i, pass_data in enumerate(passes, 1):
            print(f"\nPass {i}:")
            print(f"  - Pass Number: {pass_data.get('pass_number', 'N/A')}")
            print(f"  - Phase: {pass_data.get('phase', 'N/A')}")
            print(f"  - TTL Remaining: {pass_data.get('ttl_remaining', 'N/A')}")
            
            if "timing_information" in pass_data:
                timing = pass_data["timing_information"]
                print(f"  - Start Time: {timing.get('start_time', 'N/A')}")
                if "end_time" in timing:
                    print(f"  - End Time: {timing.get('end_time', 'N/A')}")
            
            if "plan_state" in pass_data:
                plan_state = pass_data["plan_state"]
                if isinstance(plan_state, dict):
                    print(f"  - Plan Goal: {plan_state.get('goal', 'N/A')}")
                    steps = plan_state.get("steps", [])
                    print(f"  - Steps Count: {len(steps)}")
                    for step in steps[:3]:  # Show first 3 steps
                        step_id = step.get("step_id", "unknown")
                        status = step.get("status", "unknown")
                        desc = step.get("description", "")[:50]
                        print(f"    • Step {step_id} [{status}]: {desc}...")
                    if len(steps) > 3:
                        print(f"    ... and {len(steps) - 3} more steps")
            
            if "execution_results" in pass_data:
                exec_results = pass_data["execution_results"]
                if exec_results:
                    print(f"  - Execution Results: {len(exec_results)} items")
            
            if "evaluation_results" in pass_data:
                eval_results = pass_data["evaluation_results"]
                if eval_results:
                    print(f"  - Evaluation Results: Present")
                    if isinstance(eval_results, dict):
                        if "convergence_assessment" in eval_results:
                            conv = eval_results["convergence_assessment"]
                            if isinstance(conv, dict):
                                print(f"    • Converged: {conv.get('converged', 'N/A')}")
            
            if "refinement_changes" in pass_data:
                refinements = pass_data["refinement_changes"]
                if refinements:
                    print(f"  - Refinements: {len(refinements) if isinstance(refinements, list) else 'Present'}")
    
    # Display overall statistics
    if "execution_history" in result and "overall_statistics" in result["execution_history"]:
        stats = result["execution_history"]["overall_statistics"]
        print("\n" + "=" * 80)
        print("OVERALL STATISTICS")
        print("=" * 80)
        for key, value in stats.items():
            print(f"  {key}: {value}")
else:
    print("Execution failed. No detailed log available.")

## Full Result JSON (for Audit)

In [None]:
if execution_log.get("status") == "success":
    result = execution_log["result"]
    
    # Convert result to JSON-serializable format
    def make_json_serializable(obj):
        """Recursively convert objects to JSON-serializable format."""
        if isinstance(obj, dict):
            return {k: make_json_serializable(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [make_json_serializable(item) for item in obj]
        elif hasattr(obj, 'model_dump'):
            return obj.model_dump()
        elif hasattr(obj, 'dict'):
            return obj.dict()
        elif isinstance(obj, (str, int, float, bool, type(None))):
            return obj
        else:
            return str(obj)
    
    serializable_result = make_json_serializable(result)
    
    print("FULL RESULT JSON")
    print("=" * 80)
    print(json.dumps(serializable_result, indent=2, default=str))
    
    # Optionally save to file
    # with open('execution_result.json', 'w') as f:
    #     json.dump(serializable_result, f, indent=2, default=str)
    # print("\n✓ Result saved to execution_result.json")
else:
    print("Execution failed. No result to display.")
    if "error" in execution_log:
        print(f"\nError Details:")
        print(json.dumps({
            "error_type": execution_log.get("error_type"),
            "error": execution_log.get("error")
        }, indent=2))

## Phase E (Answer Synthesis) Details

This section shows the Phase E output in detail, which is the key deliverable from Sprint 007.

In [None]:
if execution_log.get("status") == "success":
    result = execution_log["result"]
    
    if "final_answer" in result:
        final_answer = result["final_answer"]
        
        # Convert to dict if it's a Pydantic model
        if hasattr(final_answer, 'model_dump'):
            final_answer = final_answer.model_dump()
        elif hasattr(final_answer, 'dict'):
            final_answer = final_answer.dict()
        
        print("PHASE E: ANSWER SYNTHESIS DETAILS")
        print("=" * 80)
        print("\nThis is the output from Phase E (Answer Synthesis), which consolidates")
        print("execution results, plan state, and convergence information into a")
        print("coherent final answer.")
        print("\n" + "-" * 80)
        
        print("\nFull FinalAnswer Structure:")
        print(json.dumps(final_answer, indent=2, default=str))
        
        print("\n" + "=" * 80)
        print("VALIDATION")
        print("=" * 80)
        
        # Validate FinalAnswer structure
        required_fields = ["answer_text", "metadata"]
        optional_fields = ["confidence", "used_step_ids", "notes", "ttl_exhausted"]
        
        print("\nRequired Fields:")
        for field in required_fields:
            status = "✓" if field in final_answer else "✗"
            value = final_answer.get(field, "MISSING")
            print(f"  {status} {field}: {type(value).__name__}")
        
        print("\nOptional Fields:")
        for field in optional_fields:
            if field in final_answer:
                value = final_answer[field]
                print(f"  ✓ {field}: {value}")
            else:
                print(f"  - {field}: Not present")
        
        print("\nMetadata Fields:")
        if "metadata" in final_answer and isinstance(final_answer["metadata"], dict):
            for key, value in final_answer["metadata"].items():
                print(f"  - {key}: {value}")
        else:
            print("  - No metadata present")
    else:
        print("⚠️  No final_answer found in result.")
        print("\nThis may indicate:")
        print("  1. Phase E was not executed")
        print("  2. Phase E execution failed")
        print("  3. Result structure is different than expected")
        print("\nAvailable keys in result:")
        for key in result.keys():
            print(f"  - {key}")
else:
    print("Execution failed. Cannot display Phase E details.")

## Prompt Registry Audit

Verify that prompts are being retrieved from the centralized registry (Sprint 007 User Story 1).

In [None]:
from aeon.prompts.registry import get_prompt_registry, PromptId

print("PROMPT REGISTRY AUDIT")
print("=" * 80)

registry = get_prompt_registry()
all_prompts = registry.list_prompts()

print(f"\nTotal Prompts in Registry: {len(all_prompts)}")
print("\nAvailable Prompt IDs:")
for prompt_id in sorted(all_prompts, key=lambda x: x.value):
    print(f"  - {prompt_id.value}")

print("\n" + "-" * 80)
print("Key Prompts for Sprint 007:")
print("-" * 80)

# Check for Phase E prompts (User Story 3)
phase_e_prompts = [
    PromptId.ANSWER_SYNTHESIS_SYSTEM,
    PromptId.ANSWER_SYNTHESIS_USER
]

for prompt_id in phase_e_prompts:
    if prompt_id in all_prompts:
        print(f"✓ {prompt_id.value}: Registered")
        try:
            # Try to get the prompt definition
            prompt_def = registry._registry.get(prompt_id)
            if prompt_def:
                has_input = prompt_def.input_model is not None
                has_output = prompt_def.output_model is not None
                print(f"    - Input Model: {'✓' if has_input else '✗'}")
                print(f"    - Output Model: {'✓' if has_output else '✗'}")
        except Exception as e:
            print(f"    - Error checking definition: {e}")
    else:
        print(f"✗ {prompt_id.value}: NOT REGISTERED")

print("\n✓ Prompt registry audit complete")