algorithmicsuperintelligence · codelion · Jun 19, 2025 · Jun 9, 2025 · Jun 14, 2025 · Jun 8, 2025
diff --git a/Makefile b/Makefile
@@ -16,6 +16,7 @@ help:
 	@echo "  test           - Run tests"
 	@echo "  docker-build   - Build the Docker image"
 	@echo "  docker-run     - Run the Docker container with the example"
+	@echo "  visualizer     - Run the visualization script"
 
 .PHONY: all
 all: install test

diff --git a/README.md b/README.md
@@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op
 
 ## Artifacts Channel
 
-OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
+OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
 
 The artifacts channel operates alongside the traditional fitness metrics.
 
@@ -205,17 +205,28 @@ return EvaluationResult(
 ```
 
 The next generation prompt will include:
-```
+```markdown
 ## Last Execution Output
 ### Stderr
-```
 SyntaxError: invalid syntax (line 15)
-```
+
 ### Traceback
-```
 ...
 ```
+
+## Example: LLM Feedback
+
+An example for an LLM artifact side channel is part of the default evaluation template, which ends with
+```markdown
+Return your evaluation as a JSON object with the following format:
+{{
+    "readability": [score],
+    "maintainability": [score],
+    "efficiency": [score],
+    "reasoning": "[brief explanation of scores]"
+}}
 ```
+The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.
 
 ### Configuration
 
@@ -240,7 +251,7 @@ export ENABLE_ARTIFACTS=false
 ### Benefits
 
 - **Faster convergence** - LLMs can see what went wrong and fix it directly
-- **Better error handling** - Compilation and runtime failures become learning opportunities  
+- **Better error handling** - Compilation and runtime failures become learning opportunities
 - **Rich debugging context** - Full stack traces and error messages guide improvements
 - **Zero overhead** - When disabled, no performance impact on evaluation
 

diff --git a/configs/default_config.yaml b/configs/default_config.yaml
@@ -71,6 +71,7 @@ database:
   # General settings
   db_path: null                       # Path to persist database (null = in-memory only)
   in_memory: true                     # Keep database in memory for faster access
+  log_prompts: true                  # If true, log all prompts and responses into the database
 
   # Evolutionary parameters
   population_size: 1000               # Maximum number of programs to keep in memory

diff --git a/openevolve/config.py b/openevolve/config.py
@@ -142,6 +142,9 @@ class DatabaseConfig:
     db_path: Optional[str] = None  # Path to store database on disk
     in_memory: bool = True
 
+    # Prompt and response logging to programs/<id>.json
+    log_prompts: bool = True
+
     # Evolutionary parameters
     population_size: int = 1000
     archive_size: int = 100
@@ -308,6 +311,7 @@ def to_dict(self) -> Dict[str, Any]:
                 "migration_interval": self.database.migration_interval,
                 "migration_rate": self.database.migration_rate,
                 "random_seed": self.database.random_seed,
+                "log_prompts": self.database.log_prompts,
             },
             "evaluator": {
                 "timeout": self.evaluator.timeout,

diff --git a/openevolve/controller.py b/openevolve/controller.py
@@ -10,6 +10,7 @@
 import uuid
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
+import traceback
 
 from openevolve.config import Config, load_config
 from openevolve.database import Program, ProgramDatabase
@@ -142,6 +143,7 @@ def __init__(
             evaluation_file,
             self.llm_evaluator_ensemble,
             self.evaluator_prompt_sampler,
+            database=self.database,
         )
 
         logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
@@ -335,10 +337,30 @@ async def run(
                 # Add to database (will be added to current island)
                 self.database.add(child_program, iteration=i + 1)
 
+                # Log prompts
+                self.database.log_prompt(
+                    template_key=(
+                        "full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
+                    ),
+                    program_id=child_id,
+                    prompt=prompt,
+                    responses=[llm_response],
+                )
+
                 # Store artifacts if they exist
                 if artifacts:
                     self.database.store_artifacts(child_id, artifacts)
 
+                # Log prompts
+                self.database.log_prompt(
+                    template_key=(
+                        "full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
+                    ),
+                    program_id=child_id,
+                    prompt=prompt,
+                    responses=[llm_response],
+                )
+
                 # Increment generation for current island
                 self.database.increment_island_generation()
 
@@ -384,6 +406,7 @@ async def run(
 
             except Exception as e:
                 logger.error(f"Error in iteration {i+1}: {str(e)}")
+                traceback.print_exc()
                 continue
 
         # Get the best program using our tracking mechanism

diff --git a/openevolve/database.py b/openevolve/database.py
@@ -118,6 +118,9 @@ def __init__(self, config: DatabaseConfig):
         if config.db_path and os.path.exists(config.db_path):
             self.load(config.db_path)
 
+        # Prompt log
+        self.prompts_by_program: Dict[str, Dict[str, Dict[str, str]]] = None
+
         # Set random seed for reproducible sampling if specified
         if config.random_seed is not None:
             import random
@@ -328,7 +331,14 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None:
 
         # Save each program
         for program in self.programs.values():
-            self._save_program(program, save_path)
+            prompts = None
+            if (
+                self.config.log_prompts
+                and self.prompts_by_program
+                and program.id in self.prompts_by_program
+            ):
+                prompts = self.prompts_by_program[program.id]
+            self._save_program(program, save_path, prompts=prompts)
 
         # Save metadata
         metadata = {
@@ -489,13 +499,19 @@ def _distribute_programs_to_islands(self) -> None:
 
         logger.info(f"Distributed {len(program_ids)} programs across {len(self.islands)} islands")
 
-    def _save_program(self, program: Program, base_path: Optional[str] = None) -> None:
+    def _save_program(
+        self,
+        program: Program,
+        base_path: Optional[str] = None,
+        prompts: Optional[Dict[str, Dict[str, str]]] = None,
+    ) -> None:
         """
         Save a program to disk
 
         Args:
             program: Program to save
             base_path: Base path to save to (uses config.db_path if None)
+            prompts: Optional prompts to save with the program, in the format {template_key: { 'system': str, 'user': str }}
         """
         save_path = base_path or self.config.db_path
         if not save_path:
@@ -506,9 +522,13 @@ def _save_program(self, program: Program, base_path: Optional[str] = None) -> No
         os.makedirs(programs_dir, exist_ok=True)
 
         # Save program
+        program_dict = program.to_dict()
+        if prompts:
+            program_dict["prompts"] = prompts
         program_path = os.path.join(programs_dir, f"{program.id}.json")
+
         with open(program_path, "w") as f:
-            json.dump(program.to_dict(), f)
+            json.dump(program_dict, f)
 
     def _calculate_feature_coords(self, program: Program) -> List[int]:
         """
@@ -1288,3 +1308,35 @@ def _load_artifact_dir(self, artifact_dir: str) -> Dict[str, Union[str, bytes]]:
             logger.warning(f"Failed to list artifact directory {artifact_dir}: {e}")
 
         return artifacts
+
+    def log_prompt(
+        self,
+        program_id: str,
+        template_key: str,
+        prompt: Dict[str, str],
+        responses: Optional[List[str]] = None,
+    ) -> None:
+        """
+        Log a prompt for a program.
+        Only logs if self.config.log_prompts is True.
+
+        Args:
+        program_id: ID of the program to log the prompt for
+        template_key: Key for the prompt template
+        prompt: Prompts in the format {template_key: { 'system': str, 'user': str }}.
+        responses: Optional list of responses to the prompt, if available.
+        """
+
+        if not self.config.log_prompts:
+            return
+
+        if responses is None:
+            responses = []
+        prompt["responses"] = responses
+
+        if self.prompts_by_program is None:
+            self.prompts_by_program = {}
+
+        if program_id not in self.prompts_by_program:
+            self.prompts_by_program[program_id] = {}
+        self.prompts_by_program[program_id][template_key] = prompt
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -18,7 +18,9 @@
 import traceback
 
 from openevolve.config import EvaluatorConfig
+from openevolve.database import ProgramDatabase
 from openevolve.evaluation_result import EvaluationResult
+from openevolve.database import ProgramDatabase
 from openevolve.llm.ensemble import LLMEnsemble
 from openevolve.utils.async_utils import TaskPool, run_in_executor
 from openevolve.prompt.sampler import PromptSampler
@@ -41,11 +43,13 @@ def __init__(
         evaluation_file: str,
         llm_ensemble: Optional[LLMEnsemble] = None,
         prompt_sampler: Optional[PromptSampler] = None,
+        database: Optional[ProgramDatabase] = None,
     ):
         self.config = config
         self.evaluation_file = evaluation_file
         self.llm_ensemble = llm_ensemble
         self.prompt_sampler = prompt_sampler
+        self.database = database
 
         # Create a task pool for parallel evaluation
         self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
@@ -131,16 +135,40 @@ async def evaluate_program(
                 eval_result = self._process_evaluation_result(result)
 
                 # Add LLM feedback if configured
+                llm_eval_result = None
                 if self.config.use_llm_feedback and self.llm_ensemble:
-                    feedback_metrics = await self._llm_evaluate(program_code)
+                    llm_result = await self._llm_evaluate(program_code, program_id=program_id)
+                    llm_eval_result = self._process_evaluation_result(llm_result)
 
                     # Combine metrics
-                    for name, value in feedback_metrics.items():
+                    for name, value in llm_result.metrics.items():
                         eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
 
                 # Store artifacts if enabled and present
-                if artifacts_enabled and eval_result.has_artifacts() and program_id:
-                    self._pending_artifacts[program_id] = eval_result.artifacts
+                if (
+                    artifacts_enabled
+                    and (
+                        eval_result.has_artifacts()
+                        or (llm_eval_result and llm_eval_result.has_artifacts())
+                    )
+                    and program_id
+                ):
+                    self._pending_artifacts[program_id] = {}
+
+                    # Merge eval_result artifacts with llm artifacts if they exist
+                    if eval_result.has_artifacts():
+                        self._pending_artifacts[program_id].update(eval_result.artifacts)
+                        logger.debug(
+                            f"Program{program_id_str} returned artifacts: "
+                            f"{eval_result.artifacts}"
+                        )
+
+                    if llm_eval_result and llm_eval_result.has_artifacts():
+                        self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
+                        logger.debug(
+                            f"Program{program_id_str} returned LLM artifacts: "
+                            f"{llm_eval_result.artifacts}"
+                        )
 
                 elapsed = time.time() - start_time
                 logger.info(
@@ -156,6 +184,7 @@ async def evaluate_program(
                 logger.warning(
                     f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
                 )
+                traceback.print_exc()
 
                 # Capture failure artifacts if enabled
                 if artifacts_enabled and program_id:
@@ -378,12 +407,13 @@ async def _cascade_evaluate(
                 },
             )
 
-    async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
+    async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[str, float]:
         """
         Use LLM to evaluate code quality
 
         Args:
             program_code: Code to evaluate
+            program_id: Optional ID for logging
 
         Returns:
             Dictionary of metric name to score
@@ -402,12 +432,22 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                 prompt["system"], [{"role": "user", "content": prompt["user"]}]
             )
 
+            # Log prompt and response to database
+            if self.database and program_id:
+                self.database.log_prompt(
+                    program_id=program_id,
+                    template_key="evaluation",
+                    prompt=prompt,
+                    responses=responses,
+                )
+
             # Extract JSON from response
             try:
                 # Try to find JSON block
                 json_pattern = r"```json\n(.*?)\n```"
                 import re
 
+                artifacts = {}
                 avg_metrics = {}
                 for i, response in enumerate(responses):
                     json_match = re.search(json_pattern, response, re.DOTALL)
@@ -426,12 +466,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                     # Parse JSON
                     result = json.loads(json_str)
 
-                    # Filter all non-numeric values
-                    metrics = {
-                        name: float(value)
-                        for name, value in result.items()
-                        if isinstance(value, (int, float))
-                    }
+                    # All non-numeric values are artifacts, all numeric values are metrics
+                    metrics = {}
+                    for key, value in result.items():
+                        if not isinstance(value, (int, float)):
+                            artifacts[key] = value
+                        else:
+                            metrics[key] = float(value)
 
                     # Weight of the model in the ensemble
                     weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -443,7 +484,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                         else:
                             avg_metrics[name] = value * weight
 
-                return avg_metrics
+                return EvaluationResult(
+                    metrics=avg_metrics,
+                    artifacts=artifacts,
+                )
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")