Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ help:
@echo " test - Run tests"
@echo " docker-build - Build the Docker image"
@echo " docker-run - Run the Docker container with the example"
@echo " visualizer - Run the visualization script"

.PHONY: all
all: install test
Expand Down
23 changes: 17 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op

## Artifacts Channel

OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.

The artifacts channel operates alongside the traditional fitness metrics.

Expand All @@ -205,17 +205,28 @@ return EvaluationResult(
```

The next generation prompt will include:
```
```markdown
## Last Execution Output
### Stderr
```
SyntaxError: invalid syntax (line 15)
```

### Traceback
```
...
```

## Example: LLM Feedback

An example for an LLM artifact side channel is part of the default evaluation template, which ends with
```markdown
Return your evaluation as a JSON object with the following format:
{{
"readability": [score],
"maintainability": [score],
"efficiency": [score],
"reasoning": "[brief explanation of scores]"
}}
```
The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.

### Configuration

Expand All @@ -240,7 +251,7 @@ export ENABLE_ARTIFACTS=false
### Benefits

- **Faster convergence** - LLMs can see what went wrong and fix it directly
- **Better error handling** - Compilation and runtime failures become learning opportunities
- **Better error handling** - Compilation and runtime failures become learning opportunities
- **Rich debugging context** - Full stack traces and error messages guide improvements
- **Zero overhead** - When disabled, no performance impact on evaluation

Expand Down
1 change: 1 addition & 0 deletions configs/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ database:
# General settings
db_path: null # Path to persist database (null = in-memory only)
in_memory: true # Keep database in memory for faster access
log_prompts: true # If true, log all prompts and responses into the database

# Evolutionary parameters
population_size: 1000 # Maximum number of programs to keep in memory
Expand Down
4 changes: 4 additions & 0 deletions openevolve/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ class DatabaseConfig:
db_path: Optional[str] = None # Path to store database on disk
in_memory: bool = True

# Prompt and response logging to programs/<id>.json
log_prompts: bool = True

# Evolutionary parameters
population_size: int = 1000
archive_size: int = 100
Expand Down Expand Up @@ -308,6 +311,7 @@ def to_dict(self) -> Dict[str, Any]:
"migration_interval": self.database.migration_interval,
"migration_rate": self.database.migration_rate,
"random_seed": self.database.random_seed,
"log_prompts": self.database.log_prompts,
},
"evaluator": {
"timeout": self.evaluator.timeout,
Expand Down
23 changes: 23 additions & 0 deletions openevolve/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import traceback

from openevolve.config import Config, load_config
from openevolve.database import Program, ProgramDatabase
Expand Down Expand Up @@ -142,6 +143,7 @@ def __init__(
evaluation_file,
self.llm_evaluator_ensemble,
self.evaluator_prompt_sampler,
database=self.database,
)

logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
Expand Down Expand Up @@ -335,10 +337,30 @@ async def run(
# Add to database (will be added to current island)
self.database.add(child_program, iteration=i + 1)

# Log prompts
self.database.log_prompt(
template_key=(
"full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
),
program_id=child_id,
prompt=prompt,
responses=[llm_response],
)

# Store artifacts if they exist
if artifacts:
self.database.store_artifacts(child_id, artifacts)

# Log prompts
self.database.log_prompt(
template_key=(
"full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
),
program_id=child_id,
prompt=prompt,
responses=[llm_response],
)

# Increment generation for current island
self.database.increment_island_generation()

Expand Down Expand Up @@ -384,6 +406,7 @@ async def run(

except Exception as e:
logger.error(f"Error in iteration {i+1}: {str(e)}")
traceback.print_exc()
continue

# Get the best program using our tracking mechanism
Expand Down
58 changes: 55 additions & 3 deletions openevolve/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ def __init__(self, config: DatabaseConfig):
if config.db_path and os.path.exists(config.db_path):
self.load(config.db_path)

# Prompt log
self.prompts_by_program: Dict[str, Dict[str, Dict[str, str]]] = None

# Set random seed for reproducible sampling if specified
if config.random_seed is not None:
import random
Expand Down Expand Up @@ -328,7 +331,14 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None:

# Save each program
for program in self.programs.values():
self._save_program(program, save_path)
prompts = None
if (
self.config.log_prompts
and self.prompts_by_program
and program.id in self.prompts_by_program
):
prompts = self.prompts_by_program[program.id]
self._save_program(program, save_path, prompts=prompts)

# Save metadata
metadata = {
Expand Down Expand Up @@ -489,13 +499,19 @@ def _distribute_programs_to_islands(self) -> None:

logger.info(f"Distributed {len(program_ids)} programs across {len(self.islands)} islands")

def _save_program(self, program: Program, base_path: Optional[str] = None) -> None:
def _save_program(
self,
program: Program,
base_path: Optional[str] = None,
prompts: Optional[Dict[str, Dict[str, str]]] = None,
) -> None:
"""
Save a program to disk

Args:
program: Program to save
base_path: Base path to save to (uses config.db_path if None)
prompts: Optional prompts to save with the program, in the format {template_key: { 'system': str, 'user': str }}
"""
save_path = base_path or self.config.db_path
if not save_path:
Expand All @@ -506,9 +522,13 @@ def _save_program(self, program: Program, base_path: Optional[str] = None) -> No
os.makedirs(programs_dir, exist_ok=True)

# Save program
program_dict = program.to_dict()
if prompts:
program_dict["prompts"] = prompts
program_path = os.path.join(programs_dir, f"{program.id}.json")

with open(program_path, "w") as f:
json.dump(program.to_dict(), f)
json.dump(program_dict, f)

def _calculate_feature_coords(self, program: Program) -> List[int]:
"""
Expand Down Expand Up @@ -1288,3 +1308,35 @@ def _load_artifact_dir(self, artifact_dir: str) -> Dict[str, Union[str, bytes]]:
logger.warning(f"Failed to list artifact directory {artifact_dir}: {e}")

return artifacts

def log_prompt(
self,
program_id: str,
template_key: str,
prompt: Dict[str, str],
responses: Optional[List[str]] = None,
) -> None:
"""
Log a prompt for a program.
Only logs if self.config.log_prompts is True.

Args:
program_id: ID of the program to log the prompt for
template_key: Key for the prompt template
prompt: Prompts in the format {template_key: { 'system': str, 'user': str }}.
responses: Optional list of responses to the prompt, if available.
"""

if not self.config.log_prompts:
return

if responses is None:
responses = []
prompt["responses"] = responses

if self.prompts_by_program is None:
self.prompts_by_program = {}

if program_id not in self.prompts_by_program:
self.prompts_by_program[program_id] = {}
self.prompts_by_program[program_id][template_key] = prompt
68 changes: 56 additions & 12 deletions openevolve/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
import traceback

from openevolve.config import EvaluatorConfig
from openevolve.database import ProgramDatabase
from openevolve.evaluation_result import EvaluationResult
from openevolve.database import ProgramDatabase
from openevolve.llm.ensemble import LLMEnsemble
from openevolve.utils.async_utils import TaskPool, run_in_executor
from openevolve.prompt.sampler import PromptSampler
Expand All @@ -41,11 +43,13 @@ def __init__(
evaluation_file: str,
llm_ensemble: Optional[LLMEnsemble] = None,
prompt_sampler: Optional[PromptSampler] = None,
database: Optional[ProgramDatabase] = None,
):
self.config = config
self.evaluation_file = evaluation_file
self.llm_ensemble = llm_ensemble
self.prompt_sampler = prompt_sampler
self.database = database

# Create a task pool for parallel evaluation
self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
Expand Down Expand Up @@ -131,16 +135,40 @@ async def evaluate_program(
eval_result = self._process_evaluation_result(result)

# Add LLM feedback if configured
llm_eval_result = None
if self.config.use_llm_feedback and self.llm_ensemble:
feedback_metrics = await self._llm_evaluate(program_code)
llm_result = await self._llm_evaluate(program_code, program_id=program_id)
llm_eval_result = self._process_evaluation_result(llm_result)

# Combine metrics
for name, value in feedback_metrics.items():
for name, value in llm_result.metrics.items():
eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight

# Store artifacts if enabled and present
if artifacts_enabled and eval_result.has_artifacts() and program_id:
self._pending_artifacts[program_id] = eval_result.artifacts
if (
artifacts_enabled
and (
eval_result.has_artifacts()
or (llm_eval_result and llm_eval_result.has_artifacts())
)
and program_id
):
self._pending_artifacts[program_id] = {}

# Merge eval_result artifacts with llm artifacts if they exist
if eval_result.has_artifacts():
self._pending_artifacts[program_id].update(eval_result.artifacts)
logger.debug(
f"Program{program_id_str} returned artifacts: "
f"{eval_result.artifacts}"
)

if llm_eval_result and llm_eval_result.has_artifacts():
self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
logger.debug(
f"Program{program_id_str} returned LLM artifacts: "
f"{llm_eval_result.artifacts}"
)

elapsed = time.time() - start_time
logger.info(
Expand All @@ -156,6 +184,7 @@ async def evaluate_program(
logger.warning(
f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
)
traceback.print_exc()

# Capture failure artifacts if enabled
if artifacts_enabled and program_id:
Expand Down Expand Up @@ -378,12 +407,13 @@ async def _cascade_evaluate(
},
)

async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[str, float]:
"""
Use LLM to evaluate code quality

Args:
program_code: Code to evaluate
program_id: Optional ID for logging

Returns:
Dictionary of metric name to score
Expand All @@ -402,12 +432,22 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
prompt["system"], [{"role": "user", "content": prompt["user"]}]
)

# Log prompt and response to database
if self.database and program_id:
self.database.log_prompt(
program_id=program_id,
template_key="evaluation",
prompt=prompt,
responses=responses,
)

# Extract JSON from response
try:
# Try to find JSON block
json_pattern = r"```json\n(.*?)\n```"
import re

artifacts = {}
avg_metrics = {}
for i, response in enumerate(responses):
json_match = re.search(json_pattern, response, re.DOTALL)
Expand All @@ -426,12 +466,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
# Parse JSON
result = json.loads(json_str)

# Filter all non-numeric values
metrics = {
name: float(value)
for name, value in result.items()
if isinstance(value, (int, float))
}
# All non-numeric values are artifacts, all numeric values are metrics
metrics = {}
for key, value in result.items():
if not isinstance(value, (int, float)):
artifacts[key] = value
else:
metrics[key] = float(value)

# Weight of the model in the ensemble
weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
Expand All @@ -443,7 +484,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
else:
avg_metrics[name] = value * weight

return avg_metrics
return EvaluationResult(
metrics=avg_metrics,
artifacts=artifacts,
)

except Exception as e:
logger.warning(f"Error parsing LLM response: {str(e)}")
Expand Down
Loading