diff --git a/Makefile b/Makefile index 80d73d0cc..cb52e8e19 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ help: @echo " test - Run tests" @echo " docker-build - Build the Docker image" @echo " docker-run - Run the Docker container with the example" + @echo " visualizer - Run the visualization script" .PHONY: all all: install test diff --git a/README.md b/README.md index beb8b58c5..ab7f46ff5 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op ## Artifacts Channel -OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it. +OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it. The artifacts channel operates alongside the traditional fitness metrics. @@ -205,17 +205,28 @@ return EvaluationResult( ``` The next generation prompt will include: -``` +```markdown ## Last Execution Output ### Stderr -``` SyntaxError: invalid syntax (line 15) -``` + ### Traceback -``` ... ``` + +## Example: LLM Feedback + +An example for an LLM artifact side channel is part of the default evaluation template, which ends with +```markdown +Return your evaluation as a JSON object with the following format: +{{ + "readability": [score], + "maintainability": [score], + "efficiency": [score], + "reasoning": "[brief explanation of scores]" +}} ``` +The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt. ### Configuration @@ -240,7 +251,7 @@ export ENABLE_ARTIFACTS=false ### Benefits - **Faster convergence** - LLMs can see what went wrong and fix it directly -- **Better error handling** - Compilation and runtime failures become learning opportunities +- **Better error handling** - Compilation and runtime failures become learning opportunities - **Rich debugging context** - Full stack traces and error messages guide improvements - **Zero overhead** - When disabled, no performance impact on evaluation diff --git a/configs/default_config.yaml b/configs/default_config.yaml index 22f086b81..26168f84c 100644 --- a/configs/default_config.yaml +++ b/configs/default_config.yaml @@ -71,6 +71,7 @@ database: # General settings db_path: null # Path to persist database (null = in-memory only) in_memory: true # Keep database in memory for faster access + log_prompts: true # If true, log all prompts and responses into the database # Evolutionary parameters population_size: 1000 # Maximum number of programs to keep in memory diff --git a/openevolve/config.py b/openevolve/config.py index f8f1d8d77..56aff0415 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -142,6 +142,9 @@ class DatabaseConfig: db_path: Optional[str] = None # Path to store database on disk in_memory: bool = True + # Prompt and response logging to programs/.json + log_prompts: bool = True + # Evolutionary parameters population_size: int = 1000 archive_size: int = 100 @@ -308,6 +311,7 @@ def to_dict(self) -> Dict[str, Any]: "migration_interval": self.database.migration_interval, "migration_rate": self.database.migration_rate, "random_seed": self.database.random_seed, + "log_prompts": self.database.log_prompts, }, "evaluator": { "timeout": self.evaluator.timeout, diff --git a/openevolve/controller.py b/openevolve/controller.py index 670f3eb0d..1656f557b 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -10,6 +10,7 @@ import uuid from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union +import traceback from openevolve.config import Config, load_config from openevolve.database import Program, ProgramDatabase @@ -142,6 +143,7 @@ def __init__( evaluation_file, self.llm_evaluator_ensemble, self.evaluator_prompt_sampler, + database=self.database, ) logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}") @@ -335,10 +337,30 @@ async def run( # Add to database (will be added to current island) self.database.add(child_program, iteration=i + 1) + # Log prompts + self.database.log_prompt( + template_key=( + "full_rewrite_user" if self.config.allow_full_rewrites else "diff_user" + ), + program_id=child_id, + prompt=prompt, + responses=[llm_response], + ) + # Store artifacts if they exist if artifacts: self.database.store_artifacts(child_id, artifacts) + # Log prompts + self.database.log_prompt( + template_key=( + "full_rewrite_user" if self.config.allow_full_rewrites else "diff_user" + ), + program_id=child_id, + prompt=prompt, + responses=[llm_response], + ) + # Increment generation for current island self.database.increment_island_generation() @@ -384,6 +406,7 @@ async def run( except Exception as e: logger.error(f"Error in iteration {i+1}: {str(e)}") + traceback.print_exc() continue # Get the best program using our tracking mechanism diff --git a/openevolve/database.py b/openevolve/database.py index fe7a8d5be..d7526f729 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -118,6 +118,9 @@ def __init__(self, config: DatabaseConfig): if config.db_path and os.path.exists(config.db_path): self.load(config.db_path) + # Prompt log + self.prompts_by_program: Dict[str, Dict[str, Dict[str, str]]] = None + # Set random seed for reproducible sampling if specified if config.random_seed is not None: import random @@ -328,7 +331,14 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None: # Save each program for program in self.programs.values(): - self._save_program(program, save_path) + prompts = None + if ( + self.config.log_prompts + and self.prompts_by_program + and program.id in self.prompts_by_program + ): + prompts = self.prompts_by_program[program.id] + self._save_program(program, save_path, prompts=prompts) # Save metadata metadata = { @@ -489,13 +499,19 @@ def _distribute_programs_to_islands(self) -> None: logger.info(f"Distributed {len(program_ids)} programs across {len(self.islands)} islands") - def _save_program(self, program: Program, base_path: Optional[str] = None) -> None: + def _save_program( + self, + program: Program, + base_path: Optional[str] = None, + prompts: Optional[Dict[str, Dict[str, str]]] = None, + ) -> None: """ Save a program to disk Args: program: Program to save base_path: Base path to save to (uses config.db_path if None) + prompts: Optional prompts to save with the program, in the format {template_key: { 'system': str, 'user': str }} """ save_path = base_path or self.config.db_path if not save_path: @@ -506,9 +522,13 @@ def _save_program(self, program: Program, base_path: Optional[str] = None) -> No os.makedirs(programs_dir, exist_ok=True) # Save program + program_dict = program.to_dict() + if prompts: + program_dict["prompts"] = prompts program_path = os.path.join(programs_dir, f"{program.id}.json") + with open(program_path, "w") as f: - json.dump(program.to_dict(), f) + json.dump(program_dict, f) def _calculate_feature_coords(self, program: Program) -> List[int]: """ @@ -1288,3 +1308,35 @@ def _load_artifact_dir(self, artifact_dir: str) -> Dict[str, Union[str, bytes]]: logger.warning(f"Failed to list artifact directory {artifact_dir}: {e}") return artifacts + + def log_prompt( + self, + program_id: str, + template_key: str, + prompt: Dict[str, str], + responses: Optional[List[str]] = None, + ) -> None: + """ + Log a prompt for a program. + Only logs if self.config.log_prompts is True. + + Args: + program_id: ID of the program to log the prompt for + template_key: Key for the prompt template + prompt: Prompts in the format {template_key: { 'system': str, 'user': str }}. + responses: Optional list of responses to the prompt, if available. + """ + + if not self.config.log_prompts: + return + + if responses is None: + responses = [] + prompt["responses"] = responses + + if self.prompts_by_program is None: + self.prompts_by_program = {} + + if program_id not in self.prompts_by_program: + self.prompts_by_program[program_id] = {} + self.prompts_by_program[program_id][template_key] = prompt diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index 1a1d83c59..1af482bb2 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -18,7 +18,9 @@ import traceback from openevolve.config import EvaluatorConfig +from openevolve.database import ProgramDatabase from openevolve.evaluation_result import EvaluationResult +from openevolve.database import ProgramDatabase from openevolve.llm.ensemble import LLMEnsemble from openevolve.utils.async_utils import TaskPool, run_in_executor from openevolve.prompt.sampler import PromptSampler @@ -41,11 +43,13 @@ def __init__( evaluation_file: str, llm_ensemble: Optional[LLMEnsemble] = None, prompt_sampler: Optional[PromptSampler] = None, + database: Optional[ProgramDatabase] = None, ): self.config = config self.evaluation_file = evaluation_file self.llm_ensemble = llm_ensemble self.prompt_sampler = prompt_sampler + self.database = database # Create a task pool for parallel evaluation self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations) @@ -131,16 +135,40 @@ async def evaluate_program( eval_result = self._process_evaluation_result(result) # Add LLM feedback if configured + llm_eval_result = None if self.config.use_llm_feedback and self.llm_ensemble: - feedback_metrics = await self._llm_evaluate(program_code) + llm_result = await self._llm_evaluate(program_code, program_id=program_id) + llm_eval_result = self._process_evaluation_result(llm_result) # Combine metrics - for name, value in feedback_metrics.items(): + for name, value in llm_result.metrics.items(): eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight # Store artifacts if enabled and present - if artifacts_enabled and eval_result.has_artifacts() and program_id: - self._pending_artifacts[program_id] = eval_result.artifacts + if ( + artifacts_enabled + and ( + eval_result.has_artifacts() + or (llm_eval_result and llm_eval_result.has_artifacts()) + ) + and program_id + ): + self._pending_artifacts[program_id] = {} + + # Merge eval_result artifacts with llm artifacts if they exist + if eval_result.has_artifacts(): + self._pending_artifacts[program_id].update(eval_result.artifacts) + logger.debug( + f"Program{program_id_str} returned artifacts: " + f"{eval_result.artifacts}" + ) + + if llm_eval_result and llm_eval_result.has_artifacts(): + self._pending_artifacts[program_id].update(llm_eval_result.artifacts) + logger.debug( + f"Program{program_id_str} returned LLM artifacts: " + f"{llm_eval_result.artifacts}" + ) elapsed = time.time() - start_time logger.info( @@ -156,6 +184,7 @@ async def evaluate_program( logger.warning( f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}" ) + traceback.print_exc() # Capture failure artifacts if enabled if artifacts_enabled and program_id: @@ -378,12 +407,13 @@ async def _cascade_evaluate( }, ) - async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: + async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[str, float]: """ Use LLM to evaluate code quality Args: program_code: Code to evaluate + program_id: Optional ID for logging Returns: Dictionary of metric name to score @@ -402,12 +432,22 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: prompt["system"], [{"role": "user", "content": prompt["user"]}] ) + # Log prompt and response to database + if self.database and program_id: + self.database.log_prompt( + program_id=program_id, + template_key="evaluation", + prompt=prompt, + responses=responses, + ) + # Extract JSON from response try: # Try to find JSON block json_pattern = r"```json\n(.*?)\n```" import re + artifacts = {} avg_metrics = {} for i, response in enumerate(responses): json_match = re.search(json_pattern, response, re.DOTALL) @@ -426,12 +466,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: # Parse JSON result = json.loads(json_str) - # Filter all non-numeric values - metrics = { - name: float(value) - for name, value in result.items() - if isinstance(value, (int, float)) - } + # All non-numeric values are artifacts, all numeric values are metrics + metrics = {} + for key, value in result.items(): + if not isinstance(value, (int, float)): + artifacts[key] = value + else: + metrics[key] = float(value) # Weight of the model in the ensemble weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0 @@ -443,7 +484,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: else: avg_metrics[name] = value * weight - return avg_metrics + return EvaluationResult( + metrics=avg_metrics, + artifacts=artifacts, + ) except Exception as e: logger.warning(f"Error parsing LLM response: {str(e)}") diff --git a/scripts/static/js/graph.js b/scripts/static/js/graph.js index 116ab09ba..de82e1103 100644 --- a/scripts/static/js/graph.js +++ b/scripts/static/js/graph.js @@ -121,6 +121,40 @@ Object.defineProperty(window, 'g', { set: function(val) { g = val; } }); +// Recenter Button Overlay +function showRecenterButton(onClick) { + let btn = document.getElementById('graph-recenter-btn'); + if (!btn) { + btn = document.createElement('button'); + btn.id = 'graph-recenter-btn'; + btn.textContent = 'Recenter'; + btn.style.position = 'absolute'; + btn.style.left = '50%'; + btn.style.top = '50%'; + btn.style.transform = 'translate(-50%, -50%)'; + btn.style.zIndex = 1000; + btn.style.fontSize = '2em'; + btn.style.padding = '0.5em 1.5em'; + btn.style.background = '#fff'; + btn.style.border = '2px solid #2196f3'; + btn.style.borderRadius = '12px'; + btn.style.boxShadow = '0 2px 16px #0002'; + btn.style.cursor = 'pointer'; + btn.style.display = 'block'; + document.getElementById('graph').appendChild(btn); + } + btn.style.display = 'block'; + btn.onclick = function() { + btn.style.display = 'none'; + if (typeof onClick === 'function') onClick(); + }; +} + +function hideRecenterButton() { + const btn = document.getElementById('graph-recenter-btn'); + if (btn) btn.style.display = 'none'; +} + function ensureGraphSvg() { // Get latest width/height from state.js let svgEl = d3.select('#graph').select('svg'); @@ -264,6 +298,35 @@ function renderGraph(data, options = {}) { .scaleExtent([0.2, 10]) .on('zoom', function(event) { g.attr('transform', event.transform); + // Check if all content is out of view + setTimeout(() => { + try { + const svgRect = svg.node().getBoundingClientRect(); + const allCircles = g.selectAll('circle').nodes(); + if (allCircles.length === 0) { hideRecenterButton(); return; } + let anyVisible = false; + for (const c of allCircles) { + const bbox = c.getBoundingClientRect(); + if ( + bbox.right > svgRect.left && + bbox.left < svgRect.right && + bbox.bottom > svgRect.top && + bbox.top < svgRect.bottom + ) { + anyVisible = true; + break; + } + } + if (!anyVisible) { + showRecenterButton(() => { + // Reset zoom/pan + svg.transition().duration(400).call(zoomBehavior.transform, d3.zoomIdentity); + }); + } else { + hideRecenterButton(); + } + } catch {} + }, 0); }); svg.call(zoomBehavior); if (prevTransform) { diff --git a/scripts/static/js/list.js b/scripts/static/js/list.js index b491e5113..c3c88bbef 100644 --- a/scripts/static/js/list.js +++ b/scripts/static/js/list.js @@ -61,11 +61,22 @@ export function renderNodeList(nodes) { `; container.innerHTML = ''; - filtered.forEach(node => { + filtered.forEach((node, idx) => { const row = document.createElement('div'); row.className = 'node-list-item' + (selectedProgramId === node.id ? ' selected' : '') + (highlightIds.has(node.id) ? ' highlighted' : ''); row.setAttribute('data-node-id', node.id); row.tabIndex = 0; + + const numDiv = document.createElement('div'); + numDiv.textContent = `#${idx + 1}`; + numDiv.style.fontSize = '2.2em'; + numDiv.style.fontWeight = 'bold'; + numDiv.style.color = '#444'; + numDiv.style.flex = '0 0 70px'; + numDiv.style.display = 'flex'; + numDiv.style.alignItems = 'center'; + numDiv.style.justifyContent = 'center'; + row.appendChild(numDiv); let selectedMetricRow = ''; if (node.metrics && metric in node.metrics) { let val = (typeof node.metrics[metric] === 'number' && isFinite(node.metrics[metric])) ? node.metrics[metric].toFixed(4) : node.metrics[metric]; diff --git a/scripts/static/js/main.js b/scripts/static/js/main.js index b06961e2b..46f09abbb 100644 --- a/scripts/static/js/main.js +++ b/scripts/static/js/main.js @@ -83,8 +83,8 @@ function loadAndRenderData(data) { renderNodeList(data.nodes); document.getElementById('checkpoint-label').textContent = "Checkpoint: " + (data.checkpoint_dir || 'static export'); - // Populate metric-select options const metricSelect = document.getElementById('metric-select'); + const prevMetric = metricSelect.value || localStorage.getItem('selectedMetric') || null; metricSelect.innerHTML = ''; const metrics = new Set(); data.nodes.forEach(node => { @@ -98,9 +98,21 @@ function loadAndRenderData(data) { option.textContent = metric; metricSelect.appendChild(option); }); - if (metricSelect.options.length > 0) { + if (prevMetric && metrics.has(prevMetric)) { + metricSelect.value = prevMetric; + } else if (metricSelect.options.length > 0) { metricSelect.selectedIndex = 0; } + metricSelect.addEventListener('change', function() { + localStorage.setItem('selectedMetric', metricSelect.value); + }); + const perfTab = document.getElementById('tab-performance'); + const perfView = document.getElementById('view-performance'); + if (perfTab && perfView && (perfTab.classList.contains('active') || perfView.style.display !== 'none')) { + if (window.updatePerformanceGraph) { + window.updatePerformanceGraph(data.nodes); + } + } } if (window.STATIC_DATA) { diff --git a/scripts/static/js/mainUI.js b/scripts/static/js/mainUI.js index 8f652f892..79f05944c 100644 --- a/scripts/static/js/mainUI.js +++ b/scripts/static/js/mainUI.js @@ -46,6 +46,12 @@ tabs.forEach(tab => { showSidebarContent(window._lastSelectedNodeData || null); } } + // Disable page scroll for graph tabs + if (tab === 'branching' || tab === 'performance') { + document.body.style.overflow = 'hidden'; + } else { + document.body.style.overflow = ''; + } }); }); diff --git a/scripts/static/js/performance.js b/scripts/static/js/performance.js index a2f302946..4900e95ad 100644 --- a/scripts/static/js/performance.js +++ b/scripts/static/js/performance.js @@ -11,7 +11,7 @@ import { selectListNodeById } from './list.js'; if (!toggleDiv) { toggleDiv = document.createElement('div'); toggleDiv.id = 'perf-island-toggle'; - toggleDiv.style = 'display:flex;align-items:center;gap:0.7em;'; + toggleDiv.style = 'display:flex;align-items:center;gap:0.7em;margin-left:3em;'; toggleDiv.innerHTML = `