diff --git a/README.md b/README.md index 0637dd37b..beb8b58c5 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op ## Artifacts Channel -OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it. +OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it. The artifacts channel operates alongside the traditional fitness metrics. @@ -205,28 +205,17 @@ return EvaluationResult( ``` The next generation prompt will include: -```markdown +``` ## Last Execution Output ### Stderr +``` SyntaxError: invalid syntax (line 15) - +``` ### Traceback +``` ... ``` - -## Example: LLM Feedback - -An example for an LLM artifact side channel is part of the default evaluation prompt template, which ends with -```markdown -Return your evaluation as a JSON object with the following format: -{{ - "readability": [score], - "maintainability": [score], - "efficiency": [score], - "reasoning": "[brief explanation of scores]" -}} ``` -The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt. ### Configuration @@ -251,7 +240,7 @@ export ENABLE_ARTIFACTS=false ### Benefits - **Faster convergence** - LLMs can see what went wrong and fix it directly -- **Better error handling** - Compilation and runtime failures become learning opportunities +- **Better error handling** - Compilation and runtime failures become learning opportunities - **Rich debugging context** - Full stack traces and error messages guide improvements - **Zero overhead** - When disabled, no performance impact on evaluation diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index d43f39499..1a1d83c59 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -131,32 +131,16 @@ async def evaluate_program( eval_result = self._process_evaluation_result(result) # Add LLM feedback if configured - llm_eval_result = None if self.config.use_llm_feedback and self.llm_ensemble: - llm_result = await self._llm_evaluate(program_code) - llm_eval_result = self._process_evaluation_result(llm_result) + feedback_metrics = await self._llm_evaluate(program_code) # Combine metrics - for name, value in llm_result.metrics.items(): + for name, value in feedback_metrics.items(): eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight # Store artifacts if enabled and present - if ( - artifacts_enabled - and ( - eval_result.has_artifacts() - or (llm_eval_result and llm_eval_result.has_artifacts()) - ) - and program_id - ): - self._pending_artifacts[program_id] = {} - - # Merge eval_result artifacts with llm artifacts if they exist - if eval_result.has_artifacts(): - self._pending_artifacts[program_id].update(eval_result.artifacts) - - if llm_eval_result and llm_eval_result.has_artifacts(): - self._pending_artifacts[program_id].update(llm_eval_result.artifacts) + if artifacts_enabled and eval_result.has_artifacts() and program_id: + self._pending_artifacts[program_id] = eval_result.artifacts elapsed = time.time() - start_time logger.info( @@ -172,7 +156,6 @@ async def evaluate_program( logger.warning( f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}" ) - traceback.print_exc() # Capture failure artifacts if enabled if artifacts_enabled and program_id: @@ -425,7 +408,6 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: json_pattern = r"```json\n(.*?)\n```" import re - artifacts = {} avg_metrics = {} for i, response in enumerate(responses): json_match = re.search(json_pattern, response, re.DOTALL) @@ -444,13 +426,12 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: # Parse JSON result = json.loads(json_str) - # All non-numeric values are artifacts, all numeric values are metrics - metrics = {} - for key, value in result.items(): - if not isinstance(value, (int, float)): - artifacts[key] = value - else: - metrics[key] = float(value) + # Filter all non-numeric values + metrics = { + name: float(value) + for name, value in result.items() + if isinstance(value, (int, float)) + } # Weight of the model in the ensemble weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0 @@ -462,10 +443,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: else: avg_metrics[name] = value * weight - return EvaluationResult( - metrics=avg_metrics, - artifacts=artifacts, - ) + return avg_metrics except Exception as e: logger.warning(f"Error parsing LLM response: {str(e)}")