From 90f9b5bee98003511319e9bd6bb551b7a928b399 Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Mon, 9 Jun 2025 22:08:39 +0200
Subject: [PATCH 1/2] _llm_evaluate returns artifacts if LLM returned string
 responses

---
 openevolve/evaluator.py | 44 ++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
index e57b01224..93c8fae20 100644
--- a/openevolve/evaluator.py
+++ b/openevolve/evaluator.py
@@ -125,16 +125,32 @@ async def evaluate_program(
                 eval_result = self._process_evaluation_result(result)
 
                 # Add LLM feedback if configured
+                llm_eval_result = None
                 if self.config.use_llm_feedback and self.llm_ensemble:
-                    feedback_metrics = await self._llm_evaluate(program_code)
+                    llm_result = await self._llm_evaluate(program_code)
+                    llm_eval_result = self._process_evaluation_result(llm_result)
 
                     # Combine metrics
-                    for name, value in feedback_metrics.items():
+                    for name, value in llm_result.metrics.items():
                         eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
 
                 # Store artifacts if enabled and present
-                if artifacts_enabled and eval_result.has_artifacts() and program_id:
-                    self._pending_artifacts[program_id] = eval_result.artifacts
+                if (
+                    artifacts_enabled
+                    and (
+                        eval_result.has_artifacts()
+                        or (llm_eval_result and llm_eval_result.has_artifacts())
+                    )
+                    and program_id
+                ):
+                    self._pending_artifacts[program_id] = {}
+
+                    # Merge eval_result artifacts with llm artifacts if they exist
+                    if eval_result.has_artifacts():
+                        self._pending_artifacts[program_id].update(eval_result.artifacts)
+
+                    if llm_eval_result and llm_eval_result.has_artifacts():
+                        self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
 
                 elapsed = time.time() - start_time
                 logger.info(
@@ -150,6 +166,7 @@ async def evaluate_program(
                 logger.warning(
                     f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
                 )
+                traceback.print_exc()
 
                 # Capture failure artifacts if enabled
                 if artifacts_enabled and program_id:
@@ -396,6 +413,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                 json_pattern = r"```json\n(.*?)\n```"
                 import re
 
+                artifacts = {}
                 avg_metrics = {}
                 for i, response in enumerate(responses):
                     json_match = re.search(json_pattern, response, re.DOTALL)
@@ -414,12 +432,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                     # Parse JSON
                     result = json.loads(json_str)
 
-                    # Filter all non-numeric values
-                    metrics = {
-                        name: float(value)
-                        for name, value in result.items()
-                        if isinstance(value, (int, float))
-                    }
+                    # All non-numeric values are artifacts, all numeric values are metrics
+                    metrics = {}
+                    for key, value in result.items():
+                        if not isinstance(value, (int, float)):
+                            artifacts[key] = value
+                        else:
+                            metrics[key] = float(value)
 
                     # Weight of the model in the ensemble
                     weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -431,7 +450,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                         else:
                             avg_metrics[name] = value * weight
 
-                return avg_metrics
+                return EvaluationResult(
+                    metrics=avg_metrics,
+                    artifacts=artifacts,
+                )
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")

From 0fe928f35f3634dc774a37fface880cf9929f26c Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Sat, 14 Jun 2025 20:19:52 +0200
Subject: [PATCH 2/2] README formatting fix, added LLM feedback description

---
 README.md | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index beb8b58c5..0637dd37b 100644
--- a/README.md
+++ b/README.md
@@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op
 
 ## Artifacts Channel
 
-OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
+OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
 
 The artifacts channel operates alongside the traditional fitness metrics.
 
@@ -205,17 +205,28 @@ return EvaluationResult(
 ```
 
 The next generation prompt will include:
-```
+```markdown
 ## Last Execution Output
 ### Stderr
-```
 SyntaxError: invalid syntax (line 15)
-```
+
 ### Traceback
-```
 ...
 ```
+
+## Example: LLM Feedback
+
+An example for an LLM artifact side channel is part of the default evaluation prompt template, which ends with
+```markdown
+Return your evaluation as a JSON object with the following format:
+{{
+    "readability": [score],
+    "maintainability": [score],
+    "efficiency": [score],
+    "reasoning": "[brief explanation of scores]"
+}}
 ```
+The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.
 
 ### Configuration
 
@@ -240,7 +251,7 @@ export ENABLE_ARTIFACTS=false
 ### Benefits
 
 - **Faster convergence** - LLMs can see what went wrong and fix it directly
-- **Better error handling** - Compilation and runtime failures become learning opportunities  
+- **Better error handling** - Compilation and runtime failures become learning opportunities
 - **Rich debugging context** - Full stack traces and error messages guide improvements
 - **Zero overhead** - When disabled, no performance impact on evaluation