From d795ac496f61645e37e89195e379d3c4264f25be Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Fri, 22 Aug 2025 10:33:59 +0800
Subject: [PATCH 1/4] fixes

---
 examples/function_minimization/config.yaml  |  36 +++---
 examples/function_minimization/evaluator.py | 128 +++++++++++---------
 openevolve/config.py                        |  43 ++++---
 openevolve/process_parallel.py              |  24 +++-
 4 files changed, 128 insertions(+), 103 deletions(-)

diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml
index 2ea2d1a28..d9cc2dfbc 100644
--- a/examples/function_minimization/config.yaml
+++ b/examples/function_minimization/config.yaml
@@ -1,27 +1,23 @@
 # Configuration for function minimization example
-max_iterations: 100
-checkpoint_interval: 10
-log_level: "INFO"
+max_iterations: 50
+checkpoint_interval: 5
 
 # LLM configuration
 llm:
-  # primary_model: "gemini-2.0-flash-lite"
-  primary_model: "llama3.1-8b"
-  primary_model_weight: 0.8
-  # secondary_model: "gemini-2.0-flash"
-  secondary_model: "llama-4-scout-17b-16e-instruct"
-  secondary_model_weight: 0.2
-  # api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
-  api_base: "https://api.cerebras.ai/v1"
-  temperature: 0.7
-  top_p: 0.95
-  max_tokens: 4096
+  primary_model: "gemini-2.5-flash-lite"
+  # primary_model: "llama3.1-8b"
+  primary_model_weight: 0.9
+  secondary_model: "gemini-2.5-flash"
+  # secondary_model: "llama-4-scout-17b-16e-instruct"
+  secondary_model_weight: 0.1
+  api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+  # api_base: "https://api.cerebras.ai/v1"
+  temperature: 0.4
+  max_tokens: 4000
 
 # Prompt configuration
 prompt:
   system_message: "You are an expert programmer specializing in optimization algorithms. Your task is to improve a function minimization algorithm to find the global minimum of a complex function with many local minima. The function is f(x, y) = sin(x) * cos(y) + sin(x*y) + (x^2 + y^2)/20. Focus on improving the search_algorithm function to reliably find the global minimum, escaping local minima that might trap simple algorithms."
-  num_top_programs: 3
-  use_template_stochasticity: true
 
 # Database configuration
 database:
@@ -34,11 +30,9 @@ database:
 # Evaluator configuration
 evaluator:
   timeout: 60
-  cascade_evaluation: true
   cascade_thresholds: [0.5, 0.75]
-  parallel_evaluations: 4
-  use_llm_feedback: false
+  parallel_evaluations: 3
 
 # Evolution settings
-diff_based_evolution: true
-allow_full_rewrites: false
+diff_based_evolution: false
+max_code_length: 20000
\ No newline at end of file
diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py
index 806c97c85..d34a758dd 100644
--- a/examples/function_minimization/evaluator.py
+++ b/examples/function_minimization/evaluator.py
@@ -69,7 +69,7 @@ def evaluate(program_path):
             return {
                 "value_score": 0.0,
                 "distance_score": 0.0,
-                "speed_score": 0.0,
+                "reliability_score": 0.0,
                 "combined_score": 0.0,
                 "error": "Missing run_search function",
             }
@@ -162,7 +162,7 @@ def evaluate(program_path):
             return {
                 "value_score": 0.0,
                 "distance_score": 0.0,
-                "speed_score": 0.0,
+                "reliability_score": 0.0,
                 "combined_score": 0.0,
                 "error": "All trials failed",
             }
@@ -173,57 +173,32 @@ def evaluate(program_path):
         avg_time = float(np.mean(times)) if times else 1.0
 
         # Convert to scores (higher is better)
-        value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE)))  # Normalize and invert
+        value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE)))
         distance_score = float(1.0 / (1.0 + avg_distance))
-        speed_score = float(1.0 / avg_time) if avg_time > 0 else 0.0
-
-        # calculate standard deviation scores
-        # get x_std_score
-        x_std_score = float(1.0 / (1.0 + np.std(x_values)))
-        # get y_std_score
-        y_std_score = float(1.0 / (1.0 + np.std(y_values)))
-        standard_deviation_score = (x_std_score + y_std_score) / 2.0
-
-        # Normalize speed score (so it doesn't dominate)
-        speed_score = float(min(speed_score, 10.0) / 10.0)
-
+        
         # Add reliability score based on success rate
         reliability_score = float(success_count / num_trials)
 
-        # Calculate a single combined score that prioritizes finding good solutions
-        # over secondary metrics like speed and reliability
-        # Value and distance scores (quality of solution) get 90% of the weight
-        # Speed and reliability get only 10% combined
-        combined_score = float(
-            0.35 * value_score
-            + 0.35 * distance_score
-            + standard_deviation_score * 0.20
-            + 0.05 * speed_score
-            + 0.05 * reliability_score
-        )
-
-        # Also compute an "overall" score that will be the primary metric for selection
-        # This adds a bonus for finding solutions close to the global minimum
-        # and heavily penalizes solutions that aren't finding the right region
-        if distance_to_global < 1.0:  # Very close to the correct solution
-            solution_quality = 1.0
-        elif distance_to_global < 3.0:  # In the right region
-            solution_quality = 0.5
+        # Calculate solution quality based on distance to global minimum
+        if avg_distance < 0.5:  # Very close to the correct solution
+            solution_quality_multiplier = 1.5  # 50% bonus
+        elif avg_distance < 1.5:  # In the right region
+            solution_quality_multiplier = 1.2  # 20% bonus
+        elif avg_distance < 3.0:  # Getting closer
+            solution_quality_multiplier = 1.0  # No adjustment
         else:  # Not finding the right region
-            solution_quality = 0.1
+            solution_quality_multiplier = 0.7  # 30% penalty
 
-        # Overall score is dominated by solution quality but also factors in the combined score
-        overall_score = 0.8 * solution_quality + 0.2 * combined_score
+        # Calculate combined score that prioritizes finding the global minimum
+        # Base score from value and distance, then apply solution quality multiplier
+        base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
+        combined_score = float(base_score * solution_quality_multiplier)
 
         return {
             "value_score": value_score,
             "distance_score": distance_score,
-            "standard_deviation_score": standard_deviation_score,
-            "speed_score": speed_score,
             "reliability_score": reliability_score,
             "combined_score": combined_score,
-            "overall_score": overall_score,  # This will be the primary selection metric
-            "success_rate": reliability_score,
         }
     except Exception as e:
         print(f"Evaluation failed completely: {str(e)}")
@@ -231,7 +206,7 @@ def evaluate(program_path):
         return {
             "value_score": 0.0,
             "distance_score": 0.0,
-            "speed_score": 0.0,
+            "reliability_score": 0.0,
             "combined_score": 0.0,
             "error": str(e),
         }
@@ -255,7 +230,11 @@ def evaluate_stage1(program_path):
         # Check if the required function exists
         if not hasattr(program, "run_search"):
             print(f"Stage 1 validation: Program does not have 'run_search' function")
-            return {"runs_successfully": 0.0, "error": "Missing run_search function"}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": "Missing run_search function"
+            }
 
         try:
             # Run a single trial with timeout
@@ -275,10 +254,18 @@ def evaluate_stage1(program_path):
                     print(
                         f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
                     )
-                    return {"runs_successfully": 0.0, "error": "Invalid result format"}
+                    return {
+                        "runs_successfully": 0.0, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result format"
+                    }
             else:
                 print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
-                return {"runs_successfully": 0.0, "error": "Invalid result format"}
+                return {
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Invalid result format"
+                }
 
             # Ensure all values are float
             x = safe_float(x)
@@ -295,7 +282,11 @@ def evaluate_stage1(program_path):
                 or np.isinf(value)
             ):
                 print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}")
-                return {"runs_successfully": 0.5, "error": "Invalid result values"}
+                return {
+                    "runs_successfully": 0.5, 
+                    "combined_score": 0.0,
+                    "error": "Invalid result values"
+                }
 
             # Calculate distance safely
             x_diff = float(x) - GLOBAL_MIN_X
@@ -306,38 +297,59 @@ def evaluate_stage1(program_path):
             value_score = float(1.0 / (1.0 + abs(value - GLOBAL_MIN_VALUE)))
             distance_score = float(1.0 / (1.0 + distance))
 
-            # Calculate solution quality metric
-            if distance < 1.0:  # Very close to the correct solution
-                solution_quality = 1.0
-            elif distance < 3.0:  # In the right region
-                solution_quality = 0.5
+            # Calculate solution quality based on distance to global minimum
+            if distance < 0.5:  # Very close to the correct solution
+                solution_quality_multiplier = 1.4  # 40% bonus
+            elif distance < 1.5:  # In the right region
+                solution_quality_multiplier = 1.15  # 15% bonus
+            elif distance < 3.0:  # Getting closer
+                solution_quality_multiplier = 1.0  # No adjustment
             else:  # Not finding the right region
-                solution_quality = 0.1
+                solution_quality_multiplier = 0.8  # 20% penalty
+
+            # Calculate combined score for stage 1
+            base_score = 0.6 * value_score + 0.4 * distance_score
+            combined_score = float(base_score * solution_quality_multiplier)
 
-            # Basic metrics with overall score
             return {
                 "runs_successfully": 1.0,
                 "value_score": value_score,
                 "distance_score": distance_score,
-                "overall_score": solution_quality,  # This becomes a strong guiding metric
+                "combined_score": combined_score,
             }
         except TimeoutError as e:
             print(f"Stage 1 evaluation timed out: {e}")
-            return {"runs_successfully": 0.0, "error": "Timeout"}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": "Timeout"
+            }
         except IndexError as e:
             # Specifically handle IndexError which often happens with early termination checks
             print(f"Stage 1 evaluation failed with IndexError: {e}")
             print("This is likely due to a list index check before the list is fully populated.")
-            return {"runs_successfully": 0.0, "error": f"IndexError: {str(e)}"}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": f"IndexError: {str(e)}"
+            }
         except Exception as e:
             print(f"Stage 1 evaluation failed: {e}")
             print(traceback.format_exc())
-            return {"runs_successfully": 0.0, "error": str(e)}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": str(e)
+            }
 
     except Exception as e:
         print(f"Stage 1 evaluation failed: {e}")
         print(traceback.format_exc())
-        return {"runs_successfully": 0.0, "error": str(e)}
+        return {
+            "runs_successfully": 0.0, 
+            "combined_score": 0.0,
+            "error": str(e)
+        }
 
 
 def evaluate_stage2(program_path):
diff --git a/openevolve/config.py b/openevolve/config.py
index 80c5d69c4..9d86ae5f9 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -56,12 +56,7 @@ class LLMConfig(LLMModelConfig):
     retry_delay: int = 5
 
     # n-model configuration for evolution LLM ensemble
-    models: List[LLMModelConfig] = field(
-        default_factory=lambda: [
-            LLMModelConfig(name="gpt-4o-mini", weight=0.8),
-            LLMModelConfig(name="gpt-4o", weight=0.2),
-        ]
-    )
+    models: List[LLMModelConfig] = field(default_factory=list)
 
     # n-model configuration for evaluator LLM ensemble
     evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
@@ -75,24 +70,34 @@ class LLMConfig(LLMModelConfig):
     def __post_init__(self):
         """Post-initialization to set up model configurations"""
         # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight).
-        if (self.primary_model or self.primary_model_weight) and len(self.models) < 1:
-            # Ensure we have a primary model
-            self.models.append(LLMModelConfig())
         if self.primary_model:
-            self.models[0].name = self.primary_model
-        if self.primary_model_weight:
-            self.models[0].weight = self.primary_model_weight
+            # Create primary model
+            primary_model = LLMModelConfig(
+                name=self.primary_model,
+                weight=self.primary_model_weight or 1.0
+            )
+            self.models.append(primary_model)
 
-        if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2:
-            # Ensure we have a second model
-            self.models.append(LLMModelConfig())
         if self.secondary_model:
-            self.models[1].name = self.secondary_model
-        if self.secondary_model_weight:
-            self.models[1].weight = self.secondary_model_weight
+            # Create secondary model (only if weight > 0)
+            if not self.secondary_model_weight or self.secondary_model_weight > 0:
+                secondary_model = LLMModelConfig(
+                    name=self.secondary_model,
+                    weight=self.secondary_model_weight or 0.2
+                )
+                self.models.append(secondary_model)
+
+        # Only validate if this looks like a user config (has some model info)
+        # Don't validate during internal/default initialization
+        if (self.primary_model or self.secondary_model or 
+            self.primary_model_weight or self.secondary_model_weight) and not self.models:
+            raise ValueError(
+                "No LLM models configured. Please specify 'models' array or "
+                "'primary_model' in your configuration."
+            )
 
         # If no evaluator models are defined, use the same models as for evolution
-        if not self.evaluator_models or len(self.evaluator_models) < 1:
+        if not self.evaluator_models:
             self.evaluator_models = self.models.copy()
 
         # Update models with shared configuration values
diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py
index 272767022..f8d4805ef 100644
--- a/openevolve/process_parallel.py
+++ b/openevolve/process_parallel.py
@@ -178,12 +178,26 @@ def _run_iteration_worker(
         iteration_start = time.time()
 
         # Generate code modification (sync wrapper for async)
-        llm_response = asyncio.run(
-            _worker_llm_ensemble.generate_with_context(
-                system_message=prompt["system"],
-                messages=[{"role": "user", "content": prompt["user"]}],
+        try:
+            llm_response = asyncio.run(
+                _worker_llm_ensemble.generate_with_context(
+                    system_message=prompt["system"],
+                    messages=[{"role": "user", "content": prompt["user"]}],
+                )
+            )
+        except Exception as e:
+            logger.error(f"LLM generation failed: {e}")
+            return SerializableResult(
+                error=f"LLM generation failed: {str(e)}", 
+                iteration=iteration
+            )
+
+        # Check for None response
+        if llm_response is None:
+            return SerializableResult(
+                error="LLM returned None response", 
+                iteration=iteration
             )
-        )
 
         # Parse response based on evolution mode
         if _worker_config.diff_based_evolution:

From fe7f5ec52c23b0aa4fa7ebca79c3b89620e75c3a Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Fri, 22 Aug 2025 10:57:47 +0800
Subject: [PATCH 2/4] optimize config

---
 examples/function_minimization/config.yaml | 13 +++++++------
 openevolve/config.py                       |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml
index d9cc2dfbc..f403a4241 100644
--- a/examples/function_minimization/config.yaml
+++ b/examples/function_minimization/config.yaml
@@ -6,14 +6,15 @@ checkpoint_interval: 5
 llm:
   primary_model: "gemini-2.5-flash-lite"
   # primary_model: "llama3.1-8b"
-  primary_model_weight: 0.9
+  primary_model_weight: 0.8
   secondary_model: "gemini-2.5-flash"
   # secondary_model: "llama-4-scout-17b-16e-instruct"
-  secondary_model_weight: 0.1
+  secondary_model_weight: 0.2
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
   # api_base: "https://api.cerebras.ai/v1"
-  temperature: 0.4
-  max_tokens: 4000
+  temperature: 0.6
+  max_tokens: 10000
+  timeout: 120
 
 # Prompt configuration
 prompt:
@@ -30,9 +31,9 @@ database:
 # Evaluator configuration
 evaluator:
   timeout: 60
-  cascade_thresholds: [0.5, 0.75]
+  cascade_thresholds: [1.45]
   parallel_evaluations: 3
 
 # Evolution settings
-diff_based_evolution: false
+diff_based_evolution: true
 max_code_length: 20000
\ No newline at end of file
diff --git a/openevolve/config.py b/openevolve/config.py
index 9d86ae5f9..3a40f26ff 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -80,10 +80,10 @@ def __post_init__(self):
 
         if self.secondary_model:
             # Create secondary model (only if weight > 0)
-            if not self.secondary_model_weight or self.secondary_model_weight > 0:
+            if self.secondary_model_weight is None or self.secondary_model_weight > 0:
                 secondary_model = LLMModelConfig(
                     name=self.secondary_model,
-                    weight=self.secondary_model_weight or 0.2
+                    weight=self.secondary_model_weight if self.secondary_model_weight is not None else 0.2
                 )
                 self.models.append(secondary_model)
 

From 78e153bb8a81cbddb1263dcfdce5d25a5e38ad18 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:17:42 +0800
Subject: [PATCH 3/4] f

---
 examples/function_minimization/config.yaml  |   6 +-
 examples/function_minimization/evaluator.py | 255 +++++++++++++++-----
 2 files changed, 198 insertions(+), 63 deletions(-)

diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml
index f403a4241..ff25ceef8 100644
--- a/examples/function_minimization/config.yaml
+++ b/examples/function_minimization/config.yaml
@@ -12,8 +12,8 @@ llm:
   secondary_model_weight: 0.2
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
   # api_base: "https://api.cerebras.ai/v1"
-  temperature: 0.6
-  max_tokens: 10000
+  temperature: 0.7
+  max_tokens: 16000
   timeout: 120
 
 # Prompt configuration
@@ -31,7 +31,7 @@ database:
 # Evaluator configuration
 evaluator:
   timeout: 60
-  cascade_thresholds: [1.45]
+  cascade_thresholds: [1.4]
   parallel_evaluations: 3
 
 # Evolution settings
diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py
index d34a758dd..f16318125 100644
--- a/examples/function_minimization/evaluator.py
+++ b/examples/function_minimization/evaluator.py
@@ -8,6 +8,7 @@
 import concurrent.futures
 import traceback
 import signal
+from openevolve.evaluation_result import EvaluationResult
 
 
 def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
@@ -66,13 +67,23 @@ def evaluate(program_path):
         # Check if the required function exists
         if not hasattr(program, "run_search"):
             print(f"Error: program does not have 'run_search' function")
-            return {
-                "value_score": 0.0,
-                "distance_score": 0.0,
-                "reliability_score": 0.0,
-                "combined_score": 0.0,
-                "error": "Missing run_search function",
+            
+            error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Program is missing required 'run_search' function",
+                "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "value_score": 0.0,
+                    "distance_score": 0.0,
+                    "reliability_score": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing run_search function",
+                },
+                artifacts=error_artifacts
+            )
 
         # Run multiple trials
         num_trials = 10
@@ -159,13 +170,22 @@ def evaluate(program_path):
 
         # If all trials failed, return zero scores
         if success_count == 0:
-            return {
-                "value_score": 0.0,
-                "distance_score": 0.0,
-                "reliability_score": 0.0,
-                "combined_score": 0.0,
-                "error": "All trials failed",
+            error_artifacts = {
+                "error_type": "AllTrialsFailed",
+                "error_message": f"All {num_trials} trials failed - common issues: timeouts, crashes, or invalid return values",
+                "suggestion": "Check for infinite loops, ensure function returns (x, y) or (x, y, value), and verify algorithm terminates within time limit"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "value_score": 0.0,
+                    "distance_score": 0.0,
+                    "reliability_score": 0.0,
+                    "combined_score": 0.0,
+                    "error": "All trials failed",
+                },
+                artifacts=error_artifacts
+            )
 
         # Calculate metrics
         avg_value = float(np.mean(values))
@@ -194,22 +214,45 @@ def evaluate(program_path):
         base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
         combined_score = float(base_score * solution_quality_multiplier)
 
-        return {
-            "value_score": value_score,
-            "distance_score": distance_score,
-            "reliability_score": reliability_score,
-            "combined_score": combined_score,
+        # Add artifacts for successful runs
+        artifacts = {
+            "convergence_info": f"Converged in {num_trials} trials with {success_count} successes",
+            "best_position": f"Final position: x={x_values[-1]:.4f}, y={y_values[-1]:.4f}" if x_values else "No successful trials",
+            "average_distance_to_global": f"{avg_distance:.4f}",
+            "search_efficiency": f"Success rate: {reliability_score:.2%}"
         }
+
+        return EvaluationResult(
+            metrics={
+                "value_score": value_score,
+                "distance_score": distance_score,
+                "reliability_score": reliability_score,
+                "combined_score": combined_score,
+            },
+            artifacts=artifacts
+        )
     except Exception as e:
         print(f"Evaluation failed completely: {str(e)}")
         print(traceback.format_exc())
-        return {
-            "value_score": 0.0,
-            "distance_score": 0.0,
-            "reliability_score": 0.0,
-            "combined_score": 0.0,
-            "error": str(e),
+        
+        # Create error artifacts
+        error_artifacts = {
+            "error_type": type(e).__name__,
+            "error_message": str(e),
+            "full_traceback": traceback.format_exc(),
+            "suggestion": "Check for syntax errors or missing imports in the generated code"
         }
+        
+        return EvaluationResult(
+            metrics={
+                "value_score": 0.0,
+                "distance_score": 0.0,
+                "reliability_score": 0.0,
+                "combined_score": 0.0,
+                "error": str(e),
+            },
+            artifacts=error_artifacts
+        )
 
 
 # Stage-based evaluation for cascade evaluation
@@ -230,11 +273,21 @@ def evaluate_stage1(program_path):
         # Check if the required function exists
         if not hasattr(program, "run_search"):
             print(f"Stage 1 validation: Program does not have 'run_search' function")
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": "Missing run_search function"
+            
+            error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'run_search' function",
+                "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Missing run_search function"
+                },
+                artifacts=error_artifacts
+            )
 
         try:
             # Run a single trial with timeout
@@ -254,18 +307,38 @@ def evaluate_stage1(program_path):
                     print(
                         f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
                     )
-                    return {
-                        "runs_successfully": 0.0, 
-                        "combined_score": 0.0,
-                        "error": "Invalid result format"
+                    
+                    error_artifacts = {
+                        "error_type": "InvalidReturnFormat",
+                        "error_message": f"Stage 1: Function returned tuple with {len(result)} values, expected 2 or 3",
+                        "suggestion": "run_search() must return (x, y) or (x, y, value) - check your return statement"
                     }
+                    
+                    return EvaluationResult(
+                        metrics={
+                            "runs_successfully": 0.0, 
+                            "combined_score": 0.0,
+                            "error": "Invalid result format"
+                        },
+                        artifacts=error_artifacts
+                    )
             else:
                 print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
-                return {
-                    "runs_successfully": 0.0, 
-                    "combined_score": 0.0,
-                    "error": "Invalid result format"
+                
+                error_artifacts = {
+                    "error_type": "InvalidReturnType",
+                    "error_message": f"Stage 1: Function returned {type(result)}, expected tuple",
+                    "suggestion": "run_search() must return a tuple like (x, y) or (x, y, value), not a single value or other type"
                 }
+                
+                return EvaluationResult(
+                    metrics={
+                        "runs_successfully": 0.0, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result format"
+                    },
+                    artifacts=error_artifacts
+                )
 
             # Ensure all values are float
             x = safe_float(x)
@@ -282,11 +355,21 @@ def evaluate_stage1(program_path):
                 or np.isinf(value)
             ):
                 print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}")
-                return {
-                    "runs_successfully": 0.5, 
-                    "combined_score": 0.0,
-                    "error": "Invalid result values"
+                
+                error_artifacts = {
+                    "error_type": "InvalidResultValues",
+                    "error_message": f"Stage 1: Got invalid values - x={x}, y={y}, value={value}",
+                    "suggestion": "Function returned NaN or infinite values. Check for division by zero, invalid math operations, or uninitialized variables"
                 }
+                
+                return EvaluationResult(
+                    metrics={
+                        "runs_successfully": 0.5, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result values"
+                    },
+                    artifacts=error_artifacts
+                )
 
             # Calculate distance safely
             x_diff = float(x) - GLOBAL_MIN_X
@@ -311,45 +394,97 @@ def evaluate_stage1(program_path):
             base_score = 0.6 * value_score + 0.4 * distance_score
             combined_score = float(base_score * solution_quality_multiplier)
 
-            return {
-                "runs_successfully": 1.0,
-                "value_score": value_score,
-                "distance_score": distance_score,
-                "combined_score": combined_score,
+            # Add artifacts for successful stage 1
+            stage1_artifacts = {
+                "stage1_result": f"Found solution at x={x:.4f}, y={y:.4f} with value={value:.4f}",
+                "distance_to_global": f"{distance:.4f}",
+                "solution_quality": f"Distance < 0.5: Very close" if distance < 0.5 else f"Distance < 1.5: Good region" if distance < 1.5 else "Could be improved"
             }
+
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 1.0,
+                    "value_score": value_score,
+                    "distance_score": distance_score,
+                    "combined_score": combined_score,
+                },
+                artifacts=stage1_artifacts
+            )
         except TimeoutError as e:
             print(f"Stage 1 evaluation timed out: {e}")
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": "Timeout"
+            
+            error_artifacts = {
+                "error_type": "TimeoutError",
+                "error_message": "Stage 1: Function execution exceeded 5 second timeout",
+                "suggestion": "Function is likely stuck in infinite loop or doing too much computation. Try reducing iterations or adding early termination conditions"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Timeout"
+                },
+                artifacts=error_artifacts
+            )
         except IndexError as e:
             # Specifically handle IndexError which often happens with early termination checks
             print(f"Stage 1 evaluation failed with IndexError: {e}")
             print("This is likely due to a list index check before the list is fully populated.")
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": f"IndexError: {str(e)}"
+            
+            error_artifacts = {
+                "error_type": "IndexError",
+                "error_message": f"Stage 1: {str(e)}",
+                "suggestion": "List index out of range - likely accessing empty list or wrong index. Check list initialization and bounds"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": f"IndexError: {str(e)}"
+                },
+                artifacts=error_artifacts
+            )
         except Exception as e:
             print(f"Stage 1 evaluation failed: {e}")
             print(traceback.format_exc())
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": str(e)
+            
+            error_artifacts = {
+                "error_type": type(e).__name__,
+                "error_message": f"Stage 1: {str(e)}",
+                "full_traceback": traceback.format_exc(),
+                "suggestion": "Unexpected error occurred. Check the traceback for specific issue"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": str(e)
+                },
+                artifacts=error_artifacts
+            )
 
     except Exception as e:
         print(f"Stage 1 evaluation failed: {e}")
         print(traceback.format_exc())
-        return {
-            "runs_successfully": 0.0, 
-            "combined_score": 0.0,
-            "error": str(e)
+        
+        error_artifacts = {
+            "error_type": type(e).__name__,
+            "error_message": f"Stage 1 outer exception: {str(e)}",
+            "full_traceback": traceback.format_exc(),
+            "suggestion": "Critical error during stage 1 evaluation. Check program syntax and imports"
         }
+        
+        return EvaluationResult(
+            metrics={
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": str(e)
+            },
+            artifacts=error_artifacts
+        )
 
 
 def evaluate_stage2(program_path):

From 0c0e0269e7b19f47ee16ddda1f406dece9b4da34 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:26:52 +0800
Subject: [PATCH 4/4] fix unit tests

---
 .../{dataset_config.yaml => dataset_settings.yaml}            | 0
 examples/llm_prompt_optimization/evaluator.py                 | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename examples/llm_prompt_optimization/{dataset_config.yaml => dataset_settings.yaml} (100%)

diff --git a/examples/llm_prompt_optimization/dataset_config.yaml b/examples/llm_prompt_optimization/dataset_settings.yaml
similarity index 100%
rename from examples/llm_prompt_optimization/dataset_config.yaml
rename to examples/llm_prompt_optimization/dataset_settings.yaml
diff --git a/examples/llm_prompt_optimization/evaluator.py b/examples/llm_prompt_optimization/evaluator.py
index 71bf4b316..09ffc204c 100644
--- a/examples/llm_prompt_optimization/evaluator.py
+++ b/examples/llm_prompt_optimization/evaluator.py
@@ -47,8 +47,8 @@
 if not prompt_file:
     # Default to a generic dataset config if not using the wrapper script
     evaluator_dir = os.path.dirname(os.path.abspath(__file__))
-    DATASET_CONFIG_PATH = os.path.join(evaluator_dir, "dataset_config.yaml")
-    print("Warning: OPENEVOLVE_PROMPT not set. Using default dataset_config.yaml")
+    DATASET_CONFIG_PATH = os.path.join(evaluator_dir, "dataset_settings.yaml")
+    print("Warning: OPENEVOLVE_PROMPT not set. Using default dataset_settings.yaml")
 else:
     basename = os.path.basename(prompt_file)
     dataset_filename = basename.replace("_prompt.txt", "_prompt_dataset.yaml").replace(