From d795ac496f61645e37e89195e379d3c4264f25be Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 22 Aug 2025 10:33:59 +0800 Subject: [PATCH 1/4] fixes --- examples/function_minimization/config.yaml | 36 +++--- examples/function_minimization/evaluator.py | 128 +++++++++++--------- openevolve/config.py | 43 ++++--- openevolve/process_parallel.py | 24 +++- 4 files changed, 128 insertions(+), 103 deletions(-) diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml index 2ea2d1a28..d9cc2dfbc 100644 --- a/examples/function_minimization/config.yaml +++ b/examples/function_minimization/config.yaml @@ -1,27 +1,23 @@ # Configuration for function minimization example -max_iterations: 100 -checkpoint_interval: 10 -log_level: "INFO" +max_iterations: 50 +checkpoint_interval: 5 # LLM configuration llm: - # primary_model: "gemini-2.0-flash-lite" - primary_model: "llama3.1-8b" - primary_model_weight: 0.8 - # secondary_model: "gemini-2.0-flash" - secondary_model: "llama-4-scout-17b-16e-instruct" - secondary_model_weight: 0.2 - # api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" - api_base: "https://api.cerebras.ai/v1" - temperature: 0.7 - top_p: 0.95 - max_tokens: 4096 + primary_model: "gemini-2.5-flash-lite" + # primary_model: "llama3.1-8b" + primary_model_weight: 0.9 + secondary_model: "gemini-2.5-flash" + # secondary_model: "llama-4-scout-17b-16e-instruct" + secondary_model_weight: 0.1 + api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" + # api_base: "https://api.cerebras.ai/v1" + temperature: 0.4 + max_tokens: 4000 # Prompt configuration prompt: system_message: "You are an expert programmer specializing in optimization algorithms. Your task is to improve a function minimization algorithm to find the global minimum of a complex function with many local minima. The function is f(x, y) = sin(x) * cos(y) + sin(x*y) + (x^2 + y^2)/20. Focus on improving the search_algorithm function to reliably find the global minimum, escaping local minima that might trap simple algorithms." - num_top_programs: 3 - use_template_stochasticity: true # Database configuration database: @@ -34,11 +30,9 @@ database: # Evaluator configuration evaluator: timeout: 60 - cascade_evaluation: true cascade_thresholds: [0.5, 0.75] - parallel_evaluations: 4 - use_llm_feedback: false + parallel_evaluations: 3 # Evolution settings -diff_based_evolution: true -allow_full_rewrites: false +diff_based_evolution: false +max_code_length: 20000 \ No newline at end of file diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py index 806c97c85..d34a758dd 100644 --- a/examples/function_minimization/evaluator.py +++ b/examples/function_minimization/evaluator.py @@ -69,7 +69,7 @@ def evaluate(program_path): return { "value_score": 0.0, "distance_score": 0.0, - "speed_score": 0.0, + "reliability_score": 0.0, "combined_score": 0.0, "error": "Missing run_search function", } @@ -162,7 +162,7 @@ def evaluate(program_path): return { "value_score": 0.0, "distance_score": 0.0, - "speed_score": 0.0, + "reliability_score": 0.0, "combined_score": 0.0, "error": "All trials failed", } @@ -173,57 +173,32 @@ def evaluate(program_path): avg_time = float(np.mean(times)) if times else 1.0 # Convert to scores (higher is better) - value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE))) # Normalize and invert + value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE))) distance_score = float(1.0 / (1.0 + avg_distance)) - speed_score = float(1.0 / avg_time) if avg_time > 0 else 0.0 - - # calculate standard deviation scores - # get x_std_score - x_std_score = float(1.0 / (1.0 + np.std(x_values))) - # get y_std_score - y_std_score = float(1.0 / (1.0 + np.std(y_values))) - standard_deviation_score = (x_std_score + y_std_score) / 2.0 - - # Normalize speed score (so it doesn't dominate) - speed_score = float(min(speed_score, 10.0) / 10.0) - + # Add reliability score based on success rate reliability_score = float(success_count / num_trials) - # Calculate a single combined score that prioritizes finding good solutions - # over secondary metrics like speed and reliability - # Value and distance scores (quality of solution) get 90% of the weight - # Speed and reliability get only 10% combined - combined_score = float( - 0.35 * value_score - + 0.35 * distance_score - + standard_deviation_score * 0.20 - + 0.05 * speed_score - + 0.05 * reliability_score - ) - - # Also compute an "overall" score that will be the primary metric for selection - # This adds a bonus for finding solutions close to the global minimum - # and heavily penalizes solutions that aren't finding the right region - if distance_to_global < 1.0: # Very close to the correct solution - solution_quality = 1.0 - elif distance_to_global < 3.0: # In the right region - solution_quality = 0.5 + # Calculate solution quality based on distance to global minimum + if avg_distance < 0.5: # Very close to the correct solution + solution_quality_multiplier = 1.5 # 50% bonus + elif avg_distance < 1.5: # In the right region + solution_quality_multiplier = 1.2 # 20% bonus + elif avg_distance < 3.0: # Getting closer + solution_quality_multiplier = 1.0 # No adjustment else: # Not finding the right region - solution_quality = 0.1 + solution_quality_multiplier = 0.7 # 30% penalty - # Overall score is dominated by solution quality but also factors in the combined score - overall_score = 0.8 * solution_quality + 0.2 * combined_score + # Calculate combined score that prioritizes finding the global minimum + # Base score from value and distance, then apply solution quality multiplier + base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score + combined_score = float(base_score * solution_quality_multiplier) return { "value_score": value_score, "distance_score": distance_score, - "standard_deviation_score": standard_deviation_score, - "speed_score": speed_score, "reliability_score": reliability_score, "combined_score": combined_score, - "overall_score": overall_score, # This will be the primary selection metric - "success_rate": reliability_score, } except Exception as e: print(f"Evaluation failed completely: {str(e)}") @@ -231,7 +206,7 @@ def evaluate(program_path): return { "value_score": 0.0, "distance_score": 0.0, - "speed_score": 0.0, + "reliability_score": 0.0, "combined_score": 0.0, "error": str(e), } @@ -255,7 +230,11 @@ def evaluate_stage1(program_path): # Check if the required function exists if not hasattr(program, "run_search"): print(f"Stage 1 validation: Program does not have 'run_search' function") - return {"runs_successfully": 0.0, "error": "Missing run_search function"} + return { + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Missing run_search function" + } try: # Run a single trial with timeout @@ -275,10 +254,18 @@ def evaluate_stage1(program_path): print( f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}" ) - return {"runs_successfully": 0.0, "error": "Invalid result format"} + return { + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Invalid result format" + } else: print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}") - return {"runs_successfully": 0.0, "error": "Invalid result format"} + return { + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Invalid result format" + } # Ensure all values are float x = safe_float(x) @@ -295,7 +282,11 @@ def evaluate_stage1(program_path): or np.isinf(value) ): print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}") - return {"runs_successfully": 0.5, "error": "Invalid result values"} + return { + "runs_successfully": 0.5, + "combined_score": 0.0, + "error": "Invalid result values" + } # Calculate distance safely x_diff = float(x) - GLOBAL_MIN_X @@ -306,38 +297,59 @@ def evaluate_stage1(program_path): value_score = float(1.0 / (1.0 + abs(value - GLOBAL_MIN_VALUE))) distance_score = float(1.0 / (1.0 + distance)) - # Calculate solution quality metric - if distance < 1.0: # Very close to the correct solution - solution_quality = 1.0 - elif distance < 3.0: # In the right region - solution_quality = 0.5 + # Calculate solution quality based on distance to global minimum + if distance < 0.5: # Very close to the correct solution + solution_quality_multiplier = 1.4 # 40% bonus + elif distance < 1.5: # In the right region + solution_quality_multiplier = 1.15 # 15% bonus + elif distance < 3.0: # Getting closer + solution_quality_multiplier = 1.0 # No adjustment else: # Not finding the right region - solution_quality = 0.1 + solution_quality_multiplier = 0.8 # 20% penalty + + # Calculate combined score for stage 1 + base_score = 0.6 * value_score + 0.4 * distance_score + combined_score = float(base_score * solution_quality_multiplier) - # Basic metrics with overall score return { "runs_successfully": 1.0, "value_score": value_score, "distance_score": distance_score, - "overall_score": solution_quality, # This becomes a strong guiding metric + "combined_score": combined_score, } except TimeoutError as e: print(f"Stage 1 evaluation timed out: {e}") - return {"runs_successfully": 0.0, "error": "Timeout"} + return { + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Timeout" + } except IndexError as e: # Specifically handle IndexError which often happens with early termination checks print(f"Stage 1 evaluation failed with IndexError: {e}") print("This is likely due to a list index check before the list is fully populated.") - return {"runs_successfully": 0.0, "error": f"IndexError: {str(e)}"} + return { + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": f"IndexError: {str(e)}" + } except Exception as e: print(f"Stage 1 evaluation failed: {e}") print(traceback.format_exc()) - return {"runs_successfully": 0.0, "error": str(e)} + return { + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": str(e) + } except Exception as e: print(f"Stage 1 evaluation failed: {e}") print(traceback.format_exc()) - return {"runs_successfully": 0.0, "error": str(e)} + return { + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": str(e) + } def evaluate_stage2(program_path): diff --git a/openevolve/config.py b/openevolve/config.py index 80c5d69c4..9d86ae5f9 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -56,12 +56,7 @@ class LLMConfig(LLMModelConfig): retry_delay: int = 5 # n-model configuration for evolution LLM ensemble - models: List[LLMModelConfig] = field( - default_factory=lambda: [ - LLMModelConfig(name="gpt-4o-mini", weight=0.8), - LLMModelConfig(name="gpt-4o", weight=0.2), - ] - ) + models: List[LLMModelConfig] = field(default_factory=list) # n-model configuration for evaluator LLM ensemble evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: []) @@ -75,24 +70,34 @@ class LLMConfig(LLMModelConfig): def __post_init__(self): """Post-initialization to set up model configurations""" # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight). - if (self.primary_model or self.primary_model_weight) and len(self.models) < 1: - # Ensure we have a primary model - self.models.append(LLMModelConfig()) if self.primary_model: - self.models[0].name = self.primary_model - if self.primary_model_weight: - self.models[0].weight = self.primary_model_weight + # Create primary model + primary_model = LLMModelConfig( + name=self.primary_model, + weight=self.primary_model_weight or 1.0 + ) + self.models.append(primary_model) - if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2: - # Ensure we have a second model - self.models.append(LLMModelConfig()) if self.secondary_model: - self.models[1].name = self.secondary_model - if self.secondary_model_weight: - self.models[1].weight = self.secondary_model_weight + # Create secondary model (only if weight > 0) + if not self.secondary_model_weight or self.secondary_model_weight > 0: + secondary_model = LLMModelConfig( + name=self.secondary_model, + weight=self.secondary_model_weight or 0.2 + ) + self.models.append(secondary_model) + + # Only validate if this looks like a user config (has some model info) + # Don't validate during internal/default initialization + if (self.primary_model or self.secondary_model or + self.primary_model_weight or self.secondary_model_weight) and not self.models: + raise ValueError( + "No LLM models configured. Please specify 'models' array or " + "'primary_model' in your configuration." + ) # If no evaluator models are defined, use the same models as for evolution - if not self.evaluator_models or len(self.evaluator_models) < 1: + if not self.evaluator_models: self.evaluator_models = self.models.copy() # Update models with shared configuration values diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index 272767022..f8d4805ef 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -178,12 +178,26 @@ def _run_iteration_worker( iteration_start = time.time() # Generate code modification (sync wrapper for async) - llm_response = asyncio.run( - _worker_llm_ensemble.generate_with_context( - system_message=prompt["system"], - messages=[{"role": "user", "content": prompt["user"]}], + try: + llm_response = asyncio.run( + _worker_llm_ensemble.generate_with_context( + system_message=prompt["system"], + messages=[{"role": "user", "content": prompt["user"]}], + ) + ) + except Exception as e: + logger.error(f"LLM generation failed: {e}") + return SerializableResult( + error=f"LLM generation failed: {str(e)}", + iteration=iteration + ) + + # Check for None response + if llm_response is None: + return SerializableResult( + error="LLM returned None response", + iteration=iteration ) - ) # Parse response based on evolution mode if _worker_config.diff_based_evolution: From fe7f5ec52c23b0aa4fa7ebca79c3b89620e75c3a Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 22 Aug 2025 10:57:47 +0800 Subject: [PATCH 2/4] optimize config --- examples/function_minimization/config.yaml | 13 +++++++------ openevolve/config.py | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml index d9cc2dfbc..f403a4241 100644 --- a/examples/function_minimization/config.yaml +++ b/examples/function_minimization/config.yaml @@ -6,14 +6,15 @@ checkpoint_interval: 5 llm: primary_model: "gemini-2.5-flash-lite" # primary_model: "llama3.1-8b" - primary_model_weight: 0.9 + primary_model_weight: 0.8 secondary_model: "gemini-2.5-flash" # secondary_model: "llama-4-scout-17b-16e-instruct" - secondary_model_weight: 0.1 + secondary_model_weight: 0.2 api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" # api_base: "https://api.cerebras.ai/v1" - temperature: 0.4 - max_tokens: 4000 + temperature: 0.6 + max_tokens: 10000 + timeout: 120 # Prompt configuration prompt: @@ -30,9 +31,9 @@ database: # Evaluator configuration evaluator: timeout: 60 - cascade_thresholds: [0.5, 0.75] + cascade_thresholds: [1.45] parallel_evaluations: 3 # Evolution settings -diff_based_evolution: false +diff_based_evolution: true max_code_length: 20000 \ No newline at end of file diff --git a/openevolve/config.py b/openevolve/config.py index 9d86ae5f9..3a40f26ff 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -80,10 +80,10 @@ def __post_init__(self): if self.secondary_model: # Create secondary model (only if weight > 0) - if not self.secondary_model_weight or self.secondary_model_weight > 0: + if self.secondary_model_weight is None or self.secondary_model_weight > 0: secondary_model = LLMModelConfig( name=self.secondary_model, - weight=self.secondary_model_weight or 0.2 + weight=self.secondary_model_weight if self.secondary_model_weight is not None else 0.2 ) self.models.append(secondary_model) From 78e153bb8a81cbddb1263dcfdce5d25a5e38ad18 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 22 Aug 2025 11:17:42 +0800 Subject: [PATCH 3/4] f --- examples/function_minimization/config.yaml | 6 +- examples/function_minimization/evaluator.py | 255 +++++++++++++++----- 2 files changed, 198 insertions(+), 63 deletions(-) diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml index f403a4241..ff25ceef8 100644 --- a/examples/function_minimization/config.yaml +++ b/examples/function_minimization/config.yaml @@ -12,8 +12,8 @@ llm: secondary_model_weight: 0.2 api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" # api_base: "https://api.cerebras.ai/v1" - temperature: 0.6 - max_tokens: 10000 + temperature: 0.7 + max_tokens: 16000 timeout: 120 # Prompt configuration @@ -31,7 +31,7 @@ database: # Evaluator configuration evaluator: timeout: 60 - cascade_thresholds: [1.45] + cascade_thresholds: [1.4] parallel_evaluations: 3 # Evolution settings diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py index d34a758dd..f16318125 100644 --- a/examples/function_minimization/evaluator.py +++ b/examples/function_minimization/evaluator.py @@ -8,6 +8,7 @@ import concurrent.futures import traceback import signal +from openevolve.evaluation_result import EvaluationResult def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5): @@ -66,13 +67,23 @@ def evaluate(program_path): # Check if the required function exists if not hasattr(program, "run_search"): print(f"Error: program does not have 'run_search' function") - return { - "value_score": 0.0, - "distance_score": 0.0, - "reliability_score": 0.0, - "combined_score": 0.0, - "error": "Missing run_search function", + + error_artifacts = { + "error_type": "MissingFunction", + "error_message": "Program is missing required 'run_search' function", + "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)" } + + return EvaluationResult( + metrics={ + "value_score": 0.0, + "distance_score": 0.0, + "reliability_score": 0.0, + "combined_score": 0.0, + "error": "Missing run_search function", + }, + artifacts=error_artifacts + ) # Run multiple trials num_trials = 10 @@ -159,13 +170,22 @@ def evaluate(program_path): # If all trials failed, return zero scores if success_count == 0: - return { - "value_score": 0.0, - "distance_score": 0.0, - "reliability_score": 0.0, - "combined_score": 0.0, - "error": "All trials failed", + error_artifacts = { + "error_type": "AllTrialsFailed", + "error_message": f"All {num_trials} trials failed - common issues: timeouts, crashes, or invalid return values", + "suggestion": "Check for infinite loops, ensure function returns (x, y) or (x, y, value), and verify algorithm terminates within time limit" } + + return EvaluationResult( + metrics={ + "value_score": 0.0, + "distance_score": 0.0, + "reliability_score": 0.0, + "combined_score": 0.0, + "error": "All trials failed", + }, + artifacts=error_artifacts + ) # Calculate metrics avg_value = float(np.mean(values)) @@ -194,22 +214,45 @@ def evaluate(program_path): base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score combined_score = float(base_score * solution_quality_multiplier) - return { - "value_score": value_score, - "distance_score": distance_score, - "reliability_score": reliability_score, - "combined_score": combined_score, + # Add artifacts for successful runs + artifacts = { + "convergence_info": f"Converged in {num_trials} trials with {success_count} successes", + "best_position": f"Final position: x={x_values[-1]:.4f}, y={y_values[-1]:.4f}" if x_values else "No successful trials", + "average_distance_to_global": f"{avg_distance:.4f}", + "search_efficiency": f"Success rate: {reliability_score:.2%}" } + + return EvaluationResult( + metrics={ + "value_score": value_score, + "distance_score": distance_score, + "reliability_score": reliability_score, + "combined_score": combined_score, + }, + artifacts=artifacts + ) except Exception as e: print(f"Evaluation failed completely: {str(e)}") print(traceback.format_exc()) - return { - "value_score": 0.0, - "distance_score": 0.0, - "reliability_score": 0.0, - "combined_score": 0.0, - "error": str(e), + + # Create error artifacts + error_artifacts = { + "error_type": type(e).__name__, + "error_message": str(e), + "full_traceback": traceback.format_exc(), + "suggestion": "Check for syntax errors or missing imports in the generated code" } + + return EvaluationResult( + metrics={ + "value_score": 0.0, + "distance_score": 0.0, + "reliability_score": 0.0, + "combined_score": 0.0, + "error": str(e), + }, + artifacts=error_artifacts + ) # Stage-based evaluation for cascade evaluation @@ -230,11 +273,21 @@ def evaluate_stage1(program_path): # Check if the required function exists if not hasattr(program, "run_search"): print(f"Stage 1 validation: Program does not have 'run_search' function") - return { - "runs_successfully": 0.0, - "combined_score": 0.0, - "error": "Missing run_search function" + + error_artifacts = { + "error_type": "MissingFunction", + "error_message": "Stage 1: Program is missing required 'run_search' function", + "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Missing run_search function" + }, + artifacts=error_artifacts + ) try: # Run a single trial with timeout @@ -254,18 +307,38 @@ def evaluate_stage1(program_path): print( f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}" ) - return { - "runs_successfully": 0.0, - "combined_score": 0.0, - "error": "Invalid result format" + + error_artifacts = { + "error_type": "InvalidReturnFormat", + "error_message": f"Stage 1: Function returned tuple with {len(result)} values, expected 2 or 3", + "suggestion": "run_search() must return (x, y) or (x, y, value) - check your return statement" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Invalid result format" + }, + artifacts=error_artifacts + ) else: print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}") - return { - "runs_successfully": 0.0, - "combined_score": 0.0, - "error": "Invalid result format" + + error_artifacts = { + "error_type": "InvalidReturnType", + "error_message": f"Stage 1: Function returned {type(result)}, expected tuple", + "suggestion": "run_search() must return a tuple like (x, y) or (x, y, value), not a single value or other type" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Invalid result format" + }, + artifacts=error_artifacts + ) # Ensure all values are float x = safe_float(x) @@ -282,11 +355,21 @@ def evaluate_stage1(program_path): or np.isinf(value) ): print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}") - return { - "runs_successfully": 0.5, - "combined_score": 0.0, - "error": "Invalid result values" + + error_artifacts = { + "error_type": "InvalidResultValues", + "error_message": f"Stage 1: Got invalid values - x={x}, y={y}, value={value}", + "suggestion": "Function returned NaN or infinite values. Check for division by zero, invalid math operations, or uninitialized variables" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.5, + "combined_score": 0.0, + "error": "Invalid result values" + }, + artifacts=error_artifacts + ) # Calculate distance safely x_diff = float(x) - GLOBAL_MIN_X @@ -311,45 +394,97 @@ def evaluate_stage1(program_path): base_score = 0.6 * value_score + 0.4 * distance_score combined_score = float(base_score * solution_quality_multiplier) - return { - "runs_successfully": 1.0, - "value_score": value_score, - "distance_score": distance_score, - "combined_score": combined_score, + # Add artifacts for successful stage 1 + stage1_artifacts = { + "stage1_result": f"Found solution at x={x:.4f}, y={y:.4f} with value={value:.4f}", + "distance_to_global": f"{distance:.4f}", + "solution_quality": f"Distance < 0.5: Very close" if distance < 0.5 else f"Distance < 1.5: Good region" if distance < 1.5 else "Could be improved" } + + return EvaluationResult( + metrics={ + "runs_successfully": 1.0, + "value_score": value_score, + "distance_score": distance_score, + "combined_score": combined_score, + }, + artifacts=stage1_artifacts + ) except TimeoutError as e: print(f"Stage 1 evaluation timed out: {e}") - return { - "runs_successfully": 0.0, - "combined_score": 0.0, - "error": "Timeout" + + error_artifacts = { + "error_type": "TimeoutError", + "error_message": "Stage 1: Function execution exceeded 5 second timeout", + "suggestion": "Function is likely stuck in infinite loop or doing too much computation. Try reducing iterations or adding early termination conditions" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": "Timeout" + }, + artifacts=error_artifacts + ) except IndexError as e: # Specifically handle IndexError which often happens with early termination checks print(f"Stage 1 evaluation failed with IndexError: {e}") print("This is likely due to a list index check before the list is fully populated.") - return { - "runs_successfully": 0.0, - "combined_score": 0.0, - "error": f"IndexError: {str(e)}" + + error_artifacts = { + "error_type": "IndexError", + "error_message": f"Stage 1: {str(e)}", + "suggestion": "List index out of range - likely accessing empty list or wrong index. Check list initialization and bounds" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": f"IndexError: {str(e)}" + }, + artifacts=error_artifacts + ) except Exception as e: print(f"Stage 1 evaluation failed: {e}") print(traceback.format_exc()) - return { - "runs_successfully": 0.0, - "combined_score": 0.0, - "error": str(e) + + error_artifacts = { + "error_type": type(e).__name__, + "error_message": f"Stage 1: {str(e)}", + "full_traceback": traceback.format_exc(), + "suggestion": "Unexpected error occurred. Check the traceback for specific issue" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": str(e) + }, + artifacts=error_artifacts + ) except Exception as e: print(f"Stage 1 evaluation failed: {e}") print(traceback.format_exc()) - return { - "runs_successfully": 0.0, - "combined_score": 0.0, - "error": str(e) + + error_artifacts = { + "error_type": type(e).__name__, + "error_message": f"Stage 1 outer exception: {str(e)}", + "full_traceback": traceback.format_exc(), + "suggestion": "Critical error during stage 1 evaluation. Check program syntax and imports" } + + return EvaluationResult( + metrics={ + "runs_successfully": 0.0, + "combined_score": 0.0, + "error": str(e) + }, + artifacts=error_artifacts + ) def evaluate_stage2(program_path): From 0c0e0269e7b19f47ee16ddda1f406dece9b4da34 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Fri, 22 Aug 2025 11:26:52 +0800 Subject: [PATCH 4/4] fix unit tests --- .../{dataset_config.yaml => dataset_settings.yaml} | 0 examples/llm_prompt_optimization/evaluator.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename examples/llm_prompt_optimization/{dataset_config.yaml => dataset_settings.yaml} (100%) diff --git a/examples/llm_prompt_optimization/dataset_config.yaml b/examples/llm_prompt_optimization/dataset_settings.yaml similarity index 100% rename from examples/llm_prompt_optimization/dataset_config.yaml rename to examples/llm_prompt_optimization/dataset_settings.yaml diff --git a/examples/llm_prompt_optimization/evaluator.py b/examples/llm_prompt_optimization/evaluator.py index 71bf4b316..09ffc204c 100644 --- a/examples/llm_prompt_optimization/evaluator.py +++ b/examples/llm_prompt_optimization/evaluator.py @@ -47,8 +47,8 @@ if not prompt_file: # Default to a generic dataset config if not using the wrapper script evaluator_dir = os.path.dirname(os.path.abspath(__file__)) - DATASET_CONFIG_PATH = os.path.join(evaluator_dir, "dataset_config.yaml") - print("Warning: OPENEVOLVE_PROMPT not set. Using default dataset_config.yaml") + DATASET_CONFIG_PATH = os.path.join(evaluator_dir, "dataset_settings.yaml") + print("Warning: OPENEVOLVE_PROMPT not set. Using default dataset_settings.yaml") else: basename = os.path.basename(prompt_file) dataset_filename = basename.replace("_prompt.txt", "_prompt_dataset.yaml").replace(