From ed2de61c5dc9468fc897cb908a3a4eafe4a9ed24 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Wed, 4 Jun 2025 12:29:08 +0800 Subject: [PATCH 1/2] fixes --- examples/function_minimization/evaluator.py | 77 +++++++------ .../function_minimization/requirements.txt | 1 + openevolve/controller.py | 22 ++-- openevolve/database.py | 25 +++-- openevolve/evaluator.py | 3 +- openevolve/prompt/sampler.py | 103 +++++++++++++----- openevolve/utils/__init__.py | 12 ++ openevolve/utils/format_utils.py | 65 +++++++++++ openevolve/utils/metrics_utils.py | 66 +++++++++++ 9 files changed, 285 insertions(+), 89 deletions(-) create mode 100644 examples/function_minimization/requirements.txt create mode 100644 openevolve/utils/format_utils.py create mode 100644 openevolve/utils/metrics_utils.py diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py index ff80a081d..6bb1f9d90 100644 --- a/examples/function_minimization/evaluator.py +++ b/examples/function_minimization/evaluator.py @@ -5,8 +5,9 @@ import importlib.util import numpy as np import time -import multiprocessing +import concurrent.futures import traceback +import signal def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5): @@ -22,31 +23,13 @@ def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5): Returns: Result of the function or raises TimeoutError """ - - def wrapper(queue, func, args, kwargs): + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(func, *args, **kwargs) try: - result = func(*args, **kwargs) - queue.put(("success", result)) - except Exception as e: - queue.put(("error", e)) - - queue = multiprocessing.Queue() - process = multiprocessing.Process(target=wrapper, args=(queue, func, args, kwargs)) - process.start() - process.join(timeout=timeout_seconds) - - if process.is_alive(): - process.terminate() - process.join() - raise TimeoutError(f"Function timed out after {timeout_seconds} seconds") - - if queue.empty(): - raise TimeoutError("Function ended without returning a result") - - status, result = queue.get() - if status == "error": - raise result - return result + result = future.result(timeout=timeout_seconds) + return result + except concurrent.futures.TimeoutError: + raise TimeoutError(f"Function timed out after {timeout_seconds} seconds") def safe_float(value): @@ -107,15 +90,27 @@ def evaluate(program_path): # Run with timeout result = run_with_timeout(program.run_search, timeout_seconds=5) - # Check if we got a tuple of 3 values - if not isinstance(result, tuple) or len(result) != 3: + # Handle different result formats + if isinstance(result, tuple): + if len(result) == 3: + x, y, value = result + elif len(result) == 2: + # Assume it's (x, y) and calculate value + x, y = result + # Calculate the function value since it wasn't returned + value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20 + print(f"Trial {trial}: Got 2 values, calculated function value: {value}") + else: + print( + f"Trial {trial}: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}" + ) + continue + else: print( - f"Trial {trial}: Invalid result format, expected tuple of 3 values but got {type(result)}" + f"Trial {trial}: Invalid result format, expected tuple but got {type(result)}" ) continue - x, y, value = result - end_time = time.time() # Ensure all values are float @@ -264,15 +259,27 @@ def evaluate_stage1(program_path): # Run a single trial with timeout result = run_with_timeout(program.run_search, timeout_seconds=5) - # Check if we got a tuple of 3 values - if not isinstance(result, tuple) or len(result) != 3: + # Handle different result formats + if isinstance(result, tuple): + if len(result) == 3: + x, y, value = result + elif len(result) == 2: + # Assume it's (x, y) and calculate value + x, y = result + # Calculate the function value since it wasn't returned + value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20 + print(f"Stage 1: Got 2 values, calculated function value: {value}") + else: + print( + f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}" + ) + return {"runs_successfully": 0.0, "error": "Invalid result format"} + else: print( - f"Stage 1: Invalid result format, expected tuple of 3 values but got {type(result)}" + f"Stage 1: Invalid result format, expected tuple but got {type(result)}" ) return {"runs_successfully": 0.0, "error": "Invalid result format"} - x, y, value = result - # Ensure all values are float x = safe_float(x) y = safe_float(y) diff --git a/examples/function_minimization/requirements.txt b/examples/function_minimization/requirements.txt new file mode 100644 index 000000000..9c61c7363 --- /dev/null +++ b/examples/function_minimization/requirements.txt @@ -0,0 +1 @@ +scipy \ No newline at end of file diff --git a/openevolve/controller.py b/openevolve/controller.py index 68d3e0c12..42fbd49a7 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -24,6 +24,10 @@ parse_evolve_blocks, parse_full_rewrite, ) +from openevolve.utils.format_utils import ( + format_metrics_safe, + format_improvement_safe, +) logger = logging.getLogger(__name__) @@ -305,7 +309,7 @@ async def run( f"🌟 New best solution found at iteration {i+1}: {child_program.id}" ) logger.info( - f"Metrics: {', '.join(f'{name}={value:.4f}' for name, value in child_program.metrics.items())}" + f"Metrics: {format_metrics_safe(child_program.metrics)}" ) # Save checkpoint @@ -361,7 +365,7 @@ async def run( if best_program: logger.info( f"Evolution complete. Best program has metrics: " - f"{', '.join(f'{name}={value:.4f}' for name, value in best_program.metrics.items())}" + f"{format_metrics_safe(best_program.metrics)}" ) # Save the best program (using our tracked best program) @@ -389,19 +393,13 @@ def _log_iteration( child: Child program elapsed_time: Elapsed time in seconds """ - # Calculate improvement - improvement = {} - for metric, value in child.metrics.items(): - if metric in parent.metrics: - diff = value - parent.metrics[metric] - improvement[metric] = diff - - improvement_str = ", ".join(f"{name}={diff:+.4f}" for name, diff in improvement.items()) + # Calculate improvement using safe formatting + improvement_str = format_improvement_safe(parent.metrics, child.metrics) logger.info( f"Iteration {iteration+1}: Child {child.id} from parent {parent.id} " f"in {elapsed_time:.2f}s. Metrics: " - f"{', '.join(f'{name}={value:.4f}' for name, value in child.metrics.items())} " + f"{format_metrics_safe(child.metrics)} " f"(Δ: {improvement_str})" ) @@ -457,7 +455,7 @@ def _save_checkpoint(self, iteration: int) -> None: logger.info( f"Saved best program at checkpoint {iteration} with metrics: " - f"{', '.join(f'{name}={value:.4f}' for name, value in best_program.metrics.items())}" + f"{format_metrics_safe(best_program.metrics)}" ) logger.info(f"Saved checkpoint at iteration {iteration} to {checkpoint_path}") diff --git a/openevolve/database.py b/openevolve/database.py index 772250ec1..d25b55159 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -15,6 +15,7 @@ from openevolve.config import DatabaseConfig from openevolve.utils.code_utils import calculate_edit_distance +from openevolve.utils.metrics_utils import safe_numeric_average logger = logging.getLogger(__name__) @@ -227,10 +228,10 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]: if sorted_programs: logger.debug(f"Found best program by combined_score: {sorted_programs[0].id}") else: - # Sort by average of all metrics as fallback + # Sort by average of all numeric metrics as fallback sorted_programs = sorted( self.programs.values(), - key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)), + key=lambda p: safe_numeric_average(p.metrics), reverse=True, ) if sorted_programs: @@ -281,10 +282,10 @@ def get_top_programs(self, n: int = 10, metric: Optional[str] = None) -> List[Pr reverse=True, ) else: - # Sort by average of all metrics + # Sort by average of all numeric metrics sorted_programs = sorted( self.programs.values(), - key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)), + key=lambda p: safe_numeric_average(p.metrics), reverse=True, ) @@ -436,7 +437,7 @@ def _calculate_feature_coords(self, program: Program) -> List[int]: if not program.metrics: bin_idx = 0 else: - avg_score = sum(program.metrics.values()) / len(program.metrics) + avg_score = safe_numeric_average(program.metrics) bin_idx = min(int(avg_score * self.feature_bins), self.feature_bins - 1) coords.append(bin_idx) elif dim in program.metrics: @@ -487,9 +488,9 @@ def _is_better(self, program1: Program, program2: Program) -> bool: if "combined_score" in program1.metrics and "combined_score" in program2.metrics: return program1.metrics["combined_score"] > program2.metrics["combined_score"] - # Fallback to average of all metrics - avg1 = sum(program1.metrics.values()) / len(program1.metrics) - avg2 = sum(program2.metrics.values()) / len(program2.metrics) + # Fallback to average of all numeric metrics + avg1 = safe_numeric_average(program1.metrics) + avg2 = safe_numeric_average(program2.metrics) return avg1 > avg2 @@ -508,7 +509,7 @@ def _update_archive(self, program: Program) -> None: # Otherwise, find worst program in archive archive_programs = [self.programs[pid] for pid in self.archive] worst_program = min( - archive_programs, key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)) + archive_programs, key=lambda p: safe_numeric_average(p.metrics) ) # Replace if new program is better @@ -716,7 +717,7 @@ def _enforce_population_limit(self) -> None: # Sort by average metric (worst first) sorted_programs = sorted( all_programs, - key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)) if p.metrics else 0.0, + key=lambda p: safe_numeric_average(p.metrics), ) # Remove worst programs, but never remove the best program @@ -812,7 +813,7 @@ def migrate_programs(self) -> None: # Sort by fitness (using combined_score or average metrics) island_programs.sort( key=lambda p: p.metrics.get( - "combined_score", sum(p.metrics.values()) / max(1, len(p.metrics)) + "combined_score", safe_numeric_average(p.metrics) ), reverse=True, ) @@ -859,7 +860,7 @@ def get_island_stats(self) -> List[dict]: if island_programs: scores = [ p.metrics.get( - "combined_score", sum(p.metrics.values()) / max(1, len(p.metrics)) + "combined_score", safe_numeric_average(p.metrics) ) for p in island_programs ] diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index d7f4ed654..fb9fbd8e2 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -18,6 +18,7 @@ from openevolve.config import EvaluatorConfig from openevolve.llm.ensemble import LLMEnsemble from openevolve.utils.async_utils import TaskPool, run_in_executor +from openevolve.utils.format_utils import format_metrics_safe logger = logging.getLogger(__name__) @@ -119,7 +120,7 @@ async def evaluate_program( elapsed = time.time() - start_time logger.info( f"Evaluated program{program_id_str} in {elapsed:.2f}s: " - f"{', '.join(f'{name}={value:.4f}' for name, value in metrics.items())}" + f"{format_metrics_safe(metrics)}" ) return metrics diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py index ad7a6be38..594f4ffce 100644 --- a/openevolve/prompt/sampler.py +++ b/openevolve/prompt/sampler.py @@ -8,6 +8,8 @@ from openevolve.config import PromptConfig from openevolve.prompt.templates import TemplateManager +from openevolve.utils.format_utils import format_metrics_safe +from openevolve.utils.metrics_utils import safe_numeric_average logger = logging.getLogger(__name__) @@ -126,8 +128,18 @@ def build_prompt( } def _format_metrics(self, metrics: Dict[str, float]) -> str: - """Format metrics for the prompt""" - return "\n".join([f"- {name}: {value:.4f}" for name, value in metrics.items()]) + """Format metrics for the prompt using safe formatting""" + # Use safe formatting to handle mixed numeric and string values + formatted_parts = [] + for name, value in metrics.items(): + if isinstance(value, (int, float)): + try: + formatted_parts.append(f"- {name}: {value:.4f}") + except (ValueError, TypeError): + formatted_parts.append(f"- {name}: {value}") + else: + formatted_parts.append(f"- {name}: {value}") + return "\n".join(formatted_parts) def _identify_improvement_areas( self, @@ -159,10 +171,17 @@ def _identify_improvement_areas( regressed = True for attempt in recent_attempts: - if attempt["metrics"].get(metric, 0) <= value: - regressed = False - if attempt["metrics"].get(metric, 0) >= value: + attempt_value = attempt["metrics"].get(metric, 0) + # Only compare if both values are numeric + if isinstance(value, (int, float)) and isinstance(attempt_value, (int, float)): + if attempt_value <= value: + regressed = False + if attempt_value >= value: + improved = False + else: + # If either value is non-numeric, skip comparison improved = False + regressed = False if improved and metric not in metrics_improved: metrics_improved.append(metric) @@ -209,24 +228,49 @@ def _format_evolution_history( attempt_number = len(previous_programs) - i changes = program.get("changes", "Unknown changes") - # Format performance metrics - performance_str = ", ".join( - [f"{name}: {value:.4f}" for name, value in program.get("metrics", {}).items()] - ) + # Format performance metrics using safe formatting + performance_parts = [] + for name, value in program.get("metrics", {}).items(): + if isinstance(value, (int, float)): + try: + performance_parts.append(f"{name}: {value:.4f}") + except (ValueError, TypeError): + performance_parts.append(f"{name}: {value}") + else: + performance_parts.append(f"{name}: {value}") + performance_str = ", ".join(performance_parts) # Determine outcome based on comparison with parent parent_metrics = program.get("parent_metrics", {}) outcome = "Mixed results" - if all( - program.get("metrics", {}).get(m, 0) >= parent_metrics.get(m, 0) - for m in program.get("metrics", {}) - ): + # Safely compare only numeric metrics + program_metrics = program.get("metrics", {}) + + # Check if all numeric metrics improved + numeric_comparisons_improved = [] + numeric_comparisons_regressed = [] + + for m in program_metrics: + prog_value = program_metrics.get(m, 0) + parent_value = parent_metrics.get(m, 0) + + # Only compare if both values are numeric + if isinstance(prog_value, (int, float)) and isinstance(parent_value, (int, float)): + if prog_value >= parent_value: + numeric_comparisons_improved.append(True) + else: + numeric_comparisons_improved.append(False) + + if prog_value <= parent_value: + numeric_comparisons_regressed.append(True) + else: + numeric_comparisons_regressed.append(False) + + # Determine outcome based on numeric comparisons + if numeric_comparisons_improved and all(numeric_comparisons_improved): outcome = "Improvement in all metrics" - elif all( - program.get("metrics", {}).get(m, 0) <= parent_metrics.get(m, 0) - for m in program.get("metrics", {}) - ): + elif numeric_comparisons_regressed and all(numeric_comparisons_regressed): outcome = "Regression in all metrics" previous_attempts_str += ( @@ -250,18 +294,21 @@ def _format_evolution_history( if len(program_code.split("\n")) > 10: program_snippet += "\n# ... (truncated for brevity)" - # Calculate a composite score - score = sum(program.get("metrics", {}).values()) / max( - 1, len(program.get("metrics", {})) - ) + # Calculate a composite score using safe numeric average + score = safe_numeric_average(program.get("metrics", {})) # Extract key features (this could be more sophisticated) key_features = program.get("key_features", []) if not key_features: - key_features = [ - f"Performs well on {name} ({value:.4f})" - for name, value in program.get("metrics", {}).items() - ] + key_features = [] + for name, value in program.get("metrics", {}).items(): + if isinstance(value, (int, float)): + try: + key_features.append(f"Performs well on {name} ({value:.4f})") + except (ValueError, TypeError): + key_features.append(f"Performs well on {name} ({value})") + else: + key_features.append(f"Performs well on {name} ({value})") key_features_str = ", ".join(key_features) @@ -300,10 +347,8 @@ def _format_evolution_history( if len(program_code.split("\n")) > 5: program_snippet += "\n# ... (truncated)" - # Calculate a composite score - score = sum(program.get("metrics", {}).values()) / max( - 1, len(program.get("metrics", {})) - ) + # Calculate a composite score using safe numeric average + score = safe_numeric_average(program.get("metrics", {})) # Extract key features key_features = program.get("key_features", []) diff --git a/openevolve/utils/__init__.py b/openevolve/utils/__init__.py index 6d16167af..89a4a1b62 100644 --- a/openevolve/utils/__init__.py +++ b/openevolve/utils/__init__.py @@ -17,6 +17,14 @@ parse_evolve_blocks, parse_full_rewrite, ) +from openevolve.utils.format_utils import ( + format_metrics_safe, + format_improvement_safe, +) +from openevolve.utils.metrics_utils import ( + safe_numeric_average, + safe_numeric_sum, +) __all__ = [ "TaskPool", @@ -30,4 +38,8 @@ "format_diff_summary", "parse_evolve_blocks", "parse_full_rewrite", + "format_metrics_safe", + "format_improvement_safe", + "safe_numeric_average", + "safe_numeric_sum", ] diff --git a/openevolve/utils/format_utils.py b/openevolve/utils/format_utils.py new file mode 100644 index 000000000..8f0c5be74 --- /dev/null +++ b/openevolve/utils/format_utils.py @@ -0,0 +1,65 @@ +""" +Utility functions for formatting output +""" + +from typing import Any, Dict + + +def format_metrics_safe(metrics: Dict[str, Any]) -> str: + """ + Safely format metrics dictionary for logging, handling both numeric and string values. + + Args: + metrics: Dictionary of metric names to values + + Returns: + Formatted string representation of metrics + """ + if not metrics: + return "" + + formatted_parts = [] + for name, value in metrics.items(): + # Check if value is numeric (int, float) + if isinstance(value, (int, float)): + try: + # Only apply float formatting to numeric values + formatted_parts.append(f"{name}={value:.4f}") + except (ValueError, TypeError): + # Fallback to string representation if formatting fails + formatted_parts.append(f"{name}={value}") + else: + # For non-numeric values (strings, etc.), just convert to string + formatted_parts.append(f"{name}={value}") + + return ", ".join(formatted_parts) + + +def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[str, Any]) -> str: + """ + Safely format improvement metrics for logging. + + Args: + parent_metrics: Parent program metrics + child_metrics: Child program metrics + + Returns: + Formatted string representation of improvements + """ + if not parent_metrics or not child_metrics: + return "" + + improvement_parts = [] + for metric, child_value in child_metrics.items(): + if metric in parent_metrics: + parent_value = parent_metrics[metric] + # Only calculate improvement for numeric values + if isinstance(child_value, (int, float)) and isinstance(parent_value, (int, float)): + try: + diff = child_value - parent_value + improvement_parts.append(f"{metric}={diff:+.4f}") + except (ValueError, TypeError): + # Skip non-numeric comparisons + continue + + return ", ".join(improvement_parts) diff --git a/openevolve/utils/metrics_utils.py b/openevolve/utils/metrics_utils.py new file mode 100644 index 000000000..3a349a2b1 --- /dev/null +++ b/openevolve/utils/metrics_utils.py @@ -0,0 +1,66 @@ +""" +Safe calculation utilities for metrics containing mixed types +""" + +from typing import Any, Dict + + +def safe_numeric_average(metrics: Dict[str, Any]) -> float: + """ + Calculate the average of numeric values in a metrics dictionary, + safely ignoring non-numeric values like strings. + + Args: + metrics: Dictionary of metric names to values + + Returns: + Average of numeric values, or 0.0 if no numeric values found + """ + if not metrics: + return 0.0 + + numeric_values = [] + for value in metrics.values(): + if isinstance(value, (int, float)): + try: + # Convert to float and check if it's a valid number + float_val = float(value) + if not (float_val != float_val): # Check for NaN (NaN != NaN is True) + numeric_values.append(float_val) + except (ValueError, TypeError, OverflowError): + # Skip invalid numeric values + continue + + if not numeric_values: + return 0.0 + + return sum(numeric_values) / len(numeric_values) + + +def safe_numeric_sum(metrics: Dict[str, Any]) -> float: + """ + Calculate the sum of numeric values in a metrics dictionary, + safely ignoring non-numeric values like strings. + + Args: + metrics: Dictionary of metric names to values + + Returns: + Sum of numeric values, or 0.0 if no numeric values found + """ + if not metrics: + return 0.0 + + numeric_sum = 0.0 + for value in metrics.values(): + if isinstance(value, (int, float)): + try: + # Convert to float and check if it's a valid number + float_val = float(value) + if not (float_val != float_val): # Check for NaN (NaN != NaN is True) + numeric_sum += float_val + except (ValueError, TypeError, OverflowError): + # Skip invalid numeric values + continue + + return numeric_sum From cd99e3dd28ce05586c71990ead5ef063a4dc6ed6 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Wed, 4 Jun 2025 12:35:57 +0800 Subject: [PATCH 2/2] fix linter --- examples/function_minimization/evaluator.py | 4 +--- openevolve/controller.py | 4 +--- openevolve/database.py | 12 +++--------- openevolve/prompt/sampler.py | 10 +++++----- openevolve/utils/format_utils.py | 16 ++++++++-------- openevolve/utils/metrics_utils.py | 18 +++++++++--------- 6 files changed, 27 insertions(+), 37 deletions(-) diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py index 6bb1f9d90..e48d54136 100644 --- a/examples/function_minimization/evaluator.py +++ b/examples/function_minimization/evaluator.py @@ -275,9 +275,7 @@ def evaluate_stage1(program_path): ) return {"runs_successfully": 0.0, "error": "Invalid result format"} else: - print( - f"Stage 1: Invalid result format, expected tuple but got {type(result)}" - ) + print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}") return {"runs_successfully": 0.0, "error": "Invalid result format"} # Ensure all values are float diff --git a/openevolve/controller.py b/openevolve/controller.py index 42fbd49a7..c08194378 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -308,9 +308,7 @@ async def run( logger.info( f"🌟 New best solution found at iteration {i+1}: {child_program.id}" ) - logger.info( - f"Metrics: {format_metrics_safe(child_program.metrics)}" - ) + logger.info(f"Metrics: {format_metrics_safe(child_program.metrics)}") # Save checkpoint if (i + 1) % self.config.checkpoint_interval == 0: diff --git a/openevolve/database.py b/openevolve/database.py index d25b55159..983138d94 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -508,9 +508,7 @@ def _update_archive(self, program: Program) -> None: # Otherwise, find worst program in archive archive_programs = [self.programs[pid] for pid in self.archive] - worst_program = min( - archive_programs, key=lambda p: safe_numeric_average(p.metrics) - ) + worst_program = min(archive_programs, key=lambda p: safe_numeric_average(p.metrics)) # Replace if new program is better if self._is_better(program, worst_program): @@ -812,9 +810,7 @@ def migrate_programs(self) -> None: # Sort by fitness (using combined_score or average metrics) island_programs.sort( - key=lambda p: p.metrics.get( - "combined_score", safe_numeric_average(p.metrics) - ), + key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)), reverse=True, ) @@ -859,9 +855,7 @@ def get_island_stats(self) -> List[dict]: if island_programs: scores = [ - p.metrics.get( - "combined_score", safe_numeric_average(p.metrics) - ) + p.metrics.get("combined_score", safe_numeric_average(p.metrics)) for p in island_programs ] diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py index 594f4ffce..a05910b98 100644 --- a/openevolve/prompt/sampler.py +++ b/openevolve/prompt/sampler.py @@ -246,27 +246,27 @@ def _format_evolution_history( # Safely compare only numeric metrics program_metrics = program.get("metrics", {}) - + # Check if all numeric metrics improved numeric_comparisons_improved = [] numeric_comparisons_regressed = [] - + for m in program_metrics: prog_value = program_metrics.get(m, 0) parent_value = parent_metrics.get(m, 0) - + # Only compare if both values are numeric if isinstance(prog_value, (int, float)) and isinstance(parent_value, (int, float)): if prog_value >= parent_value: numeric_comparisons_improved.append(True) else: numeric_comparisons_improved.append(False) - + if prog_value <= parent_value: numeric_comparisons_regressed.append(True) else: numeric_comparisons_regressed.append(False) - + # Determine outcome based on numeric comparisons if numeric_comparisons_improved and all(numeric_comparisons_improved): outcome = "Improvement in all metrics" diff --git a/openevolve/utils/format_utils.py b/openevolve/utils/format_utils.py index 8f0c5be74..4dd83c7ac 100644 --- a/openevolve/utils/format_utils.py +++ b/openevolve/utils/format_utils.py @@ -8,16 +8,16 @@ def format_metrics_safe(metrics: Dict[str, Any]) -> str: """ Safely format metrics dictionary for logging, handling both numeric and string values. - + Args: metrics: Dictionary of metric names to values - + Returns: Formatted string representation of metrics """ if not metrics: return "" - + formatted_parts = [] for name, value in metrics.items(): # Check if value is numeric (int, float) @@ -31,24 +31,24 @@ def format_metrics_safe(metrics: Dict[str, Any]) -> str: else: # For non-numeric values (strings, etc.), just convert to string formatted_parts.append(f"{name}={value}") - + return ", ".join(formatted_parts) def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[str, Any]) -> str: """ Safely format improvement metrics for logging. - + Args: parent_metrics: Parent program metrics child_metrics: Child program metrics - + Returns: Formatted string representation of improvements """ if not parent_metrics or not child_metrics: return "" - + improvement_parts = [] for metric, child_value in child_metrics.items(): if metric in parent_metrics: @@ -61,5 +61,5 @@ def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[ except (ValueError, TypeError): # Skip non-numeric comparisons continue - + return ", ".join(improvement_parts) diff --git a/openevolve/utils/metrics_utils.py b/openevolve/utils/metrics_utils.py index 3a349a2b1..f2da61f49 100644 --- a/openevolve/utils/metrics_utils.py +++ b/openevolve/utils/metrics_utils.py @@ -9,16 +9,16 @@ def safe_numeric_average(metrics: Dict[str, Any]) -> float: """ Calculate the average of numeric values in a metrics dictionary, safely ignoring non-numeric values like strings. - + Args: metrics: Dictionary of metric names to values - + Returns: Average of numeric values, or 0.0 if no numeric values found """ if not metrics: return 0.0 - + numeric_values = [] for value in metrics.values(): if isinstance(value, (int, float)): @@ -30,10 +30,10 @@ def safe_numeric_average(metrics: Dict[str, Any]) -> float: except (ValueError, TypeError, OverflowError): # Skip invalid numeric values continue - + if not numeric_values: return 0.0 - + return sum(numeric_values) / len(numeric_values) @@ -41,16 +41,16 @@ def safe_numeric_sum(metrics: Dict[str, Any]) -> float: """ Calculate the sum of numeric values in a metrics dictionary, safely ignoring non-numeric values like strings. - + Args: metrics: Dictionary of metric names to values - + Returns: Sum of numeric values, or 0.0 if no numeric values found """ if not metrics: return 0.0 - + numeric_sum = 0.0 for value in metrics.values(): if isinstance(value, (int, float)): @@ -62,5 +62,5 @@ def safe_numeric_sum(metrics: Dict[str, Any]) -> float: except (ValueError, TypeError, OverflowError): # Skip invalid numeric values continue - + return numeric_sum