From ed2de61c5dc9468fc897cb908a3a4eafe4a9ed24 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Wed, 4 Jun 2025 12:29:08 +0800
Subject: [PATCH 1/2] fixes

---
 examples/function_minimization/evaluator.py   |  77 +++++++------
 .../function_minimization/requirements.txt    |   1 +
 openevolve/controller.py                      |  22 ++--
 openevolve/database.py                        |  25 +++--
 openevolve/evaluator.py                       |   3 +-
 openevolve/prompt/sampler.py                  | 103 +++++++++++++-----
 openevolve/utils/__init__.py                  |  12 ++
 openevolve/utils/format_utils.py              |  65 +++++++++++
 openevolve/utils/metrics_utils.py             |  66 +++++++++++
 9 files changed, 285 insertions(+), 89 deletions(-)
 create mode 100644 examples/function_minimization/requirements.txt
 create mode 100644 openevolve/utils/format_utils.py
 create mode 100644 openevolve/utils/metrics_utils.py

diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py
index ff80a081d..6bb1f9d90 100644
--- a/examples/function_minimization/evaluator.py
+++ b/examples/function_minimization/evaluator.py
@@ -5,8 +5,9 @@
 import importlib.util
 import numpy as np
 import time
-import multiprocessing
+import concurrent.futures
 import traceback
+import signal
 
 
 def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
@@ -22,31 +23,13 @@ def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
     Returns:
         Result of the function or raises TimeoutError
     """
-
-    def wrapper(queue, func, args, kwargs):
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
         try:
-            result = func(*args, **kwargs)
-            queue.put(("success", result))
-        except Exception as e:
-            queue.put(("error", e))
-
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=wrapper, args=(queue, func, args, kwargs))
-    process.start()
-    process.join(timeout=timeout_seconds)
-
-    if process.is_alive():
-        process.terminate()
-        process.join()
-        raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
-
-    if queue.empty():
-        raise TimeoutError("Function ended without returning a result")
-
-    status, result = queue.get()
-    if status == "error":
-        raise result
-    return result
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
 
 
 def safe_float(value):
@@ -107,15 +90,27 @@ def evaluate(program_path):
                 # Run with timeout
                 result = run_with_timeout(program.run_search, timeout_seconds=5)
 
-                # Check if we got a tuple of 3 values
-                if not isinstance(result, tuple) or len(result) != 3:
+                # Handle different result formats
+                if isinstance(result, tuple):
+                    if len(result) == 3:
+                        x, y, value = result
+                    elif len(result) == 2:
+                        # Assume it's (x, y) and calculate value
+                        x, y = result
+                        # Calculate the function value since it wasn't returned
+                        value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
+                        print(f"Trial {trial}: Got 2 values, calculated function value: {value}")
+                    else:
+                        print(
+                            f"Trial {trial}: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
+                        )
+                        continue
+                else:
                     print(
-                        f"Trial {trial}: Invalid result format, expected tuple of 3 values but got {type(result)}"
+                        f"Trial {trial}: Invalid result format, expected tuple but got {type(result)}"
                     )
                     continue
 
-                x, y, value = result
-
                 end_time = time.time()
 
                 # Ensure all values are float
@@ -264,15 +259,27 @@ def evaluate_stage1(program_path):
             # Run a single trial with timeout
             result = run_with_timeout(program.run_search, timeout_seconds=5)
 
-            # Check if we got a tuple of 3 values
-            if not isinstance(result, tuple) or len(result) != 3:
+            # Handle different result formats
+            if isinstance(result, tuple):
+                if len(result) == 3:
+                    x, y, value = result
+                elif len(result) == 2:
+                    # Assume it's (x, y) and calculate value
+                    x, y = result
+                    # Calculate the function value since it wasn't returned
+                    value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
+                    print(f"Stage 1: Got 2 values, calculated function value: {value}")
+                else:
+                    print(
+                        f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
+                    )
+                    return {"runs_successfully": 0.0, "error": "Invalid result format"}
+            else:
                 print(
-                    f"Stage 1: Invalid result format, expected tuple of 3 values but got {type(result)}"
+                    f"Stage 1: Invalid result format, expected tuple but got {type(result)}"
                 )
                 return {"runs_successfully": 0.0, "error": "Invalid result format"}
 
-            x, y, value = result
-
             # Ensure all values are float
             x = safe_float(x)
             y = safe_float(y)
diff --git a/examples/function_minimization/requirements.txt b/examples/function_minimization/requirements.txt
new file mode 100644
index 000000000..9c61c7363
--- /dev/null
+++ b/examples/function_minimization/requirements.txt
@@ -0,0 +1 @@
+scipy
\ No newline at end of file
diff --git a/openevolve/controller.py b/openevolve/controller.py
index 68d3e0c12..42fbd49a7 100644
--- a/openevolve/controller.py
+++ b/openevolve/controller.py
@@ -24,6 +24,10 @@
     parse_evolve_blocks,
     parse_full_rewrite,
 )
+from openevolve.utils.format_utils import (
+    format_metrics_safe,
+    format_improvement_safe,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -305,7 +309,7 @@ async def run(
                         f"🌟 New best solution found at iteration {i+1}: {child_program.id}"
                     )
                     logger.info(
-                        f"Metrics: {', '.join(f'{name}={value:.4f}' for name, value in child_program.metrics.items())}"
+                        f"Metrics: {format_metrics_safe(child_program.metrics)}"
                     )
 
                 # Save checkpoint
@@ -361,7 +365,7 @@ async def run(
         if best_program:
             logger.info(
                 f"Evolution complete. Best program has metrics: "
-                f"{', '.join(f'{name}={value:.4f}' for name, value in best_program.metrics.items())}"
+                f"{format_metrics_safe(best_program.metrics)}"
             )
 
             # Save the best program (using our tracked best program)
@@ -389,19 +393,13 @@ def _log_iteration(
             child: Child program
             elapsed_time: Elapsed time in seconds
         """
-        # Calculate improvement
-        improvement = {}
-        for metric, value in child.metrics.items():
-            if metric in parent.metrics:
-                diff = value - parent.metrics[metric]
-                improvement[metric] = diff
-
-        improvement_str = ", ".join(f"{name}={diff:+.4f}" for name, diff in improvement.items())
+        # Calculate improvement using safe formatting
+        improvement_str = format_improvement_safe(parent.metrics, child.metrics)
 
         logger.info(
             f"Iteration {iteration+1}: Child {child.id} from parent {parent.id} "
             f"in {elapsed_time:.2f}s. Metrics: "
-            f"{', '.join(f'{name}={value:.4f}' for name, value in child.metrics.items())} "
+            f"{format_metrics_safe(child.metrics)} "
             f"(Δ: {improvement_str})"
         )
 
@@ -457,7 +455,7 @@ def _save_checkpoint(self, iteration: int) -> None:
 
             logger.info(
                 f"Saved best program at checkpoint {iteration} with metrics: "
-                f"{', '.join(f'{name}={value:.4f}' for name, value in best_program.metrics.items())}"
+                f"{format_metrics_safe(best_program.metrics)}"
             )
 
         logger.info(f"Saved checkpoint at iteration {iteration} to {checkpoint_path}")
diff --git a/openevolve/database.py b/openevolve/database.py
index 772250ec1..d25b55159 100644
--- a/openevolve/database.py
+++ b/openevolve/database.py
@@ -15,6 +15,7 @@
 
 from openevolve.config import DatabaseConfig
 from openevolve.utils.code_utils import calculate_edit_distance
+from openevolve.utils.metrics_utils import safe_numeric_average
 
 logger = logging.getLogger(__name__)
 
@@ -227,10 +228,10 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]:
             if sorted_programs:
                 logger.debug(f"Found best program by combined_score: {sorted_programs[0].id}")
         else:
-            # Sort by average of all metrics as fallback
+            # Sort by average of all numeric metrics as fallback
             sorted_programs = sorted(
                 self.programs.values(),
-                key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)),
+                key=lambda p: safe_numeric_average(p.metrics),
                 reverse=True,
             )
             if sorted_programs:
@@ -281,10 +282,10 @@ def get_top_programs(self, n: int = 10, metric: Optional[str] = None) -> List[Pr
                 reverse=True,
             )
         else:
-            # Sort by average of all metrics
+            # Sort by average of all numeric metrics
             sorted_programs = sorted(
                 self.programs.values(),
-                key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)),
+                key=lambda p: safe_numeric_average(p.metrics),
                 reverse=True,
             )
 
@@ -436,7 +437,7 @@ def _calculate_feature_coords(self, program: Program) -> List[int]:
                 if not program.metrics:
                     bin_idx = 0
                 else:
-                    avg_score = sum(program.metrics.values()) / len(program.metrics)
+                    avg_score = safe_numeric_average(program.metrics)
                     bin_idx = min(int(avg_score * self.feature_bins), self.feature_bins - 1)
                 coords.append(bin_idx)
             elif dim in program.metrics:
@@ -487,9 +488,9 @@ def _is_better(self, program1: Program, program2: Program) -> bool:
         if "combined_score" in program1.metrics and "combined_score" in program2.metrics:
             return program1.metrics["combined_score"] > program2.metrics["combined_score"]
 
-        # Fallback to average of all metrics
-        avg1 = sum(program1.metrics.values()) / len(program1.metrics)
-        avg2 = sum(program2.metrics.values()) / len(program2.metrics)
+        # Fallback to average of all numeric metrics
+        avg1 = safe_numeric_average(program1.metrics)
+        avg2 = safe_numeric_average(program2.metrics)
 
         return avg1 > avg2
 
@@ -508,7 +509,7 @@ def _update_archive(self, program: Program) -> None:
         # Otherwise, find worst program in archive
         archive_programs = [self.programs[pid] for pid in self.archive]
         worst_program = min(
-            archive_programs, key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics))
+            archive_programs, key=lambda p: safe_numeric_average(p.metrics)
         )
 
         # Replace if new program is better
@@ -716,7 +717,7 @@ def _enforce_population_limit(self) -> None:
         # Sort by average metric (worst first)
         sorted_programs = sorted(
             all_programs,
-            key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)) if p.metrics else 0.0,
+            key=lambda p: safe_numeric_average(p.metrics),
         )
 
         # Remove worst programs, but never remove the best program
@@ -812,7 +813,7 @@ def migrate_programs(self) -> None:
             # Sort by fitness (using combined_score or average metrics)
             island_programs.sort(
                 key=lambda p: p.metrics.get(
-                    "combined_score", sum(p.metrics.values()) / max(1, len(p.metrics))
+                    "combined_score", safe_numeric_average(p.metrics)
                 ),
                 reverse=True,
             )
@@ -859,7 +860,7 @@ def get_island_stats(self) -> List[dict]:
             if island_programs:
                 scores = [
                     p.metrics.get(
-                        "combined_score", sum(p.metrics.values()) / max(1, len(p.metrics))
+                        "combined_score", safe_numeric_average(p.metrics)
                     )
                     for p in island_programs
                 ]
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
index d7f4ed654..fb9fbd8e2 100644
--- a/openevolve/evaluator.py
+++ b/openevolve/evaluator.py
@@ -18,6 +18,7 @@
 from openevolve.config import EvaluatorConfig
 from openevolve.llm.ensemble import LLMEnsemble
 from openevolve.utils.async_utils import TaskPool, run_in_executor
+from openevolve.utils.format_utils import format_metrics_safe
 
 logger = logging.getLogger(__name__)
 
@@ -119,7 +120,7 @@ async def evaluate_program(
                 elapsed = time.time() - start_time
                 logger.info(
                     f"Evaluated program{program_id_str} in {elapsed:.2f}s: "
-                    f"{', '.join(f'{name}={value:.4f}' for name, value in metrics.items())}"
+                    f"{format_metrics_safe(metrics)}"
                 )
 
                 return metrics
diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py
index ad7a6be38..594f4ffce 100644
--- a/openevolve/prompt/sampler.py
+++ b/openevolve/prompt/sampler.py
@@ -8,6 +8,8 @@
 
 from openevolve.config import PromptConfig
 from openevolve.prompt.templates import TemplateManager
+from openevolve.utils.format_utils import format_metrics_safe
+from openevolve.utils.metrics_utils import safe_numeric_average
 
 logger = logging.getLogger(__name__)
 
@@ -126,8 +128,18 @@ def build_prompt(
         }
 
     def _format_metrics(self, metrics: Dict[str, float]) -> str:
-        """Format metrics for the prompt"""
-        return "\n".join([f"- {name}: {value:.4f}" for name, value in metrics.items()])
+        """Format metrics for the prompt using safe formatting"""
+        # Use safe formatting to handle mixed numeric and string values
+        formatted_parts = []
+        for name, value in metrics.items():
+            if isinstance(value, (int, float)):
+                try:
+                    formatted_parts.append(f"- {name}: {value:.4f}")
+                except (ValueError, TypeError):
+                    formatted_parts.append(f"- {name}: {value}")
+            else:
+                formatted_parts.append(f"- {name}: {value}")
+        return "\n".join(formatted_parts)
 
     def _identify_improvement_areas(
         self,
@@ -159,10 +171,17 @@ def _identify_improvement_areas(
                 regressed = True
 
                 for attempt in recent_attempts:
-                    if attempt["metrics"].get(metric, 0) <= value:
-                        regressed = False
-                    if attempt["metrics"].get(metric, 0) >= value:
+                    attempt_value = attempt["metrics"].get(metric, 0)
+                    # Only compare if both values are numeric
+                    if isinstance(value, (int, float)) and isinstance(attempt_value, (int, float)):
+                        if attempt_value <= value:
+                            regressed = False
+                        if attempt_value >= value:
+                            improved = False
+                    else:
+                        # If either value is non-numeric, skip comparison
                         improved = False
+                        regressed = False
 
                 if improved and metric not in metrics_improved:
                     metrics_improved.append(metric)
@@ -209,24 +228,49 @@ def _format_evolution_history(
             attempt_number = len(previous_programs) - i
             changes = program.get("changes", "Unknown changes")
 
-            # Format performance metrics
-            performance_str = ", ".join(
-                [f"{name}: {value:.4f}" for name, value in program.get("metrics", {}).items()]
-            )
+            # Format performance metrics using safe formatting
+            performance_parts = []
+            for name, value in program.get("metrics", {}).items():
+                if isinstance(value, (int, float)):
+                    try:
+                        performance_parts.append(f"{name}: {value:.4f}")
+                    except (ValueError, TypeError):
+                        performance_parts.append(f"{name}: {value}")
+                else:
+                    performance_parts.append(f"{name}: {value}")
+            performance_str = ", ".join(performance_parts)
 
             # Determine outcome based on comparison with parent
             parent_metrics = program.get("parent_metrics", {})
             outcome = "Mixed results"
 
-            if all(
-                program.get("metrics", {}).get(m, 0) >= parent_metrics.get(m, 0)
-                for m in program.get("metrics", {})
-            ):
+            # Safely compare only numeric metrics
+            program_metrics = program.get("metrics", {})
+            
+            # Check if all numeric metrics improved
+            numeric_comparisons_improved = []
+            numeric_comparisons_regressed = []
+            
+            for m in program_metrics:
+                prog_value = program_metrics.get(m, 0)
+                parent_value = parent_metrics.get(m, 0)
+                
+                # Only compare if both values are numeric
+                if isinstance(prog_value, (int, float)) and isinstance(parent_value, (int, float)):
+                    if prog_value >= parent_value:
+                        numeric_comparisons_improved.append(True)
+                    else:
+                        numeric_comparisons_improved.append(False)
+                        
+                    if prog_value <= parent_value:
+                        numeric_comparisons_regressed.append(True)
+                    else:
+                        numeric_comparisons_regressed.append(False)
+            
+            # Determine outcome based on numeric comparisons
+            if numeric_comparisons_improved and all(numeric_comparisons_improved):
                 outcome = "Improvement in all metrics"
-            elif all(
-                program.get("metrics", {}).get(m, 0) <= parent_metrics.get(m, 0)
-                for m in program.get("metrics", {})
-            ):
+            elif numeric_comparisons_regressed and all(numeric_comparisons_regressed):
                 outcome = "Regression in all metrics"
 
             previous_attempts_str += (
@@ -250,18 +294,21 @@ def _format_evolution_history(
             if len(program_code.split("\n")) > 10:
                 program_snippet += "\n# ... (truncated for brevity)"
 
-            # Calculate a composite score
-            score = sum(program.get("metrics", {}).values()) / max(
-                1, len(program.get("metrics", {}))
-            )
+            # Calculate a composite score using safe numeric average
+            score = safe_numeric_average(program.get("metrics", {}))
 
             # Extract key features (this could be more sophisticated)
             key_features = program.get("key_features", [])
             if not key_features:
-                key_features = [
-                    f"Performs well on {name} ({value:.4f})"
-                    for name, value in program.get("metrics", {}).items()
-                ]
+                key_features = []
+                for name, value in program.get("metrics", {}).items():
+                    if isinstance(value, (int, float)):
+                        try:
+                            key_features.append(f"Performs well on {name} ({value:.4f})")
+                        except (ValueError, TypeError):
+                            key_features.append(f"Performs well on {name} ({value})")
+                    else:
+                        key_features.append(f"Performs well on {name} ({value})")
 
             key_features_str = ", ".join(key_features)
 
@@ -300,10 +347,8 @@ def _format_evolution_history(
                     if len(program_code.split("\n")) > 5:
                         program_snippet += "\n# ... (truncated)"
 
-                    # Calculate a composite score
-                    score = sum(program.get("metrics", {}).values()) / max(
-                        1, len(program.get("metrics", {}))
-                    )
+                    # Calculate a composite score using safe numeric average
+                    score = safe_numeric_average(program.get("metrics", {}))
 
                     # Extract key features
                     key_features = program.get("key_features", [])
diff --git a/openevolve/utils/__init__.py b/openevolve/utils/__init__.py
index 6d16167af..89a4a1b62 100644
--- a/openevolve/utils/__init__.py
+++ b/openevolve/utils/__init__.py
@@ -17,6 +17,14 @@
     parse_evolve_blocks,
     parse_full_rewrite,
 )
+from openevolve.utils.format_utils import (
+    format_metrics_safe,
+    format_improvement_safe,
+)
+from openevolve.utils.metrics_utils import (
+    safe_numeric_average,
+    safe_numeric_sum,
+)
 
 __all__ = [
     "TaskPool",
@@ -30,4 +38,8 @@
     "format_diff_summary",
     "parse_evolve_blocks",
     "parse_full_rewrite",
+    "format_metrics_safe",
+    "format_improvement_safe",
+    "safe_numeric_average",
+    "safe_numeric_sum",
 ]
diff --git a/openevolve/utils/format_utils.py b/openevolve/utils/format_utils.py
new file mode 100644
index 000000000..8f0c5be74
--- /dev/null
+++ b/openevolve/utils/format_utils.py
@@ -0,0 +1,65 @@
+"""
+Utility functions for formatting output
+"""
+
+from typing import Any, Dict
+
+
+def format_metrics_safe(metrics: Dict[str, Any]) -> str:
+    """
+    Safely format metrics dictionary for logging, handling both numeric and string values.
+    
+    Args:
+        metrics: Dictionary of metric names to values
+        
+    Returns:
+        Formatted string representation of metrics
+    """
+    if not metrics:
+        return ""
+    
+    formatted_parts = []
+    for name, value in metrics.items():
+        # Check if value is numeric (int, float)
+        if isinstance(value, (int, float)):
+            try:
+                # Only apply float formatting to numeric values
+                formatted_parts.append(f"{name}={value:.4f}")
+            except (ValueError, TypeError):
+                # Fallback to string representation if formatting fails
+                formatted_parts.append(f"{name}={value}")
+        else:
+            # For non-numeric values (strings, etc.), just convert to string
+            formatted_parts.append(f"{name}={value}")
+    
+    return ", ".join(formatted_parts)
+
+
+def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[str, Any]) -> str:
+    """
+    Safely format improvement metrics for logging.
+    
+    Args:
+        parent_metrics: Parent program metrics
+        child_metrics: Child program metrics
+        
+    Returns:
+        Formatted string representation of improvements
+    """
+    if not parent_metrics or not child_metrics:
+        return ""
+    
+    improvement_parts = []
+    for metric, child_value in child_metrics.items():
+        if metric in parent_metrics:
+            parent_value = parent_metrics[metric]
+            # Only calculate improvement for numeric values
+            if isinstance(child_value, (int, float)) and isinstance(parent_value, (int, float)):
+                try:
+                    diff = child_value - parent_value
+                    improvement_parts.append(f"{metric}={diff:+.4f}")
+                except (ValueError, TypeError):
+                    # Skip non-numeric comparisons
+                    continue
+    
+    return ", ".join(improvement_parts)
diff --git a/openevolve/utils/metrics_utils.py b/openevolve/utils/metrics_utils.py
new file mode 100644
index 000000000..3a349a2b1
--- /dev/null
+++ b/openevolve/utils/metrics_utils.py
@@ -0,0 +1,66 @@
+"""
+Safe calculation utilities for metrics containing mixed types
+"""
+
+from typing import Any, Dict
+
+
+def safe_numeric_average(metrics: Dict[str, Any]) -> float:
+    """
+    Calculate the average of numeric values in a metrics dictionary,
+    safely ignoring non-numeric values like strings.
+    
+    Args:
+        metrics: Dictionary of metric names to values
+        
+    Returns:
+        Average of numeric values, or 0.0 if no numeric values found
+    """
+    if not metrics:
+        return 0.0
+    
+    numeric_values = []
+    for value in metrics.values():
+        if isinstance(value, (int, float)):
+            try:
+                # Convert to float and check if it's a valid number
+                float_val = float(value)
+                if not (float_val != float_val):  # Check for NaN (NaN != NaN is True)
+                    numeric_values.append(float_val)
+            except (ValueError, TypeError, OverflowError):
+                # Skip invalid numeric values
+                continue
+    
+    if not numeric_values:
+        return 0.0
+    
+    return sum(numeric_values) / len(numeric_values)
+
+
+def safe_numeric_sum(metrics: Dict[str, Any]) -> float:
+    """
+    Calculate the sum of numeric values in a metrics dictionary,
+    safely ignoring non-numeric values like strings.
+    
+    Args:
+        metrics: Dictionary of metric names to values
+        
+    Returns:
+        Sum of numeric values, or 0.0 if no numeric values found
+    """
+    if not metrics:
+        return 0.0
+    
+    numeric_sum = 0.0
+    for value in metrics.values():
+        if isinstance(value, (int, float)):
+            try:
+                # Convert to float and check if it's a valid number
+                float_val = float(value)
+                if not (float_val != float_val):  # Check for NaN (NaN != NaN is True)
+                    numeric_sum += float_val
+            except (ValueError, TypeError, OverflowError):
+                # Skip invalid numeric values
+                continue
+    
+    return numeric_sum

From cd99e3dd28ce05586c71990ead5ef063a4dc6ed6 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Wed, 4 Jun 2025 12:35:57 +0800
Subject: [PATCH 2/2] fix linter

---
 examples/function_minimization/evaluator.py |  4 +---
 openevolve/controller.py                    |  4 +---
 openevolve/database.py                      | 12 +++---------
 openevolve/prompt/sampler.py                | 10 +++++-----
 openevolve/utils/format_utils.py            | 16 ++++++++--------
 openevolve/utils/metrics_utils.py           | 18 +++++++++---------
 6 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py
index 6bb1f9d90..e48d54136 100644
--- a/examples/function_minimization/evaluator.py
+++ b/examples/function_minimization/evaluator.py
@@ -275,9 +275,7 @@ def evaluate_stage1(program_path):
                     )
                     return {"runs_successfully": 0.0, "error": "Invalid result format"}
             else:
-                print(
-                    f"Stage 1: Invalid result format, expected tuple but got {type(result)}"
-                )
+                print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
                 return {"runs_successfully": 0.0, "error": "Invalid result format"}
 
             # Ensure all values are float
diff --git a/openevolve/controller.py b/openevolve/controller.py
index 42fbd49a7..c08194378 100644
--- a/openevolve/controller.py
+++ b/openevolve/controller.py
@@ -308,9 +308,7 @@ async def run(
                     logger.info(
                         f"🌟 New best solution found at iteration {i+1}: {child_program.id}"
                     )
-                    logger.info(
-                        f"Metrics: {format_metrics_safe(child_program.metrics)}"
-                    )
+                    logger.info(f"Metrics: {format_metrics_safe(child_program.metrics)}")
 
                 # Save checkpoint
                 if (i + 1) % self.config.checkpoint_interval == 0:
diff --git a/openevolve/database.py b/openevolve/database.py
index d25b55159..983138d94 100644
--- a/openevolve/database.py
+++ b/openevolve/database.py
@@ -508,9 +508,7 @@ def _update_archive(self, program: Program) -> None:
 
         # Otherwise, find worst program in archive
         archive_programs = [self.programs[pid] for pid in self.archive]
-        worst_program = min(
-            archive_programs, key=lambda p: safe_numeric_average(p.metrics)
-        )
+        worst_program = min(archive_programs, key=lambda p: safe_numeric_average(p.metrics))
 
         # Replace if new program is better
         if self._is_better(program, worst_program):
@@ -812,9 +810,7 @@ def migrate_programs(self) -> None:
 
             # Sort by fitness (using combined_score or average metrics)
             island_programs.sort(
-                key=lambda p: p.metrics.get(
-                    "combined_score", safe_numeric_average(p.metrics)
-                ),
+                key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)),
                 reverse=True,
             )
 
@@ -859,9 +855,7 @@ def get_island_stats(self) -> List[dict]:
 
             if island_programs:
                 scores = [
-                    p.metrics.get(
-                        "combined_score", safe_numeric_average(p.metrics)
-                    )
+                    p.metrics.get("combined_score", safe_numeric_average(p.metrics))
                     for p in island_programs
                 ]
 
diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py
index 594f4ffce..a05910b98 100644
--- a/openevolve/prompt/sampler.py
+++ b/openevolve/prompt/sampler.py
@@ -246,27 +246,27 @@ def _format_evolution_history(
 
             # Safely compare only numeric metrics
             program_metrics = program.get("metrics", {})
-            
+
             # Check if all numeric metrics improved
             numeric_comparisons_improved = []
             numeric_comparisons_regressed = []
-            
+
             for m in program_metrics:
                 prog_value = program_metrics.get(m, 0)
                 parent_value = parent_metrics.get(m, 0)
-                
+
                 # Only compare if both values are numeric
                 if isinstance(prog_value, (int, float)) and isinstance(parent_value, (int, float)):
                     if prog_value >= parent_value:
                         numeric_comparisons_improved.append(True)
                     else:
                         numeric_comparisons_improved.append(False)
-                        
+
                     if prog_value <= parent_value:
                         numeric_comparisons_regressed.append(True)
                     else:
                         numeric_comparisons_regressed.append(False)
-            
+
             # Determine outcome based on numeric comparisons
             if numeric_comparisons_improved and all(numeric_comparisons_improved):
                 outcome = "Improvement in all metrics"
diff --git a/openevolve/utils/format_utils.py b/openevolve/utils/format_utils.py
index 8f0c5be74..4dd83c7ac 100644
--- a/openevolve/utils/format_utils.py
+++ b/openevolve/utils/format_utils.py
@@ -8,16 +8,16 @@
 def format_metrics_safe(metrics: Dict[str, Any]) -> str:
     """
     Safely format metrics dictionary for logging, handling both numeric and string values.
-    
+
     Args:
         metrics: Dictionary of metric names to values
-        
+
     Returns:
         Formatted string representation of metrics
     """
     if not metrics:
         return ""
-    
+
     formatted_parts = []
     for name, value in metrics.items():
         # Check if value is numeric (int, float)
@@ -31,24 +31,24 @@ def format_metrics_safe(metrics: Dict[str, Any]) -> str:
         else:
             # For non-numeric values (strings, etc.), just convert to string
             formatted_parts.append(f"{name}={value}")
-    
+
     return ", ".join(formatted_parts)
 
 
 def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[str, Any]) -> str:
     """
     Safely format improvement metrics for logging.
-    
+
     Args:
         parent_metrics: Parent program metrics
         child_metrics: Child program metrics
-        
+
     Returns:
         Formatted string representation of improvements
     """
     if not parent_metrics or not child_metrics:
         return ""
-    
+
     improvement_parts = []
     for metric, child_value in child_metrics.items():
         if metric in parent_metrics:
@@ -61,5 +61,5 @@ def format_improvement_safe(parent_metrics: Dict[str, Any], child_metrics: Dict[
                 except (ValueError, TypeError):
                     # Skip non-numeric comparisons
                     continue
-    
+
     return ", ".join(improvement_parts)
diff --git a/openevolve/utils/metrics_utils.py b/openevolve/utils/metrics_utils.py
index 3a349a2b1..f2da61f49 100644
--- a/openevolve/utils/metrics_utils.py
+++ b/openevolve/utils/metrics_utils.py
@@ -9,16 +9,16 @@ def safe_numeric_average(metrics: Dict[str, Any]) -> float:
     """
     Calculate the average of numeric values in a metrics dictionary,
     safely ignoring non-numeric values like strings.
-    
+
     Args:
         metrics: Dictionary of metric names to values
-        
+
     Returns:
         Average of numeric values, or 0.0 if no numeric values found
     """
     if not metrics:
         return 0.0
-    
+
     numeric_values = []
     for value in metrics.values():
         if isinstance(value, (int, float)):
@@ -30,10 +30,10 @@ def safe_numeric_average(metrics: Dict[str, Any]) -> float:
             except (ValueError, TypeError, OverflowError):
                 # Skip invalid numeric values
                 continue
-    
+
     if not numeric_values:
         return 0.0
-    
+
     return sum(numeric_values) / len(numeric_values)
 
 
@@ -41,16 +41,16 @@ def safe_numeric_sum(metrics: Dict[str, Any]) -> float:
     """
     Calculate the sum of numeric values in a metrics dictionary,
     safely ignoring non-numeric values like strings.
-    
+
     Args:
         metrics: Dictionary of metric names to values
-        
+
     Returns:
         Sum of numeric values, or 0.0 if no numeric values found
     """
     if not metrics:
         return 0.0
-    
+
     numeric_sum = 0.0
     for value in metrics.values():
         if isinstance(value, (int, float)):
@@ -62,5 +62,5 @@ def safe_numeric_sum(metrics: Dict[str, Any]) -> float:
             except (ValueError, TypeError, OverflowError):
                 # Skip invalid numeric values
                 continue
-    
+
     return numeric_sum