From bf789822f68ecf38d0dddb69db2dd6aa69f7d899 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sat, 30 Aug 2025 23:02:58 +0800
Subject: [PATCH 1/2] fix timeout

---
 openevolve/process_parallel.py             |  14 +-
 tests/test_checkpoint_resume.py            |   4 +-
 tests/test_feature_stats_persistence.py    | 137 ++++++++----------
 tests/test_grid_stability.py               | 139 ++++++++-----------
 tests/test_island_isolation.py             | 154 +++++++++------------
 tests/test_island_parent_consistency.py    |  75 +++++-----
 tests/test_model_parameter_demo.py         |  44 +++---
 tests/test_openai_model_detection.py       |  53 ++++---
 tests/test_process_parallel.py             |   7 +-
 tests/test_prompt_sampler_comprehensive.py | 127 +++++++----------
 10 files changed, 341 insertions(+), 413 deletions(-)

diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py
index c9da41634..4dd78a9ea 100644
--- a/openevolve/process_parallel.py
+++ b/openevolve/process_parallel.py
@@ -8,7 +8,7 @@
 import pickle
 import signal
 import time
-from concurrent.futures import ProcessPoolExecutor, Future
+from concurrent.futures import ProcessPoolExecutor, Future, TimeoutError as FutureTimeoutError
 from dataclasses import dataclass, asdict
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@@ -454,7 +454,9 @@ async def run_evolution(
             future = pending_futures.pop(completed_iteration)
 
             try:
-                result = future.result()
+                # Use evaluator timeout + buffer to gracefully handle stuck processes
+                timeout_seconds = self.config.evaluator.timeout + 30
+                result = future.result(timeout=timeout_seconds)
 
                 if result.error:
                     logger.warning(f"Iteration {completed_iteration} error: {result.error}")
@@ -612,6 +614,14 @@ async def run_evolution(
                                 )
                                 break
 
+            except FutureTimeoutError:
+                logger.error(
+                    f"⏰ Iteration {completed_iteration} timed out after {timeout_seconds}s "
+                    f"(evaluator timeout: {self.config.evaluator.timeout}s + 30s buffer). "
+                    f"Canceling future and continuing with next iteration."
+                )
+                # Cancel the future to clean up the process
+                future.cancel()
             except Exception as e:
                 logger.error(f"Error processing result from iteration {completed_iteration}: {e}")
 
diff --git a/tests/test_checkpoint_resume.py b/tests/test_checkpoint_resume.py
index eba314b7b..fa13a0592 100644
--- a/tests/test_checkpoint_resume.py
+++ b/tests/test_checkpoint_resume.py
@@ -308,7 +308,9 @@ async def run_test():
                 )
 
                 # Mock the parallel controller to avoid API calls
-                with patch("openevolve.controller.ProcessParallelController") as mock_parallel_class:
+                with patch(
+                    "openevolve.controller.ProcessParallelController"
+                ) as mock_parallel_class:
                     mock_parallel = MagicMock()
                     mock_parallel.run_evolution = AsyncMock(return_value=None)
                     mock_parallel.start = MagicMock()
diff --git a/tests/test_feature_stats_persistence.py b/tests/test_feature_stats_persistence.py
index 0b508e05d..236f09cf6 100644
--- a/tests/test_feature_stats_persistence.py
+++ b/tests/test_feature_stats_persistence.py
@@ -22,7 +22,7 @@ def setUp(self):
         self.config = DatabaseConfig(
             db_path=self.test_dir,
             feature_dimensions=["score", "custom_metric1", "custom_metric2"],
-            feature_bins=10
+            feature_bins=10,
         )
 
     def tearDown(self):
@@ -33,7 +33,7 @@ def test_feature_stats_saved_and_loaded(self):
         """Test that feature_stats are correctly saved and loaded from checkpoints"""
         # Create database and add programs to build feature_stats
         db1 = ProgramDatabase(self.config)
-        
+
         programs = []
         for i in range(5):
             program = Program(
@@ -42,8 +42,8 @@ def test_feature_stats_saved_and_loaded(self):
                 metrics={
                     "combined_score": 0.1 + i * 0.2,
                     "custom_metric1": 10 + i * 20,
-                    "custom_metric2": 100 + i * 50
-                }
+                    "custom_metric2": 100 + i * 50,
+                },
             )
             programs.append(program)
             db1.add(program)
@@ -52,14 +52,10 @@ def test_feature_stats_saved_and_loaded(self):
         self.assertIn("score", db1.feature_stats)
         self.assertIn("custom_metric1", db1.feature_stats)
         self.assertIn("custom_metric2", db1.feature_stats)
-        
+
         # Store original feature_stats for comparison
         original_stats = {
-            dim: {
-                "min": stats["min"],
-                "max": stats["max"],
-                "values": stats["values"].copy()
-            }
+            dim: {"min": stats["min"], "max": stats["max"], "values": stats["values"].copy()}
             for dim, stats in db1.feature_stats.items()
         }
 
@@ -72,11 +68,11 @@ def test_feature_stats_saved_and_loaded(self):
 
         # Verify feature_stats were loaded correctly
         self.assertEqual(len(db2.feature_stats), len(original_stats))
-        
+
         for dim, original in original_stats.items():
             self.assertIn(dim, db2.feature_stats)
             loaded = db2.feature_stats[dim]
-            
+
             self.assertAlmostEqual(loaded["min"], original["min"], places=5)
             self.assertAlmostEqual(loaded["max"], original["max"], places=5)
             self.assertEqual(loaded["values"], original["values"])
@@ -84,21 +80,21 @@ def test_feature_stats_saved_and_loaded(self):
     def test_empty_feature_stats_handling(self):
         """Test handling of empty feature_stats"""
         db1 = ProgramDatabase(self.config)
-        
+
         # Save without any programs (empty feature_stats)
         db1.save(self.test_dir, iteration=1)
-        
+
         # Load and verify
         db2 = ProgramDatabase(self.config)
         db2.load(self.test_dir)
-        
+
         self.assertEqual(db2.feature_stats, {})
 
     def test_backward_compatibility_missing_feature_stats(self):
         """Test loading checkpoints that don't have feature_stats (backward compatibility)"""
         # Create a checkpoint manually without feature_stats
         os.makedirs(self.test_dir, exist_ok=True)
-        
+
         # Create metadata without feature_stats (simulating old checkpoint)
         metadata = {
             "feature_map": {},
@@ -112,60 +108,48 @@ def test_backward_compatibility_missing_feature_stats(self):
             "last_migration_generation": 0,
             # Note: no "feature_stats" key
         }
-        
+
         with open(os.path.join(self.test_dir, "metadata.json"), "w") as f:
             json.dump(metadata, f)
-        
+
         # Load should work without errors
         db = ProgramDatabase(self.config)
         db.load(self.test_dir)
-        
+
         # feature_stats should be empty but not None
         self.assertEqual(db.feature_stats, {})
 
     def test_feature_stats_serialization_edge_cases(self):
         """Test feature_stats serialization handles edge cases correctly"""
         db = ProgramDatabase(self.config)
-        
+
         # Test with various edge cases
         db.feature_stats = {
-            "normal_case": {
-                "min": 1.0,
-                "max": 10.0,
-                "values": [1.0, 5.0, 10.0]
-            },
-            "single_value": {
-                "min": 5.0,
-                "max": 5.0,
-                "values": [5.0]
-            },
+            "normal_case": {"min": 1.0, "max": 10.0, "values": [1.0, 5.0, 10.0]},
+            "single_value": {"min": 5.0, "max": 5.0, "values": [5.0]},
             "large_values_list": {
                 "min": 0.0,
                 "max": 200.0,
-                "values": list(range(200))  # Should be truncated to 100
+                "values": list(range(200)),  # Should be truncated to 100
             },
-            "empty_values": {
-                "min": 0.0,
-                "max": 1.0,
-                "values": []
-            }
+            "empty_values": {"min": 0.0, "max": 1.0, "values": []},
         }
-        
+
         # Test serialization
         serialized = db._serialize_feature_stats()
-        
+
         # Check that large values list was truncated
         self.assertLessEqual(len(serialized["large_values_list"]["values"]), 100)
-        
+
         # Test deserialization
         deserialized = db._deserialize_feature_stats(serialized)
-        
+
         # Verify structure is maintained
         self.assertIn("normal_case", deserialized)
         self.assertIn("single_value", deserialized)
         self.assertIn("large_values_list", deserialized)
         self.assertIn("empty_values", deserialized)
-        
+
         # Verify types are correct
         for dim, stats in deserialized.items():
             self.assertIsInstance(stats["min"], float)
@@ -176,9 +160,9 @@ def test_feature_stats_preservation_during_load(self):
         """Test that feature_stats ranges are preserved when loading from checkpoint"""
         # Create database with programs
         db1 = ProgramDatabase(self.config)
-        
+
         test_programs = []
-        
+
         for i in range(3):
             program = Program(
                 id=f"stats_test_{i}",
@@ -186,8 +170,8 @@ def test_feature_stats_preservation_during_load(self):
                 metrics={
                     "combined_score": 0.2 + i * 0.3,
                     "custom_metric1": 20 + i * 30,
-                    "custom_metric2": 200 + i * 100
-                }
+                    "custom_metric2": 200 + i * 100,
+                },
             )
             test_programs.append(program)
             db1.add(program)
@@ -195,10 +179,7 @@ def test_feature_stats_preservation_during_load(self):
         # Record original feature ranges
         original_ranges = {}
         for dim, stats in db1.feature_stats.items():
-            original_ranges[dim] = {
-                "min": stats["min"],
-                "max": stats["max"]
-            }
+            original_ranges[dim] = {"min": stats["min"], "max": stats["max"]}
 
         # Save checkpoint
         db1.save(self.test_dir, iteration=50)
@@ -211,31 +192,35 @@ def test_feature_stats_preservation_during_load(self):
         for dim, original_range in original_ranges.items():
             self.assertIn(dim, db2.feature_stats)
             loaded_stats = db2.feature_stats[dim]
-            
+
             self.assertAlmostEqual(
-                loaded_stats["min"], original_range["min"], places=5,
-                msg=f"Min value changed for {dim}: {original_range['min']} -> {loaded_stats['min']}"
+                loaded_stats["min"],
+                original_range["min"],
+                places=5,
+                msg=f"Min value changed for {dim}: {original_range['min']} -> {loaded_stats['min']}",
             )
             self.assertAlmostEqual(
-                loaded_stats["max"], original_range["max"], places=5,
-                msg=f"Max value changed for {dim}: {original_range['max']} -> {loaded_stats['max']}"
+                loaded_stats["max"],
+                original_range["max"],
+                places=5,
+                msg=f"Max value changed for {dim}: {original_range['max']} -> {loaded_stats['max']}",
             )
-        
+
         # Test that adding a new program within existing ranges doesn't break anything
         new_program = Program(
             id="range_test",
             code="# Program to test range stability",
             metrics={
                 "combined_score": 0.35,  # Within existing range
-                "custom_metric1": 35,    # Within existing range
-                "custom_metric2": 250    # Within existing range
-            }
+                "custom_metric1": 35,  # Within existing range
+                "custom_metric2": 250,  # Within existing range
+            },
         )
-        
+
         # Adding this program should not cause issues
         db2.add(new_program)
         new_coords = db2._calculate_feature_coords(new_program)
-        
+
         # Should get valid coordinates
         self.assertEqual(len(new_coords), len(self.config.feature_dimensions))
         for coord in new_coords:
@@ -245,25 +230,25 @@ def test_feature_stats_preservation_during_load(self):
     def test_feature_stats_with_numpy_types(self):
         """Test that numpy types are correctly handled in serialization"""
         import numpy as np
-        
+
         db = ProgramDatabase(self.config)
-        
+
         # Simulate feature_stats with numpy types
         db.feature_stats = {
             "numpy_test": {
                 "min": np.float64(1.5),
                 "max": np.float64(9.5),
-                "values": [np.float64(x) for x in [1.5, 5.0, 9.5]]
+                "values": [np.float64(x) for x in [1.5, 5.0, 9.5]],
             }
         }
-        
+
         # Test serialization doesn't fail
         serialized = db._serialize_feature_stats()
-        
+
         # Verify numpy types were converted to Python types
         self.assertIsInstance(serialized["numpy_test"]["min"], float)
         self.assertIsInstance(serialized["numpy_test"]["max"], float)
-        
+
         # Test deserialization
         deserialized = db._deserialize_feature_stats(serialized)
         self.assertIsInstance(deserialized["numpy_test"]["min"], float)
@@ -272,32 +257,28 @@ def test_feature_stats_with_numpy_types(self):
     def test_malformed_feature_stats_handling(self):
         """Test handling of malformed feature_stats during deserialization"""
         db = ProgramDatabase(self.config)
-        
+
         # Test with malformed data
         malformed_data = {
-            "valid_entry": {
-                "min": 1.0,
-                "max": 10.0,
-                "values": [1.0, 5.0, 10.0]
-            },
+            "valid_entry": {"min": 1.0, "max": 10.0, "values": [1.0, 5.0, 10.0]},
             "invalid_entry": "this is not a dict",
             "missing_keys": {
                 "min": 1.0
                 # missing "max" and "values"
-            }
+            },
         }
-        
-        with patch('openevolve.database.logger') as mock_logger:
+
+        with patch("openevolve.database.logger") as mock_logger:
             deserialized = db._deserialize_feature_stats(malformed_data)
-        
+
         # Should have valid entry and skip invalid ones
         self.assertIn("valid_entry", deserialized)
         self.assertNotIn("invalid_entry", deserialized)
         self.assertIn("missing_keys", deserialized)  # Should be created with defaults
-        
+
         # Should have logged warning for invalid entry
         mock_logger.warning.assert_called()
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/test_grid_stability.py b/tests/test_grid_stability.py
index 514d89b24..86a387511 100644
--- a/tests/test_grid_stability.py
+++ b/tests/test_grid_stability.py
@@ -27,24 +27,22 @@ def test_feature_ranges_preserved_across_checkpoints(self):
         config = DatabaseConfig(
             db_path=self.test_dir,
             feature_dimensions=["score", "prompt_length", "reasoning_sophistication"],
-            feature_bins=5  # Use smaller bins for easier testing
+            feature_bins=5,  # Use smaller bins for easier testing
         )
 
         # Phase 1: Create initial population with specific range
         db1 = ProgramDatabase(config)
-        
+
         # Create programs with known metrics to establish ranges
         test_cases = [
             {"combined_score": 0.2, "prompt_length": 100, "reasoning_sophistication": 0.1},
             {"combined_score": 0.5, "prompt_length": 300, "reasoning_sophistication": 0.5},
             {"combined_score": 0.8, "prompt_length": 500, "reasoning_sophistication": 0.9},
         ]
-        
+
         for i, metrics in enumerate(test_cases):
             program = Program(
-                id=f"range_test_{i}",
-                code=f"# Range test program {i}",
-                metrics=metrics
+                id=f"range_test_{i}", code=f"# Range test program {i}", metrics=metrics
             )
             db1.add(program)
 
@@ -54,7 +52,7 @@ def test_feature_ranges_preserved_across_checkpoints(self):
             original_ranges[dim] = {
                 "min": stats["min"],
                 "max": stats["max"],
-                "value_count": len(stats["values"])
+                "value_count": len(stats["values"]),
             }
 
         # Save checkpoint
@@ -71,14 +69,18 @@ def test_feature_ranges_preserved_across_checkpoints(self):
         for dim, original_range in original_ranges.items():
             self.assertIn(dim, db2.feature_stats)
             loaded_stats = db2.feature_stats[dim]
-            
+
             self.assertAlmostEqual(
-                loaded_stats["min"], original_range["min"], places=5,
-                msg=f"Min range changed for {dim}"
+                loaded_stats["min"],
+                original_range["min"],
+                places=5,
+                msg=f"Min range changed for {dim}",
             )
             self.assertAlmostEqual(
-                loaded_stats["max"], original_range["max"], places=5,
-                msg=f"Max range changed for {dim}"
+                loaded_stats["max"],
+                original_range["max"],
+                places=5,
+                msg=f"Max range changed for {dim}",
             )
 
         # Phase 3: Add new program within existing range - ranges should not contract
@@ -87,39 +89,35 @@ def test_feature_ranges_preserved_across_checkpoints(self):
             code="# New program within established range",
             metrics={
                 "combined_score": 0.35,  # Between existing values
-                "prompt_length": 200,    # Between existing values  
-                "reasoning_sophistication": 0.3  # Between existing values
-            }
+                "prompt_length": 200,  # Between existing values
+                "reasoning_sophistication": 0.3,  # Between existing values
+            },
         )
-        
+
         # Add new program
         db2.add(new_program)
         new_coords = db2._calculate_feature_coords(new_program)
-        
+
         # Verify ranges did not contract (should be same or expanded)
         for dim, original_range in original_ranges.items():
             current_stats = db2.feature_stats[dim]
-            
+
             self.assertLessEqual(
-                current_stats["min"], original_range["min"],
-                f"Min range contracted for {dim}"
+                current_stats["min"], original_range["min"], f"Min range contracted for {dim}"
             )
             self.assertGreaterEqual(
-                current_stats["max"], original_range["max"],
-                f"Max range contracted for {dim}"
+                current_stats["max"], original_range["max"], f"Max range contracted for {dim}"
             )
 
     def test_grid_expansion_behavior(self):
         """Test that grid expands correctly when new programs exceed existing ranges"""
         config = DatabaseConfig(
-            db_path=self.test_dir,
-            feature_dimensions=["score", "execution_time"],
-            feature_bins=5
+            db_path=self.test_dir, feature_dimensions=["score", "execution_time"], feature_bins=5
         )
 
         # Phase 1: Establish initial range
         db1 = ProgramDatabase(config)
-        
+
         # Initial programs with limited range
         for i in range(3):
             program = Program(
@@ -127,8 +125,8 @@ def test_grid_expansion_behavior(self):
                 code=f"# Initial program {i}",
                 metrics={
                     "combined_score": 0.4 + i * 0.1,  # 0.4 to 0.6
-                    "execution_time": 10 + i * 5      # 10 to 20
-                }
+                    "execution_time": 10 + i * 5,  # 10 to 20
+                },
             )
             db1.add(program)
 
@@ -156,76 +154,67 @@ def test_grid_expansion_behavior(self):
             id="expansion_test",
             code="# Program to test range expansion",
             metrics={
-                "combined_score": 0.9,    # Higher than existing max (0.6)
-                "execution_time": 50      # Higher than existing max (20)
-            }
+                "combined_score": 0.9,  # Higher than existing max (0.6)
+                "execution_time": 50,  # Higher than existing max (20)
+            },
         )
-        
+
         db2.add(expansion_program)
 
         # Verify ranges expanded appropriately
         self.assertLessEqual(db2.feature_stats["score"]["min"], original_score_min)
         self.assertGreaterEqual(db2.feature_stats["score"]["max"], 0.9)
-        self.assertLessEqual(db2.feature_stats["execution_time"]["min"], original_time_min)  
+        self.assertLessEqual(db2.feature_stats["execution_time"]["min"], original_time_min)
         self.assertGreaterEqual(db2.feature_stats["execution_time"]["max"], 50)
 
     def test_feature_stats_consistency_across_cycles(self):
         """Test that feature_stats remain consistent across multiple save/load cycles"""
         config = DatabaseConfig(
-            db_path=self.test_dir,
-            feature_dimensions=["score", "memory_usage"],
-            feature_bins=4
+            db_path=self.test_dir, feature_dimensions=["score", "memory_usage"], feature_bins=4
         )
 
         # Initial program to establish baseline
         reference_program = Program(
             id="reference",
             code="# Reference program for consistency testing",
-            metrics={
-                "combined_score": 0.5,
-                "memory_usage": 1024
-            }
+            metrics={"combined_score": 0.5, "memory_usage": 1024},
         )
 
         # Cycle 1: Establish initial feature stats
         db1 = ProgramDatabase(config)
         db1.add(reference_program)
-        
+
         # Record initial feature stats
         cycle1_stats = {}
         for dim, stats in db1.feature_stats.items():
-            cycle1_stats[dim] = {
-                "min": stats["min"],
-                "max": stats["max"]
-            }
-        
+            cycle1_stats[dim] = {"min": stats["min"], "max": stats["max"]}
+
         db1.save(self.test_dir, iteration=10)
 
         # Cycle 2: Load and verify stats preservation
         db2 = ProgramDatabase(config)
         db2.load(self.test_dir)
-        
+
         # Verify feature stats were preserved
         for dim, original_stats in cycle1_stats.items():
             self.assertIn(dim, db2.feature_stats)
             self.assertAlmostEqual(db2.feature_stats[dim]["min"], original_stats["min"])
             self.assertAlmostEqual(db2.feature_stats[dim]["max"], original_stats["max"])
-        
+
         # Add another program and save again
-        db2.add(Program(
-            id="cycle2_program",
-            code="# Cycle 2 program",
-            metrics={"combined_score": 0.3, "memory_usage": 512}
-        ))
-        
+        db2.add(
+            Program(
+                id="cycle2_program",
+                code="# Cycle 2 program",
+                metrics={"combined_score": 0.3, "memory_usage": 512},
+            )
+        )
+
         # Record expanded stats after adding new program
         cycle2_stats = {}
         for dim, stats in db2.feature_stats.items():
-            cycle2_stats[dim] = {
-                "min": stats["min"],
-                "max": stats["max"]
-            }
-        
+            cycle2_stats[dim] = {"min": stats["min"], "max": stats["max"]}
+
         db2.save(self.test_dir, iteration=20)
 
         # Cycle 3: Verify stats are still preserved
@@ -236,33 +225,30 @@ def test_feature_stats_consistency_across_cycles(self):
         for dim, cycle2_stats_dim in cycle2_stats.items():
             self.assertIn(dim, db3.feature_stats)
             self.assertAlmostEqual(
-                db3.feature_stats[dim]["min"], cycle2_stats_dim["min"],
-                msg=f"Min value changed for {dim} in cycle 3"
+                db3.feature_stats[dim]["min"],
+                cycle2_stats_dim["min"],
+                msg=f"Min value changed for {dim} in cycle 3",
             )
             self.assertAlmostEqual(
-                db3.feature_stats[dim]["max"], cycle2_stats_dim["max"],
-                msg=f"Max value changed for {dim} in cycle 3"
+                db3.feature_stats[dim]["max"],
+                cycle2_stats_dim["max"],
+                msg=f"Max value changed for {dim} in cycle 3",
             )
 
     def test_feature_stats_accumulation(self):
         """Test that feature_stats accumulate correctly across checkpoint cycles"""
         config = DatabaseConfig(
-            db_path=self.test_dir,
-            feature_dimensions=["score", "complexity"],
-            feature_bins=10
+            db_path=self.test_dir, feature_dimensions=["score", "complexity"], feature_bins=10
         )
 
         # Cycle 1: Initial programs
         db1 = ProgramDatabase(config)
-        
+
         for i in range(3):
             program = Program(
                 id=f"phase1_{i}",
                 code=f"# Phase 1 program {i}",
-                metrics={
-                    "combined_score": 0.2 + i * 0.2,
-                    "complexity": 100 + i * 50
-                }
+                metrics={"combined_score": 0.2 + i * 0.2, "complexity": 100 + i * 50},
             )
             db1.add(program)
 
@@ -280,10 +266,7 @@ def test_feature_stats_accumulation(self):
             program = Program(
                 id=f"phase2_{i}",
                 code=f"# Phase 2 program {i}",
-                metrics={
-                    "combined_score": 0.1 + i * 0.3,
-                    "complexity": 75 + i * 75
-                }
+                metrics={"combined_score": 0.1 + i * 0.3, "complexity": 75 + i * 75},
             )
             db2.add(program)
 
@@ -294,13 +277,13 @@ def test_feature_stats_accumulation(self):
         # Phase 1 values should be preserved (subset relationship)
         self.assertTrue(
             phase1_score_values.issubset(phase2_score_values),
-            "Phase 1 score values were lost after loading checkpoint"
+            "Phase 1 score values were lost after loading checkpoint",
         )
         self.assertTrue(
             phase1_complexity_values.issubset(phase2_complexity_values),
-            "Phase 1 complexity values were lost after loading checkpoint"
+            "Phase 1 complexity values were lost after loading checkpoint",
         )
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/test_island_isolation.py b/tests/test_island_isolation.py
index d70459a4e..2ed5b632f 100644
--- a/tests/test_island_isolation.py
+++ b/tests/test_island_isolation.py
@@ -20,16 +20,14 @@ def setUp(self):
         self.config.database.num_islands = 3
         self.config.evaluator.parallel_evaluations = 6  # 2 workers per island
         self.config.database.population_size = 30
-        
+
         self.database = ProgramDatabase(self.config.database)
         self.evaluation_file = "mock_evaluator.py"
 
     def test_worker_island_mapping(self):
         """Test that workers are correctly mapped to islands"""
-        controller = ProcessParallelController(
-            self.config, self.evaluation_file, self.database
-        )
-        
+        controller = ProcessParallelController(self.config, self.evaluation_file, self.database)
+
         # Check mapping is correct
         expected_mapping = {
             0: 0,  # Worker 0 -> Island 0
@@ -39,182 +37,164 @@ def test_worker_island_mapping(self):
             4: 1,  # Worker 4 -> Island 1
             5: 2,  # Worker 5 -> Island 2
         }
-        
+
         self.assertEqual(controller.worker_island_map, expected_mapping)
 
     def test_uneven_worker_distribution(self):
         """Test mapping when workers don't divide evenly into islands"""
         self.config.evaluator.parallel_evaluations = 7  # Not divisible by 3
-        
-        controller = ProcessParallelController(
-            self.config, self.evaluation_file, self.database
-        )
-        
+
+        controller = ProcessParallelController(self.config, self.evaluation_file, self.database)
+
         # Island 0 should get 3 workers, islands 1 and 2 get 2 each
         island_worker_counts = {0: 0, 1: 0, 2: 0}
         for worker_id, island_id in controller.worker_island_map.items():
             island_worker_counts[island_id] += 1
-        
+
         self.assertEqual(island_worker_counts[0], 3)
         self.assertEqual(island_worker_counts[1], 2)
         self.assertEqual(island_worker_counts[2], 2)
 
     def test_submit_iteration_uses_correct_island(self):
         """Test that _submit_iteration samples from the specified island"""
-        controller = ProcessParallelController(
-            self.config, self.evaluation_file, self.database
-        )
-        
+        controller = ProcessParallelController(self.config, self.evaluation_file, self.database)
+
         # Add some test programs to different islands
         for i in range(9):
             program = Program(
-                id=f"test_prog_{i}",
-                code=f"# Test program {i}",
-                metrics={"combined_score": 0.5}
+                id=f"test_prog_{i}", code=f"# Test program {i}", metrics={"combined_score": 0.5}
             )
             island_id = i % 3
             program.metadata["island"] = island_id
             self.database.add(program)
             self.database.islands[island_id].add(program.id)
-        
-        with patch.object(controller, 'executor') as mock_executor:
+
+        with patch.object(controller, "executor") as mock_executor:
             mock_future = MagicMock()
             mock_executor.submit.return_value = mock_future
-            
+
             # Submit iteration for island 1
             original_island = self.database.current_island
             future = controller._submit_iteration(100, island_id=1)
-            
+
             # Check that database island was temporarily changed
             # but restored after sampling
             self.assertEqual(self.database.current_island, original_island)
-            
+
             # Check that submit was called
             self.assertIsNotNone(future)
             mock_executor.submit.assert_called_once()
-            
+
             # Get the snapshot that was passed to worker
             call_args = mock_executor.submit.call_args[0]
             db_snapshot = call_args[2]  # Third argument is db_snapshot
-            
+
             # Verify snapshot has island marking
             self.assertEqual(db_snapshot["sampling_island"], 1)
 
     def test_island_isolation_during_evolution(self):
         """Test that parallel workers maintain island isolation"""
-        controller = ProcessParallelController(
-            self.config, self.evaluation_file, self.database
-        )
-        
+        controller = ProcessParallelController(self.config, self.evaluation_file, self.database)
+
         # Track which islands were sampled
         sampled_islands = []
-        
+
         def mock_sample(num_inspirations=None):
             # Record which island was sampled
             sampled_islands.append(self.database.current_island)
             # Return mock parent and inspirations
             mock_program = Program(id="mock", code="", metrics={})
             return mock_program, []
-        
-        with patch.object(self.database, 'sample', side_effect=mock_sample):
-            with patch.object(controller, 'executor'):
+
+        with patch.object(self.database, "sample", side_effect=mock_sample):
+            with patch.object(controller, "executor"):
                 # Submit iterations for different islands
                 controller._submit_iteration(1, island_id=0)
                 controller._submit_iteration(2, island_id=1)
                 controller._submit_iteration(3, island_id=2)
                 controller._submit_iteration(4, island_id=0)
-                
+
                 # Check that correct islands were sampled
                 self.assertEqual(sampled_islands, [0, 1, 2, 0])
 
     def test_fewer_workers_than_islands(self):
         """Test handling when there are fewer workers than islands"""
         self.config.evaluator.parallel_evaluations = 2  # Only 2 workers for 3 islands
-        
-        controller = ProcessParallelController(
-            self.config, self.evaluation_file, self.database
-        )
-        
+
+        controller = ProcessParallelController(self.config, self.evaluation_file, self.database)
+
         # Workers should be distributed across available islands
         expected_mapping = {
             0: 0,  # Worker 0 -> Island 0
             1: 1,  # Worker 1 -> Island 1
             # Island 2 has no dedicated worker
         }
-        
+
         self.assertEqual(controller.worker_island_map, expected_mapping)
 
     def test_database_current_island_restoration(self):
         """Test that database current_island is properly restored after sampling"""
-        controller = ProcessParallelController(
-            self.config, self.evaluation_file, self.database
-        )
-        
+        controller = ProcessParallelController(self.config, self.evaluation_file, self.database)
+
         # Add test programs
         for i in range(6):
             program = Program(
-                id=f"test_prog_{i}",
-                code=f"# Test program {i}",
-                metrics={"combined_score": 0.5}
+                id=f"test_prog_{i}", code=f"# Test program {i}", metrics={"combined_score": 0.5}
             )
             island_id = i % 3
             program.metadata["island"] = island_id
             self.database.add(program)
             self.database.islands[island_id].add(program.id)
-        
+
         # Set initial island
         self.database.current_island = 1
         original_island = self.database.current_island
-        
-        with patch.object(controller, 'executor') as mock_executor:
+
+        with patch.object(controller, "executor") as mock_executor:
             mock_executor.submit.return_value = MagicMock()
-            
+
             # Submit iteration for different island
             controller._submit_iteration(100, island_id=2)
-            
+
             # Check that current_island was restored
             self.assertEqual(self.database.current_island, original_island)
 
     def test_island_distribution_in_batch(self):
         """Test that initial batch is distributed across islands"""
-        controller = ProcessParallelController(
-            self.config, self.evaluation_file, self.database
-        )
-        
+        controller = ProcessParallelController(self.config, self.evaluation_file, self.database)
+
         # Add test programs
         for i in range(9):
             program = Program(
-                id=f"test_prog_{i}",
-                code=f"# Test program {i}",
-                metrics={"combined_score": 0.5}
+                id=f"test_prog_{i}", code=f"# Test program {i}", metrics={"combined_score": 0.5}
             )
             island_id = i % 3
             program.metadata["island"] = island_id
             self.database.add(program)
             self.database.islands[island_id].add(program.id)
-        
+
         # Track submitted islands
         submitted_islands = []
-        
+
         def mock_submit_iteration(iteration, island_id=None):
             if island_id is not None:
                 submitted_islands.append(island_id)
             return MagicMock()
-        
+
         # Start the process pool
         controller.start()
-        
+
         try:
-            with patch.object(controller, '_submit_iteration', side_effect=mock_submit_iteration):
+            with patch.object(controller, "_submit_iteration", side_effect=mock_submit_iteration):
                 # Start evolution with small batch to test distribution
                 asyncio.run(controller.run_evolution(1, 6))  # 6 iterations
-                
+
                 # Check that islands were distributed (expect round-robin pattern)
                 # Should be [0, 1, 2, 0, 1, 2] or similar distribution
                 island_counts = {0: 0, 1: 0, 2: 0}
                 for island_id in submitted_islands:
                     island_counts[island_id] += 1
-                
+
                 # Each island should have received iterations
                 for count in island_counts.values():
                     self.assertGreater(count, 0)
@@ -224,7 +204,7 @@ def mock_submit_iteration(iteration, island_id=None):
 
 class TestIslandMigration(unittest.TestCase):
     """Test that migration still works with island pinning"""
-    
+
     def setUp(self):
         """Set up test environment"""
         self.config = Config()
@@ -232,49 +212,47 @@ def setUp(self):
         self.config.database.migration_interval = 10
         self.config.database.migration_rate = 0.1
         self.database = ProgramDatabase(self.config.database)
-    
+
     def test_migration_preserves_island_structure(self):
         """Test that migration works correctly with pinned workers"""
         # Add programs to islands properly
         for i in range(30):
             program = Program(
-                id=f"prog_{i}",
-                code=f"# Program {i}",
-                metrics={"combined_score": i * 0.1}
+                id=f"prog_{i}", code=f"# Program {i}", metrics={"combined_score": i * 0.1}
             )
             island_id = i % 3
             program.metadata["island"] = island_id
-            
+
             # Add to database
             self.database.programs[program.id] = program
             # Add to island
             self.database.islands[island_id].add(program.id)
-        
+
         # Record island populations before migration
         island_sizes_before = [len(island) for island in self.database.islands]
         original_program_count = len(self.database.programs)
-        
+
         # Verify we set up the test correctly
         self.assertEqual(sum(island_sizes_before), 30)
         self.assertEqual(original_program_count, 30)
-        
+
         # Trigger migration
         self.database.migrate_programs()
-        
+
         # Check islands still have programs
         island_sizes_after = [len(island) for island in self.database.islands]
         total_programs_after = len(self.database.programs)
-        
+
         # All islands should still have programs
         for size in island_sizes_after:
             self.assertGreater(size, 0)
-        
+
         # Migration creates copies, so total population should increase
         # With migration_rate=0.1 and 10 programs per island, expect ~1 program per island to migrate
         # Each program migrates to 2 adjacent islands, so we expect ~6 new programs
         self.assertGreater(total_programs_after, original_program_count)
         self.assertGreater(sum(island_sizes_after), sum(island_sizes_before))
-        
+
         # Verify that migrant programs have correct metadata
         migrant_count = 0
         for program in self.database.programs.values():
@@ -282,40 +260,40 @@ def test_migration_preserves_island_structure(self):
                 migrant_count += 1
                 # Migrant should have "_migrant_" in their ID
                 self.assertIn("_migrant_", program.id)
-        
+
         # Should have some migrant programs
         self.assertGreater(migrant_count, 0)
 
 
 class TestWorkerPinningEdgeCases(unittest.TestCase):
     """Test edge cases for worker-to-island pinning"""
-    
+
     def test_single_island(self):
         """Test behavior with only one island"""
         config = Config()
         config.database.num_islands = 1
         config.evaluator.parallel_evaluations = 4
-        
+
         database = ProgramDatabase(config.database)
         controller = ProcessParallelController(config, "test.py", database)
-        
+
         # All workers should map to island 0
         expected_mapping = {0: 0, 1: 0, 2: 0, 3: 0}
         self.assertEqual(controller.worker_island_map, expected_mapping)
-    
+
     def test_single_worker(self):
         """Test behavior with only one worker"""
         config = Config()
         config.database.num_islands = 5
         config.evaluator.parallel_evaluations = 1
-        
+
         database = ProgramDatabase(config.database)
         controller = ProcessParallelController(config, "test.py", database)
-        
+
         # Single worker should map to island 0
         expected_mapping = {0: 0}
         self.assertEqual(controller.worker_island_map, expected_mapping)
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/test_island_parent_consistency.py b/tests/test_island_parent_consistency.py
index ad6bd385e..d15eeb704 100644
--- a/tests/test_island_parent_consistency.py
+++ b/tests/test_island_parent_consistency.py
@@ -15,63 +15,60 @@ def test_parent_child_island_consistency(self):
         config = Config()
         config.database.num_islands = 3
         database = ProgramDatabase(config.database)
-        
+
         # Create initial program on island 0
         initial_program = Program(
-            id="initial",
-            code="def initial(): pass",
-            metrics={"score": 0.5},
-            iteration_found=0
+            id="initial", code="def initial(): pass", metrics={"score": 0.5}, iteration_found=0
         )
         database.add(initial_program)  # Should go to island 0 (current_island)
-        
+
         # Verify initial program is on island 0
         self.assertIn("initial", database.islands[0])
         self.assertEqual(initial_program.metadata.get("island"), 0)
-        
+
         # Now switch to island 1
         database.next_island()
         self.assertEqual(database.current_island, 1)
-        
+
         # Create a child of the initial program
         child_program = Program(
             id="child1",
             code="def child1(): pass",
             parent_id="initial",  # Parent is on island 0
             metrics={"score": 0.6},
-            iteration_found=1
+            iteration_found=1,
         )
-        
+
         # Add child without specifying target_island
         # This is what happens in process_parallel.py line 445
         database.add(child_program)
-        
+
         # With the fix: child should go to parent's island (0), not current_island (1)
         parent_island = database.programs["initial"].metadata.get("island", 0)
         child_island = database.programs["child1"].metadata.get("island")
-        
+
         # Check if parent is in child's island (this is what the user's assertion checks)
         if child_program.parent_id:
             # This is the exact check from the issue report - should now pass
             self.assertIn(
-                child_program.parent_id, 
+                child_program.parent_id,
                 database.islands[child_island],
-                "Parent should be in child's island"
+                "Parent should be in child's island",
             )
-        
+
         # Verify child is on same island as parent
         self.assertEqual(
-            parent_island, 
+            parent_island,
             child_island,
-            f"Child should be on same island as parent. Parent: island {parent_island}, Child: island {child_island}"
+            f"Child should be on same island as parent. Parent: island {parent_island}, Child: island {child_island}",
         )
-        
+
     def test_multiple_generations_island_drift(self):
         """Test that island drift happens across multiple generations"""
         config = Config()
         config.database.num_islands = 4
         database = ProgramDatabase(config.database)
-        
+
         # Create a lineage
         programs = []
         for i in range(10):
@@ -81,7 +78,7 @@ def test_multiple_generations_island_drift(self):
                     id=f"prog_{i}",
                     code=f"def prog_{i}(): pass",
                     metrics={"score": 0.1 * i},
-                    iteration_found=i
+                    iteration_found=i,
                 )
             else:
                 # Child of previous
@@ -90,16 +87,16 @@ def test_multiple_generations_island_drift(self):
                     code=f"def prog_{i}(): pass",
                     parent_id=f"prog_{i-1}",
                     metrics={"score": 0.1 * i},
-                    iteration_found=i
+                    iteration_found=i,
                 )
-            
+
             database.add(prog)
             programs.append(prog)
-            
+
             # Switch islands periodically (simulating what happens in evolution)
             if i % 3 == 0:
                 database.next_island()
-        
+
         # Check island consistency
         inconsistent_pairs = []
         for prog in programs:
@@ -108,18 +105,18 @@ def test_multiple_generations_island_drift(self):
                 if parent:
                     parent_island = parent.metadata.get("island")
                     child_island = prog.metadata.get("island")
-                    
+
                     # Check if parent is in child's island
                     if prog.parent_id not in database.islands[child_island]:
                         inconsistent_pairs.append((prog.parent_id, prog.id))
-        
+
         # With the fix, we should find NO inconsistent parent-child island assignments
         self.assertEqual(
-            len(inconsistent_pairs), 
+            len(inconsistent_pairs),
             0,
-            f"Found {len(inconsistent_pairs)} inconsistent parent-child pairs: {inconsistent_pairs}"
+            f"Found {len(inconsistent_pairs)} inconsistent parent-child pairs: {inconsistent_pairs}",
         )
-        
+
         # Verify all parent-child pairs are on the same island
         for prog in programs:
             if prog.parent_id:
@@ -131,44 +128,40 @@ def test_multiple_generations_island_drift(self):
                         parent_island,
                         child_island,
                         f"Parent {prog.parent_id} (island {parent_island}) and "
-                        f"child {prog.id} (island {child_island}) should be on same island"
+                        f"child {prog.id} (island {child_island}) should be on same island",
                     )
 
-
     def test_explicit_migration_override(self):
         """Test that explicit target_island overrides parent island inheritance"""
         config = Config()
         config.database.num_islands = 3
         database = ProgramDatabase(config.database)
-        
+
         # Create parent on island 0
         parent = Program(
-            id="parent",
-            code="def parent(): pass",
-            metrics={"score": 0.5},
-            iteration_found=0
+            id="parent", code="def parent(): pass", metrics={"score": 0.5}, iteration_found=0
         )
         database.add(parent)  # Goes to island 0
         self.assertIn("parent", database.islands[0])
-        
+
         # Create child but explicitly send to island 2 (migration)
         migrant_child = Program(
             id="migrant",
             code="def migrant(): pass",
             parent_id="parent",
             metrics={"score": 0.7},
-            iteration_found=1
+            iteration_found=1,
         )
         database.add(migrant_child, target_island=2)  # Explicit migration
-        
+
         # Verify migrant went to island 2, not parent's island 0
         self.assertIn("migrant", database.islands[2])
         self.assertNotIn("migrant", database.islands[0])
         self.assertEqual(migrant_child.metadata.get("island"), 2)
-        
+
         # Parent should still be on island 0
         self.assertEqual(database.programs["parent"].metadata.get("island"), 0)
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/test_model_parameter_demo.py b/tests/test_model_parameter_demo.py
index 142fe43e1..6e19c6229 100644
--- a/tests/test_model_parameter_demo.py
+++ b/tests/test_model_parameter_demo.py
@@ -2,69 +2,73 @@
 Demonstration of fixed OpenAI model parameter handling
 """
 
+
 def demo_model_parameter_selection():
     """Demonstrate how different models get different parameters"""
-    
+
     # Mock the logic from openai.py
     OPENAI_REASONING_MODEL_PREFIXES = (
         # O-series reasoning models
-        "o1-", "o1",  # o1, o1-mini, o1-preview
-        "o3-", "o3",  # o3, o3-mini, o3-pro  
-        "o4-",        # o4-mini
+        "o1-",
+        "o1",  # o1, o1-mini, o1-preview
+        "o3-",
+        "o3",  # o3, o3-mini, o3-pro
+        "o4-",  # o4-mini
         # GPT-5 series are also reasoning models
-        "gpt-5-", "gpt-5"  # gpt-5, gpt-5-mini, gpt-5-nano
+        "gpt-5-",
+        "gpt-5",  # gpt-5, gpt-5-mini, gpt-5-nano
     )
-    
+
     def get_params_for_model(model_name, api_base="https://api.openai.com/v1"):
         """Show what parameters would be used for each model"""
         model_lower = str(model_name).lower()
         is_openai_reasoning_model = (
-            api_base == "https://api.openai.com/v1" and 
-            model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES)
+            api_base == "https://api.openai.com/v1"
+            and model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES)
         )
-        
+
         if is_openai_reasoning_model:
             return {
                 "type": "reasoning_model",
                 "uses": "max_completion_tokens",
                 "supports": ["reasoning_effort", "verbosity"],
-                "excludes": ["temperature", "top_p"]
+                "excludes": ["temperature", "top_p"],
             }
         else:
             return {
-                "type": "standard_model", 
+                "type": "standard_model",
                 "uses": "max_tokens",
                 "supports": ["temperature", "top_p"],
-                "excludes": []
+                "excludes": [],
             }
-    
+
     print("🔧 OpenAI Model Parameter Selection Demo")
     print("=" * 50)
-    
+
     test_models = [
         # Reasoning models
         ("o1-mini", "✅ Reasoning"),
         ("o1-preview", "✅ Reasoning"),
         ("o3-mini-2025-01-31", "✅ Reasoning (with date)"),
         ("gpt-5-nano", "✅ Reasoning (GPT-5 series)"),
-        
-        # Standard models  
+        # Standard models
         ("gpt-4o-mini", "❌ Standard (not reasoning)"),
         ("gpt-4o", "❌ Standard"),
         ("gpt-4-turbo", "❌ Standard"),
     ]
-    
+
     for model, description in test_models:
         params = get_params_for_model(model)
         print(f"\n📋 Model: {model}")
         print(f"   Type: {description}")
         print(f"   Uses: {params['uses']}")
         print(f"   Supports: {', '.join(params['supports'])}")
-        if params['excludes']:
+        if params["excludes"]:
             print(f"   Excludes: {', '.join(params['excludes'])}")
-    
+
     print("\n" + "=" * 50)
     print("✅ Fix successful! No more false positives/negatives.")
 
+
 if __name__ == "__main__":
-    demo_model_parameter_selection()
\ No newline at end of file
+    demo_model_parameter_selection()
diff --git a/tests/test_openai_model_detection.py b/tests/test_openai_model_detection.py
index fb9b745f0..c8665abd0 100644
--- a/tests/test_openai_model_detection.py
+++ b/tests/test_openai_model_detection.py
@@ -11,23 +11,25 @@ class TestOpenAIReasoningModelDetection(unittest.TestCase):
 
     def test_reasoning_model_detection(self):
         """Test various model names to ensure correct reasoning model detection"""
-        
+
         # Define the same constants as in the code
         OPENAI_REASONING_MODEL_PREFIXES = (
             # O-series reasoning models
-            "o1-", "o1",  # o1, o1-mini, o1-preview
-            "o3-", "o3",  # o3, o3-mini, o3-pro  
-            "o4-",        # o4-mini
+            "o1-",
+            "o1",  # o1, o1-mini, o1-preview
+            "o3-",
+            "o3",  # o3, o3-mini, o3-pro
+            "o4-",  # o4-mini
             # GPT-5 series are also reasoning models
-            "gpt-5-", "gpt-5"  # gpt-5, gpt-5-mini, gpt-5-nano
+            "gpt-5-",
+            "gpt-5",  # gpt-5, gpt-5-mini, gpt-5-nano
         )
-        
+
         def is_reasoning_model(model_name, api_base="https://api.openai.com/v1"):
             """Test function that mimics the logic in openai.py"""
             model_lower = str(model_name).lower()
-            return (
-                api_base == "https://api.openai.com/v1" and 
-                model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES)
+            return api_base == "https://api.openai.com/v1" and model_lower.startswith(
+                OPENAI_REASONING_MODEL_PREFIXES
             )
 
         # Test cases: (model_name, expected_result, description)
@@ -44,7 +46,6 @@ def is_reasoning_model(model_name, api_base="https://api.openai.com/v1"):
             ("gpt-5", True, "Base gpt-5 model"),
             ("gpt-5-mini", True, "gpt-5-mini model"),
             ("gpt-5-nano", True, "gpt-5-nano model"),
-            
             # Non-reasoning models - should return False
             ("gpt-4o-mini", False, "gpt-4o-mini (not reasoning)"),
             ("gpt-4o", False, "gpt-4o (not reasoning)"),
@@ -52,50 +53,46 @@ def is_reasoning_model(model_name, api_base="https://api.openai.com/v1"):
             ("gpt-3.5-turbo", False, "gpt-3.5-turbo (not reasoning)"),
             ("claude-3", False, "Non-OpenAI model"),
             ("gemini-pro", False, "Non-OpenAI model"),
-            
             # Edge cases
             ("O1-MINI", True, "Uppercase o1-mini"),
             ("GPT-5-MINI", True, "Uppercase gpt-5-mini"),
         ]
-        
+
         for model_name, expected, description in test_cases:
             with self.subTest(model=model_name, desc=description):
                 result = is_reasoning_model(model_name)
                 self.assertEqual(
-                    result, 
-                    expected, 
-                    f"Model '{model_name}' ({description}): expected {expected}, got {result}"
+                    result,
+                    expected,
+                    f"Model '{model_name}' ({description}): expected {expected}, got {result}",
                 )
 
     def test_non_openai_api_base(self):
         """Test that non-OpenAI API bases don't trigger reasoning model logic"""
-        OPENAI_REASONING_MODEL_PREFIXES = (
-            "o1-", "o1", "o3-", "o3", "o4-", "gpt-5-", "gpt-5"
-        )
-        
+        OPENAI_REASONING_MODEL_PREFIXES = ("o1-", "o1", "o3-", "o3", "o4-", "gpt-5-", "gpt-5")
+
         def is_reasoning_model(model_name, api_base):
             model_lower = str(model_name).lower()
-            return (
-                api_base == "https://api.openai.com/v1" and 
-                model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES)
+            return api_base == "https://api.openai.com/v1" and model_lower.startswith(
+                OPENAI_REASONING_MODEL_PREFIXES
             )
-        
+
         # Even reasoning model names should return False for non-OpenAI APIs
         test_cases = [
             ("o1-mini", "https://api.anthropic.com/v1", False),
             ("gpt-5", "https://generativelanguage.googleapis.com/v1beta/openai/", False),
             ("o3-mini", "https://api.deepseek.com/v1", False),
         ]
-        
+
         for model_name, api_base, expected in test_cases:
             with self.subTest(model=model_name, api=api_base):
                 result = is_reasoning_model(model_name, api_base)
                 self.assertEqual(
-                    result, 
-                    expected, 
-                    f"Model '{model_name}' with API '{api_base}' should return {expected}"
+                    result,
+                    expected,
+                    f"Model '{model_name}' with API '{api_base}' should return {expected}",
                 )
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/test_process_parallel.py b/tests/test_process_parallel.py
index 925d23a3c..8cdd525b3 100644
--- a/tests/test_process_parallel.py
+++ b/tests/test_process_parallel.py
@@ -8,6 +8,7 @@
 import unittest
 from unittest.mock import Mock, patch, MagicMock
 import time
+from concurrent.futures import Future
 
 # Set dummy API key for testing
 os.environ["OPENAI_API_KEY"] = "test"
@@ -111,7 +112,7 @@ async def run_test():
             # Mock the executor to avoid actually spawning processes
             with patch.object(controller, "_submit_iteration") as mock_submit:
                 # Create mock futures that complete immediately
-                mock_future1 = asyncio.Future()
+                mock_future1 = MagicMock()
                 mock_result1 = SerializableResult(
                     child_program_dict={
                         "id": "child_1",
@@ -127,7 +128,9 @@ async def run_test():
                     iteration_time=0.1,
                     iteration=1,
                 )
-                mock_future1.set_result(mock_result1)
+                mock_future1.done.return_value = True
+                mock_future1.result.return_value = mock_result1
+                mock_future1.cancel.return_value = True
 
                 mock_submit.return_value = mock_future1
 
diff --git a/tests/test_prompt_sampler_comprehensive.py b/tests/test_prompt_sampler_comprehensive.py
index a0ae9292a..b001c5e67 100644
--- a/tests/test_prompt_sampler_comprehensive.py
+++ b/tests/test_prompt_sampler_comprehensive.py
@@ -28,9 +28,9 @@ def test_build_prompt_with_inspirations(self):
             "accuracy": 0.9,
             "speed": 0.8,
             "complexity": 5,
-            "memory_usage": 100
+            "memory_usage": 100,
         }
-        
+
         # Create inspirations with diverse characteristics
         inspirations = [
             {
@@ -41,9 +41,9 @@ def test_build_prompt_with_inspirations(self):
                     "accuracy": 0.7,
                     "speed": 0.95,
                     "complexity": 3,
-                    "memory_usage": 50
+                    "memory_usage": 50,
                 },
-                "metadata": {"diverse": True}
+                "metadata": {"diverse": True},
             },
             {
                 "id": "insp2",
@@ -53,10 +53,10 @@ def test_build_prompt_with_inspirations(self):
                     "accuracy": 0.8,
                     "speed": 0.5,
                     "complexity": 7,
-                    "memory_usage": 20
+                    "memory_usage": 20,
                 },
-                "metadata": {"migrant": True}
-            }
+                "metadata": {"migrant": True},
+            },
         ]
 
         # Build prompt with inspirations and feature_dimensions
@@ -65,17 +65,17 @@ def test_build_prompt_with_inspirations(self):
             parent_program=parent_program,
             program_metrics=program_metrics,
             inspirations=inspirations,
-            feature_dimensions=self.feature_dimensions
+            feature_dimensions=self.feature_dimensions,
         )
 
         # Verify prompt was built successfully
         self.assertIn("system", prompt)
         self.assertIn("user", prompt)
-        
+
         # Check that inspirations are included
         self.assertIn("fast_implementation", prompt["user"])
         self.assertIn("memory_efficient", prompt["user"])
-        
+
         # Verify fitness scores are calculated correctly (excluding feature dimensions)
         # The inspirations should show their fitness scores, not including complexity/memory_usage
         self.assertIn("0.75", prompt["user"])  # insp1's combined_score
@@ -91,19 +91,17 @@ def test_format_inspirations_section_with_feature_dimensions(self):
                     "combined_score": 0.9,
                     "accuracy": 0.95,
                     "complexity": 10,  # Feature dimension
-                    "memory_usage": 200  # Feature dimension
+                    "memory_usage": 200,  # Feature dimension
                 },
-                "metadata": {"diverse": True}
+                "metadata": {"diverse": True},
             }
         ]
-        
+
         # Call the method directly
         result = self.prompt_sampler._format_inspirations_section(
-            inspirations, 
-            "python",
-            feature_dimensions=["complexity", "memory_usage"]
+            inspirations, "python", feature_dimensions=["complexity", "memory_usage"]
         )
-        
+
         # Should not raise NameError
         self.assertIsInstance(result, str)
         self.assertIn("test_func", result)
@@ -116,36 +114,28 @@ def test_format_inspirations_section_without_feature_dimensions(self):
                 "id": "test2",
                 "code": "def another_func(): pass",
                 "metrics": {"score": 0.7, "time": 1.2},
-                "metadata": {}
+                "metadata": {},
             }
         ]
-        
+
         # Call without feature_dimensions (should use default of None)
-        result = self.prompt_sampler._format_inspirations_section(
-            inspirations, 
-            "python"
-        )
-        
+        result = self.prompt_sampler._format_inspirations_section(inspirations, "python")
+
         self.assertIsInstance(result, str)
         self.assertIn("another_func", result)
 
     def test_determine_program_type_with_feature_dimensions(self):
         """Test _determine_program_type with feature_dimensions parameter"""
         program = {
-            "metrics": {
-                "combined_score": 0.85,
-                "complexity": 5,
-                "memory_usage": 100
-            },
-            "metadata": {}
+            "metrics": {"combined_score": 0.85, "complexity": 5, "memory_usage": 100},
+            "metadata": {},
         }
-        
+
         # Test with feature_dimensions
         program_type = self.prompt_sampler._determine_program_type(
-            program, 
-            feature_dimensions=["complexity", "memory_usage"]
+            program, feature_dimensions=["complexity", "memory_usage"]
         )
-        
+
         self.assertEqual(program_type, "High-Performer")  # Based on combined_score of 0.85
 
     def test_extract_unique_features_calls_determine_program_type(self):
@@ -153,30 +143,26 @@ def test_extract_unique_features_calls_determine_program_type(self):
         program = {
             "code": "",  # Empty code to trigger default features
             "metrics": {"score": 0.5},
-            "metadata": {}
+            "metadata": {},
         }
-        
+
         # This should not raise NameError when calling _determine_program_type
         features = self.prompt_sampler._extract_unique_features(program)
-        
+
         self.assertIsInstance(features, str)
         self.assertIn("approach to the problem", features)
 
     def test_build_prompt_with_all_optional_parameters(self):
         """Test build_prompt with all optional parameters including inspirations"""
         current_program = "def main(): pass"
-        
+
         # Comprehensive test data
-        previous_programs = [
-            {"id": "prev1", "code": "def v1(): pass", "metrics": {"score": 0.3}}
-        ]
+        previous_programs = [{"id": "prev1", "code": "def v1(): pass", "metrics": {"score": 0.3}}]
         top_programs = [
             {"id": "top1", "code": "def best(): pass", "metrics": {"combined_score": 0.95}}
         ]
-        inspirations = [
-            {"id": "insp1", "code": "def creative(): pass", "metrics": {"score": 0.6}}
-        ]
-        
+        inspirations = [{"id": "insp1", "code": "def creative(): pass", "metrics": {"score": 0.6}}]
+
         prompt = self.prompt_sampler.build_prompt(
             current_program=current_program,
             parent_program="def parent(): pass",
@@ -188,9 +174,9 @@ def test_build_prompt_with_all_optional_parameters(self):
             evolution_round=5,
             diff_based_evolution=True,
             feature_dimensions=["feature1"],
-            program_artifacts={"output": "test output"}
+            program_artifacts={"output": "test output"},
         )
-        
+
         self.assertIn("system", prompt)
         self.assertIn("user", prompt)
         # Verify all components are included
@@ -205,20 +191,18 @@ def test_fitness_calculation_consistency(self):
             "accuracy": 0.9,
             "speed": 0.7,
             "complexity": 5,  # Feature dimension
-            "memory_usage": 100  # Feature dimension
+            "memory_usage": 100,  # Feature dimension
         }
         feature_dimensions = ["complexity", "memory_usage"]
-        
+
         # Build a prompt with these metrics
         prompt = self.prompt_sampler.build_prompt(
             current_program="def test(): pass",
             program_metrics=metrics,
-            inspirations=[
-                {"id": "i1", "code": "pass", "metrics": metrics}
-            ],
-            feature_dimensions=feature_dimensions
+            inspirations=[{"id": "i1", "code": "pass", "metrics": metrics}],
+            feature_dimensions=feature_dimensions,
         )
-        
+
         # The fitness score should be 0.8 (combined_score), not an average including features
         self.assertIn("0.8000", prompt["user"])  # Fitness score in prompt
 
@@ -227,9 +211,9 @@ def test_empty_inspirations_list(self):
         prompt = self.prompt_sampler.build_prompt(
             current_program="def empty(): pass",
             inspirations=[],  # Empty list
-            feature_dimensions=["test_feature"]
+            feature_dimensions=["test_feature"],
         )
-        
+
         self.assertIn("system", prompt)
         self.assertIn("user", prompt)
         # Should complete without errors
@@ -246,46 +230,39 @@ def test_inspirations_with_missing_metrics(self):
                 "id": "bad2",
                 "code": "def worse(): pass",
                 # No metrics key at all
-            }
+            },
         ]
-        
+
         # Should handle gracefully without errors
         result = self.prompt_sampler._format_inspirations_section(
-            inspirations,
-            "python",
-            feature_dimensions=["test"]
+            inspirations, "python", feature_dimensions=["test"]
         )
-        
+
         self.assertIsInstance(result, str)
 
     def test_feature_dimensions_none_vs_empty_list(self):
         """Test that None and empty list for feature_dimensions are handled correctly"""
         program = {"metrics": {"score": 0.5}}
-        
+
         # Test with None
         type_none = self.prompt_sampler._determine_program_type(program, None)
-        
+
         # Test with empty list
         type_empty = self.prompt_sampler._determine_program_type(program, [])
-        
+
         # Both should work and give same result
         self.assertEqual(type_none, type_empty)
 
     def test_feature_coordinates_formatting_in_prompt(self):
         """Test that feature coordinates are formatted correctly in the prompt"""
-        metrics = {
-            "combined_score": 0.75,
-            "complexity": 8,
-            "memory_usage": 150,
-            "cpu_usage": 0.3
-        }
-        
+        metrics = {"combined_score": 0.75, "complexity": 8, "memory_usage": 150, "cpu_usage": 0.3}
+
         prompt = self.prompt_sampler.build_prompt(
             current_program="def test(): pass",
             program_metrics=metrics,
-            feature_dimensions=["complexity", "memory_usage", "cpu_usage"]
+            feature_dimensions=["complexity", "memory_usage", "cpu_usage"],
         )
-        
+
         # Check that feature coordinates are included
         user_msg = prompt["user"]
         self.assertIn("complexity", user_msg)
@@ -294,4 +271,4 @@ def test_feature_coordinates_formatting_in_prompt(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From ec44ff24169e596540cdfc7ad8bb45e254a96e10 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sat, 30 Aug 2025 23:04:22 +0800
Subject: [PATCH 2/2] Update _version.py

---
 openevolve/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openevolve/_version.py b/openevolve/_version.py
index 6ce4e8a54..33eb37d35 100644
--- a/openevolve/_version.py
+++ b/openevolve/_version.py
@@ -1,3 +1,3 @@
 """Version information for openevolve package."""
 
-__version__ = "0.2.10"
+__version__ = "0.2.11"