diff --git a/README.md b/README.md index 94cc417e..afce19f2 100644 --- a/README.md +++ b/README.md @@ -337,6 +337,40 @@ Sample configuration files are available in the `configs/` directory: - `default_config.yaml`: Comprehensive configuration with all available options - `island_config_example.yaml`: Advanced island-based evolution setup +### Prompt Engineering Design + +OpenEvolve uses a sophisticated prompt engineering approach that separates different types of program examples to optimize LLM learning: + +#### Program Selection Strategy + +The system distinguishes between three types of program examples shown to the LLM: + +1. **Previous Attempts** (`num_top_programs`): Shows only the best performing programs to demonstrate high-quality approaches + - Used for the "Previous Attempts" section in prompts + - Focused on proven successful patterns + - Helps LLM understand what constitutes good performance + +2. **Top Programs** (`num_top_programs + num_diverse_programs`): Broader selection including both top performers and diverse approaches + - Used for the "Top Performing Programs" section + - Includes diverse programs to prevent local optima + - Balances exploitation of known good solutions with exploration of novel approaches + +3. **Inspirations** (`num_top_programs`): Cross-island program samples for creative inspiration + - Derived from other evolution islands to maintain diversity + - Count automatically configures based on `num_top_programs` setting + - Prevents convergence by exposing LLM to different evolutionary trajectories + +#### Design Rationale + +This separation is intentional and serves multiple purposes: + +- **Focused Learning**: Previous attempts show only the best patterns, helping LLM understand quality standards +- **Diversity Maintenance**: Top programs include diverse solutions to encourage exploration beyond local optima +- **Cross-Pollination**: Inspirations from other islands introduce novel approaches and prevent stagnation +- **Configurable Balance**: Adjust `num_top_programs` and `num_diverse_programs` to control exploration vs exploitation + +The inspiration count automatically scales with `num_top_programs` to maintain consistency across different configuration sizes, eliminating the need for a separate configuration parameter. + ### Template Customization OpenEvolve supports advanced prompt template customization to increase diversity in code evolution: diff --git a/openevolve/_version.py b/openevolve/_version.py index 7b8ca900..3e5f65b3 100644 --- a/openevolve/_version.py +++ b/openevolve/_version.py @@ -1,3 +1,3 @@ """Version information for openevolve package.""" -__version__ = "0.1.2" +__version__ = "0.1.3" diff --git a/openevolve/database.py b/openevolve/database.py index 28cf937d..30823380 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -304,10 +304,13 @@ def get(self, program_id: str) -> Optional[Program]: """ return self.programs.get(program_id) - def sample(self) -> Tuple[Program, List[Program]]: + def sample(self, num_inspirations: Optional[int] = None) -> Tuple[Program, List[Program]]: """ Sample a program and inspirations for the next evolution step + Args: + num_inspirations: Number of inspiration programs to sample (defaults to 5 for backward compatibility) + Returns: Tuple of (parent_program, inspiration_programs) """ @@ -315,7 +318,9 @@ def sample(self) -> Tuple[Program, List[Program]]: parent = self._sample_parent() # Select inspirations - inspirations = self._sample_inspirations(parent, n=5) + if num_inspirations is None: + num_inspirations = 5 # Default for backward compatibility + inspirations = self._sample_inspirations(parent, n=num_inspirations) logger.debug(f"Sampled parent {parent.id} and {len(inspirations)} inspirations") return parent, inspirations @@ -436,10 +441,10 @@ def get_top_programs( reverse=True, ) else: - # Sort by average of all numeric metrics + # Sort by combined_score if available, otherwise by average of all numeric metrics sorted_programs = sorted( candidates, - key=lambda p: safe_numeric_average(p.metrics), + key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)), reverse=True, ) @@ -877,7 +882,7 @@ def _update_archive(self, program: Program) -> None: # Find worst program among valid programs if valid_archive_programs: worst_program = min( - valid_archive_programs, key=lambda p: safe_numeric_average(p.metrics) + valid_archive_programs, key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)) ) # Replace if new program is better @@ -1279,10 +1284,10 @@ def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> # Get programs sorted by fitness (worst first) all_programs = list(self.programs.values()) - # Sort by average metric (worst first) + # Sort by combined_score if available, otherwise by average metric (worst first) sorted_programs = sorted( all_programs, - key=lambda p: safe_numeric_average(p.metrics), + key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)), ) # Remove worst programs, but never remove the best program or excluded program diff --git a/openevolve/iteration.py b/openevolve/iteration.py index 11d3453a..df64a550 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -48,7 +48,7 @@ async def run_iteration_with_shared_db( try: # Sample parent and inspirations from database - parent, inspirations = database.sample() + parent, inspirations = database.sample(num_inspirations=config.prompt.num_top_programs) # Get artifacts for the parent program if available parent_artifacts = database.get_artifacts(parent.id) diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index 22088f0a..c9ffba05 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -153,18 +153,20 @@ def _run_iteration_worker( ) # Use config values for limits instead of hardcoding - island_top_programs = island_programs[ + # Programs for LLM display (includes both top and diverse for inspiration) + programs_for_prompt = island_programs[ : _worker_config.prompt.num_top_programs + _worker_config.prompt.num_diverse_programs ] - island_previous_programs = island_programs[: _worker_config.prompt.num_top_programs] + # Best programs only (for previous attempts section, focused on top performers) + best_programs_only = island_programs[: _worker_config.prompt.num_top_programs] # Build prompt prompt = _worker_prompt_sampler.build_prompt( current_program=parent.code, parent_program=parent.code, program_metrics=parent.metrics, - previous_programs=[p.to_dict() for p in island_previous_programs], - top_programs=[p.to_dict() for p in island_top_programs], + previous_programs=[p.to_dict() for p in best_programs_only], + top_programs=[p.to_dict() for p in programs_for_prompt], inspirations=[p.to_dict() for p in inspirations], language=_worker_config.language, evolution_round=iteration, @@ -589,7 +591,7 @@ def _submit_iteration(self, iteration: int, island_id: Optional[int] = None) -> try: # Sample parent and inspirations from the target island - parent, inspirations = self.database.sample() + parent, inspirations = self.database.sample(num_inspirations=self.config.prompt.num_top_programs) finally: # Always restore original island state self.database.current_island = original_island diff --git a/tests/test_island_isolation.py b/tests/test_island_isolation.py index 431b508a..d70459a4 100644 --- a/tests/test_island_isolation.py +++ b/tests/test_island_isolation.py @@ -109,7 +109,7 @@ def test_island_isolation_during_evolution(self): # Track which islands were sampled sampled_islands = [] - def mock_sample(): + def mock_sample(num_inspirations=None): # Record which island was sampled sampled_islands.append(self.database.current_island) # Return mock parent and inspirations