diff --git a/README.md b/README.md index 877172e23..663623b78 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ OpenEvolve implements a comprehensive evolutionary coding system with: - **Island-Based Evolution**: Multiple populations with periodic migration for diversity maintenance - **Inspiration vs Performance**: Sophisticated prompt engineering separating top performers from diverse inspirations - **Multi-Strategy Selection**: Elite, diverse, and exploratory program sampling strategies +- **Adaptive Feature Dimensions**: Default features (complexity & diversity) with customizable multi-dimensional search spaces #### 📊 **Evaluation & Feedback Systems** - **Artifacts Side-Channel**: Capture build errors, profiling data, and execution feedback for LLM improvement @@ -274,7 +275,7 @@ database: population_size: 500 num_islands: 5 # Island-based evolution migration_interval: 20 - feature_dimensions: ["score", "complexity"] # Quality-diversity features + feature_dimensions: ["complexity", "diversity"] # Default quality-diversity features evaluator: # Advanced evaluation features @@ -293,8 +294,41 @@ Sample configuration files are available in the `configs/` directory: - `default_config.yaml`: Comprehensive configuration with all available options - `island_config_example.yaml`: Advanced island-based evolution setup +### Feature Dimensions in MAP-Elites + +Feature dimensions control how programs are organized in the MAP-Elites quality-diversity grid: + +**Default Features**: If `feature_dimensions` is NOT specified in your config, OpenEvolve uses `["complexity", "diversity"]` as defaults. + +**Built-in Features** (always computed internally by OpenEvolve): +- **complexity**: Code length (recommended default) +- **diversity**: Code structure diversity compared to other programs (recommended default) + +Only `complexity` and `diversity` are used as defaults because they work well across all program types. + +**Custom Features**: You can mix built-in features with metrics from your evaluator: +```yaml +database: + feature_dimensions: ["complexity", "performance", "correctness"] # Mix of built-in and custom + # Per-dimension bin configuration (optional) + feature_bins: + complexity: 10 # 10 bins for complexity + performance: 20 # 20 bins for performance (from YOUR evaluator) + correctness: 15 # 15 bins for correctness (from YOUR evaluator) +``` + +**Important**: OpenEvolve will raise an error if a specified feature is not found in the evaluator's metrics. This ensures your configuration is correct. The error message will show available metrics to help you fix the configuration. + See the [Configuration Guide](configs/default_config.yaml) for a full list of options. +### Default Metric for Program Selection + +When comparing and selecting programs, OpenEvolve uses the following priority: +1. **combined_score**: If your evaluator returns a `combined_score` metric, it will be used as the primary fitness measure +2. **Average of all metrics**: If no `combined_score` is provided, OpenEvolve calculates the average of all numeric metrics returned by your evaluator + +This ensures programs can always be compared even without explicit fitness definitions. For best results, consider having your evaluator return a `combined_score` that represents overall program fitness. + ## Artifacts Channel OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it. diff --git a/configs/default_config.yaml b/configs/default_config.yaml index a3c4705c2..7a9863240 100644 --- a/configs/default_config.yaml +++ b/configs/default_config.yaml @@ -90,10 +90,32 @@ database: # Note: diversity_metric is fixed to "edit_distance" (feature_based not implemented) # Feature map dimensions for MAP-Elites + # Default if not specified: ["complexity", "diversity"] + # + # Built-in features (always available, computed by OpenEvolve): + # - "complexity": Code length + # - "diversity": Code structure diversity + # + # You can mix built-in features with custom metrics from your evaluator: feature_dimensions: # Dimensions for MAP-Elites feature map - - "score" # Performance score - - "complexity" # Code complexity (length) + - "complexity" # Code length (built-in) + - "diversity" # Code diversity (built-in) + # Example with custom features: + # feature_dimensions: + # - "performance" # Must be returned by your evaluator + # - "correctness" # Must be returned by your evaluator + # - "memory_efficiency" # Must be returned by your evaluator + + # Number of bins per dimension + # Can be a single integer (same for all dimensions) or a dict feature_bins: 10 # Number of bins per dimension + # Example of per-dimension configuration: + # feature_bins: + # complexity: 10 # 10 bins for complexity + # diversity: 15 # 15 bins for diversity + # performance: 20 # 20 bins for custom metric + + diversity_reference_size: 20 # Size of reference set for diversity calculation # Evaluator configuration evaluator: diff --git a/configs/island_config_example.yaml b/configs/island_config_example.yaml index 7a7ab6db7..d352dedb1 100644 --- a/configs/island_config_example.yaml +++ b/configs/island_config_example.yaml @@ -33,8 +33,14 @@ database: # Note: diversity_metric fixed to "edit_distance" # Feature map dimensions for MAP-Elites - feature_dimensions: ["score", "complexity"] + # Default if not specified: ["complexity", "diversity"] + # Comment out the line below to use the defaults + # feature_dimensions: ["complexity", "diversity"] feature_bins: 10 + # Can also use per-dimension bins: + # feature_bins: + # performance: 20 + # correctness: 10 # Prompt configuration prompt: diff --git a/openevolve/_version.py b/openevolve/_version.py index db4a72e56..73b505cfd 100644 --- a/openevolve/_version.py +++ b/openevolve/_version.py @@ -1,3 +1,3 @@ """Version information for openevolve package.""" -__version__ = "0.0.19" \ No newline at end of file +__version__ = "0.0.20" diff --git a/openevolve/config.py b/openevolve/config.py index a870c9ce1..cc87455e6 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -166,9 +166,10 @@ class DatabaseConfig: diversity_metric: str = "edit_distance" # Options: "edit_distance", "feature_based" # Feature map dimensions for MAP-Elites - # feature_dimensions: List[str] = field(default_factory=lambda: ["score", "complexity"]) - feature_dimensions: List[str] = field(default_factory=lambda: ["complexity"]) - feature_bins: int = 10 + # Default to complexity and diversity for better exploration + feature_dimensions: List[str] = field(default_factory=lambda: ["complexity", "diversity"]) + feature_bins: Union[int, Dict[str, int]] = 10 # Can be int (all dims) or dict (per-dim) + diversity_reference_size: int = 20 # Size of reference set for diversity calculation # Migration parameters for island-based evolution migration_interval: int = 50 # Migrate every N generations diff --git a/openevolve/database.py b/openevolve/database.py index 3893ad3ae..0d2ba6fd4 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -105,8 +105,16 @@ def __init__(self, config: DatabaseConfig): # Feature grid for MAP-Elites self.feature_map: Dict[str, str] = {} - self.feature_bins = max(config.feature_bins, - int(pow(config.archive_size, 1 / len(config.feature_dimensions)) + 0.99)) + + # Handle both int and dict types for feature_bins + if isinstance(config.feature_bins, int): + self.feature_bins = max( + config.feature_bins, + int(pow(config.archive_size, 1 / len(config.feature_dimensions)) + 0.99), + ) + else: + # If dict, keep as is (we'll use feature_bins_per_dim instead) + self.feature_bins = 10 # Default fallback for backward compatibility # Island populations self.islands: List[Set[str]] = [set() for _ in range(config.num_islands)] @@ -123,7 +131,7 @@ def __init__(self, config: DatabaseConfig): # Track the absolute best program separately self.best_program_id: Optional[str] = None - + # Track best program per island for proper island-based evolution self.island_best_programs: List[Optional[str]] = [None] * config.num_islands @@ -144,6 +152,29 @@ def __init__(self, config: DatabaseConfig): random.seed(config.random_seed) logger.debug(f"Database: Set random seed to {config.random_seed}") + # Diversity caching infrastructure + self.diversity_cache: Dict[int, Dict[str, Union[float, float]]] = ( + {} + ) # hash -> {"value": float, "timestamp": float} + self.diversity_cache_size: int = 1000 # LRU cache size + self.diversity_reference_set: List[str] = ( + [] + ) # Reference program codes for consistent diversity + self.diversity_reference_size: int = getattr(config, "diversity_reference_size", 20) + + # Feature scaling infrastructure + self.feature_stats: Dict[str, Dict[str, Union[float, float, List[float]]]] = {} + self.feature_scaling_method: str = "minmax" # Options: minmax, zscore, percentile + + # Per-dimension bins support + if hasattr(config, "feature_bins") and isinstance(config.feature_bins, dict): + self.feature_bins_per_dim = config.feature_bins + else: + # Backward compatibility - use same bins for all dimensions + self.feature_bins_per_dim = { + dim: self.feature_bins for dim in config.feature_dimensions + } + logger.info(f"Initialized program database with {len(self.programs)} programs") def add( @@ -191,8 +222,11 @@ def add( if should_replace: # Log significant MAP-Elites events - coords_dict = {self.config.feature_dimensions[i]: feature_coords[i] for i in range(len(feature_coords))} - + coords_dict = { + self.config.feature_dimensions[i]: feature_coords[i] + for i in range(len(feature_coords)) + } + if feature_key not in self.feature_map: # New cell occupation logger.info("New MAP-Elites cell occupied: %s", coords_dict) @@ -200,8 +234,12 @@ def add( total_possible_cells = self.feature_bins ** len(self.config.feature_dimensions) coverage = (len(self.feature_map) + 1) / total_possible_cells if coverage in [0.1, 0.25, 0.5, 0.75, 0.9]: - logger.info("MAP-Elites coverage reached %.1f%% (%d/%d cells)", - coverage * 100, len(self.feature_map) + 1, total_possible_cells) + logger.info( + "MAP-Elites coverage reached %.1f%% (%d/%d cells)", + coverage * 100, + len(self.feature_map) + 1, + total_possible_cells, + ) else: # Cell replacement - existing program being replaced existing_program_id = self.feature_map[feature_key] @@ -209,14 +247,18 @@ def add( existing_program = self.programs[existing_program_id] new_fitness = safe_numeric_average(program.metrics) existing_fitness = safe_numeric_average(existing_program.metrics) - logger.info("MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)", - coords_dict, existing_fitness, new_fitness) - + logger.info( + "MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)", + coords_dict, + existing_fitness, + new_fitness, + ) + # use MAP-Elites to manage archive if existing_program_id in self.archive: self.archive.discard(existing_program_id) self.archive.add(program.id) - + self.feature_map[feature_key] = program.id # Add to specific island (not random!) @@ -236,7 +278,7 @@ def add( # Update the absolute best program tracking (after population enforcement) self._update_best_program(program) - + # Update island-specific best program tracking self._update_island_best_program(program, island_idx) @@ -349,7 +391,9 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]: return sorted_programs[0] if sorted_programs else None - def get_top_programs(self, n: int = 10, metric: Optional[str] = None, island_idx: Optional[int] = None) -> List[Program]: + def get_top_programs( + self, n: int = 10, metric: Optional[str] = None, island_idx: Optional[int] = None + ) -> List[Program]: """ Get the top N programs based on a metric @@ -364,7 +408,7 @@ def get_top_programs(self, n: int = 10, metric: Optional[str] = None, island_idx # Validate island_idx parameter if island_idx is not None and (island_idx < 0 or island_idx >= len(self.islands)): raise IndexError(f"Island index {island_idx} is out of range (0-{len(self.islands)-1})") - + if not self.programs: return [] @@ -372,14 +416,13 @@ def get_top_programs(self, n: int = 10, metric: Optional[str] = None, island_idx if island_idx is not None: # Island-specific query island_programs = [ - self.programs[pid] for pid in self.islands[island_idx] - if pid in self.programs + self.programs[pid] for pid in self.islands[island_idx] if pid in self.programs ] candidates = island_programs else: # Global query candidates = list(self.programs.values()) - + if not candidates: return [] @@ -467,7 +510,9 @@ def load(self, path: str) -> None: saved_islands = metadata.get("islands", []) self.archive = set(metadata.get("archive", [])) self.best_program_id = metadata.get("best_program_id") - self.island_best_programs = metadata.get("island_best_programs", [None] * len(saved_islands)) + self.island_best_programs = metadata.get( + "island_best_programs", [None] * len(saved_islands) + ) self.last_iteration = metadata.get("last_iteration", 0) self.current_island = metadata.get("current_island", 0) self.island_generations = metadata.get("island_generations", [0] * len(saved_islands)) @@ -496,7 +541,7 @@ def load(self, path: str) -> None: # Ensure island_generations list has correct length if len(self.island_generations) != len(self.islands): self.island_generations = [0] * len(self.islands) - + # Ensure island_best_programs list has correct length if len(self.island_best_programs) != len(self.islands): self.island_best_programs = [None] * len(self.islands) @@ -547,7 +592,7 @@ def _reconstruct_islands(self, saved_islands: List[List[str]]) -> None: feature_keys_to_remove.append(key) for key in feature_keys_to_remove: del self.feature_map[key] - + # Clean up island best programs - remove stale references self._cleanup_stale_island_bests() @@ -645,18 +690,12 @@ def _calculate_feature_coords(self, program: Program) -> List[int]: bin_idx = self._calculate_complexity_bin(complexity) coords.append(bin_idx) elif dim == "diversity": - # Use average fast code diversity to other programs + # Use cached diversity calculation with reference set if len(self.programs) < 2: bin_idx = 0 else: - sample_programs = random.sample( - list(self.programs.values()), min(5, len(self.programs)) - ) - avg_diversity = sum( - self._fast_code_diversity(program.code, other.code) - for other in sample_programs - ) / len(sample_programs) - bin_idx = self._calculate_diversity_bin(avg_diversity) + diversity = self._get_cached_diversity(program) + bin_idx = self._calculate_diversity_bin(diversity) coords.append(bin_idx) elif dim == "score": # Use average of numeric metrics @@ -664,16 +703,30 @@ def _calculate_feature_coords(self, program: Program) -> List[int]: bin_idx = 0 else: avg_score = safe_numeric_average(program.metrics) - bin_idx = min(int(avg_score * self.feature_bins), self.feature_bins - 1) + # Update stats and scale + self._update_feature_stats("score", avg_score) + scaled_value = self._scale_feature_value("score", avg_score) + num_bins = self.feature_bins_per_dim.get("score", self.feature_bins) + bin_idx = int(scaled_value * num_bins) + bin_idx = max(0, min(num_bins - 1, bin_idx)) coords.append(bin_idx) elif dim in program.metrics: # Use specific metric score = program.metrics[dim] - bin_idx = min(int(score * self.feature_bins), self.feature_bins - 1) + # Update stats and scale + self._update_feature_stats(dim, score) + scaled_value = self._scale_feature_value(dim, score) + num_bins = self.feature_bins_per_dim.get(dim, self.feature_bins) + bin_idx = int(scaled_value * num_bins) + bin_idx = max(0, min(num_bins - 1, bin_idx)) coords.append(bin_idx) else: - # Default to middle bin if feature not found - coords.append(self.feature_bins // 2) + # Feature not found - this is an error + raise ValueError( + f"Feature dimension '{dim}' specified in config but not found in program metrics. " + f"Available metrics: {list(program.metrics.keys())}. " + f"Either remove '{dim}' from feature_dimensions or ensure your evaluator returns it." + ) # Only log coordinates at debug level for troubleshooting logger.debug( "MAP-Elites coords: %s", @@ -683,100 +736,56 @@ def _calculate_feature_coords(self, program: Program) -> List[int]: def _calculate_complexity_bin(self, complexity: int) -> int: """ - Calculate the bin index for a given complexity value using adaptive binning. - + Calculate the bin index for a given complexity value using feature scaling. + Args: complexity: The complexity value (code length) - + Returns: Bin index in range [0, self.feature_bins - 1] """ - if len(self.programs) < 2: - # Cold start: use fixed range binning - # Assume reasonable range of 0-10000 characters for code length - max_complexity = 10000 - min_complexity = 0 - else: - # Adaptive binning: use actual range from existing programs - existing_complexities = [len(p.code) for p in self.programs.values()] - min_complexity = min(existing_complexities) - max_complexity = max(existing_complexities) - - # Ensure range is not zero - if max_complexity == min_complexity: - max_complexity = min_complexity + 1 - - # Normalize complexity to [0, 1] range - if max_complexity > min_complexity: - normalized = (complexity - min_complexity) / (max_complexity - min_complexity) - else: - normalized = 0.0 - - # Clamp to [0, 1] range - normalized = max(0.0, min(1.0, normalized)) - + # Update feature statistics + self._update_feature_stats("complexity", float(complexity)) + + # Scale the value using configured method + scaled_value = self._scale_feature_value("complexity", float(complexity)) + + # Get number of bins for this dimension + num_bins = self.feature_bins_per_dim.get("complexity", self.feature_bins) + # Convert to bin index - bin_idx = int(normalized * self.feature_bins) - + bin_idx = int(scaled_value * num_bins) + # Ensure bin index is within valid range - bin_idx = max(0, min(self.feature_bins - 1, bin_idx)) - + bin_idx = max(0, min(num_bins - 1, bin_idx)) + return bin_idx def _calculate_diversity_bin(self, diversity: float) -> int: """ - Calculate the bin index for a given diversity value using adaptive binning. - + Calculate the bin index for a given diversity value using feature scaling. + Args: diversity: The average fast code diversity to other programs - + Returns: Bin index in range [0, self.feature_bins - 1] """ - def _fast_diversity(program, sample_programs): - """Calculate average fast diversity for a program against sample programs""" - avg_diversity = sum( - self._fast_code_diversity(program.code, other.code) - for other in sample_programs - ) / len(sample_programs) - return avg_diversity - - if len(self.programs) < 2: - # Cold start: use fixed range binning - # Assume reasonable range of 0-10000 for fast diversity - max_diversity = 10000 - min_diversity = 0 - else: - # Sample programs for calculating diversity range (limit to 5 for performance) - sample_programs = list(self.programs.values()) - if len(sample_programs) > 5: - import random - sample_programs = random.sample(sample_programs, 5) - - # Adaptive binning: use actual range from existing programs - existing_diversities = [_fast_diversity(p, sample_programs) for p in self.programs.values()] - min_diversity = min(existing_diversities) - max_diversity = max(existing_diversities) - - # Ensure range is not zero - if max_diversity == min_diversity: - max_diversity = min_diversity + 1 - - # Normalize diversity to [0, 1] range - if max_diversity > min_diversity: - normalized = (diversity - min_diversity) / (max_diversity - min_diversity) - else: - normalized = 0.0 + # Update feature statistics + self._update_feature_stats("diversity", diversity) + + # Scale the value using configured method + scaled_value = self._scale_feature_value("diversity", diversity) - # Clamp to [0, 1] range - normalized = max(0.0, min(1.0, normalized)) + # Get number of bins for this dimension + num_bins = self.feature_bins_per_dim.get("diversity", self.feature_bins) # Convert to bin index - bin_idx = int(normalized * self.feature_bins) + bin_idx = int(scaled_value * num_bins) # Ensure bin index is within valid range - bin_idx = max(0, min(self.feature_bins - 1, bin_idx)) - + bin_idx = max(0, min(num_bins - 1, bin_idx)) + return bin_idx def _feature_coords_to_key(self, coords: List[int]) -> str: @@ -911,7 +920,7 @@ def _update_best_program(self, program: Program) -> None: def _update_island_best_program(self, program: Program, island_idx: int) -> None: """ Update the best program tracking for a specific island - + Args: program: Program to consider as the new best for the island island_idx: Island index @@ -920,14 +929,14 @@ def _update_island_best_program(self, program: Program, island_idx: int) -> None if island_idx >= len(self.island_best_programs): logger.warning(f"Invalid island index {island_idx}, skipping island best update") return - + # If island doesn't have a best program yet, this becomes the best current_island_best_id = self.island_best_programs[island_idx] if current_island_best_id is None: self.island_best_programs[island_idx] = program.id logger.debug(f"Set initial best program for island {island_idx} to {program.id}") return - + # Check if current best still exists if current_island_best_id not in self.programs: logger.warning( @@ -935,16 +944,19 @@ def _update_island_best_program(self, program: Program, island_idx: int) -> None ) self.island_best_programs[island_idx] = program.id return - + current_island_best = self.programs[current_island_best_id] - + # Update if the new program is better if self._is_better(program, current_island_best): old_id = current_island_best_id self.island_best_programs[island_idx] = program.id - + # Log the change - if "combined_score" in program.metrics and "combined_score" in current_island_best.metrics: + if ( + "combined_score" in program.metrics + and "combined_score" in current_island_best.metrics + ): old_score = current_island_best.metrics["combined_score"] new_score = program.metrics["combined_score"] score_diff = new_score - old_score @@ -953,7 +965,9 @@ def _update_island_best_program(self, program: Program, island_idx: int) -> None f"(combined_score: {old_score:.4f} → {new_score:.4f}, +{score_diff:.4f})" ) else: - logger.debug(f"Island {island_idx}: New best program {program.id} replaces {old_id}") + logger.debug( + f"Island {island_idx}: New best program {program.id} replaces {old_id}" + ) def _sample_parent(self) -> Program: """ @@ -1077,7 +1091,7 @@ def _sample_random_parent(self) -> Program: def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: """ Sample inspiration programs for the next evolution step. - + For proper island-based evolution, inspirations are sampled ONLY from the current island, maintaining genetic isolation between islands. @@ -1089,14 +1103,14 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: List of inspiration programs from the current island """ inspirations = [] - + # Get the parent's island (should be current_island) parent_island = parent.metadata.get("island", self.current_island) - + # Get all programs from the current island island_program_ids = list(self.islands[parent_island]) island_programs = [self.programs[pid] for pid in island_program_ids if pid in self.programs] - + if not island_programs: logger.warning(f"Island {parent_island} has no programs for inspiration sampling") return [] @@ -1110,7 +1124,9 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: ): island_best = self.programs[island_best_id] inspirations.append(island_best) - logger.debug(f"Including island {parent_island} best program {island_best_id} in inspirations") + logger.debug( + f"Including island {parent_island} best program {island_best_id} in inspirations" + ) elif island_best_id is not None and island_best_id not in self.programs: # Clean up stale island best reference logger.warning( @@ -1132,7 +1148,7 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: # Try to sample from different feature cells within the island feature_coords = self._calculate_feature_coords(parent) nearby_programs = [] - + # Create a mapping of feature cells to island programs for efficient lookup island_feature_map = {} for prog_id in island_program_ids: @@ -1141,7 +1157,7 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: prog_coords = self._calculate_feature_coords(prog) cell_key = self._feature_coords_to_key(prog_coords) island_feature_map[cell_key] = prog_id - + # Try to find programs from nearby feature cells within the island for _ in range(remaining_slots * 3): # Try more times to find nearby programs # Perturb coordinates @@ -1149,7 +1165,7 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: max(0, min(self.feature_bins - 1, c + random.randint(-2, 2))) for c in feature_coords ] - + cell_key = self._feature_coords_to_key(perturbed_coords) if cell_key in island_feature_map: program_id = island_feature_map[cell_key] @@ -1166,7 +1182,7 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: # If we still need more, add random programs from the island if len(inspirations) + len(nearby_programs) < n: remaining = n - len(inspirations) - len(nearby_programs) - + # Get available programs from the island excluded_ids = ( {parent.id} @@ -1174,14 +1190,14 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: .union(p.id for p in nearby_programs) ) available_island_ids = [ - pid for pid in island_program_ids + pid + for pid in island_program_ids if pid not in excluded_ids and pid in self.programs ] - + if available_island_ids: random_ids = random.sample( - available_island_ids, - min(remaining, len(available_island_ids)) + available_island_ids, min(remaining, len(available_island_ids)) ) random_programs = [self.programs[pid] for pid in random_ids] nearby_programs.extend(random_programs) @@ -1270,7 +1286,7 @@ def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> logger.debug(f"Removed program {program_id} due to population limit") logger.info(f"Population size after cleanup: {len(self.programs)}") - + # Clean up any stale island best program references after removal self._cleanup_stale_island_bests() @@ -1346,34 +1362,40 @@ def migrate_programs(self) -> None: # Add to target island self.islands[target_island].add(migrant_copy.id) self.programs[migrant_copy.id] = migrant_copy - + # Update island-specific best program if migrant is better self._update_island_best_program(migrant_copy, target_island) # Log migration with MAP-Elites coordinates feature_coords = self._calculate_feature_coords(migrant_copy) - coords_dict = {self.config.feature_dimensions[j]: feature_coords[j] for j in range(len(feature_coords))} - logger.info("Program migrated to island %d at MAP-Elites coords: %s", - target_island, coords_dict) + coords_dict = { + self.config.feature_dimensions[j]: feature_coords[j] + for j in range(len(feature_coords)) + } + logger.info( + "Program migrated to island %d at MAP-Elites coords: %s", + target_island, + coords_dict, + ) # Update last migration generation self.last_migration_generation = max(self.island_generations) logger.info(f"Migration completed at generation {self.last_migration_generation}") - + # Validate migration results self._validate_migration_results() def _validate_migration_results(self) -> None: """ Validate migration didn't create inconsistencies - + Checks that: 1. Program island metadata matches actual island assignment 2. No programs are assigned to multiple islands 3. All island best programs exist and are in correct islands """ seen_program_ids = set() - + for i, island in enumerate(self.islands): for program_id in island: # Check for duplicate assignments @@ -1381,12 +1403,12 @@ def _validate_migration_results(self) -> None: logger.error(f"Program {program_id} assigned to multiple islands") continue seen_program_ids.add(program_id) - + # Check program exists if program_id not in self.programs: logger.warning(f"Island {i} contains nonexistent program {program_id}") continue - + # Check metadata consistency program = self.programs[program_id] stored_island = program.metadata.get("island") @@ -1395,7 +1417,7 @@ def _validate_migration_results(self) -> None: f"Island mismatch for program {program_id}: " f"in island {i} but metadata says {stored_island}" ) - + # Validate island best programs for i, best_id in enumerate(self.island_best_programs): if best_id is not None: @@ -1407,42 +1429,50 @@ def _validate_migration_results(self) -> None: def _cleanup_stale_island_bests(self) -> None: """ Remove stale island best program references - + Cleans up references to programs that no longer exist in the database or are not actually in their assigned islands. """ cleaned_count = 0 - + for i, best_id in enumerate(self.island_best_programs): if best_id is not None: should_clear = False - + # Check if program still exists if best_id not in self.programs: - logger.debug(f"Clearing stale island {i} best program {best_id} (program deleted)") + logger.debug( + f"Clearing stale island {i} best program {best_id} (program deleted)" + ) should_clear = True # Check if program is still in the island elif best_id not in self.islands[i]: - logger.debug(f"Clearing stale island {i} best program {best_id} (not in island)") + logger.debug( + f"Clearing stale island {i} best program {best_id} (not in island)" + ) should_clear = True - + if should_clear: self.island_best_programs[i] = None cleaned_count += 1 - + if cleaned_count > 0: logger.info(f"Cleaned up {cleaned_count} stale island best program references") - + # Recalculate best programs for islands that were cleared for i, best_id in enumerate(self.island_best_programs): if best_id is None and len(self.islands[i]) > 0: # Find new best program for this island - island_programs = [self.programs[pid] for pid in self.islands[i] if pid in self.programs] + island_programs = [ + self.programs[pid] for pid in self.islands[i] if pid in self.programs + ] if island_programs: # Sort by fitness and update best_program = max( island_programs, - key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)) + key=lambda p: p.metrics.get( + "combined_score", safe_numeric_average(p.metrics) + ), ) self.island_best_programs[i] = best_program.id logger.debug(f"Recalculated island {i} best program: {best_program.id}") @@ -1543,14 +1573,199 @@ def _fast_code_diversity(self, code1: str, code2: str) -> float: return diversity + def _get_cached_diversity(self, program: Program) -> float: + """ + Get diversity score for a program using cache and reference set + + Args: + program: The program to calculate diversity for + + Returns: + Diversity score (cached or newly computed) + """ + code_hash = hash(program.code) + + # Check cache first + if code_hash in self.diversity_cache: + return self.diversity_cache[code_hash]["value"] + + # Update reference set if needed + if ( + not self.diversity_reference_set + or len(self.diversity_reference_set) < self.diversity_reference_size + ): + self._update_diversity_reference_set() + + # Compute diversity against reference set + diversity_scores = [] + for ref_code in self.diversity_reference_set: + if ref_code != program.code: # Don't compare with itself + diversity_scores.append(self._fast_code_diversity(program.code, ref_code)) + + diversity = ( + sum(diversity_scores) / max(1, len(diversity_scores)) if diversity_scores else 0.0 + ) + + # Cache the result with LRU eviction + self._cache_diversity_value(code_hash, diversity) + + return diversity + + def _update_diversity_reference_set(self) -> None: + """Update the reference set for diversity calculation""" + if len(self.programs) == 0: + return + + # Select diverse programs for reference set + all_programs = list(self.programs.values()) + + if len(all_programs) <= self.diversity_reference_size: + self.diversity_reference_set = [p.code for p in all_programs] + else: + # Select programs with maximum diversity + selected = [] + remaining = all_programs.copy() + + # Start with a random program + first_idx = random.randint(0, len(remaining) - 1) + selected.append(remaining.pop(first_idx)) + + # Greedily add programs that maximize diversity to selected set + while len(selected) < self.diversity_reference_size and remaining: + max_diversity = -1 + best_idx = -1 + + for i, candidate in enumerate(remaining): + # Calculate minimum diversity to selected programs + min_div = float("inf") + for selected_prog in selected: + div = self._fast_code_diversity(candidate.code, selected_prog.code) + min_div = min(min_div, div) + + if min_div > max_diversity: + max_diversity = min_div + best_idx = i + + if best_idx >= 0: + selected.append(remaining.pop(best_idx)) + + self.diversity_reference_set = [p.code for p in selected] + + logger.debug( + f"Updated diversity reference set with {len(self.diversity_reference_set)} programs" + ) + + def _cache_diversity_value(self, code_hash: int, diversity: float) -> None: + """Cache a diversity value with LRU eviction""" + # Check if cache is full + if len(self.diversity_cache) >= self.diversity_cache_size: + # Remove oldest entry + oldest_hash = min(self.diversity_cache.items(), key=lambda x: x[1]["timestamp"])[0] + del self.diversity_cache[oldest_hash] + + # Add new entry + self.diversity_cache[code_hash] = {"value": diversity, "timestamp": time.time()} + + def _invalidate_diversity_cache(self) -> None: + """Invalidate the diversity cache when programs change significantly""" + self.diversity_cache.clear() + self.diversity_reference_set = [] + logger.debug("Diversity cache invalidated") + + def _update_feature_stats(self, feature_name: str, value: float) -> None: + """ + Update statistics for a feature dimension + + Args: + feature_name: Name of the feature dimension + value: New value to incorporate into stats + """ + if feature_name not in self.feature_stats: + self.feature_stats[feature_name] = { + "min": value, + "max": value, + "values": [], # Keep recent values for percentile calculation if needed + } + + stats = self.feature_stats[feature_name] + stats["min"] = min(stats["min"], value) + stats["max"] = max(stats["max"], value) + + # Keep recent values for more sophisticated scaling methods + stats["values"].append(value) + if len(stats["values"]) > 1000: # Limit memory usage + stats["values"] = stats["values"][-1000:] + + def _scale_feature_value(self, feature_name: str, value: float) -> float: + """ + Scale a feature value according to the configured scaling method + + Args: + feature_name: Name of the feature dimension + value: Raw feature value + + Returns: + Scaled value in range [0, 1] + """ + if feature_name not in self.feature_stats: + # No stats yet, return normalized by a reasonable default + return min(1.0, max(0.0, value)) + + stats = self.feature_stats[feature_name] + + if self.feature_scaling_method == "minmax": + # Min-max normalization to [0, 1] + min_val = stats["min"] + max_val = stats["max"] + + if max_val == min_val: + return 0.5 # All values are the same + + scaled = (value - min_val) / (max_val - min_val) + return min(1.0, max(0.0, scaled)) # Ensure in [0, 1] + + elif self.feature_scaling_method == "percentile": + # Use percentile ranking + values = stats["values"] + if not values: + return 0.5 + + # Count how many values are less than or equal to this value + count = sum(1 for v in values if v <= value) + percentile = count / len(values) + return percentile + + else: + # Default to min-max if unknown method + return self._scale_feature_value_minmax(feature_name, value) + + def _scale_feature_value_minmax(self, feature_name: str, value: float) -> float: + """Helper for min-max scaling""" + if feature_name not in self.feature_stats: + return min(1.0, max(0.0, value)) + + stats = self.feature_stats[feature_name] + min_val = stats["min"] + max_val = stats["max"] + + if max_val == min_val: + return 0.5 + + scaled = (value - min_val) / (max_val - min_val) + return min(1.0, max(0.0, scaled)) + def log_island_status(self) -> None: """Log current status of all islands""" stats = self.get_island_stats() logger.info("Island Status:") for stat in stats: current_marker = " *" if stat["is_current"] else " " - island_idx = stat['island'] - island_best_id = self.island_best_programs[island_idx] if island_idx < len(self.island_best_programs) else None + island_idx = stat["island"] + island_best_id = ( + self.island_best_programs[island_idx] + if island_idx < len(self.island_best_programs) + else None + ) best_indicator = f" (best: {island_best_id})" if island_best_id else "" logger.info( f"{current_marker} Island {stat['island']}: {stat['population_size']} programs, " diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index 2ab93f361..25d880987 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -89,7 +89,7 @@ def _load_evaluation_function(self) -> None: self.evaluate_function = module.evaluate logger.info(f"Successfully loaded evaluation function from {self.evaluation_file}") - + # Validate cascade configuration self._validate_cascade_configuration(module) except Exception as e: @@ -99,16 +99,16 @@ def _load_evaluation_function(self) -> None: def _validate_cascade_configuration(self, module) -> None: """ Validate cascade evaluation configuration and warn about potential issues - + Args: module: The loaded evaluation module """ if self.config.cascade_evaluation: # Check if cascade functions exist has_stage1 = hasattr(module, "evaluate_stage1") - has_stage2 = hasattr(module, "evaluate_stage2") + has_stage2 = hasattr(module, "evaluate_stage2") has_stage3 = hasattr(module, "evaluate_stage3") - + if not has_stage1: logger.warning( f"Configuration has 'cascade_evaluation: true' but evaluator " @@ -123,7 +123,9 @@ def _validate_cascade_configuration(self, module) -> None: f"multi-stage evaluation for better cascade benefits." ) else: - logger.debug(f"Cascade evaluation properly configured with available stage functions") + logger.debug( + f"Cascade evaluation properly configured with available stage functions" + ) async def evaluate_program( self, @@ -305,7 +307,9 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str """ return self._pending_artifacts.pop(program_id, None) - async def _direct_evaluate(self, program_path: str) -> Union[Dict[str, float], EvaluationResult]: + async def _direct_evaluate( + self, program_path: str + ) -> Union[Dict[str, float], EvaluationResult]: """ Directly evaluate a program using the evaluation function with timeout @@ -616,22 +620,23 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s def _create_cascade_error_context(self, stage: str, error: Exception) -> dict: """ Create rich error context for cascade failures - + Args: stage: The stage where the error occurred error: The exception that was raised - + Returns: Dictionary with enhanced error context """ import time + return { "failure_stage": stage, "error_type": type(error).__name__, "error_message": str(error), "timestamp": time.time(), "cascade_config": self.config.cascade_evaluation, - "cascade_thresholds": getattr(self.config, 'cascade_thresholds', []), + "cascade_thresholds": getattr(self.config, "cascade_thresholds", []), "timeout_config": self.config.timeout, "evaluation_file": self.evaluation_file, } diff --git a/tests/test_cascade_validation.py b/tests/test_cascade_validation.py index 3bda17d11..e2d61fdca 100644 --- a/tests/test_cascade_validation.py +++ b/tests/test_cascade_validation.py @@ -17,27 +17,28 @@ class TestCascadeValidation(unittest.IsolatedAsyncioTestCase): def setUp(self): """Set up test evaluator with cascade validation""" self.config = Config() - + # Create temporary evaluator files for testing self.temp_dir = tempfile.mkdtemp() - + def tearDown(self): """Clean up temporary files""" # Clean up temp files more safely import shutil + shutil.rmtree(self.temp_dir, ignore_errors=True) def _create_evaluator_file(self, filename: str, content: str) -> str: """Helper to create temporary evaluator file""" file_path = os.path.join(self.temp_dir, filename) - with open(file_path, 'w') as f: + with open(file_path, "w") as f: f.write(content) return file_path def test_cascade_validation_with_valid_evaluator(self): """Test cascade validation with evaluator that has cascade functions""" # Create evaluator with cascade functions - evaluator_content = ''' + evaluator_content = """ def evaluate_stage1(program_path): return {"stage1_score": 0.5} @@ -49,37 +50,37 @@ def evaluate_stage3(program_path): def evaluate(program_path): return {"final_score": 1.0} -''' +""" evaluator_path = self._create_evaluator_file("valid_cascade.py", evaluator_content) - + # Configure for cascade evaluation self.config.evaluator.cascade_evaluation = True self.config.evaluator.evaluation_file = evaluator_path - + # Should not raise warnings for valid cascade evaluator - with patch('openevolve.evaluator.logger') as mock_logger: + with patch("openevolve.evaluator.logger") as mock_logger: evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Should not have called warning mock_logger.warning.assert_not_called() def test_cascade_validation_warning_for_missing_functions(self): """Test cascade validation warns when cascade functions are missing""" # Create evaluator without cascade functions - evaluator_content = ''' + evaluator_content = """ def evaluate(program_path): return {"score": 0.5} -''' +""" evaluator_path = self._create_evaluator_file("no_cascade.py", evaluator_content) - + # Configure for cascade evaluation self.config.evaluator.cascade_evaluation = True self.config.evaluator.evaluation_file = evaluator_path - + # Should warn about missing cascade functions - with patch('openevolve.evaluator.logger') as mock_logger: + with patch("openevolve.evaluator.logger") as mock_logger: evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Should have warned about missing stage functions mock_logger.warning.assert_called() warning_call = mock_logger.warning.call_args[0][0] @@ -89,52 +90,54 @@ def evaluate(program_path): def test_cascade_validation_partial_functions(self): """Test cascade validation with only some cascade functions""" # Create evaluator with only stage1 - evaluator_content = ''' + evaluator_content = """ def evaluate_stage1(program_path): return {"stage1_score": 0.5} def evaluate(program_path): return {"score": 0.5} -''' +""" evaluator_path = self._create_evaluator_file("partial_cascade.py", evaluator_content) - + # Configure for cascade evaluation self.config.evaluator.cascade_evaluation = True self.config.evaluator.evaluation_file = evaluator_path - + # Should warn about missing additional stages - with patch('openevolve.evaluator.logger') as mock_logger: + with patch("openevolve.evaluator.logger") as mock_logger: evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Should warn about missing stage2/stage3 mock_logger.warning.assert_called_once() warning_call = mock_logger.warning.call_args[0][0] - self.assertIn("defines 'evaluate_stage1' but no additional cascade stages", warning_call) + self.assertIn( + "defines 'evaluate_stage1' but no additional cascade stages", warning_call + ) def test_no_cascade_validation_when_disabled(self): """Test no validation when cascade evaluation is disabled""" # Create evaluator without cascade functions - evaluator_content = ''' + evaluator_content = """ def evaluate(program_path): return {"score": 0.5} -''' +""" evaluator_path = self._create_evaluator_file("no_cascade.py", evaluator_content) - + # Configure WITHOUT cascade evaluation self.config.evaluator.cascade_evaluation = False self.config.evaluator.evaluation_file = evaluator_path - + # Should not perform validation or warn - with patch('openevolve.evaluator.logger') as mock_logger: + with patch("openevolve.evaluator.logger") as mock_logger: evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Should not warn when cascade evaluation is disabled mock_logger.warning.assert_not_called() async def test_direct_evaluate_supports_evaluation_result(self): """Test that _direct_evaluate supports EvaluationResult returns""" # Create evaluator that returns EvaluationResult - evaluator_content = ''' + evaluator_content = """ from openevolve.evaluation_result import EvaluationResult def evaluate(program_path): @@ -142,30 +145,29 @@ def evaluate(program_path): metrics={"score": 0.8, "accuracy": 0.9}, artifacts={"debug_info": "test data"} ) -''' +""" evaluator_path = self._create_evaluator_file("result_evaluator.py", evaluator_content) - + self.config.evaluator.cascade_evaluation = False self.config.evaluator.evaluation_file = evaluator_path self.config.evaluator.timeout = 10 - + evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Create a dummy program file program_path = self._create_evaluator_file("test_program.py", "def test(): pass") - + # Mock the evaluation function def mock_evaluate(path): return EvaluationResult( - metrics={"score": 0.8, "accuracy": 0.9}, - artifacts={"debug_info": "test data"} + metrics={"score": 0.8, "accuracy": 0.9}, artifacts={"debug_info": "test data"} ) - + evaluator.evaluate_function = mock_evaluate - + # Should handle EvaluationResult without issues result = await evaluator._direct_evaluate(program_path) - + # Should return the EvaluationResult as-is self.assertIsInstance(result, EvaluationResult) self.assertEqual(result.metrics["score"], 0.8) @@ -174,30 +176,30 @@ def mock_evaluate(path): async def test_direct_evaluate_supports_dict_result(self): """Test that _direct_evaluate still supports dict returns""" # Create evaluator that returns dict - evaluator_content = ''' + evaluator_content = """ def evaluate(program_path): return {"score": 0.7, "performance": 0.85} -''' +""" evaluator_path = self._create_evaluator_file("dict_evaluator.py", evaluator_content) - + self.config.evaluator.cascade_evaluation = False self.config.evaluator.evaluation_file = evaluator_path self.config.evaluator.timeout = 10 - + evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Create a dummy program file program_path = self._create_evaluator_file("test_program.py", "def test(): pass") - + # Mock the evaluation function directly def mock_evaluate(path): return {"score": 0.7, "performance": 0.85} - + evaluator.evaluate_function = mock_evaluate - + # Should handle dict result without issues result = await evaluator._direct_evaluate(program_path) - + # Should return the dict as-is self.assertIsInstance(result, dict) self.assertEqual(result["score"], 0.7) @@ -206,7 +208,7 @@ def mock_evaluate(path): def test_cascade_validation_with_class_based_evaluator(self): """Test cascade validation with class-based evaluator""" # Create class-based evaluator with all stages - evaluator_content = ''' + evaluator_content = """ class Evaluator: def evaluate_stage1(self, program_path): return {"stage1_score": 0.5} @@ -229,32 +231,32 @@ def evaluate_stage2(program_path): def evaluate(program_path): evaluator = Evaluator() return evaluator.evaluate(program_path) -''' +""" evaluator_path = self._create_evaluator_file("class_cascade.py", evaluator_content) - + # Configure for cascade evaluation self.config.evaluator.cascade_evaluation = True self.config.evaluator.evaluation_file = evaluator_path - + # Should not warn since module-level functions exist - with patch('openevolve.evaluator.logger') as mock_logger: + with patch("openevolve.evaluator.logger") as mock_logger: evaluator = Evaluator(self.config.evaluator, evaluator_path) - + mock_logger.warning.assert_not_called() def test_cascade_validation_with_syntax_error(self): """Test cascade validation handles syntax errors gracefully""" # Create evaluator with syntax error - evaluator_content = ''' + evaluator_content = """ def evaluate_stage1(program_path) # Missing colon return {"stage1_score": 0.5} -''' +""" evaluator_path = self._create_evaluator_file("syntax_error.py", evaluator_content) - + # Configure for cascade evaluation self.config.evaluator.cascade_evaluation = True self.config.evaluator.evaluation_file = evaluator_path - + # Should raise an error due to syntax error with self.assertRaises(Exception): # Could be SyntaxError or other import error evaluator = Evaluator(self.config.evaluator, evaluator_path) @@ -265,56 +267,55 @@ def test_cascade_validation_nonexistent_file(self): nonexistent_path = "/nonexistent/path.py" self.config.evaluator.cascade_evaluation = True self.config.evaluator.evaluation_file = nonexistent_path - + # Should raise ValueError for missing file with self.assertRaises(ValueError) as context: evaluator = Evaluator(self.config.evaluator, nonexistent_path) - + self.assertIn("not found", str(context.exception)) def test_process_evaluation_result_with_artifacts(self): """Test that _process_evaluation_result handles artifacts correctly""" - evaluator_content = ''' + evaluator_content = """ def evaluate(program_path): return {"score": 0.5} -''' +""" evaluator_path = self._create_evaluator_file("dummy.py", evaluator_content) - + self.config.evaluator.cascade_evaluation = False # Disable cascade to avoid warnings self.config.evaluator.evaluation_file = evaluator_path evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Test with EvaluationResult containing artifacts eval_result = EvaluationResult( - metrics={"score": 0.9}, - artifacts={"log": "test log", "data": [1, 2, 3]} + metrics={"score": 0.9}, artifacts={"log": "test log", "data": [1, 2, 3]} ) - + result = evaluator._process_evaluation_result(eval_result) - + self.assertEqual(result.metrics, {"score": 0.9}) self.assertEqual(result.artifacts, {"log": "test log", "data": [1, 2, 3]}) def test_process_evaluation_result_with_dict(self): """Test that _process_evaluation_result handles dict results correctly""" - evaluator_content = ''' + evaluator_content = """ def evaluate(program_path): return {"score": 0.5} -''' +""" evaluator_path = self._create_evaluator_file("dummy.py", evaluator_content) - + self.config.evaluator.cascade_evaluation = False # Disable cascade to avoid warnings self.config.evaluator.evaluation_file = evaluator_path evaluator = Evaluator(self.config.evaluator, evaluator_path) - + # Test with dict result dict_result = {"score": 0.7, "accuracy": 0.8} - + result = evaluator._process_evaluation_result(dict_result) - + self.assertEqual(result.metrics, {"score": 0.7, "accuracy": 0.8}) self.assertEqual(result.artifacts, {}) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_checkpoint_resume.py b/tests/test_checkpoint_resume.py index 96accfd4b..f593ad7cf 100644 --- a/tests/test_checkpoint_resume.py +++ b/tests/test_checkpoint_resume.py @@ -97,7 +97,9 @@ async def run_test(): self.assertEqual(controller.database.last_iteration, 0) # Mock the parallel controller to avoid API calls - with patch("openevolve.controller.ImprovedParallelController") as mock_controller_class: + with patch( + "openevolve.controller.ImprovedParallelController" + ) as mock_controller_class: mock_controller = Mock() mock_controller.run_evolution = AsyncMock(return_value=None) mock_controller.start = Mock(return_value=None) @@ -105,7 +107,7 @@ async def run_test(): mock_controller.shutdown_flag = Mock() mock_controller.shutdown_flag.is_set.return_value = False mock_controller_class.return_value = mock_controller - + # Run for 0 iterations (just initialization) result = await controller.run(iterations=0) @@ -151,7 +153,9 @@ async def run_test(): controller.database.add(existing_program) # Mock the parallel controller to avoid API calls - with patch("openevolve.controller.ImprovedParallelController") as mock_controller_class: + with patch( + "openevolve.controller.ImprovedParallelController" + ) as mock_controller_class: mock_controller = Mock() mock_controller.run_evolution = AsyncMock(return_value=None) mock_controller.start = Mock(return_value=None) @@ -159,7 +163,7 @@ async def run_test(): mock_controller.shutdown_flag = Mock() mock_controller.shutdown_flag.is_set.return_value = False mock_controller_class.return_value = mock_controller - + # Run for 0 iterations (just initialization) result = await controller.run(iterations=0) @@ -204,7 +208,9 @@ async def run_test(): self.assertEqual(controller.database.last_iteration, 10) # Mock the parallel controller to avoid API calls - with patch("openevolve.controller.ImprovedParallelController") as mock_controller_class: + with patch( + "openevolve.controller.ImprovedParallelController" + ) as mock_controller_class: mock_controller = Mock() mock_controller.run_evolution = AsyncMock(return_value=None) mock_controller.start = Mock(return_value=None) @@ -212,7 +218,7 @@ async def run_test(): mock_controller.shutdown_flag = Mock() mock_controller.shutdown_flag.is_set.return_value = False mock_controller_class.return_value = mock_controller - + # Run for 0 iterations (just initialization) result = await controller.run(iterations=0) @@ -260,7 +266,9 @@ async def run_test(): self.assertEqual(controller.database.last_iteration, 0) # Mock the parallel controller to avoid API calls - with patch("openevolve.controller.ImprovedParallelController") as mock_controller_class: + with patch( + "openevolve.controller.ImprovedParallelController" + ) as mock_controller_class: mock_controller = Mock() mock_controller.run_evolution = AsyncMock(return_value=None) mock_controller.start = Mock(return_value=None) @@ -268,7 +276,7 @@ async def run_test(): mock_controller.shutdown_flag = Mock() mock_controller.shutdown_flag.is_set.return_value = False mock_controller_class.return_value = mock_controller - + # Run for 0 iterations (just initialization) result = await controller.run(iterations=0) diff --git a/tests/test_database.py b/tests/test_database.py index 478f8911c..0d17f8961 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -84,16 +84,16 @@ def test_island_operations_basic(self): """Test basic island operations""" # Test with default islands (should be 5 by default) self.assertEqual(len(self.db.islands), 5) - + program = Program( id="island_test", code="def island_test(): pass", language="python", metrics={"score": 0.6}, ) - + self.db.add(program) - + # Should be in island 0 self.assertIn("island_test", self.db.islands[0]) self.assertEqual(program.metadata.get("island"), 0) @@ -105,10 +105,10 @@ def test_multi_island_setup(self): config.database.in_memory = True config.database.num_islands = 3 multi_db = ProgramDatabase(config.database) - + self.assertEqual(len(multi_db.islands), 3) self.assertEqual(len(multi_db.island_best_programs), 3) - + # Add programs to specific islands for i in range(3): program = Program( @@ -118,7 +118,7 @@ def test_multi_island_setup(self): metrics={"score": 0.5 + i * 0.1}, ) multi_db.add(program, target_island=i) - + # Verify assignment self.assertIn(f"test_island_{i}", multi_db.islands[i]) self.assertEqual(program.metadata.get("island"), i) @@ -131,13 +131,13 @@ def test_feature_coordinates_calculation(self): language="python", metrics={"score": 0.8}, ) - + coords = self.db._calculate_feature_coords(program) - + # Should return list of coordinates self.assertIsInstance(coords, list) self.assertEqual(len(coords), len(self.db.config.feature_dimensions)) - + # All coordinates should be within valid range for coord in coords: self.assertGreaterEqual(coord, 0) @@ -145,38 +145,61 @@ def test_feature_coordinates_calculation(self): def test_feature_map_operations(self): """Test feature map operations for MAP-Elites""" + # Add some initial programs to establish diversity reference set + for i in range(3): + init_program = Program( + id=f"init_{i}", + code=f"def init_{i}(): return {i}", + language="python", + metrics={"score": 0.3}, + ) + self.db.add(init_program) + program1 = Program( id="map_test1", code="def short(): pass", # Similar complexity language="python", metrics={"score": 0.5}, ) - + program2 = Program( - id="map_test2", + id="map_test2", code="def also_short(): pass", # Similar complexity language="python", metrics={"score": 0.8}, # Better score ) - + self.db.add(program1) self.db.add(program2) - - # Both programs might land in same cell due to similar features - # The better program should be kept in the feature map - feature_coords1 = self.db._calculate_feature_coords(program1) - feature_coords2 = self.db._calculate_feature_coords(program2) - - key1 = self.db._feature_coords_to_key(feature_coords1) - key2 = self.db._feature_coords_to_key(feature_coords2) - - if key1 == key2: # Same cell - # Better program should be in feature map - self.assertEqual(self.db.feature_map[key1], "map_test2") - else: # Different cells - # Both should be in feature map - self.assertEqual(self.db.feature_map[key1], "map_test1") - self.assertEqual(self.db.feature_map[key2], "map_test2") + + # Both programs should be in the feature map + # Since they have different codes, they should have different keys + self.assertIn("map_test1", self.db.programs) + self.assertIn("map_test2", self.db.programs) + + # Check that both programs are represented in the feature map + feature_map_values = list(self.db.feature_map.values()) + + # At least one of our test programs should be in the feature map + test_programs_in_map = [v for v in feature_map_values if v in ["map_test1", "map_test2"]] + self.assertGreater( + len(test_programs_in_map), 0, "At least one test program should be in feature map" + ) + + # If both are in the map, verify they have different keys (due to diversity) + if "map_test1" in feature_map_values and "map_test2" in feature_map_values: + # Find their keys + key1 = None + key2 = None + for k, v in self.db.feature_map.items(): + if v == "map_test1": + key1 = k + elif v == "map_test2": + key2 = k + + # If they have the same key, the better program should be kept + if key1 == key2: + self.assertEqual(self.db.feature_map[key1], "map_test2") def test_get_top_programs_with_metrics(self): """Test get_top_programs with specific metrics""" @@ -186,21 +209,21 @@ def test_get_top_programs_with_metrics(self): language="python", metrics={"accuracy": 0.9, "speed": 0.3}, ) - + program2 = Program( id="metric_test2", - code="def test2(): pass", + code="def test2(): pass", language="python", metrics={"accuracy": 0.7, "speed": 0.8}, ) - + self.db.add(program1) self.db.add(program2) - + # Test sorting by specific metric top_by_accuracy = self.db.get_top_programs(n=2, metric="accuracy") self.assertEqual(top_by_accuracy[0].id, "metric_test1") # Higher accuracy - + top_by_speed = self.db.get_top_programs(n=2, metric="speed") self.assertEqual(top_by_speed[0].id, "metric_test2") # Higher speed @@ -215,11 +238,11 @@ def test_archive_operations(self): metrics={"score": i * 0.1}, ) self.db.add(program) - + # Archive should contain program IDs self.assertGreater(len(self.db.archive), 0) self.assertLessEqual(len(self.db.archive), self.db.config.archive_size) - + # Archive should contain program IDs that exist for program_id in self.db.archive: self.assertIn(program_id, self.db.programs) @@ -232,17 +255,17 @@ def test_best_program_tracking(self): language="python", metrics={"combined_score": 0.6}, ) - + program2 = Program( id="best_test2", code="def test2(): pass", - language="python", + language="python", metrics={"combined_score": 0.9}, ) - + self.db.add(program1) self.assertEqual(self.db.best_program_id, "best_test1") - + self.db.add(program2) self.assertEqual(self.db.best_program_id, "best_test2") # Should update to better program @@ -251,7 +274,7 @@ def test_population_limit_enforcement(self): # Set small population limit original_limit = self.db.config.population_size self.db.config.population_size = 3 - + # Add more programs than limit for i in range(5): program = Program( @@ -261,10 +284,10 @@ def test_population_limit_enforcement(self): metrics={"score": i * 0.1}, ) self.db.add(program) - + # Population should be at or below limit self.assertLessEqual(len(self.db.programs), 3) - + # Restore original limit self.db.config.population_size = original_limit @@ -273,18 +296,28 @@ def test_calculate_complexity_bin_adaptive(self): # Add programs with different complexities programs = [ Program(id="short", code="x=1", metrics={"score": 0.5}), - Program(id="medium", code="def func():\n return x*2\n pass", metrics={"score": 0.5}), - Program(id="long", code="def complex_function():\n result = []\n for i in range(100):\n result.append(i*2)\n return result", metrics={"score": 0.5}), + Program( + id="medium", code="def func():\n return x*2\n pass", metrics={"score": 0.5} + ), + Program( + id="long", + code="def complex_function():\n result = []\n for i in range(100):\n result.append(i*2)\n return result", + metrics={"score": 0.5}, + ), ] - + for program in programs: self.db.add(program) - + # Test binning for different complexity values short_bin = self.db._calculate_complexity_bin(len("x=1")) medium_bin = self.db._calculate_complexity_bin(len("def func():\n return x*2\n pass")) - long_bin = self.db._calculate_complexity_bin(len("def complex_function():\n result = []\n for i in range(100):\n result.append(i*2)\n return result")) - + long_bin = self.db._calculate_complexity_bin( + len( + "def complex_function():\n result = []\n for i in range(100):\n result.append(i*2)\n return result" + ) + ) + # Bins should be different and within valid range self.assertNotEqual(short_bin, long_bin) self.assertGreaterEqual(short_bin, 0) @@ -296,14 +329,14 @@ def test_calculate_complexity_bin_cold_start(self): """Test complexity binning during cold start (< 2 programs)""" # Empty database - should use fixed range bin_idx = self.db._calculate_complexity_bin(500) - + self.assertGreaterEqual(bin_idx, 0) self.assertLess(bin_idx, self.db.feature_bins) - + # Add one program - still cold start program = Program(id="single", code="x=1", metrics={"score": 0.5}) self.db.add(program) - + bin_idx = self.db._calculate_complexity_bin(500) self.assertGreaterEqual(bin_idx, 0) self.assertLess(bin_idx, self.db.feature_bins) @@ -314,24 +347,32 @@ def test_calculate_diversity_bin_adaptive(self): programs = [ Program(id="simple", code="x = 1", metrics={"score": 0.5}), Program(id="function", code="def add(a, b):\n return a + b", metrics={"score": 0.5}), - Program(id="loop", code="for i in range(10):\n print(i)\n x += i", metrics={"score": 0.5}), - Program(id="complex", code="class MyClass:\n def __init__(self):\n self.data = []\n def process(self, items):\n return [x*2 for x in items]", metrics={"score": 0.5}), + Program( + id="loop", + code="for i in range(10):\n print(i)\n x += i", + metrics={"score": 0.5}, + ), + Program( + id="complex", + code="class MyClass:\n def __init__(self):\n self.data = []\n def process(self, items):\n return [x*2 for x in items]", + metrics={"score": 0.5}, + ), ] - + for program in programs: self.db.add(program) - + # Test binning for different diversity values # Use fast diversity to calculate test values simple_prog = programs[0] complex_prog = programs[3] - + # Calculate diversity for simple vs complex programs simple_diversity = self.db._fast_code_diversity(simple_prog.code, complex_prog.code) - + # Test the binning bin_idx = self.db._calculate_diversity_bin(simple_diversity) - + # Should be within valid range self.assertGreaterEqual(bin_idx, 0) self.assertLess(bin_idx, self.db.feature_bins) @@ -340,14 +381,14 @@ def test_calculate_diversity_bin_cold_start(self): """Test diversity binning during cold start (< 2 programs)""" # Empty database - should use fixed range bin_idx = self.db._calculate_diversity_bin(500.0) - + self.assertGreaterEqual(bin_idx, 0) self.assertLess(bin_idx, self.db.feature_bins) - + # Add one program - still cold start program = Program(id="single", code="x=1", metrics={"score": 0.5}) self.db.add(program) - + bin_idx = self.db._calculate_diversity_bin(500.0) self.assertGreaterEqual(bin_idx, 0) self.assertLess(bin_idx, self.db.feature_bins) @@ -357,15 +398,13 @@ def test_calculate_diversity_bin_identical_programs(self): # Add multiple identical programs for i in range(3): program = Program( - id=f"identical_{i}", - code="x = 1", # Same code - metrics={"score": 0.5} + id=f"identical_{i}", code="x = 1", metrics={"score": 0.5} # Same code ) self.db.add(program) - + # Test binning - should handle zero range gracefully bin_idx = self.db._calculate_diversity_bin(0.0) - + self.assertGreaterEqual(bin_idx, 0) self.assertLess(bin_idx, self.db.feature_bins) @@ -376,13 +415,13 @@ def test_fast_code_diversity_function(self): code2 = "def test(): pass" diversity = self.db._fast_code_diversity(code1, code2) self.assertEqual(diversity, 0.0) - + # Test different code code1 = "x = 1" code2 = "def complex_function():\n return [i*2 for i in range(100)]" diversity = self.db._fast_code_diversity(code1, code2) self.assertGreater(diversity, 0.0) - + # Test length difference short_code = "x = 1" long_code = "x = 1" + "a" * 100 @@ -397,22 +436,22 @@ def test_diversity_feature_integration(self): Program(id="prog2", code="def func():\n return 2", metrics={"score": 0.5}), Program(id="prog3", code="for i in range(5):\n print(i)", metrics={"score": 0.5}), ] - + for program in programs: self.db.add(program) - + # Create a test program with diversity feature enabled test_config = self.db.config test_config.feature_dimensions = ["score", "complexity", "diversity"] - + test_program = Program(id="test", code="def test(): return 42", metrics={"score": 0.7}) - + # Calculate feature coordinates - should include diversity dimension coords = self.db._calculate_feature_coords(test_program) - + # Should have 3 coordinates for score, complexity, and diversity self.assertEqual(len(coords), 3) - + # All coordinates should be within valid range for coord in coords: self.assertGreaterEqual(coord, 0) diff --git a/tests/test_island_migration.py b/tests/test_island_migration.py index 116dc6b49..62e8c0f8b 100644 --- a/tests/test_island_migration.py +++ b/tests/test_island_migration.py @@ -26,7 +26,7 @@ def _create_test_program(self, program_id: str, score: float, island: int) -> Pr code=f"def func_{program_id}(): return {score}", language="python", metrics={"score": score, "combined_score": score}, - metadata={"island": island} + metadata={"island": island}, ) return program @@ -35,11 +35,11 @@ def test_initial_island_setup(self): self.assertEqual(len(self.db.islands), 3) self.assertEqual(len(self.db.island_best_programs), 3) self.assertEqual(len(self.db.island_generations), 3) - + # All islands should be empty initially for island in self.db.islands: self.assertEqual(len(island), 0) - + # All island best programs should be None initially for best_id in self.db.island_best_programs: self.assertIsNone(best_id) @@ -50,16 +50,16 @@ def test_program_island_assignment(self): program1 = self._create_test_program("test1", 0.5, 0) program2 = self._create_test_program("test2", 0.7, 1) program3 = self._create_test_program("test3", 0.3, 2) - + self.db.add(program1, target_island=0) - self.db.add(program2, target_island=1) + self.db.add(program2, target_island=1) self.db.add(program3, target_island=2) - + # Verify island assignments self.assertIn("test1", self.db.islands[0]) self.assertIn("test2", self.db.islands[1]) self.assertIn("test3", self.db.islands[2]) - + # Verify metadata self.assertEqual(self.db.programs["test1"].metadata["island"], 0) self.assertEqual(self.db.programs["test2"].metadata["island"], 1) @@ -69,11 +69,11 @@ def test_should_migrate_logic(self): """Test the migration timing logic""" # Initially should not migrate (no generations passed) self.assertFalse(self.db.should_migrate()) - + # Advance island generations self.db.island_generations = [5, 6, 7] # Max is 7, last migration was 0, so 7-0=7 >= 5 self.assertTrue(self.db.should_migrate()) - + # Test with mixed generations below threshold self.db.island_generations = [3, 4, 2] # Max is 4, 4-0=4 < 5 self.assertFalse(self.db.should_migrate()) @@ -83,32 +83,32 @@ def test_migration_ring_topology(self): # Add programs to islands 0 and 1 program1 = self._create_test_program("test1", 0.8, 0) program2 = self._create_test_program("test2", 0.6, 1) - + self.db.add(program1, target_island=0) self.db.add(program2, target_island=1) - + # Set up for migration self.db.island_generations = [6, 6, 6] # Trigger migration - + initial_program_count = len(self.db.programs) - + # Perform migration self.db.migrate_programs() - + # Should have created migrant copies self.assertGreater(len(self.db.programs), initial_program_count) - + # Check that migrants were created with proper naming migrant_ids = [pid for pid in self.db.programs.keys() if "_migrant_" in pid] self.assertGreater(len(migrant_ids), 0) - + # Verify ring topology: island 0 -> islands 1,2 island_0_migrants = [pid for pid in migrant_ids if "test1_migrant_" in pid] - + # test1 from island 0 should migrate to islands 1 and 2 (0+1=1, 0-1=-1%3=2) self.assertTrue(any(pid.endswith("_1") for pid in island_0_migrants)) self.assertTrue(any(pid.endswith("_2") for pid in island_0_migrants)) - + # Note: Due to the current migration implementation, test2 may not create direct migrants # when test1 migrants are added to island 1 during the same migration round. # This is a known limitation of the current implementation that processes islands @@ -122,27 +122,31 @@ def test_migration_rate_respected(self): program = self._create_test_program(f"test{i}", 0.5 + i * 0.05, 0) programs.append(program) self.db.add(program, target_island=0) - + # Set up for migration self.db.island_generations = [6, 6, 6] - + initial_count = len(self.db.programs) - + # Perform migration self.db.migrate_programs() - + # Calculate expected migrants # With 50% migration rate and 10 programs, expect 5 migrants # Each migrant goes to 2 target islands, so 10 initial new programs # But migrants can themselves migrate, so more programs are created initial_migrants = 5 * 2 # 5 migrants * 2 target islands each actual_new_programs = len(self.db.programs) - initial_count - + # Should have at least the initial expected migrants self.assertGreaterEqual(actual_new_programs, initial_migrants) - + # Check that the right number of first-generation migrants were created - first_gen_migrants = [pid for pid in self.db.programs.keys() if pid.count('_migrant_') == 1 and '_migrant_' in pid] + first_gen_migrants = [ + pid + for pid in self.db.programs.keys() + if pid.count("_migrant_") == 1 and "_migrant_" in pid + ] self.assertEqual(len(first_gen_migrants), initial_migrants) def test_migration_preserves_best_programs(self): @@ -151,21 +155,21 @@ def test_migration_preserves_best_programs(self): program1 = self._create_test_program("low_score", 0.2, 0) program2 = self._create_test_program("high_score", 0.9, 0) program3 = self._create_test_program("med_score", 0.5, 0) - + self.db.add(program1, target_island=0) self.db.add(program2, target_island=0) self.db.add(program3, target_island=0) - + # Set up for migration self.db.island_generations = [6, 6, 6] - + # Perform migration self.db.migrate_programs() - + # Check that the high-score program was selected for migration migrant_ids = [pid for pid in self.db.programs.keys() if "_migrant_" in pid] high_score_migrants = [pid for pid in migrant_ids if "high_score_migrant_" in pid] - + self.assertGreater(len(high_score_migrants), 0) def test_migration_updates_generations(self): @@ -173,13 +177,13 @@ def test_migration_updates_generations(self): # Add a program and set up for migration program = self._create_test_program("test1", 0.5, 0) self.db.add(program, target_island=0) - + self.db.island_generations = [6, 7, 8] initial_migration_gen = self.db.last_migration_generation - + # Perform migration self.db.migrate_programs() - + # Should update to max of island generations self.assertEqual(self.db.last_migration_generation, 8) self.assertGreater(self.db.last_migration_generation, initial_migration_gen) @@ -189,10 +193,10 @@ def test_migration_with_empty_islands(self): # Add program only to island 0, leave others empty program = self._create_test_program("test1", 0.5, 0) self.db.add(program, target_island=0) - + # Set up for migration self.db.island_generations = [6, 6, 6] - + # Should not crash with empty islands try: self.db.migrate_programs() @@ -203,34 +207,34 @@ def test_migration_creates_proper_copies(self): """Test that migration creates proper program copies""" program = self._create_test_program("original", 0.7, 0) self.db.add(program, target_island=0) - + # Set up for migration self.db.island_generations = [6, 6, 6] - + # Perform migration self.db.migrate_programs() - + # Find migrant copies migrant_ids = [pid for pid in self.db.programs.keys() if "original_migrant_" in pid] self.assertGreater(len(migrant_ids), 0) - + # Check first-generation migrant properties - first_gen_migrants = [pid for pid in migrant_ids if pid.count('_migrant_') == 1] + first_gen_migrants = [pid for pid in migrant_ids if pid.count("_migrant_") == 1] self.assertGreater(len(first_gen_migrants), 0) - + for migrant_id in first_gen_migrants: migrant = self.db.programs[migrant_id] - + # Should have same code and metrics as original self.assertEqual(migrant.code, program.code) self.assertEqual(migrant.metrics, program.metrics) - + # Should have proper parent reference self.assertEqual(migrant.parent_id, "original") - + # Should be marked as migrant self.assertTrue(migrant.metadata.get("migrant", False)) - + # Should be in correct target island target_island = migrant.metadata["island"] self.assertIn(migrant_id, self.db.islands[target_island]) @@ -242,20 +246,20 @@ def test_no_migration_with_single_island(self): config.database.in_memory = True config.database.num_islands = 1 single_island_db = ProgramDatabase(config.database) - + program = self._create_test_program("test1", 0.5, 0) single_island_db.add(program, target_island=0) - + single_island_db.island_generations = [6] - + initial_count = len(single_island_db.programs) - + # Should not perform migration single_island_db.migrate_programs() - + # Program count should remain the same self.assertEqual(len(single_island_db.programs), initial_count) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_island_tracking.py b/tests/test_island_tracking.py index 28723da1f..160f9161c 100644 --- a/tests/test_island_tracking.py +++ b/tests/test_island_tracking.py @@ -24,7 +24,7 @@ def _create_test_program(self, program_id: str, score: float, island: int) -> Pr code=f"def func_{program_id}(): return {score}", language="python", metrics={"score": score, "combined_score": score}, - metadata={"island": island} + metadata={"island": island}, ) return program @@ -39,10 +39,10 @@ def test_first_program_becomes_island_best(self): """Test that the first program added to an island becomes the best""" program = self._create_test_program("first", 0.5, 0) self.db.add(program, target_island=0) - + # Should become the best program for island 0 self.assertEqual(self.db.island_best_programs[0], "first") - + # Other islands should still have None self.assertIsNone(self.db.island_best_programs[1]) self.assertIsNone(self.db.island_best_programs[2]) @@ -53,7 +53,7 @@ def test_better_program_updates_island_best(self): program1 = self._create_test_program("mediocre", 0.5, 0) self.db.add(program1, target_island=0) self.assertEqual(self.db.island_best_programs[0], "mediocre") - + # Add better program program2 = self._create_test_program("better", 0.8, 0) self.db.add(program2, target_island=0) @@ -65,11 +65,11 @@ def test_worse_program_does_not_update_island_best(self): program1 = self._create_test_program("good", 0.8, 0) self.db.add(program1, target_island=0) self.assertEqual(self.db.island_best_programs[0], "good") - + # Add worse program program2 = self._create_test_program("worse", 0.3, 0) self.db.add(program2, target_island=0) - + # Should still be the good program self.assertEqual(self.db.island_best_programs[0], "good") @@ -79,11 +79,11 @@ def test_island_isolation_in_best_tracking(self): program1 = self._create_test_program("island0_best", 0.9, 0) program2 = self._create_test_program("island1_best", 0.7, 1) program3 = self._create_test_program("island2_best", 0.5, 2) - + self.db.add(program1, target_island=0) self.db.add(program2, target_island=1) self.db.add(program3, target_island=2) - + # Each island should track its own best self.assertEqual(self.db.island_best_programs[0], "island0_best") self.assertEqual(self.db.island_best_programs[1], "island1_best") @@ -94,10 +94,10 @@ def test_migration_updates_island_best(self): # Add program to island 0 original = self._create_test_program("original", 0.6, 0) self.db.add(original, target_island=0) - + # Island 1 starts empty self.assertIsNone(self.db.island_best_programs[1]) - + # Manually create a migrant to island 1 (simulating migration) migrant = Program( id="original_migrant_1", @@ -106,12 +106,12 @@ def test_migration_updates_island_best(self): parent_id=original.id, generation=original.generation, metrics=original.metrics.copy(), - metadata={"island": 1, "migrant": True} + metadata={"island": 1, "migrant": True}, ) - + # Add migrant to island 1 self.db.add(migrant, target_island=1) - + # Should become best for island 1 self.assertEqual(self.db.island_best_programs[1], "original_migrant_1") @@ -121,23 +121,23 @@ def test_get_top_programs_island_specific(self): program1 = self._create_test_program("prog1", 0.9, 0) program2 = self._create_test_program("prog2", 0.7, 0) program3 = self._create_test_program("prog3", 0.5, 0) - + # Add programs to island 1 program4 = self._create_test_program("prog4", 0.8, 1) program5 = self._create_test_program("prog5", 0.6, 1) - + self.db.add(program1, target_island=0) self.db.add(program2, target_island=0) self.db.add(program3, target_island=0) self.db.add(program4, target_island=1) self.db.add(program5, target_island=1) - + # Get top programs from island 0 island0_top = self.db.get_top_programs(n=2, island_idx=0) self.assertEqual(len(island0_top), 2) self.assertEqual(island0_top[0].id, "prog1") # Highest score self.assertEqual(island0_top[1].id, "prog2") # Second highest - + # Get top programs from island 1 island1_top = self.db.get_top_programs(n=2, island_idx=1) self.assertEqual(len(island1_top), 2) @@ -152,20 +152,20 @@ def test_island_best_with_combined_score(self): code="def test1(): pass", language="python", metrics={"score": 0.5, "other": 0.3, "combined_score": 0.4}, - metadata={"island": 0} + metadata={"island": 0}, ) - + program2 = Program( - id="test2", + id="test2", code="def test2(): pass", language="python", metrics={"score": 0.3, "other": 0.7, "combined_score": 0.5}, - metadata={"island": 0} + metadata={"island": 0}, ) - + self.db.add(program1, target_island=0) self.assertEqual(self.db.island_best_programs[0], "test1") - + # program2 has higher combined_score, should become best self.db.add(program2, target_island=0) self.assertEqual(self.db.island_best_programs[0], "test2") @@ -175,15 +175,15 @@ def test_island_best_with_missing_program(self): program = self._create_test_program("to_remove", 0.8, 0) self.db.add(program, target_island=0) self.assertEqual(self.db.island_best_programs[0], "to_remove") - + # Manually remove the program (simulating cleanup) del self.db.programs["to_remove"] self.db.islands[0].remove("to_remove") - + # Add a new program - should detect stale reference and update new_program = self._create_test_program("new", 0.6, 0) self.db.add(new_program, target_island=0) - + # Should update the best program (the old one is gone) self.assertEqual(self.db.island_best_programs[0], "new") @@ -192,33 +192,35 @@ def test_sample_inspirations_from_island(self): # Add programs to island 0 program1 = self._create_test_program("island0_prog1", 0.9, 0) program2 = self._create_test_program("island0_prog2", 0.7, 0) - + # Add programs to island 1 program3 = self._create_test_program("island1_prog1", 0.8, 1) program4 = self._create_test_program("island1_prog2", 0.6, 1) - + self.db.add(program1, target_island=0) self.db.add(program2, target_island=0) self.db.add(program3, target_island=1) self.db.add(program4, target_island=1) - + # Sample from island 0 program inspirations = self.db._sample_inspirations(program1, n=5) - + # All inspirations should be from island 0 for inspiration in inspirations: island = inspiration.metadata.get("island") - self.assertEqual(island, 0, f"Program {inspiration.id} should be from island 0, got {island}") + self.assertEqual( + island, 0, f"Program {inspiration.id} should be from island 0, got {island}" + ) def test_island_status_logging(self): """Test island status logging functionality""" # Add programs to different islands program1 = self._create_test_program("p1", 0.9, 0) program2 = self._create_test_program("p2", 0.7, 1) - + self.db.add(program1, target_island=0) self.db.add(program2, target_island=1) - + # Should not crash when logging status try: self.db.log_island_status() @@ -230,21 +232,21 @@ def test_island_best_persistence(self): # Add programs to islands program1 = self._create_test_program("best0", 0.9, 0) program2 = self._create_test_program("best1", 0.8, 1) - + self.db.add(program1, target_island=0) self.db.add(program2, target_island=1) - + # Verify initial state self.assertEqual(self.db.island_best_programs[0], "best0") self.assertEqual(self.db.island_best_programs[1], "best1") - + # Add more programs that are not better program3 = self._create_test_program("worse0", 0.5, 0) program4 = self._create_test_program("worse1", 0.4, 1) - + self.db.add(program3, target_island=0) self.db.add(program4, target_island=1) - + # Best should remain unchanged self.assertEqual(self.db.island_best_programs[0], "best0") self.assertEqual(self.db.island_best_programs[1], "best1") @@ -263,4 +265,4 @@ def test_empty_island_top_programs(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_map_elites_features.py b/tests/test_map_elites_features.py new file mode 100644 index 000000000..5984f054e --- /dev/null +++ b/tests/test_map_elites_features.py @@ -0,0 +1,273 @@ +""" +Tests for MAP-Elites feature enhancements in openevolve.database +""" + +import unittest +from unittest.mock import MagicMock, patch +from openevolve.config import Config +from openevolve.database import Program, ProgramDatabase + + +class TestMapElitesFeatures(unittest.TestCase): + """Tests for MAP-Elites feature enhancements""" + + def setUp(self): + """Set up test database with enhanced features""" + config = Config() + config.database.in_memory = True + config.database.feature_dimensions = ["complexity", "diversity"] + config.database.feature_bins = 10 + self.db = ProgramDatabase(config.database) + + def test_diversity_caching(self): + """Test diversity score caching functionality""" + # Add some reference programs + for i in range(5): + program = Program( + id=f"ref_{i}", + code=f"def func_{i}():\n return {i} * 2", + language="python", + metrics={"score": 0.5 + i * 0.1}, + ) + self.db.add(program) + + # Create a test program + test_program = Program( + id="test", code="def test():\n return 42", language="python", metrics={"score": 0.7} + ) + + # First call should compute diversity + diversity1 = self.db._get_cached_diversity(test_program) + + # Second call should use cache + diversity2 = self.db._get_cached_diversity(test_program) + + # Should be the same value + self.assertEqual(diversity1, diversity2) + + # Check cache was populated + code_hash = hash(test_program.code) + self.assertIn(code_hash, self.db.diversity_cache) + self.assertEqual(self.db.diversity_cache[code_hash]["value"], diversity1) + + def test_diversity_reference_set_update(self): + """Test diversity reference set updates""" + # Initially empty + self.assertEqual(len(self.db.diversity_reference_set), 0) + + # Add programs + for i in range(30): + program = Program( + id=f"prog_{i}", + code=f"def func_{i}():\n " + "x = 1\n" * i, # Varying complexity + language="python", + metrics={"score": 0.5}, + ) + self.db.add(program) + + # Update reference set + self.db._update_diversity_reference_set() + + # Should have up to diversity_reference_size programs + expected_size = min(30, self.db.diversity_reference_size) + self.assertEqual(len(self.db.diversity_reference_set), expected_size) + + # Reference set should contain diverse programs (no duplicates) + self.assertEqual(len(set(self.db.diversity_reference_set)), expected_size) + + def test_feature_scaling_minmax(self): + """Test min-max feature scaling""" + # Add programs with different complexities + complexities = [100, 300, 500, 700, 900] + for i, complexity in enumerate(complexities): + program = Program( + id=f"scale_{i}", code="x" * complexity, language="python", metrics={"score": 0.5} + ) + self.db.add(program) + + # Test scaling + # Min value (100) should scale to 0 + scaled_min = self.db._scale_feature_value("complexity", 100.0) + self.assertAlmostEqual(scaled_min, 0.0, places=5) + + # Max value (900) should scale to 1 + scaled_max = self.db._scale_feature_value("complexity", 900.0) + self.assertAlmostEqual(scaled_max, 1.0, places=5) + + # Middle value (500) should scale to 0.5 + scaled_mid = self.db._scale_feature_value("complexity", 500.0) + self.assertAlmostEqual(scaled_mid, 0.5, places=5) + + def test_feature_stats_update(self): + """Test feature statistics are updated correctly""" + # Initially no stats + self.assertEqual(len(self.db.feature_stats), 0) + + # Add programs and check stats are updated + values = [10.0, 20.0, 30.0, 40.0, 50.0] + for val in values: + self.db._update_feature_stats("test_feature", val) + + # Check stats + stats = self.db.feature_stats["test_feature"] + self.assertEqual(stats["min"], 10.0) + self.assertEqual(stats["max"], 50.0) + self.assertEqual(len(stats["values"]), 5) + self.assertEqual(stats["values"], values) + + def test_per_dimension_bins(self): + """Test per-dimension bin configuration""" + # Create database with per-dimension bins + config = Config() + config.database.in_memory = True + config.database.feature_dimensions = ["complexity", "diversity", "score"] + config.database.feature_bins = {"complexity": 20, "diversity": 10, "score": 5} + db = ProgramDatabase(config.database) + + # Check per-dimension bins were set correctly + self.assertEqual(db.feature_bins_per_dim["complexity"], 20) + self.assertEqual(db.feature_bins_per_dim["diversity"], 10) + self.assertEqual(db.feature_bins_per_dim["score"], 5) + + # Add a program and check binning + program = Program( + id="test_bins", + code="def test():\n return 42", + language="python", + metrics={"score": 0.8}, + ) + db.add(program) + + coords = db._calculate_feature_coords(program) + + # Each coordinate should be within its dimension's range + self.assertLess(coords[0], 20) # complexity + self.assertLess(coords[1], 10) # diversity + self.assertLess(coords[2], 5) # score + + def test_default_feature_dimensions(self): + """Test default feature dimensions are complexity and diversity""" + config = Config() + # Don't set feature_dimensions, use defaults + self.assertEqual(config.database.feature_dimensions, ["complexity", "diversity"]) + + def test_diversity_cache_lru_eviction(self): + """Test LRU eviction in diversity cache""" + # Set small cache size + self.db.diversity_cache_size = 3 + + # Add reference programs + for i in range(3): + program = Program( + id=f"ref_{i}", + code=f"def func_{i}(): pass", + language="python", + metrics={"score": 0.5}, + ) + self.db.add(program) + + # Fill cache + programs = [] + for i in range(5): + program = Program( + id=f"cache_test_{i}", + code=f"def test_{i}(): return {i}", + language="python", + metrics={"score": 0.5}, + ) + programs.append(program) + self.db._get_cached_diversity(program) + + # Cache should only have last 3 entries + self.assertLessEqual(len(self.db.diversity_cache), 3) + + # First 2 programs should be evicted + self.assertNotIn(hash(programs[0].code), self.db.diversity_cache) + self.assertNotIn(hash(programs[1].code), self.db.diversity_cache) + + # Last 3 should be in cache + self.assertIn(hash(programs[2].code), self.db.diversity_cache) + self.assertIn(hash(programs[3].code), self.db.diversity_cache) + self.assertIn(hash(programs[4].code), self.db.diversity_cache) + + def test_feature_scaling_with_identical_values(self): + """Test feature scaling when all values are identical""" + # Add programs with same complexity + for i in range(3): + program = Program( + id=f"same_{i}", + code="x" * 100, # Same length + language="python", + metrics={"score": 0.5}, + ) + self.db.add(program) + + # Scaling should return 0.5 for all values when min==max + scaled = self.db._scale_feature_value("complexity", 100.0) + self.assertEqual(scaled, 0.5) + + def test_feature_coordinates_with_new_defaults(self): + """Test feature coordinate calculation with new default dimensions""" + # Create fresh database with default config + config = Config() + config.database.in_memory = True + db = ProgramDatabase(config.database) + + # Default dimensions should be complexity and diversity + self.assertEqual(db.config.feature_dimensions, ["complexity", "diversity"]) + + # Add some programs + for i in range(5): + program = Program( + id=f"default_test_{i}", + code=f"def func_{i}():\n " + "pass\n" * i, + language="python", + metrics={"score": 0.5 + i * 0.1}, + ) + db.add(program) + + # Test program + test_program = Program( + id="test_defaults", + code="def test():\n return 42", + language="python", + metrics={"score": 0.7}, + ) + + coords = db._calculate_feature_coords(test_program) + + # Should have 2 coordinates (complexity, diversity) + self.assertEqual(len(coords), 2) + + # Both should be valid bin indices + for coord in coords: + self.assertGreaterEqual(coord, 0) + self.assertLess(coord, db.feature_bins) + + def test_missing_feature_dimension_error(self): + """Test that missing feature dimensions raise appropriate errors""" + config = Config() + config.database.in_memory = True + config.database.feature_dimensions = ["complexity", "nonexistent_metric"] + db = ProgramDatabase(config.database) + + # Add a program without the required metric + program = Program( + id="test_error", + code="def test(): pass", + language="python", + metrics={"score": 0.5}, # Missing 'nonexistent_metric' + ) + + # Should raise ValueError when calculating feature coordinates + with self.assertRaises(ValueError) as context: + db.add(program) + + # Check error message + self.assertIn("nonexistent_metric", str(context.exception)) + self.assertIn("not found in program metrics", str(context.exception)) + self.assertIn("score", str(context.exception)) # Should show available metrics + + +if __name__ == "__main__": + unittest.main()