Skip to content

Evolution vs. Parallel: Is the evolution process really necessary? #100

@kongwanbianjinyu

Description

@kongwanbianjinyu

To verify whether the evolving process helps, I compared it with a baseline parallel ablation that only includes the initial program as a prompt for each iteration. I selected the best-of-N as the best program and verified this on the circle_packing problem. Following a two-stage setting, I used the initial program in stage 1, and in stage 2, I used the best program from checkpoint 100 as the initial program. The results show that even without the evaluation process, history, and evolution tree, the system performs well.

Best Evolve score: 0.9927
Best Parallel score: 0.9924

This ablation study raises doubts about the necessity of incorporating the evolutionary process.

Image

The parallel template is:

PARALLEL_TEMPLATE = '''\
# Current Program
```{language}
{current_program}
```

# Task
Rewrite the program to improve its performance on the specified metrics.
Provide the complete new program code.

IMPORTANT: Make sure your rewritten program maintains the same inputs and outputs
as the original program, but with improved internal implementation.

```{language}
# Your rewritten program here
```
'''

This is code for parallel run:

async def run_parallel(
            self,
            iterations: Optional[int] = None,
            target_score: Optional[float] = None,
        ) -> Program:
        
        max_iterations = iterations or self.config.max_iterations

        # Always start fresh - no resuming for parallel mode
        start_iteration = 0

        # Add initial program to database
        logger.info("Adding initial program to database for parallel evolution")
        initial_program_id = str(uuid.uuid4())

        # Evaluate the initial program
        initial_metrics = await self.evaluator.evaluate_program(
            self.initial_program_code, initial_program_id
        )

        initial_program = Program(
            id=initial_program_id,
            code=self.initial_program_code,
            language=self.language,
            metrics=initial_metrics,
            iteration_found=start_iteration,
        )

        # Create a simple database for parallel mode (no islands)
        all_programs = [initial_program]
        best_program = initial_program

        logger.info(
            f"Starting parallel evolution for {max_iterations} iterations"
        )
        logger.info(f"Initial program metrics: {format_metrics_safe(initial_metrics)}")

        for i in range(max_iterations):
            logger.info(f"############### PARALLEL_ITERATION {i+1} ############### ")
            iteration_start = time.time()

            # Build prompt always based on initial program (no evolution)
            prompt = self.prompt_sampler.build_prompt(
                current_program=self.initial_program_code,
                parent_program=self.initial_program_code,
                program_metrics=initial_metrics,
                previous_programs=[],  # No previous programs in parallel mode
                top_programs=[],  # No top programs evolution
                language=self.language,
                evolution_round=i,
                allow_full_rewrite=True,  # Always allow full rewrite in parallel mode
                program_artifacts=None,
                template_key="parallel"
            )

            logger.info(
                f"------ PROMPT ------\n"
                f"Iteration {i+1}: Generating independent program from initial code\n"
                f"System prompt: {prompt['system']}\n"
                f"User prompt: {prompt['user']}\n"
                f"------------------"
            )

            # Generate new program independently
            llm_response = await self.llm_ensemble.generate_with_context(
                system_message=prompt["system"],
                messages=[{"role": "user", "content": prompt["user"]}],
            )

            # Parse the response as full rewrite (always full rewrite in parallel mode)
            new_code = parse_full_rewrite(llm_response, self.language)

            if not new_code:
                logger.warning(f"Iteration {i+1}: No valid code found in response")
                continue

            # Check code length
            if len(new_code) > self.config.max_code_length:
                logger.warning(
                    f"Iteration {i+1}: Generated code exceeds maximum length "
                    f"({len(new_code)} > {self.config.max_code_length})"
                )
                continue

            # Evaluate the new program
            program_id = str(uuid.uuid4())
            program_metrics = await self.evaluator.evaluate_program(new_code, program_id)

            # Create new program (no parent-child relationship)
            new_program = Program(
                id=program_id,
                code=new_code,
                language=self.language,
                parent_id=None,  # No parent in parallel mode
                generation=0,  # All programs are generation 0
                metrics=program_metrics,
                iteration_found=i + 1,
                metadata={
                    "changes": "Independent generation from initial program",
                    "mode": "parallel",
                },
            )

            logger.info("------------------ GENERATED PROGRAM --------------")
            logger.info(f"Generated program {program_id}:")
            logger.info(f"Code:\n{new_code}")
            logger.info("------------------------------------------------")

            # Add to our list of programs
            all_programs.append(new_program)

            # Update best program using best-of-N selection
            if self._is_better_program(new_program, best_program):
                best_program = new_program
                logger.info(
                    f"🌟 New best solution found at iteration {i+1}: {new_program.id}"
                )
                logger.info(f"Metrics: {format_metrics_safe(new_program.metrics)}")

            # Log progress
            iteration_time = time.time() - iteration_start
            improvement_str = format_improvement_safe(initial_metrics, program_metrics)
            
            logger.info(
                f"Iteration {i+1}: Generated program {program_id} "
                f"in {iteration_time:.2f}s. Metrics: "
                f"{format_metrics_safe(program_metrics)} "
                f"(Δ from initial: {improvement_str})"
            )

            # Save checkpoint periodically
            if (i + 1) % self.config.checkpoint_interval == 0:
                self._save_parallel_checkpoint(i + 1, all_programs, best_program)

            # Check if target score reached
            if target_score is not None and 'combined_score' in program_metrics:
                if program_metrics['combined_score'] >= target_score:
                    logger.info(f"Target score {target_score} reached after {i+1} iterations")
                    break

        # Final best-of-N selection
        logger.info(f"Parallel evolution complete. Generated {len(all_programs)} programs total.")
        logger.info(f"Best program: {best_program.id}")
        logger.info(f"Best program metrics: {format_metrics_safe(best_program.metrics)}")

        # Save the best program
        self._save_parallel_best_program(best_program, all_programs)

        return best_program

    def _is_better_program(self, program1: Program, program2: Program) -> bool:
        """
        Compare two programs to determine which is better
        Uses combined_score if available, otherwise uses a heuristic
        """
        metrics1 = program1.metrics
        metrics2 = program2.metrics
        
        # Primary comparison: combined_score
        if 'combined_score' in metrics1 and 'combined_score' in metrics2:
            return metrics1['combined_score'] > metrics2['combined_score']
        
        # Fallback: use other metrics (customize based on your evaluation metrics)
        # This is a simple heuristic - adjust based on your specific metrics
        if 'efficiency' in metrics1 and 'efficiency' in metrics2:
            return metrics1['efficiency'] > metrics2['efficiency']
            
        # Final fallback: assume program1 is not better
        return False

    def _save_parallel_checkpoint(self, iteration: int, all_programs: List[Program], best_program: Program) -> None:
        """
        Save a checkpoint for parallel evolution
        """
        checkpoint_dir = os.path.join(self.output_dir, "checkpoints")
        os.makedirs(checkpoint_dir, exist_ok=True)

        # Create specific checkpoint directory
        checkpoint_path = os.path.join(checkpoint_dir, f"parallel_checkpoint_{iteration}")
        os.makedirs(checkpoint_path, exist_ok=True)

        # Save all programs as JSON
        programs_data = []
        for program in all_programs:
            programs_data.append({
                "id": program.id,
                "code": program.code,
                "language": program.language,
                "metrics": program.metrics,
                "iteration_found": program.iteration_found,
                "timestamp": program.timestamp,
                "metadata": program.metadata,
            })

        programs_file = os.path.join(checkpoint_path, "all_programs.json")
        with open(programs_file, "w") as f:
            json.dump(programs_data, f, indent=2)

        # Save the best program at this checkpoint
        if best_program:
            best_program_path = os.path.join(checkpoint_path, f"best_program{self.file_extension}")
            with open(best_program_path, "w") as f:
                f.write(best_program.code)

            # Save metrics
            best_program_info_path = os.path.join(checkpoint_path, "best_program_info.json")
            with open(best_program_info_path, "w") as f:
                json.dump(
                    {
                        "id": best_program.id,
                        "metrics": best_program.metrics,
                        "iteration_found": best_program.iteration_found,
                        "current_iteration": iteration,
                        "language": best_program.language,
                        "timestamp": best_program.timestamp,
                        "total_programs": len(all_programs),
                        "saved_at": time.time(),
                    },
                    f,
                    indent=2,
                )

        logger.info(f"Saved parallel checkpoint at iteration {iteration} to {checkpoint_path}")

    def _save_parallel_best_program(self, best_program: Program, all_programs: List[Program]) -> None:
        """
        Save the best program from parallel evolution
        """
        best_dir = os.path.join(self.output_dir, "best")
        os.makedirs(best_dir, exist_ok=True)

        # Save best program code
        filename = f"best_program{self.file_extension}"
        code_path = os.path.join(best_dir, filename)
        with open(code_path, "w") as f:
            f.write(best_program.code)

        # Save complete program info
        info_path = os.path.join(best_dir, "best_program_info.json")
        with open(info_path, "w") as f:
            json.dump(
                {
                    "id": best_program.id,
                    "iteration_found": best_program.iteration_found,
                    "timestamp": best_program.timestamp,
                    "metrics": best_program.metrics,
                    "language": best_program.language,
                    "mode": "parallel",
                    "total_programs_generated": len(all_programs),
                    "saved_at": time.time(),
                },
                f,
                indent=2,
            )

        # Save all programs for analysis
        all_programs_path = os.path.join(best_dir, "all_programs.json")
        programs_data = []
        for program in all_programs:
            programs_data.append({
                "id": program.id,
                "metrics": program.metrics,
                "iteration_found": program.iteration_found,
                "timestamp": program.timestamp,
                "is_best": program.id == best_program.id,
            })
        
        with open(all_programs_path, "w") as f:
            json.dump(programs_data, f, indent=2)

        logger.info(f"Saved best program to {code_path} with program info to {info_path}")
        logger.info(f"Saved all {len(all_programs)} programs summary to {all_programs_path}")

Metadata

Metadata

Assignees

No one assigned

    Labels

    questionFurther information is requested

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions