From 006422e256b29ead642649b2ec56b1ff4c0a5e21 Mon Sep 17 00:00:00 2001
From: linhaowei <linhaowei@pku.edu.cn>
Date: Thu, 22 May 2025 17:44:05 +0000
Subject: [PATCH 1/3] init

---
 .gitignore                                    |   4 +
 README.md                                     | 273 +++----
 examples/symbolic_regression/README.md        | 181 +++++
 .../symbolic_regression/bench/dataclasses.py  |  56 ++
 .../symbolic_regression/bench/datamodules.py  | 137 ++++
 examples/symbolic_regression/data_api.py      | 765 ++++++++++++++++++
 examples/symbolic_regression/eval.py          | 395 +++++++++
 examples/symbolic_regression/scripts.sh       |  82 ++
 8 files changed, 1752 insertions(+), 141 deletions(-)
 create mode 100644 examples/symbolic_regression/README.md
 create mode 100755 examples/symbolic_regression/bench/dataclasses.py
 create mode 100755 examples/symbolic_regression/bench/datamodules.py
 create mode 100755 examples/symbolic_regression/data_api.py
 create mode 100755 examples/symbolic_regression/eval.py
 create mode 100644 examples/symbolic_regression/scripts.sh

diff --git a/.gitignore b/.gitignore
index 4ca4d023e..341dda1ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,3 +45,7 @@ htmlcov/
 # Misc
 .DS_Store
 .venv
+
+# For SR
+secrets.yaml
+problems
\ No newline at end of file
diff --git a/README.md b/README.md
index 5a790f3ec..8a3a1e2a8 100644
--- a/README.md
+++ b/README.md
@@ -1,201 +1,192 @@
-# OpenEvolve
+# Evolving Symbolic Regression with OpenEvolve on LLM-SRBench 🧬🔍
 
-An open-source implementation of the AlphaEvolve system described in the Google DeepMind paper "AlphaEvolve: A coding agent for scientific and algorithmic discovery" (2025).
+This example demonstrates how **OpenEvolve** can be utilized to perform **symbolic regression** tasks using the **LLM-SRBench benchmark** (highlighted at ICML 2025). It showcases OpenEvolve's capability to evolve Python code, transforming simple mathematical expressions into more complex and accurate models that fit given datasets.
 
-![OpenEvolve Logo](openevolve-logo.png)
+------
 
-## Overview
+## 🎯 Problem Description: Symbolic Regression on LLM-SRBench
 
-OpenEvolve is an evolutionary coding agent that uses Large Language Models to optimize code through an iterative process. It orchestrates a pipeline of LLM-based code generation, evaluation, and selection to continuously improve programs for a variety of tasks.
+**Symbolic Regression** is the task of discovering a mathematical expression that best fits a given dataset. Unlike traditional regression techniques that optimize parameters for a predefined model structure, symbolic regression aims to find both the **structure of the model** and its **parameters**.
 
-Key features:
-- Evolution of entire code files, not just single functions
-- Support for multiple programming languages
-- Supports OpenAI-compatible APIs for any LLM
-- Multi-objective optimization
-- Flexible prompt engineering
-- Distributed evaluation
+This example leverages **LLM-SRBench**, a benchmark specifically designed for Large Language Model-based Symbolic Regression. The core objective is to use OpenEvolve to evolve an initial, often simple, model (e.g., a linear model) into a more sophisticated symbolic expression. This evolved expression should accurately capture the underlying relationships within various scientific datasets provided by the benchmark.
 
-## How It Works
+------
 
-OpenEvolve follows an evolutionary approach with the following components:
+## 🚀 Getting Started
 
-![OpenEvolve Architecture](openevolve-architecture.png)
+Follow these steps to set up and run the symbolic regression benchmark example:
 
-1. **Prompt Sampler**: Creates context-rich prompts containing past programs, their scores, and problem descriptions
-2. **LLM Ensemble**: Generates code modifications via an ensemble of language models
-3. **Evaluator Pool**: Tests generated programs and assigns scores
-4. **Program Database**: Stores programs and their evaluation metrics, guiding future evolution
+### 1. Configure API Secrets
 
-The controller orchestrates interactions between these components in an asynchronous pipeline, maximizing throughput to evaluate as many candidate solutions as possible.
+You'll need to provide your API credentials for the language models used by OpenEvolve.
 
-## Getting Started
+- Create a `secrets.yaml` file in the example directory.
+- Add your API key and model preferences:
 
-### Installation
+YAML
 
-To install natively, use:
-```bash
-git clone https://github.com/codelion/openevolve.git
-cd openevolve
-pip install -e .
 ```
-
-### Quick Start
-
-```python
-from openevolve import OpenEvolve
-
-# Initialize the system
-evolve = OpenEvolve(
-    initial_program_path="path/to/initial_program.py",
-    evaluation_file="path/to/evaluator.py",
-    config_path="path/to/config.yaml"
-)
-
-# Run the evolution
-best_program = await evolve.run(iterations=1000)
-print(f"Best program metrics:")
-for name, value in best_program.metrics.items():
-    print(f"  {name}: {value:.4f}")
+# secrets.yaml
+api_key: <YOUR_OPENAI_API_KEY>
+api_base: "https://api.openai.com/v1"  # Or your custom endpoint
+primary_model: "gpt-4o"
+secondary_model: "o3" # Or another preferred model for specific tasks
 ```
 
-### Command-Line Usage
+Replace `<YOUR_OPENAI_API_KEY>` with your actual OpenAI API key.
 
-OpenEvolve can also be run from the command line:
+### 2. Load Benchmark Tasks & Generate Initial Programs
 
-```bash
-python openevolve-run.py path/to/initial_program.py path/to/evaluator.py --config path/to/config.yaml --iterations 1000
-```
+The `data_api.py` script is crucial for setting up the environment. It prepares tasks from the LLM-SRBench dataset (defined by classes in `./bench`, and will be located at `./problems`).
+
+For each benchmark task, this script will automatically generate:
 
-### Resuming from Checkpoints
+- `initial_program.py`: A starting Python program, typically a simple linear model.
+- `evaluator.py`: A tailored evaluation script for the task.
+- `config.yaml`: An OpenEvolve configuration file specific to the task.
 
-OpenEvolve automatically saves checkpoints at intervals specified by the `checkpoint_interval` config parameter (default is 10 iterations). You can resume an evolution run from a saved checkpoint:
+Run the script from your terminal:
 
 ```bash
-python openevolve-run.py path/to/initial_program.py path/to/evaluator.py \
-  --config path/to/config.yaml \
-  --checkpoint path/to/checkpoint_directory \
-  --iterations 50
+python data_api.py
 ```
 
-When resuming from a checkpoint:
-- The system loads all previously evolved programs and their metrics
-- Checkpoint numbering continues from where it left off (e.g., if loaded from checkpoint_50, the next checkpoint will be checkpoint_60)
-- All evolution state is preserved (best programs, feature maps, archives, etc.)
-- Each checkpoint directory contains a copy of the best program at that point in time
+This will create subdirectories for each benchmark task, populated with the necessary files.
+
+### 3. Run OpenEvolve
 
-Example workflow with checkpoints:
+Use the provided shell script `scripts.sh` to execute OpenEvolve across the generated benchmark tasks. This script iterates through the task-specific configurations and applies the evolutionary process.
 
 ```bash
-# Run for 50 iterations (creates checkpoints at iterations 10, 20, 30, 40, 50)
-python openevolve-run.py examples/function_minimization/initial_program.py \
-  examples/function_minimization/evaluator.py \
-  --iterations 50
-
-# Resume from checkpoint 50 for another 50 iterations (creates checkpoints at 60, 70, 80, 90, 100)
-python openevolve-run.py examples/function_minimization/initial_program.py \
-  examples/function_minimization/evaluator.py \
-  --checkpoint examples/function_minimization/openevolve_output/checkpoints/checkpoint_50 \
-  --iterations 50
+bash scripts.sh
 ```
 
-### Comparing Results Across Checkpoints
+### 4. Evaluate Results
 
-Each checkpoint directory contains the best program found up to that point, making it easy to compare solutions over time:
+After OpenEvolve has completed its runs, you can evaluate the performance on different subsets of tasks (e.g., bio, chemical, physics, material). The `eval.py` script collates the results and provides a summary.
 
-```
-checkpoints/
-  checkpoint_10/
-    best_program.py         # Best program at iteration 10
-    best_program_info.json  # Metrics and details
-    programs/               # All programs evaluated so far
-    metadata.json           # Database state
-  checkpoint_20/
-    best_program.py         # Best program at iteration 20
-    ...
+```bash
+python eval.py <subset_path>
 ```
 
-You can compare the evolution of solutions by examining the best programs at different checkpoints:
+For example, to evaluate results for the 'physics' subset located in `./problems/phys_osc/`, you would run:
 
 ```bash
-# Compare best programs at different checkpoints
-diff -u checkpoints/checkpoint_10/best_program.py checkpoints/checkpoint_20/best_program.py
-
-# Compare metrics
-cat checkpoints/checkpoint_*/best_program_info.json | grep -A 10 metrics
+python eval.py ./problems/phys_osc
 ```
-### Docker
 
-You can also install and execute via Docker:
-```bash
-docker build -t openevolve .
-docker run --rm -v .:/app openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
-```
+This script will also save a `JSON` file containing detailed results for your analysis.
 
-## Configuration
+------
 
-OpenEvolve is highly configurable. You can specify configuration options in a YAML file:
+## 🌱 Algorithm Evolution: From Linear Model to Complex Expression
 
-```yaml
-# Example configuration
-max_iterations: 1000
-llm:
-  primary_model: "gemini-2.0-flash-lite"
-  secondary_model: "gemini-2.0-flash"
-  temperature: 0.7
-database:
-  population_size: 500
-  num_islands: 5
-```
+OpenEvolve works by iteratively modifying an initial Python program to find a better-fitting mathematical expression.
 
-Sample configuration files are available in the `configs/` directory:
-- `default_config.yaml`: Comprehensive configuration with all available options
+### Initial Algorithm (Example: Linear Model)
 
-See the [Configuration Guide](configs/default_config.yaml) for a full list of options.
+The `data_api.py` script typically generates a basic linear model as the starting point. For a given task, this `initial_program.py` might look like this:
 
-## Examples
+```python
+"""
+Initial program: A naive linear model for symbolic regression.
+This model predicts the output as a linear combination of input variables
+or a constant if no input variables are present.
+The function is designed for vectorized input (X matrix).
+
+Target output variable: dv_dt (Acceleration in Nonl-linear Harmonic Oscillator)
+Input variables (columns of x): x (Position at time t), t (Time), v (Velocity at time t)
+"""
+import numpy as np
+
+# Input variable mapping for x (columns of the input matrix):
+#   x[:, 0]: x (Position at time t)
+#   x[:, 1]: t (Time)
+#   x[:, 2]: v (Velocity at time t)
+
+# Parameters will be optimized by BFGS outside this function.
+# Number of parameters expected by this model: 10.
+# Example initialization: params = np.random.rand(10)
+
+# EVOLVE-BLOCK-START
+
+def func(x, params):
+    """
+    Calculates the model output using a linear combination of input variables
+    or a constant value if no input variables. Operates on a matrix of samples.
+
+    Args:
+        x (np.ndarray): A 2D numpy array of input variable values, shape (n_samples, n_features).
+                        n_features is 3.
+                        If n_features is 0, x should be shape (n_samples, 0).
+                        The order of columns in x must correspond to:
+                        (x, t, v).
+        params (np.ndarray): A 1D numpy array of parameters.
+                             Expected length: 10.
+
+    Returns:
+        np.ndarray: A 1D numpy array of predicted output values, shape (n_samples,).
+    """
+
+    result = x[:, 0] * params[0] + x[:, 1] * params[1] + x[:, 2] * params[2]
+    return result
+    
+# EVOLVE-BLOCK-END
+
+# This part remains fixed (not evolved)
+# It ensures that OpenEvolve can consistently call the evolving function.
+def run_search():
+    return func
+
+# Note: The actual structure of initial_program.py is determined by data_api.py.
+```
 
-See the `examples/` directory for complete examples of using OpenEvolve on various problems:
+### Evolved Algorithm (Discovered Symbolic Expression)
 
-### Circle Packing
+OpenEvolve will iteratively modify the Python code within the `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` markers in `initial_program.py`. The goal is to transform the simple initial model into a more complex and accurate symbolic expression that minimizes the Mean Squared Error (MSE) on the training data.
 
-Our implementation of the circle packing problem from the AlphaEvolve paper. For the n=26 case, where one needs to pack 26 circles in a unit square we also obtain SOTA results.
+An evolved `func` might, for instance, discover a non-linear expression like:
 
-[Explore the Circle Packing Example](examples/circle_packing/)
+```python
+# Hypothetical example of what OpenEvolve might find:
+def func(x, params):
+   # Assuming X_train_scaled maps to x and const maps to a parameter in params
+   predictions = np.sin(x[:, 0]) * x[:, 1]**2 + params[0]
+   return predictions
+```
 
-We have sucessfully replicated the results from the AlphaEvolve paper, below is the packing found by OpenEvolve after 800 iterations
+*(This is a simplified, hypothetical example to illustrate the transformation.)*
 
-![alpha-evolve-replication](https://github.com/user-attachments/assets/00100f9e-2ac3-445b-9266-0398b7174193)
+------
 
-This is exactly the packing reported by AlphaEolve in their paper (Figure 14): 
+## ⚙️ Key Configuration & Approach
 
-![alpha-evolve-results](https://github.com/user-attachments/assets/0c9affa5-053d-404e-bb2d-11479ab248c9)
+- LLM Models:
+  - **Primary Model:** `gpt-4o` (or your configured `primary_model`) is typically used for sophisticated code generation and modification.
+  - **Secondary Model:** `o3` (or your configured `secondary_model`) can be used for refinements, simpler modifications, or other auxiliary tasks within the evolutionary process.
+- Evaluation Strategy:
+  - Currently, this example employs a direct evaluation strategy (not **cascade evaluation**).
+- Objective Function:
+  - The primary objective is to **minimize the Mean Squared Error (MSE)** between the model's predictions and the true values on the training data.
 
-### Function Minimization
+------
 
-An example showing how OpenEvolve can transform a simple random search algorithm into a sophisticated simulated annealing approach.
+## 📊 Results
 
-[Explore the Function Minimization Example](examples/function_minimization/)
+The `eval.py` script will help you collect and analyze performance metrics. The LLM-SRBench paper provides a comprehensive comparison of various baselines. For results generated by this specific OpenEvolve example, you should run the evaluation script as described in the "Getting Started" section.
 
-## Preparing Your Own Problems
+For benchmark-wide comparisons and results from other methods, please refer to the official LLM-SRBench paper.
 
-To use OpenEvolve for your own problems:
+| **Task Category**       | Med. NMSE (Test) | Med. R2 (Test) | **Med. NMSE (OOD Test)** | **Med. R2 (OOD Test)** |
+| ----------------------- | ---------------- | -------------- | ------------------------ | ---------------------- |
+| Chemistry (36 tasks)    | 2.3419e-06       | 1.000          | 3.1384e-02               | 0.9686                 |
+| Biology (24 tasks)      |                  |                |                          |                        |
+| Physics (44 tasks)      | 1.8548e-05       | 1.000          | 7.9255e-04               | 0.9992                 |
+| Material Sc. (25 tasks) |                  |                |                          |                        |
 
-1. **Mark code sections** to evolve with `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` comments
-2. **Create an evaluation function** that returns a dictionary of metrics
-3. **Configure OpenEvolve** with appropriate parameters
-4. **Run the evolution** process
+------
 
-## Citation
+## 🤝 Contribution
 
-If you use OpenEvolve in your research, please cite:
+This OpenEvolve example for LLM-SRBench was implemented by [**Haowei Lin**](https://linhaowei1.github.io/) from Peking University. If you encounter any issues or have questions, please feel free to reach out to Haowei via email (linhaowei@pku.edu.cn) for discussion.
 
-```
-@software{openevolve,
-  title = {OpenEvolve: Open-source implementation of AlphaEvolve},
-  author = {Asankhaya Sharma},
-  year = {2025},
-  publisher = {GitHub},
-  url = {https://github.com/codelion/openevolve}
-}
-```
diff --git a/examples/symbolic_regression/README.md b/examples/symbolic_regression/README.md
new file mode 100644
index 000000000..76692ab96
--- /dev/null
+++ b/examples/symbolic_regression/README.md
@@ -0,0 +1,181 @@
+# Evolving Symbolic Regression Models with OpenEvolve on LLM-SRBench 🧬🔍
+
+This example demonstrates how **OpenEvolve** can be utilized to perform **symbolic regression** tasks using the **LLM-SRBench benchmark**. It showcases the ability of OpenEvolve to evolve Python code representing mathematical expressions to fit given datasets.
+
+---
+
+## Problem Description: Symbolic Regression on LLM-SRBench
+
+**Symbolic Regression** is the task of discovering a mathematical expression that best fits a given dataset. Unlike traditional regression techniques that fit parameters to a predefined model structure, symbolic regression aims to find both the structure of the model and its parameters.
+
+This example leverages **LLM-SRBench**, a benchmark for Large Language Model based Symbolic Regression (highlighted at ICML 2025). The goal is to use OpenEvolve to evolve an initial, simple model (e.g., a linear model) into a more accurate symbolic expression that captures the underlying relationships in various scientific datasets provided by the benchmark.
+
+---
+
+## 🚀 Getting Started
+
+Follow these steps to set up and run the symbolic regression benchmark example:
+
+### 1. Configure API Secrets
+
+You'll need to provide your API credentials for the language models.
+Create a `secrets.yaml` file in the example directory with the following structure:
+
+```yaml
+# secrets.yaml
+api_key: <YOUR_OPENAI_API_KEY>
+api_base: "[https://api.openai.com/v1](https://api.openai.com/v1)"  # Or your custom endpoint
+primary_model: "gpt-4o"
+secondary_model: "o3" # Or another preferred model
+```
+Replace `<YOUR_OPENAI_API_KEY>` with your actual key.
+
+### 2. Load Benchmark Tasks & Generate Initial Programs
+
+The `data_api.py` script is used to load tasks from the LLM-SRBench dataset (located in `./problems` and defined by classes in `./bench`). This script will also automatically generate:
+* An `initial_program.py` (typically a simple linear model) for each benchmark task.
+* An `evaluator.py` tailored for each task.
+* A `config.yaml` for OpenEvolve for each task.
+
+Run the script:
+```bash
+python data_api.py
+```
+This will prepare all necessary files within subdirectories for each benchmark task.
+
+### 3. Run OpenEvolve
+
+Use the provided shell script `scripts.sh` to execute OpenEvolve across the generated benchmark tasks. This script will iterate through the task-specific configurations and apply the evolutionary process.
+
+```bash
+bash scripts.sh
+```
+
+### 4. Evaluate Results
+
+After OpenEvolve has completed its runs, you can evaluate the performance on different subsets of tasks (e.g., `bio`, `chemical`, `physics`, `material`). The `eval.py` script collates the results and provides a summary.
+
+```bash
+python eval.py <subset_path>
+```
+For example, to evaluate results for the 'physics' subset, if they are located in `results/physics_tasks`, you might run `python eval.py results/physics_tasks`.
+
+This will also save a JSON file containing detailed results for your analysis.
+
+---
+
+## Algorithm Evolution: From Linear Model to...?
+
+### Initial Algorithm (e.g., Linear Model)
+
+The `data_api.py` script typically generates a basic linear model as the starting point for evolution. For a given task, this `initial_program.py` might look something like:
+
+```python
+# initial_program.py (conceptual example)
+import numpy as np
+
+# [[evolve_start]]
+def symbolic_model(X_train_scaled, X_test_scaled, y_train_scaled, feature_names, X_train, y_train):
+    # A simple linear model or a placeholder function
+    # The actual initial model is generated by data_api.py
+    predictions = np.zeros(len(X_train_scaled)) # Placeholder
+    # For a real linear model, it might be:
+    # if X_train_scaled.shape[1] > 0:
+    #     coeffs = np.random.rand(X_train_scaled.shape[1])
+    #     intercept = np.random.rand()
+    #     predictions = X_train_scaled @ coeffs + intercept
+    # else:
+    #     predictions = np.full(len(X_train_scaled), np.mean(y_train_scaled))
+
+    # The goal of openevolve is to replace this function
+    # with one that produces better predictions by finding
+    # a symbolic expression using the input features.
+    mse = np.mean((y_train_scaled - predictions)**2)
+    return mse, predictions
+# [[evolve_end]]
+
+def evaluate(X_train_scaled, X_test_scaled, y_train_scaled, feature_names, X_train, y_train):
+    # [[evolve_start]]
+    mse, _ = symbolic_model(X_train_scaled, X_test_scaled, y_train_scaled, feature_names, X_train, y_train)
+    # [[evolve_end]]
+    return mse
+```
+*Note: The actual structure of `initial_program.py` is determined by `data_api.py`.*
+
+### Evolved Algorithm (Discovered Symbolic Expression)
+
+OpenEvolve will iteratively modify the code within the `[[evolve_start]]` and `[[evolve_end]]` blocks in `initial_program.py`. The aim is to transform the simple initial model into a more complex and accurate symbolic expression that minimizes the Mean Squared Error (MSE) on the training data.
+
+An evolved `symbolic_model` function might, for instance, discover an expression like:
+`predictions = np.sin(X_train_scaled[:, 0]) * X_train_scaled[:, 1]**2 + const`
+(This is a hypothetical example of what OpenEvolve might find).
+
+---
+
+## ⚙️ Key Configuration & Approach
+
+* **LLM Models**:
+    * **Primary Model**: `gpt-4o` (used for sophisticated code generation)
+    * **Secondary Model**: `o3` (or your configured alternative, potentially for refinements or specific sub-tasks)
+* **Evaluation Strategy**:
+    * Currently, this example **does not use cascade evaluation**. Each evolved program is evaluated directly. Exploring cascade evaluation could be a future enhancement to potentially boost performance.
+* **Objective Function**:
+    * The primary objective is straightforward: **minimize the Mean Squared Error (MSE)** of the model's predictions on the training data.
+
+---
+
+## 📊 Results
+
+*(This section can be filled in once you have run the benchmarks and gathered performance data.)*
+
+You can present results in a table or through plots, comparing metrics like:
+* Achieved MSE on training/test sets.
+* Complexity of discovered expressions.
+* Performance across different LLM-SRBench task categories (bio, chemical, physics, material).
+
+Example Table Structure:
+
+| Task Category | Avg. MSE (Train) | Avg. MSE (Test) | Notes |
+|---------------|------------------|-----------------|-------|
+| Bio           | *value* | *value* |       |
+| Chemical      | *value* | *value* |       |
+| Physics       | *value* | *value* |       |
+| Material      | *value* | *value* |       |
+| **Overall** | **value** | **value** |       |
+
+The `eval.py` script will output a JSON file with detailed results suitable for populating such tables or for further analysis.
+
+---
+
+## 💡 How It Works with OpenEvolve
+
+This example highlights several capabilities of OpenEvolve:
+
+* **Automated Code Evolution**: OpenEvolve directly modifies Python code within specified blocks to search for better solutions.
+* **Symbolic Discovery**: Instead of just tuning parameters, OpenEvolve attempts to discover the underlying mathematical structure (the symbolic expression) that best models the data.
+* **Adaptability to Benchmarks**: The framework is set up to systematically process multiple tasks from the LLM-SRBench.
+* **Leveraging LLMs for Code Generation**: It utilizes powerful LLMs like GPT-4o to propose novel code structures representing mathematical formulas.
+
+---
+
+## 🔮 Next Steps & Future Exploration
+
+* **Analyze Detailed Results**: Dive into the JSON output from `eval.py` to understand the performance on specific tasks and the nature of the evolved expressions.
+* **Implement Cascade Evaluation**: Explore adding a cascade evaluation mechanism where promising programs are subjected to more rigorous or diverse evaluation criteria.
+* **Experiment with Different LLMs**: Try swapping the primary and secondary models in `secrets.yaml` or testing newer models as they become available.
+* **Modify Evolutionary Parameters**: Adjust settings in the task-specific `config.yaml` files (e.g., population size, number of generations, mutation rates) to see their impact on the discovery process.
+* **Explore Different Objectives**: While MSE is standard, consider incorporating other objectives like model complexity (e.g., using a Pareto front or adding a complexity penalty to the fitness function) to find simpler, more interpretable expressions.
+
+---
+
+## 📁 Files in this Example
+
+* `data_api.py`: Loads benchmark tasks and generates initial files (`initial_program.py`, `evaluator.py`, `config.yaml`) for each.
+* `./problems/`: Contains the raw data files for the LLM-SRBench tasks.
+* `./bench/`: Contains Python data classes and helpers for loading and handling LLM-SRBench tasks.
+* `eval.py`: Script to evaluate the results from OpenEvolve runs for a subset of tasks.
+* `secrets.yaml` (to be created by you): Stores API keys and model preferences.
+* `scripts.sh`: Utility script to run OpenEvolve across all configured benchmark tasks.
+* `initial_program.py` (generated per task): The starting Python code that OpenEvolve will evolve.
+* `evaluator.py` (generated per task): Defines how the evolved programs are evaluated for a specific task.
+* `config.yaml` (generated per task): Configuration file for OpenEvolve for a specific task.
diff --git a/examples/symbolic_regression/bench/dataclasses.py b/examples/symbolic_regression/bench/dataclasses.py
new file mode 100755
index 000000000..83082aca4
--- /dev/null
+++ b/examples/symbolic_regression/bench/dataclasses.py
@@ -0,0 +1,56 @@
+from typing import Optional, Any
+from dataclasses import dataclass
+import sympy
+
+
+@dataclass
+class Equation:
+    symbols: list
+    symbol_descs: list
+    symbol_properties: list
+    expression: str
+    desc: Optional[str] = None
+
+    sympy_format: Optional[sympy.Expr] = None
+    lambda_format: Optional[callable] = None
+    program_format: Optional[str] = None
+
+@dataclass
+class SearchResult:
+    equation: Equation
+    aux: Any
+
+@dataclass
+class SEDTask:
+    name: str
+    symbols: list
+    symbol_descs: list
+    symbol_properties: list
+    samples: Any
+    desc: Optional[str] = None
+
+@dataclass
+class Problem:
+    dataset_identifier: str
+    equation_idx: str
+    gt_equation: Equation
+    samples: Any
+
+    def create_task(self) -> SEDTask:
+        return SEDTask(name=self.equation_idx,
+                        symbols=self.gt_equation.symbols,
+                        symbol_descs=self.gt_equation.symbol_descs,
+                        symbol_properties=self.gt_equation.symbol_properties,
+                        samples=self.train_samples,
+                        desc=self.gt_equation.desc)
+    @property
+    def train_samples(self):
+        return self.samples['train']
+    
+    @property
+    def test_samples(self):
+        return self.samples['test']
+    
+    @property
+    def ood_test_samples(self):
+        return self.samples.get('ood_test', None) 
\ No newline at end of file
diff --git a/examples/symbolic_regression/bench/datamodules.py b/examples/symbolic_regression/bench/datamodules.py
new file mode 100755
index 000000000..d2a8dff0b
--- /dev/null
+++ b/examples/symbolic_regression/bench/datamodules.py
@@ -0,0 +1,137 @@
+from typing import Optional, Any
+
+import json
+from pathlib import Path
+
+import numpy as np
+import h5py
+import datasets
+from huggingface_hub import snapshot_download
+
+from .dataclasses import Equation, Problem
+
+import warnings
+
+REPO_ID = "nnheui/llm-srbench"
+
+def _download(repo_id):
+    return snapshot_download(repo_id=repo_id, repo_type="dataset")
+
+class TransformedFeynmanDataModule:
+    def __init__(self):
+        self._dataset_dir = None
+        self._dataset_identifier = 'lsr_transform'
+    
+    def setup(self):
+        self._dataset_dir = Path(_download(repo_id=REPO_ID))
+        ds = datasets.load_dataset(REPO_ID)['lsr_transform']
+        sample_h5file_path = self._dataset_dir / "lsr_bench_data.hdf5"
+        self.problems = []
+        with h5py.File(sample_h5file_path, "r") as sample_file:
+            for e in ds:
+                samples = {k:v[...].astype(np.float64) for k,v in sample_file[f'/lsr_transform/{e["name"]}'].items()}
+                self.problems.append(Problem(dataset_identifier=self._dataset_identifier,
+                                        equation_idx = e['name'],
+                                        gt_equation=Equation(
+                                            symbols=e['symbols'],
+                                            symbol_descs=e['symbol_descs'],
+                                            symbol_properties=e['symbol_properties'],
+                                            expression=e['expression'],
+                                        ),
+                                        samples=samples)
+                )
+        self.name2id = {p.equation_idx: i for i,p in enumerate(self.problems)}
+
+    @property
+    def name(self):
+        return "LSR_Transform"
+
+class SynProblem(Problem):
+    @property
+    def train_samples(self):
+        return self.samples['train_data']
+    
+    @property
+    def test_samples(self):
+        return self.samples['id_test_data']
+    
+    @property
+    def ood_test_samples(self):
+        return self.samples['ood_test_data']
+
+class BaseSynthDataModule:
+    def __init__(self, dataset_identifier, short_dataset_identifier, root, default_symbols = None, default_symbol_descs=None):
+        self._dataset_dir = Path(root)
+        self._dataset_identifier = dataset_identifier
+        self._short_dataset_identifier = short_dataset_identifier
+        self._default_symbols = default_symbols
+        self._default_symbol_descs = default_symbol_descs
+    
+    def setup(self):
+        self._dataset_dir = Path(_download(repo_id=REPO_ID))
+        ds = datasets.load_dataset(REPO_ID)[f'lsr_synth_{self._dataset_identifier}']
+        sample_h5file_path = self._dataset_dir / "lsr_bench_data.hdf5"
+        self.problems = []
+        with h5py.File(sample_h5file_path, "r") as sample_file:
+            for e in ds:
+                samples = {k:v[...].astype(np.float64) for k,v in sample_file[f'/lsr_synth/{self._dataset_identifier}/{e["name"]}'].items()}
+                self.problems.append(Problem(dataset_identifier=self._dataset_identifier,
+                                        equation_idx = e['name'],
+                                        gt_equation=Equation(
+                                            symbols=e['symbols'],
+                                            symbol_descs=e['symbol_descs'],
+                                            symbol_properties=e['symbol_properties'],
+                                            expression=e['expression'],
+                                        ),
+                                        samples=samples)
+                )
+        self.name2id = {p.equation_idx: i for i,p in enumerate(self.problems)}
+
+    
+        self.name2id = {p.equation_idx: i for i,p in enumerate(self.problems)}
+
+    @property
+    def name(self):
+        return self._dataset_identifier
+
+class MatSciDataModule(BaseSynthDataModule):
+    def __init__(self, root):
+        super().__init__("matsci", "MatSci", root)
+
+class ChemReactKineticsDataModule(BaseSynthDataModule):
+    def __init__(self, root):
+        super().__init__("chem_react", "CRK", root,
+                         default_symbols=['dA_dt', 't', 'A'],
+                         default_symbol_descs=['Rate of change of concentration in chemistry reaction kinetics', 'Time', 'Concentration at time t'])
+        
+class BioPopGrowthDataModule(BaseSynthDataModule):
+    def __init__(self, root):
+        super().__init__("bio_pop_growth", "BPG", root,
+                         default_symbols=['dP_dt', 't', 'P'],
+                         default_symbol_descs=['Population growth rate', 'Time', 'Population at time t'])
+        
+class PhysOscilDataModule(BaseSynthDataModule):
+    def __init__(self, root):
+        super().__init__("phys_osc", "PO", root,
+                         default_symbols=['dv_dt', 'x', 't', 'v'],
+                         default_symbol_descs=['Acceleration in Nonl-linear Harmonic Oscillator', 'Position at time t', 'Time', 'Velocity at time t'])
+
+def get_datamodule(name, root_folder):
+    if name == 'bio_pop_growth':
+        root = root_folder or "datasets/lsr-synth-bio"
+        return BioPopGrowthDataModule(root)
+    elif name == 'chem_react':
+        root = root_folder or "datasets/lsr-synth-chem"
+        return ChemReactKineticsDataModule(root)
+    elif name == 'matsci':
+        root = root_folder or "datasets/lsr-synth-matsci"
+        return MatSciDataModule(root)
+    elif name == 'phys_osc':
+        root = root_folder or "datasets/lsr-synth-phys"
+        return PhysOscilDataModule(root)
+    # elif name == 'feynman':
+    #     return FeynmanDataModule()
+    elif name == 'lsrtransform':
+        return TransformedFeynmanDataModule()
+    else:
+        raise ValueError(f"Unknown datamodule name: {name}")
\ No newline at end of file
diff --git a/examples/symbolic_regression/data_api.py b/examples/symbolic_regression/data_api.py
new file mode 100755
index 000000000..a4be4fca7
--- /dev/null
+++ b/examples/symbolic_regression/data_api.py
@@ -0,0 +1,765 @@
+"""
+Symbolic Regression Problem Generator
+
+This module creates initial programs, evaluators, and configurations for symbolic regression tasks.
+It processes multiple datasets in parallel and generates the necessary files for each problem.
+"""
+
+import os
+import yaml
+import numpy as np
+import multiprocessing
+import importlib.util
+from typing import Dict, List, Tuple, Optional, Any
+
+from bench.datamodules import get_datamodule
+
+
+def load_secret(secrets_file: str = "secrets.yaml") -> Dict[str, Any]:
+    """
+    Load API keys and configuration from a secrets file.
+    
+    Args:
+        secrets_file: Path to the YAML secrets file
+        
+    Returns:
+        Dictionary containing secret configuration, empty dict if file not found
+    """
+    try:
+        with open(secrets_file, 'r') as f:
+            return yaml.safe_load(f)
+    except FileNotFoundError:
+        print(f"Warning: Secrets file '{secrets_file}' not found.")
+        return {}
+    except Exception as e:
+        print(f"Warning: Error loading secrets file '{secrets_file}': {e}")
+        return {}
+
+
+def extract_problem_data_from_initialized_dataset(initialized_dataset, problem_id: int) -> Dict[str, Any]:
+    """
+    Extract data for a specific problem from an initialized dataset.
+    
+    Args:
+        initialized_dataset: Pre-initialized and setup dataset object
+        problem_id: Index of the problem to extract
+        
+    Returns:
+        Dictionary containing problem data including train/test samples, symbols, and metadata
+    """
+    problem = initialized_dataset.problems[problem_id]
+    gt_eq = problem.gt_equation
+    samples = problem.samples
+    
+    data = {
+        'train': samples['train'],
+        'test': samples['test'],
+        'ood_test': samples.get('ood_test', None),
+        'symbols': gt_eq.symbols,
+        'symbol_descs': gt_eq.symbol_descs,
+        'symbol_properties': gt_eq.symbol_properties,
+        'expression': gt_eq.expression,
+        'dataset_identifier': problem.dataset_identifier,
+        'equation_idx': problem.equation_idx,
+    }
+    return data
+
+
+def create_program(problem: Dict[str, Any]) -> str:
+    """
+    Create a Python script with a naive linear model for symbolic regression.
+    
+    The generated script contains a `func(x, params)` that computes predictions
+    in a vectorized manner: x @ params. If no input features exist, it predicts
+    a constant params[0].
+    
+    Args:
+        problem: Dictionary containing problem data
+        
+    Returns:
+        Path to the created program file
+    """
+    problem_dir = f'problems/{problem["dataset_identifier"]}/{problem["equation_idx"]}'
+    
+    # Parse symbols and properties
+    symbols = problem['symbols']
+    properties = problem['symbol_properties']
+    descs = problem['symbol_descs']
+    
+    input_vars = []
+    input_vars_descs = []
+    output_var = None
+    output_var_desc = "N/A"
+    
+    for i, prop in enumerate(properties):
+        if prop == 'V':
+            input_vars.append(symbols[i])
+            input_vars_descs.append(descs[i])
+        elif prop == 'O':
+            output_var = symbols[i]
+            output_var_desc = descs[i]
+    
+    if not output_var:
+        raise ValueError("No output variable ('O') found in symbol_properties.")
+    
+    # Build input variable mapping comments
+    x_mapping_comments = ["# Input variable mapping for x (columns of the input matrix):"]
+    if not input_vars:
+        x_mapping_comments.append("#   No input variables (x will be an (n_samples, 0) matrix).")
+    else:
+        for i, var_name in enumerate(input_vars):
+            x_mapping_comments.append(f"#   x[:, {i}]: {var_name} ({input_vars_descs[i]})")
+    x_mapping_str = "\n".join(x_mapping_comments)
+    
+    # Build function body
+    num_features = len(input_vars)
+    if num_features > 0:
+        function_body = ' + '.join([f"x[:, {i}] * params[{i}]" for i in range(num_features)])
+    else:
+        function_body = "np.full(x.shape[0], params[0])  # Predicts a constant value for all samples"
+    
+    model_num_params = 10
+    
+    # Build input variables description
+    input_vars_desc_list = [f"{v} ({input_vars_descs[i]})" for i, v in enumerate(input_vars)]
+    input_vars_desc_str = ', '.join(input_vars_desc_list) if input_vars else "None"
+    
+    program_content = f'''"""
+Initial program: A naive linear model for symbolic regression.
+This model predicts the output as a linear combination of input variables
+or a constant if no input variables are present.
+The function is designed for vectorized input (X matrix).
+
+Target output variable: {output_var} ({output_var_desc})
+Input variables (columns of x): {input_vars_desc_str}
+"""
+import numpy as np
+
+{x_mapping_str}
+
+# Parameters will be optimized by BFGS outside this function.
+# Number of parameters expected by this model: {model_num_params}.
+# Example initialization: params = np.random.rand({model_num_params})
+
+# EVOLVE-BLOCK-START
+
+def func(x, params):
+    """
+    Calculates the model output using a linear combination of input variables
+    or a constant value if no input variables. Operates on a matrix of samples.
+
+    Args:
+        x (np.ndarray): A 2D numpy array of input variable values, shape (n_samples, n_features).
+                        n_features is {num_features}.
+                        If n_features is 0, x should be shape (n_samples, 0).
+                        The order of columns in x must correspond to:
+                        ({', '.join(input_vars) if input_vars else "None - x has 0 columns"}).
+        params (np.ndarray): A 1D numpy array of parameters.
+                             Expected length: {model_num_params}.
+
+    Returns:
+        np.ndarray: A 1D numpy array of predicted output values, shape (n_samples,).
+    """
+    result = {function_body}
+    return result
+    
+# EVOLVE-BLOCK-END
+
+# This part remains fixed (not evolved)
+def run_search():
+    return func
+'''
+    
+    os.makedirs(problem_dir, exist_ok=True)
+    file_path = os.path.join(problem_dir, "initial_program.py")
+    with open(file_path, "w") as f:
+        f.write(program_content)
+    
+    return file_path
+
+
+def create_evaluator(problem: Dict[str, Any]) -> str:
+    """
+    Create an evaluator script for the symbolic regression problem.
+    
+    The evaluator assesses model performance using BFGS optimization
+    and computes various metrics including MSE and combined scores.
+    
+    Args:
+        problem: Dictionary containing problem data
+        
+    Returns:
+        Path to the created evaluator file
+    """
+    problem_dir = f'problems/{problem["dataset_identifier"]}/{problem["equation_idx"]}'
+    os.makedirs(problem_dir, exist_ok=True)
+    
+    # Extract data arrays
+    symbols = problem['symbols']
+    properties = problem['symbol_properties']
+    train_samples = np.asarray(problem['train'])
+    test_samples = np.asarray(problem['test'])
+    ood_test_samples = problem['ood_test']
+    if ood_test_samples is not None:
+        ood_test_samples = np.asarray(ood_test_samples)
+    
+    # Find input and output indices
+    input_indices = [i for i, prop in enumerate(properties) if prop == 'V']
+    output_indices = [i for i, prop in enumerate(properties) if prop == 'O']
+    
+    if not output_indices:
+        raise ValueError("No output variable ('O') found in symbol_properties.")
+    if len(output_indices) > 1:
+        raise ValueError("Multiple output variables ('O') found. Evaluator supports single output.")
+    output_index = output_indices[0]
+    
+    # Prepare data arrays
+    if not input_indices:
+        X_train = np.empty((len(train_samples), 0))
+        X_test = np.empty((len(test_samples), 0))
+        X_ood_test = np.empty((len(ood_test_samples), 0)) if ood_test_samples is not None else None
+    else:
+        X_train = train_samples[:, input_indices]
+        X_test = test_samples[:, input_indices]
+        X_ood_test = ood_test_samples[:, input_indices] if ood_test_samples is not None else None
+    
+    y_train = train_samples[:, output_index]
+    y_test = test_samples[:, output_index]
+    y_ood_test = ood_test_samples[:, output_index] if ood_test_samples is not None else None
+    
+    num_input_features = len(input_indices)
+    model_num_params_expected = 10
+    
+    # Save data files
+    base_data_path = './'
+    x_train_path = os.path.join(base_data_path, problem_dir, 'X_train_for_eval.npy')
+    y_train_path = os.path.join(base_data_path, problem_dir, 'y_train_for_eval.npy')
+    np.save(x_train_path, X_train)
+    np.save(y_train_path, y_train)
+    
+    x_test_path = os.path.join(problem_dir, 'X_test_for_eval.npy')
+    y_test_path = os.path.join(problem_dir, 'y_test_for_eval.npy')
+    np.save(x_test_path, X_test)
+    np.save(y_test_path, y_test)
+    
+    if X_ood_test is not None and y_ood_test is not None:
+        x_ood_test_path = os.path.join(problem_dir, 'X_ood_test_for_eval.npy')
+        y_ood_test_path = os.path.join(problem_dir, 'y_ood_test_for_eval.npy')
+        np.save(x_ood_test_path, X_ood_test)
+        np.save(y_ood_test_path, y_ood_test)
+    
+    evaluator_script_content = f'''"""
+Evaluator for a symbolic regression model.
+It assesses a model program based on its performance on training data.
+The model's `func` is expected to take a matrix X of inputs.
+"""
+import os
+import sys
+import time
+import traceback
+import importlib.util
+import numpy as np
+from scipy.optimize import minimize
+import concurrent.futures
+
+# Expected number of input features for the model's func
+NUM_INPUT_FEATURES_EXPECTED = {num_input_features}
+# Expected number of parameters for the initial model
+MODEL_NUM_PARAMS_EXPECTED = {model_num_params_expected}
+
+# Paths to data (should be relative to where evaluator.py is run or absolute)
+X_TRAIN_EVAL_PATH = r'{x_train_path}'
+Y_TRAIN_EVAL_PATH = r'{y_train_path}'
+
+
+def run_with_timeout(func, args=(), kwargs={{}}, timeout_seconds=5):
+    """Execute a function with a timeout."""
+    if timeout_seconds is None or timeout_seconds <= 0:
+        return func(*args, **kwargs)
+    
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            return future.result(timeout=timeout_seconds)
+        except concurrent.futures.TimeoutError:
+            func_name = getattr(func, '__name__', 'Unnamed function')
+            raise TimeoutError(f"Function {{func_name}} timed out after {{timeout_seconds}} seconds")
+
+
+def filter_and_convert_metrics(current_metrics_dict):
+    """Filter and convert metrics to appropriate types."""
+    filtered_dict = {{}}
+    float_metric_keys = ['combined_score', 'negative_mse']
+    
+    for key in float_metric_keys:
+        if key in current_metrics_dict:
+            value = current_metrics_dict[key]
+            if value is None:
+                continue
+            if isinstance(value, (int, float, np.integer, np.floating, bool)):
+                try:
+                    filtered_dict[key] = float(value)
+                except (ValueError, TypeError):
+                    pass
+    
+    return filtered_dict
+
+
+def objective_function(params, model_func, X_matrix, y_true_vector):
+    """
+    Objective function for scipy.optimize.minimize.
+    Calculates MSE of the model_func with given params on X_matrix, y_true_vector.
+    
+    Args:
+        params: Parameter vector for the model
+        model_func: Function that takes (X_matrix, params) and returns predictions
+        X_matrix: Input features matrix (n_samples, n_features)
+        y_true_vector: True output values (n_samples,)
+        
+    Returns:
+        MSE value or inf if computation fails
+    """
+    if not callable(model_func):
+        return float('inf')
+    
+    try:
+        predictions = model_func(X_matrix, params)
+        if not isinstance(predictions, np.ndarray) or predictions.shape != y_true_vector.shape:
+            return float('inf')
+    except Exception:
+        return float('inf')
+    
+    if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
+        return float('inf')
+    
+    mse = np.mean((predictions - y_true_vector)**2)
+    return mse
+
+
+def evaluate(program_path):
+    """
+    Evaluate a model program on the training data.
+    
+    Args:
+        program_path: Path to the Python program containing the model
+        
+    Returns:
+        Dictionary containing evaluation metrics
+    """
+    metrics = {{
+        'can_run': 0.0,
+        'negative_mse': -1e09,
+        'raw_mse_train': float('inf'),
+        'mse_train_score': 0.0,
+        'num_params': MODEL_NUM_PARAMS_EXPECTED,
+        'combined_score': -1e09,
+        'error_message': None,
+        'optimization_success': False,
+        'optimized_params': None
+    }}
+    
+    # Load training data
+    try:
+        X_train = np.load(X_TRAIN_EVAL_PATH)
+        y_train = np.load(Y_TRAIN_EVAL_PATH)
+        
+        if X_train.shape[1] != NUM_INPUT_FEATURES_EXPECTED:
+            metrics['error_message'] = f"Loaded X_train has {{X_train.shape[1]}} features, expected {{NUM_INPUT_FEATURES_EXPECTED}}."
+            return filter_and_convert_metrics(metrics)
+        
+        if X_train.shape[0] != y_train.shape[0]:
+            metrics['error_message'] = f"X_train has {{X_train.shape[0]}} samples, y_train has {{y_train.shape[0]}}."
+            return filter_and_convert_metrics(metrics)
+    except Exception as e:
+        metrics['error_message'] = f"Failed to load training data: {{str(e)}}. Paths: X:{{X_TRAIN_EVAL_PATH}}, Y:{{Y_TRAIN_EVAL_PATH}}"
+        return filter_and_convert_metrics(metrics)
+    
+    # Load and test the model function
+    func_to_eval = None
+    try:
+        spec = importlib.util.spec_from_file_location("model_program", program_path)
+        if spec is None or spec.loader is None:
+            metrics['error_message'] = f"Could not create spec for module at {{program_path}}"
+            return filter_and_convert_metrics(metrics)
+        
+        model_module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(model_module)
+        metrics['can_run'] = 0.2
+        
+        if not hasattr(model_module, 'run_search') or not callable(model_module.run_search):
+            metrics['error_message'] = "Model program missing callable 'run_search'."
+            return filter_and_convert_metrics(metrics)
+        
+        func_to_eval = model_module.run_search()
+        
+        if not callable(func_to_eval):
+            metrics['error_message'] = "'run_search' did not return a callable function."
+            return filter_and_convert_metrics(metrics)
+        
+        # Test the function with dummy data
+        num_dummy_samples = 5
+        dummy_x = np.random.rand(num_dummy_samples, NUM_INPUT_FEATURES_EXPECTED)
+        if NUM_INPUT_FEATURES_EXPECTED == 0:
+            dummy_x = np.empty((num_dummy_samples, 0))
+        dummy_params = np.random.rand(MODEL_NUM_PARAMS_EXPECTED)
+        
+        try:
+            pred_test = run_with_timeout(func_to_eval, args=(dummy_x, dummy_params), timeout_seconds=5)
+            if not isinstance(pred_test, np.ndarray) or pred_test.shape != (num_dummy_samples,):
+                metrics['can_run'] = 0.5
+                metrics['error_message'] = f"Func test: output shape mismatch. Got {{pred_test.shape if isinstance(pred_test, np.ndarray) else type(pred_test)}}, expected ({{num_dummy_samples}},)."
+                return filter_and_convert_metrics(metrics)
+            metrics['can_run'] = 1.0
+        except TimeoutError as te:
+            metrics['can_run'] = 0.5
+            metrics['error_message'] = f"Func execution test timed out: {{str(te)}}"
+            return filter_and_convert_metrics(metrics)
+        except Exception as e:
+            metrics['can_run'] = 0.5
+            metrics['error_message'] = f"Func execution test failed: {{str(e)}} with dummy_x.shape={{dummy_x.shape}}, dummy_params.shape={{dummy_params.shape}}"
+            return filter_and_convert_metrics(metrics)
+    
+    except FileNotFoundError:
+        metrics['error_message'] = f"Model program file not found: {{program_path}}"
+        return filter_and_convert_metrics(metrics)
+    except Exception as e:
+        metrics['error_message'] = f"Failed to load or test model function: {{str(e)}}"
+        return filter_and_convert_metrics(metrics)
+    
+    if metrics['can_run'] < 1.0:
+        return filter_and_convert_metrics(metrics)
+    
+    # Optimize parameters
+    initial_params = np.random.rand(MODEL_NUM_PARAMS_EXPECTED)
+    optimized_params = None
+    
+    if X_train.ndim != 2 or X_train.shape[1] != NUM_INPUT_FEATURES_EXPECTED:
+        metrics['error_message'] = f"X_train shape {{X_train.shape}} is not compatible with NUM_INPUT_FEATURES_EXPECTED {{NUM_INPUT_FEATURES_EXPECTED}} for optimization."
+        return filter_and_convert_metrics(metrics)
+    
+    try:
+        opt_result = minimize(
+            objective_function,
+            initial_params,
+            args=(func_to_eval, X_train, y_train),
+            method='BFGS'
+        )
+        
+        metrics['raw_mse_train'] = opt_result.fun if np.isfinite(opt_result.fun) else float('inf')
+        metrics['optimization_success'] = opt_result.success
+        
+        if opt_result.success or hasattr(opt_result, 'x'):
+            optimized_params = opt_result.x
+        else:
+            optimized_params = initial_params
+        
+        if not opt_result.success and metrics['error_message'] is None:
+            metrics['error_message'] = f"Optimization did not converge: {{opt_result.message if hasattr(opt_result, 'message') else 'Unknown reason'}}"
+    
+    except Exception as e:
+        metrics['raw_mse_train'] = float('inf')
+        metrics['error_message'] = f"Error during optimization: {{str(e)}}"
+    
+    metrics['optimized_params'] = optimized_params.tolist() if optimized_params is not None else None
+    
+    # Calculate final scores
+    if np.isfinite(metrics['raw_mse_train']):
+        metrics['negative_mse'] = -metrics['raw_mse_train']
+        metrics['mse_train_score'] = -np.log10(metrics['raw_mse_train'] + 1e-9)
+    else:
+        metrics['mse_train_score'] = 0.0
+    
+    metrics['combined_score'] = metrics['mse_train_score']
+    
+    return filter_and_convert_metrics(metrics)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: python evaluator.py <path_to_model_program.py>")
+        print("Please run the main script that calls create_program and create_evaluator first.")
+        sys.exit(1)
+    
+    program_to_evaluate = sys.argv[1]
+    if not os.path.exists(program_to_evaluate):
+        print(f"Error: Program file '{{program_to_evaluate}}' not found.")
+        sys.exit(1)
+    
+    print(f"Evaluating model: {{program_to_evaluate}}")
+    print(f"Using NUM_INPUT_FEATURES_EXPECTED = {{NUM_INPUT_FEATURES_EXPECTED}}")
+    print(f"Using MODEL_NUM_PARAMS_EXPECTED = {{MODEL_NUM_PARAMS_EXPECTED}}")
+    print(f"Loading X_train from: {{X_TRAIN_EVAL_PATH}}")
+    print(f"Loading y_train from: {{Y_TRAIN_EVAL_PATH}}")
+    
+    if not os.path.exists(X_TRAIN_EVAL_PATH):
+        print(f"Error: X_train data file '{{X_TRAIN_EVAL_PATH}}' not found.")
+        sys.exit(1)
+    if not os.path.exists(Y_TRAIN_EVAL_PATH):
+        print(f"Error: y_train data file '{{Y_TRAIN_EVAL_PATH}}' not found.")
+        sys.exit(1)
+    
+    evaluation_results = evaluate(program_to_evaluate)
+    print("\\nEvaluation Results:")
+    for key, value in evaluation_results.items():
+        if isinstance(value, float):
+            print(f"  {{key}}: {{value:.4f}}")
+        else:
+            print(f"  {{key}}: {{value}}")
+'''
+    
+    evaluator_file_path = os.path.join(problem_dir, "evaluator.py")
+    with open(evaluator_file_path, "w") as f:
+        f.write(evaluator_script_content)
+    
+    return evaluator_file_path
+
+
+def create_config(problem: Dict[str, Any]) -> str:
+    """
+    Create a YAML configuration file for the symbolic regression task.
+    
+    Args:
+        problem: Dictionary containing problem data
+        
+    Returns:
+        Path to the created configuration file
+    """
+    problem_dir = f'problems/{problem["dataset_identifier"]}/{problem["equation_idx"]}'
+    os.makedirs(problem_dir, exist_ok=True)
+    config_file_path = os.path.join(problem_dir, "config.yaml")
+    
+    # Parse variables
+    symbols = problem['symbols']
+    properties = problem['symbol_properties']
+    descs = problem['symbol_descs']
+    
+    input_vars_list = []
+    output_var_list = []
+    
+    for i, prop in enumerate(properties):
+        if prop == 'V':
+            input_vars_list.append(f"{symbols[i]} ({descs[i]})")
+        elif prop == 'O':
+            output_var_list.append(f"{symbols[i]} ({descs[i]})")
+    
+    input_vars_str = ", ".join(input_vars_list) if input_vars_list else "None"
+    output_var_str = ", ".join(output_var_list) if output_var_list else "None (Error: No output defined!)"
+    
+    num_initial_params = 10
+    
+    system_message = (
+        "Your task is to evolve a Python function `func(x, params)` that models a scientific process, "
+        "considering the physical meaning and relationships of inputs, "
+        "by predicting output variables based on input variables.\\n\\n"
+        "The function signature is:\\n\\n"
+        "```python\\n"
+        "def func(x: np.ndarray, params: np.ndarray) -> np.ndarray:\\n"
+        "```\\n\\n"
+        f"- `x` is a 2D NumPy array of shape `(n_samples, {len(input_vars_list)})`\\n"
+        f"- `params` is a 1D NumPy array of up to {num_initial_params} parameters\\n"
+        "- The function should return a 1D NumPy array of predictions with shape `(n_samples,)`\\n\\n"
+        "**Current Problem:**\\n"
+        f"Model the {output_var_str} using the input features: {input_vars_str}\\n"
+        f"Thus, `x` contains {len(input_vars_list)} columns: {input_vars_str}.\\n\\n"
+        "The initial version of `func` is a simple linear model. Parameters in `params` will be optimized externally "
+        "using the BFGS algorithm based on unseen training data.\\n\\n"
+        "Your objective is to evolve `func` to improve predictive performance on unseen data. Aim for a balance between:\\n"
+        "- **Accuracy**: Lower mean squared error (MSE) on training data\\n"
+        "- **Simplicity**: Prefer concise, interpretable expressions\\n\\n"
+        "Model performance (score = -log_10(mse)) will be evaluated on a held-out dataset. "
+        "Ensure the model is free of potential numerical errors (e.g., log0, division by 0)."
+    )
+    
+    secret = load_secret()
+    config_data = {
+        "# Configuration for Symbolic Regression Task": f"{problem['dataset_identifier']}/{problem['equation_idx']}",
+        "max_iterations": 200,
+        "log_level": "INFO",
+        "target_score": "combined_score",
+        "checkpoint_interval": 10,
+        
+        "llm": {
+            "primary_model": secret.get('primary_model', "gpt-4o"),
+            "primary_model_weight": 0.8,
+            "secondary_model": secret.get('secondary_model', "o3-mini"),
+            "secondary_model_weight": 0.2,
+            "api_key": secret.get('api_key', "YOUR_API_KEY_PLACEHOLDER"),
+            "api_base": secret.get('api_base', "https://api.dwyu.top/v1")
+        },
+        
+        "prompt": {
+            "system_message": system_message,
+            "num_top_programs": 4,
+            "use_template_stochasticity": True,
+        },
+        
+        "database": {
+            "population_size": 70,
+            "archive_size": 30,
+            "num_islands": 4,
+            "elite_selection_ratio": 0.3,
+            "exploitation_ratio": 0.6,
+        },
+        
+        "evaluator": {
+            "timeout": 90,
+            "cascade_evaluation": False,
+            "cascade_thresholds": [1.0],
+            "parallel_evaluations": 4,
+            "use_llm_feedback": False,
+        },
+        
+        "diff_based_evolution": True,
+        "allow_full_rewrites": False,
+    }
+    
+    class PreserveNewlinesDumper(yaml.SafeDumper):
+        """Custom YAML dumper that preserves multi-line strings."""
+        def represent_scalar(self, tag, value, style=None):
+            if style is None and isinstance(value, str) and '\n' in value:
+                style = '|'
+            return super().represent_scalar(tag, value, style)
+    
+    with open(config_file_path, "w") as f:
+        yaml.dump(config_data, f, Dumper=PreserveNewlinesDumper, 
+                  default_flow_style=False, sort_keys=False, indent=2)
+    
+    return config_file_path
+
+
+def process_problem(initialized_dataset, problem_id: int, split_name: str) -> str:
+    """
+    Process a single problem using a pre-initialized dataset.
+    
+    Loads specific problem data, creates program, evaluator, and config.
+    Skips processing if essential output files already exist.
+    
+    Args:
+        initialized_dataset: Pre-initialized and setup dataset object
+        problem_id: Index of the problem to process
+        split_name: Name of the dataset split
+        
+    Returns:
+        Status message indicating success, skip, or error
+    """
+    try:
+        problem_data = extract_problem_data_from_initialized_dataset(initialized_dataset, problem_id)
+        
+        dataset_identifier = problem_data['dataset_identifier']
+        equation_idx = problem_data['equation_idx']
+        problem_dir = os.path.join('problems', dataset_identifier, str(equation_idx))
+        base_data_path = './'
+        
+        # Check if all essential files already exist
+        essential_files = [
+            os.path.join(problem_dir, "initial_program.py"),
+            os.path.join(problem_dir, "evaluator.py"),
+            os.path.join(problem_dir, "config.yaml"),
+            os.path.join(base_data_path, problem_dir, 'X_train_for_eval.npy'),
+            os.path.join(base_data_path, problem_dir, 'y_train_for_eval.npy'),
+            os.path.join(problem_dir, 'X_test_for_eval.npy'),
+            os.path.join(problem_dir, 'y_test_for_eval.npy'),
+        ]
+        
+        # Add OOD test files if applicable
+        if problem_data.get('ood_test') is not None:
+            essential_files.extend([
+                os.path.join(problem_dir, 'X_ood_test_for_eval.npy'),
+                os.path.join(problem_dir, 'y_ood_test_for_eval.npy')
+            ])
+        
+        # Check if all files exist
+        all_files_exist = all(os.path.exists(f) for f in essential_files)
+        
+        if all_files_exist:
+            return f"Skipped (already processed): problem_id: {problem_id} for split: {split_name} ({dataset_identifier}/{equation_idx})"
+        
+        # Create necessary files
+        create_program(problem_data)
+        create_evaluator(problem_data)
+        create_config(problem_data)
+        
+        return f"Successfully processed problem_id: {problem_id} for split: {split_name} ({dataset_identifier}/{equation_idx})"
+    
+    except Exception as e:
+        import traceback
+        return f"Error processing problem_id {problem_id} for split {split_name}: {str(e)}\n{traceback.format_exc()}"
+
+
+def main():
+    """
+    Main entry point for processing symbolic regression problems.
+    
+    Initializes datasets and processes problems in parallel using multiprocessing.
+    """
+    # Determine number of processes to use
+    num_cores_available = os.cpu_count()
+    num_processes = min(max(1, (num_cores_available - 1) if num_cores_available else 4), 24)
+    
+    print(f"Starting processing with {num_processes} processes...")
+    
+    # Define dataset splits and their problem counts
+    splits_data = {
+        'bio_pop_growth': 24,
+        'chem_react': 36,
+        'matsci': 25,
+        'phys_osc': 44,
+        # 'lsrtransform': 111  # Uncomment to include this split
+    }
+    
+    all_tasks = []
+    
+    # Initialize datasets and prepare tasks
+    for split_name, num_problems in splits_data.items():
+        print(f"\nInitializing dataset for split: {split_name}...")
+        dataset_root_folder = f'dataset/{split_name}'
+        
+        try:
+            # Initialize and setup dataset once per split
+            initialized_dataset = get_datamodule(split_name, dataset_root_folder)
+            initialized_dataset.setup()
+            print(f"Dataset for {split_name} initialized and setup complete.")
+            
+            # Prepare tasks for this split
+            print(f"Preparing tasks for split: {split_name} ({num_problems} problems)")
+            for problem_id in range(num_problems):
+                all_tasks.append((initialized_dataset, problem_id, split_name))
+                
+        except Exception as e:
+            print(f"ERROR: Could not initialize or setup dataset for split {split_name}. Skipping this split.")
+            print(f"Details: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+    
+    if not all_tasks:
+        print("No tasks to process. This could be due to errors in dataset initialization. Exiting.")
+        return
+    
+    print(f"\nTotal tasks to process across all successfully initialized splits: {len(all_tasks)}")
+    
+    # Process tasks in parallel
+    with multiprocessing.Pool(processes=num_processes) as pool:
+        results = pool.starmap(process_problem, all_tasks)
+    
+    # Print results summary
+    print("\n--- Processing Complete ---")
+    success_count = 0
+    skipped_count = 0
+    error_count = 0
+    
+    for result in results:
+        print(result)
+        if "Successfully processed" in result:
+            success_count += 1
+        elif "Skipped" in result:
+            skipped_count += 1
+        elif "Error processing" in result:
+            error_count += 1
+    
+    print(f"\nSummary: {success_count} successful, {skipped_count} skipped, {error_count} errors.")
+    print("\nAll tasks finished.")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/examples/symbolic_regression/eval.py b/examples/symbolic_regression/eval.py
new file mode 100755
index 000000000..a662092e3
--- /dev/null
+++ b/examples/symbolic_regression/eval.py
@@ -0,0 +1,395 @@
+from typing import Dict, Any # List removed as it's not used
+import json
+import os
+from pathlib import Path
+import numpy as np
+# import time # Not used
+from scipy.stats import kendalltau
+from sklearn.metrics import mean_absolute_percentage_error
+from scipy.optimize import minimize
+import importlib.util
+import sys
+# import traceback # Not used
+# import json # Not used
+# Example custom JSON encoder if you need to save results with numpy types
+import json
+class NumpyFloatJSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super(NumpyFloatJSONEncoder, self).default(obj)
+    
+def compute_output_base_metrics(y_pred: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
+    """
+    Computes base metrics after filtering NaNs from predictions.
+    Ensures inputs y_pred and y are treated as 1D arrays.
+    """
+    # Ensure y_pred and y are 1D arrays.
+    y_pred_1d = np.asarray(y_pred).squeeze()
+    y_1d = np.asarray(y).squeeze()
+
+    # If squeeze results in 0-D (scalar), reshape to 1-D
+    if y_pred_1d.ndim == 0:
+        y_pred_1d = y_pred_1d.reshape(1)
+    if y_1d.ndim == 0:
+        y_1d = y_1d.reshape(1)
+
+    base_metrics_nan = {
+        "mse": float('nan'), "nmse": float('nan'), "r2": float('nan'),
+        "kdt": float('nan'), "mape": float('nan'), "num_valid_points": 0
+    }
+
+    if y_pred_1d.shape != y_1d.shape and not (y_pred_1d.size == 0 and y_1d.size == 0):
+        return {**base_metrics_nan, "error": "y_pred and y have incompatible shapes after ensuring 1D."}
+
+    nonnan_mask = ~np.isnan(y_pred_1d)
+    y_pred_filtered = y_pred_1d[nonnan_mask]
+    y_filtered = y_1d[nonnan_mask]
+
+    if y_pred_filtered.size == 0: # All predictions were NaN or inputs were empty
+        return {**base_metrics_nan, "error": "All predictions are NaN or no data to compare after filtering."}
+
+    mse = np.mean((y_filtered - y_pred_filtered)**2)
+    var_y = np.var(y_filtered)
+
+    if var_y == 0:
+        nmse = 0.0 if mse == 0 else float('inf') # Consistent if true values are constant
+    else:
+        nmse = mse / var_y
+    
+    sum_sq_res = np.sum((y_filtered - y_pred_filtered)**2)
+    sum_sq_total = np.sum((y_filtered - np.mean(y_filtered))**2) # Use mean of filtered y
+
+    if sum_sq_total == 0: # True values (after filtering) are constant
+        r2 = 1.0 if sum_sq_res == 0 else -float('inf') # Or 0.0 if mse is also 0, definition varies. Sklearn uses 1.0.
+    else:
+        r2 = 1 - (sum_sq_res / sum_sq_total)
+
+    kdt = float('nan')
+    try:
+        if y_filtered.size >= 2: # Kendall's tau requires at least 2 points
+            kdt_val, _ = kendalltau(y_filtered, y_pred_filtered)
+            kdt = float(kdt_val) # Ensure it's a basic float (handles np.nan)
+        # If size < 2, kdt remains float('nan')
+    except ValueError: # Should be less common with size check, but as a fallback
+        kdt = float('nan') # Explicitly set, though already NaN.
+
+    mape = float('nan')
+    try:
+        valid_mape_indices = y_filtered != 0
+        if np.sum(valid_mape_indices) > 0:
+            mape = mean_absolute_percentage_error(y_filtered[valid_mape_indices], y_pred_filtered[valid_mape_indices])
+        elif y_filtered.size > 0: # All true values are zero
+            mape = 0.0 if np.all(y_pred_filtered == 0) else float('inf')
+        # If y_filtered.size is 0, mape remains float('nan')
+    except ValueError: # Fallback for any other MAPE calculation issues
+        mape = float('nan')
+
+    return {
+        "mse": float(mse),
+        "nmse": float(nmse),
+        "r2": float(r2),
+        "kdt": kdt, # Already a float
+        "mape": float(mape) if mape is not float('inf') else float('inf'), # Ensure float, preserve inf
+        "num_valid_points": int(y_pred_filtered.size),
+    }
+
+def objective_function(params: np.ndarray, model_func: callable, X_matrix: np.ndarray, y_true_vector: np.ndarray) -> float:
+    """
+    Objective function for scipy.optimize.minimize.
+    Calculates MSE of the model_func with given params on X_matrix, y_true_vector.
+    """
+    # model_func callable status is checked before calling minimize in the evaluation function.
+    try:
+        predictions = model_func(X_matrix, params)
+        if not isinstance(predictions, np.ndarray) or predictions.shape != y_true_vector.shape:
+            # print(f"Debug: Objective func - Bad prediction shape/type. Got {type(predictions)}, shape {getattr(predictions, 'shape', 'N/A')}. Expected {y_true_vector.shape}")
+            return float('inf')
+        if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
+            # print("Debug: Objective func - Predictions contain NaN/Inf.")
+            return float('inf')
+    except Exception: # Catch any error during model prediction
+        # print(f"Debug: Objective func - Exception during model_func call: {e_obj}")
+        return float('inf')
+
+    mse = np.mean((predictions - y_true_vector)**2)
+    return mse
+
+def evaluation(
+    program_path: str,
+    data_path: str,
+) -> Dict[str, Dict[str, Any]]:
+    """
+    Evaluates a model by loading it, optimizing its parameters, and testing it.
+    The model function from program_path is expected to be named 'func'.
+    """
+    base_error_metrics = {
+        "mse": float('nan'), "nmse": float('nan'), "r2": float('nan'),
+        "kdt": float('nan'), "mape": float('nan'), "num_valid_points": 0,
+    }
+
+    def _create_error_return(error_message: str) -> Dict[str, Dict[str, Any]]:
+        print(f"Error: {error_message}")
+        return {
+            "train_metrics": {**base_error_metrics, "error": error_message},
+            "test_metrics": {**base_error_metrics, "error": error_message},
+            "ood_metrics": {**base_error_metrics, "error": error_message},
+        }
+
+    # 1. Load data
+    try:
+        p_data_path = Path(data_path)
+        train_x = np.load(p_data_path / "X_train_for_eval.npy")
+        train_y = np.load(p_data_path / "y_train_for_eval.npy").squeeze() # Ensure 1D
+        test_x = np.load(p_data_path / "X_test_for_eval.npy")
+        test_y = np.load(p_data_path / "y_test_for_eval.npy").squeeze()   # Ensure 1D
+        test_x_ood = np.load(p_data_path / "X_ood_test_for_eval.npy")
+        test_y_ood = np.load(p_data_path / "y_ood_test_for_eval.npy").squeeze() # Ensure 1D
+    except FileNotFoundError as e:
+        return _create_error_return(f"Data file not found: {e.filename}")
+    except Exception as e:
+        return _create_error_return(f"Error loading or processing data: {str(e)}")
+
+    # 2. Load program (model function)
+    model_func = None
+    try:
+        p_program_path = Path(program_path)
+        if not p_program_path.is_file():
+            raise FileNotFoundError(f"Program file not found: {program_path}")
+
+        spec = importlib.util.spec_from_file_location("custom_model_module", str(p_program_path))
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Could not create module spec from {program_path}")
+        
+        custom_model_module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(custom_model_module)
+        
+        model_func = getattr(custom_model_module, "func", None)
+        if not callable(model_func):
+            raise AttributeError(f"'func' function not found or not callable in {program_path}")
+    except Exception as e:
+        return _create_error_return(f"Failed to load model function 'func' from '{program_path}': {str(e)}")
+
+    # 3. Optimize parameters on training data
+    optimized_params = None
+    num_attempts = 10 # Default number of attempts
+    best_func_value = float('inf')
+    optimization_critical_error_msg = None
+
+    # Try to get num_params from the model if it provides it, otherwise default
+    num_params_to_optimize = getattr(model_func, 'num_params', 10) # Default to 10 if not specified
+
+    print(f"Starting optimization for {program_path} with {num_attempts} attempts (num_params: {num_params_to_optimize})...")
+    for i in range(num_attempts):
+        print(f"Attempt {i+1}/{num_attempts}")
+        initial_params = np.random.rand(num_params_to_optimize) 
+        try:
+            optimization_result = minimize(
+                objective_function,
+                initial_params,
+                args=(model_func, train_x, train_y),
+                method='BFGS',
+                # options={'maxiter': 1000, 'disp': False} # Example options
+            )
+            if optimization_result.success:
+                print(f"Attempt {i+1} successful. Func value: {optimization_result.fun}")
+                if optimization_result.fun < best_func_value:
+                    best_func_value = optimization_result.fun
+                    optimized_params = optimization_result.x
+                    print(f"New best result found in attempt {i+1}. Func value: {best_func_value}")
+            else:
+                print(f"Warning: Optimization attempt {i+1} did not converge. Message: {optimization_result.message}. Func value: {optimization_result.fun}")
+                if optimization_result.fun < best_func_value: # Still consider if it's the best so far
+                    print(f"Non-converged result from attempt {i+1} is an improvement. Func value: {optimization_result.fun}")
+                    best_func_value = optimization_result.fun
+                    optimized_params = optimization_result.x
+
+        except Exception as e:
+            optimization_critical_error_msg = f"Critical error during optimization attempt {i+1} for {program_path}: {str(e)}"
+            print(f"Error: {optimization_critical_error_msg}")
+            break 
+
+    if optimization_critical_error_msg:
+        return _create_error_return(optimization_critical_error_msg)
+
+    def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str) -> Dict[str, Any]:
+        if optimized_params is None:
+            msg = f"Optimization failed to find parameters for {program_path}, cannot evaluate {set_name}."
+            return {**base_error_metrics, "error": msg}
+        try:
+            pred_y = model_func(X_data, optimized_params)
+            if not isinstance(pred_y, np.ndarray):
+                 raise ValueError(f"{set_name} predictions are not numpy arrays. Got {type(pred_y)}")
+            
+            metrics = compute_output_base_metrics(pred_y, y_data)
+            if "error" in metrics and metrics["num_valid_points"] == 0 :
+                 print(f"Warning for {set_name} ({program_path}): {metrics['error']}")
+            return metrics
+        except Exception as e:
+            error_msg = f"{set_name} evaluation failed for '{program_path}': {str(e)}"
+            print(f"Error: {error_msg}")
+            return {**base_error_metrics, "error": error_msg}
+
+    train_metrics = _get_metrics_for_set(train_x, train_y, "Train set")
+    test_metrics = _get_metrics_for_set(test_x, test_y, "Test set")
+    ood_metrics = _get_metrics_for_set(test_x_ood, test_y_ood, "OOD test set")
+
+    return {
+        "train_metrics": train_metrics,
+        "test_metrics": test_metrics,
+        "ood_metrics": ood_metrics,
+    }
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: python your_script_name.py <path_to_problems_directory_or_single_problem>")
+        sys.exit(1)
+        
+    root_path_arg = sys.argv[1]
+    path_obj = Path(root_path_arg)
+    problem_dirs = []
+
+    # Check if the path is a single problem directory
+    # A problem directory is expected to contain data files directly and an openevolve_output subdir
+    program_file_check = path_obj / 'openevolve_output' / 'best' / 'best_program.py'
+    data_file_check = path_obj / "X_train_for_eval.npy"
+
+    if data_file_check.exists() and program_file_check.exists():
+        problem_dirs.append(path_obj)
+        print(f"Identified as single problem directory: {path_obj}")
+    else:
+        # Assume path is a parent directory containing multiple problem subdirectories
+        print(f"Identified as parent directory: {path_obj}. Searching for problem subdirectories...")
+        try:
+            if not path_obj.is_dir():
+                 print(f"Error: Root path {root_path_arg} is not a directory.")
+                 sys.exit(1)
+            for d in path_obj.iterdir():
+                if d.is_dir():
+                    # Check if this subdirectory looks like a problem directory
+                    if (d / "X_train_for_eval.npy").exists() and \
+                       (d / 'openevolve_output' / 'best' / 'best_program.py').exists():
+                        problem_dirs.append(d)
+                        print(f"  Found problem subdirectory: {d.name}")
+                    else:
+                        print(f"  Skipping subdirectory (missing data or program): {d.name}")
+        except FileNotFoundError:
+            print(f"Error: Root directory not found: {root_path_arg}")
+            sys.exit(1)
+
+    if not problem_dirs:
+        print(f"No valid problem subdirectories found in '{root_path_arg}' or '{root_path_arg}' itself is not a valid problem directory.")
+        sys.exit(1)
+
+    all_results = {}
+    for subdir_path in problem_dirs:
+        problem_name = subdir_path.name
+        # if "21" not in problem_name: continue
+        print(f"\nProcessing problem: {problem_name}")
+        program_file_path = subdir_path / 'openevolve_output' / 'best' / 'best_program.py'
+        data_dir_path = subdir_path 
+
+        if not program_file_path.exists(): # Should have been caught by subdir check, but as a safeguard
+            print(f"Skipping {problem_name}: best_program.py not found at {program_file_path}")
+            all_results[problem_name] = {
+                "train_metrics": {"error": "best_program.py not found"},
+                "test_metrics": {"error": "best_program.py not found"},
+                "ood_metrics": {"error": "best_program.py not found"},
+            }
+            continue
+            
+        print(f"  Program path: {program_file_path}")
+        print(f"  Data path: {data_dir_path}")
+
+        metrics_output = evaluation( # Renamed from 'metrics' to avoid conflict
+            program_path=str(program_file_path),
+            data_path=str(data_dir_path),
+        )
+        print(f"  Metrics for {problem_name}: {metrics_output}")
+        all_results[problem_name] = metrics_output
+    
+    print("\n--- All Evaluation Results ---")
+    for problem, result in all_results.items():
+        print(f"\nProblem: {problem}")
+        print(f"  Train Metrics: {result.get('train_metrics')}")
+        print(f"  Test Metrics: {result.get('test_metrics')}")
+        print(f"  OOD Metrics: {result.get('ood_metrics')}")
+
+    # --- Overall Performance Calculation ---
+    overall_performance = {}
+    # Metrics to aggregate: mse, nmse, r2, kdt, mape
+    metric_keys = ["mse", "nmse", "r2", "kdt", "mape"] 
+    dataset_types = ["train_metrics", "test_metrics", "ood_metrics"]
+
+    for d_type in dataset_types:
+        overall_performance[d_type] = {}
+        for m_key in metric_keys:
+            all_scores = []
+            for problem_name, results_data in all_results.items():
+                # Ensure the dataset type (e.g., train_metrics) exists and doesn't have a top-level error
+                if d_type in results_data and "error" not in results_data[d_type]:
+                    score = results_data[d_type].get(m_key)
+                    # Only include if score is a number (not nan, not None, not inf for some metrics initially)
+                    # np.nanmean and np.nanmedian will handle internal NaNs gracefully.
+                    # We explicitly exclude inf from aggregation here, as it can skew means badly.
+                    # For R2, -inf is possible and should be handled by nanmedian/nanmean or filtered if desired.
+                    if isinstance(score, (int, float)) and not np.isinf(score): # np.isnan(score) is fine for nan* functions
+                        all_scores.append(score)
+                    elif score == -float('inf') and m_key == "r2": # Special case for R2, allow -inf
+                         all_scores.append(score)
+
+
+            if all_scores:
+                # Replace -inf with NaN for R2 mean calculation if desired, or handle as is.
+                # For simplicity, we'll let nanmean/nanmedian handle it.
+                # Extreme values can still affect the mean significantly.
+                
+                # Filter out inf values for mean calculation as they make it non-informative
+                # but keep them for median if appropriate (or filter there too).
+                # For simplicity here, we are filtering inf before both.
+                # A more nuanced approach might replace inf with a very large/small number or handle per metric.
+                
+                scores_for_mean = [s for s in all_scores if s != -float('inf')] # R2 can be -inf
+                
+                overall_performance[d_type][f"mean_{m_key}"] = np.nanmean(scores_for_mean) if scores_for_mean else float('nan')
+                overall_performance[d_type][f"median_{m_key}"] = np.nanmedian(all_scores) if all_scores else float('nan')
+                overall_performance[d_type][f"num_problems_for_{m_key}"] = len(all_scores)
+            else:
+                overall_performance[d_type][f"mean_{m_key}"] = float('nan')
+                overall_performance[d_type][f"median_{m_key}"] = float('nan')
+                overall_performance[d_type][f"num_problems_for_{m_key}"] = 0
+    
+    print("\n--- Overall Performance Summary ---")
+    for d_type, metrics_summary in overall_performance.items():
+        print(f"\n{d_type.replace('_', ' ').title()}:")
+        if not metrics_summary:
+            print("  No data for overall summary.")
+            continue
+        for stat_name, value in metrics_summary.items():
+            if "num_problems_for_" in stat_name: # Print count separately or alongside
+                m_key = stat_name.replace("num_problems_for_", "")
+                print(f"  Number of problems for {m_key.upper()} stats: {value}")
+            elif "mean_" in stat_name or "median_" in stat_name:
+                 print(f"  {stat_name.replace('_', ' ').title()}: {value:.4f}" if isinstance(value, float) and not np.isnan(value) else f"  {stat_name.replace('_', ' ').title()}: {value}")
+
+
+    # Add overall performance to the results to be saved
+    all_results["overall_performance_summary"] = overall_performance
+    
+    # Optional: Save all_results to a JSON file
+    # Determine the output file path. If root_path_arg is a file, save alongside it. If a dir, save inside it.
+    if path_obj.is_file(): # Should not happen with current logic, but as a fallback
+        output_results_file = path_obj.parent / "all_evaluation_results.json"
+    else: # path_obj is a directory
+        output_results_file = path_obj / "all_evaluation_results.json"
+
+    try:
+        with open(output_results_file, 'w') as f:
+            json.dump(all_results, f, indent=4, cls=NumpyFloatJSONEncoder)
+        print(f"\nAll results, including overall performance, saved to {output_results_file}")
+    except Exception as e:
+        print(f"\nError saving results to JSON: {e}")
\ No newline at end of file
diff --git a/examples/symbolic_regression/scripts.sh b/examples/symbolic_regression/scripts.sh
new file mode 100644
index 000000000..1351a36b1
--- /dev/null
+++ b/examples/symbolic_regression/scripts.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Define the number of problems for each split
+declare -A split_counts=(
+    ["bio_pop_growth"]=24
+    ["chem_react"]=36
+    ["matsci"]=25
+    ["phys_osc"]=44
+)
+
+declare -A split_problem_dir_prefixes=(
+    ["bio_pop_growth"]="BPG"  
+    ["chem_react"]="CRK"       
+    ["matsci"]="MatSci"           
+    ["phys_osc"]="PO"         
+)
+
+base_problems_dir="./problems"
+
+echo "Starting all experiments..."
+
+for split_name in "${!split_counts[@]}"; do
+    count=${split_counts[$split_name]}
+    problem_dir_prefix=${split_problem_dir_prefixes[$split_name]}
+
+    # Check if a prefix is defined (it can be an empty string if paths are like "split_name/0/")
+    if [ -z "$problem_dir_prefix" ] && [ "${split_problem_dir_prefixes[$split_name]+_}" != "_" ]; then
+        # This means the key exists but the value is an empty string, which is allowed.
+        : # Do nothing, empty prefix is fine.
+    elif [ -z "$problem_dir_prefix" ]; then
+        echo ""
+        echo "Warning: No problem directory prefix defined for split '$split_name' in 'split_problem_dir_prefixes'. Skipping this split."
+        continue
+    fi
+
+    echo ""
+    echo "----------------------------------------------------"
+    echo "Processing Split: $split_name"
+    echo "Number of problems: $count"
+    echo "Problem directory prefix: '$problem_dir_prefix'" # Prefix like CRK, BPG, etc.
+    echo "Expected problem path structure: $base_problems_dir/$split_name/${problem_dir_prefix}[ID]/"
+    echo "----------------------------------------------------"
+
+    # Loop from problem_id 0 to count-1
+    for (( i=0; i<count; i++ )); do
+        # Construct the path to the specific problem's directory
+        # e.g., ./examples/symbolic_regression/problems/chem_react/CRK0
+        problem_dir="$base_problems_dir/$split_name/$problem_dir_prefix$i"
+
+        initial_program_path="$problem_dir/initial_program.py"
+        evaluator_path="$problem_dir/evaluator.py"
+        config_path="$problem_dir/config.yaml" # Assumes 'config.yaml' as in your original script
+
+        # --- Sanity checks for file existence (optional but recommended) ---
+        if [[ ! -f "$initial_program_path" ]]; then
+            echo "  [Problem $i] SKIPPING: Initial program not found at $initial_program_path"
+            continue
+        fi
+        if [[ ! -f "$evaluator_path" ]]; then
+            echo "  [Problem $i] SKIPPING: Evaluator not found at $evaluator_path"
+            continue
+        fi
+        if [[ ! -f "$config_path" ]]; then
+            echo "  [Problem $i] SKIPPING: Config file not found at $config_path"
+            continue
+        fi
+        # --- End Sanity checks ---
+
+        echo "  Launching $split_name - Problem $i ($initial_program_path)"
+        # Run the experiment in the background
+        cmd="python ../../openevolve-run.py "$initial_program_path" "$evaluator_path" --config "$config_path" --iterations 200"
+        eval $cmd &
+    done
+    wait    # let's do split by split
+done
+
+echo ""
+echo "All experiment processes have been launched in the background."
+echo "Waiting for all background processes to complete..."
+wait
+echo ""
+echo "All experiments have completed."
\ No newline at end of file

From 5f8f25664ec8875b72739e73e90666d1ccdb9093 Mon Sep 17 00:00:00 2001
From: linhaowei <linhaowei@pku.edu.cn>
Date: Thu, 22 May 2025 17:45:57 +0000
Subject: [PATCH 2/3] update readme

update README

undo

sync

sync
---
 README.md                              | 273 +++++++++++++------------
 examples/symbolic_regression/README.md | 237 +++++++++++----------
 2 files changed, 265 insertions(+), 245 deletions(-)

diff --git a/README.md b/README.md
index 8a3a1e2a8..5a790f3ec 100644
--- a/README.md
+++ b/README.md
@@ -1,192 +1,201 @@
-# Evolving Symbolic Regression with OpenEvolve on LLM-SRBench 🧬🔍
+# OpenEvolve
 
-This example demonstrates how **OpenEvolve** can be utilized to perform **symbolic regression** tasks using the **LLM-SRBench benchmark** (highlighted at ICML 2025). It showcases OpenEvolve's capability to evolve Python code, transforming simple mathematical expressions into more complex and accurate models that fit given datasets.
+An open-source implementation of the AlphaEvolve system described in the Google DeepMind paper "AlphaEvolve: A coding agent for scientific and algorithmic discovery" (2025).
 
-------
+![OpenEvolve Logo](openevolve-logo.png)
 
-## 🎯 Problem Description: Symbolic Regression on LLM-SRBench
+## Overview
 
-**Symbolic Regression** is the task of discovering a mathematical expression that best fits a given dataset. Unlike traditional regression techniques that optimize parameters for a predefined model structure, symbolic regression aims to find both the **structure of the model** and its **parameters**.
+OpenEvolve is an evolutionary coding agent that uses Large Language Models to optimize code through an iterative process. It orchestrates a pipeline of LLM-based code generation, evaluation, and selection to continuously improve programs for a variety of tasks.
 
-This example leverages **LLM-SRBench**, a benchmark specifically designed for Large Language Model-based Symbolic Regression. The core objective is to use OpenEvolve to evolve an initial, often simple, model (e.g., a linear model) into a more sophisticated symbolic expression. This evolved expression should accurately capture the underlying relationships within various scientific datasets provided by the benchmark.
+Key features:
+- Evolution of entire code files, not just single functions
+- Support for multiple programming languages
+- Supports OpenAI-compatible APIs for any LLM
+- Multi-objective optimization
+- Flexible prompt engineering
+- Distributed evaluation
 
-------
+## How It Works
 
-## 🚀 Getting Started
+OpenEvolve follows an evolutionary approach with the following components:
 
-Follow these steps to set up and run the symbolic regression benchmark example:
+![OpenEvolve Architecture](openevolve-architecture.png)
 
-### 1. Configure API Secrets
+1. **Prompt Sampler**: Creates context-rich prompts containing past programs, their scores, and problem descriptions
+2. **LLM Ensemble**: Generates code modifications via an ensemble of language models
+3. **Evaluator Pool**: Tests generated programs and assigns scores
+4. **Program Database**: Stores programs and their evaluation metrics, guiding future evolution
 
-You'll need to provide your API credentials for the language models used by OpenEvolve.
+The controller orchestrates interactions between these components in an asynchronous pipeline, maximizing throughput to evaluate as many candidate solutions as possible.
 
-- Create a `secrets.yaml` file in the example directory.
-- Add your API key and model preferences:
+## Getting Started
 
-YAML
+### Installation
 
+To install natively, use:
+```bash
+git clone https://github.com/codelion/openevolve.git
+cd openevolve
+pip install -e .
 ```
-# secrets.yaml
-api_key: <YOUR_OPENAI_API_KEY>
-api_base: "https://api.openai.com/v1"  # Or your custom endpoint
-primary_model: "gpt-4o"
-secondary_model: "o3" # Or another preferred model for specific tasks
-```
 
-Replace `<YOUR_OPENAI_API_KEY>` with your actual OpenAI API key.
+### Quick Start
+
+```python
+from openevolve import OpenEvolve
+
+# Initialize the system
+evolve = OpenEvolve(
+    initial_program_path="path/to/initial_program.py",
+    evaluation_file="path/to/evaluator.py",
+    config_path="path/to/config.yaml"
+)
+
+# Run the evolution
+best_program = await evolve.run(iterations=1000)
+print(f"Best program metrics:")
+for name, value in best_program.metrics.items():
+    print(f"  {name}: {value:.4f}")
+```
 
-### 2. Load Benchmark Tasks & Generate Initial Programs
+### Command-Line Usage
 
-The `data_api.py` script is crucial for setting up the environment. It prepares tasks from the LLM-SRBench dataset (defined by classes in `./bench`, and will be located at `./problems`).
+OpenEvolve can also be run from the command line:
 
-For each benchmark task, this script will automatically generate:
+```bash
+python openevolve-run.py path/to/initial_program.py path/to/evaluator.py --config path/to/config.yaml --iterations 1000
+```
 
-- `initial_program.py`: A starting Python program, typically a simple linear model.
-- `evaluator.py`: A tailored evaluation script for the task.
-- `config.yaml`: An OpenEvolve configuration file specific to the task.
+### Resuming from Checkpoints
 
-Run the script from your terminal:
+OpenEvolve automatically saves checkpoints at intervals specified by the `checkpoint_interval` config parameter (default is 10 iterations). You can resume an evolution run from a saved checkpoint:
 
 ```bash
-python data_api.py
+python openevolve-run.py path/to/initial_program.py path/to/evaluator.py \
+  --config path/to/config.yaml \
+  --checkpoint path/to/checkpoint_directory \
+  --iterations 50
 ```
 
-This will create subdirectories for each benchmark task, populated with the necessary files.
-
-### 3. Run OpenEvolve
+When resuming from a checkpoint:
+- The system loads all previously evolved programs and their metrics
+- Checkpoint numbering continues from where it left off (e.g., if loaded from checkpoint_50, the next checkpoint will be checkpoint_60)
+- All evolution state is preserved (best programs, feature maps, archives, etc.)
+- Each checkpoint directory contains a copy of the best program at that point in time
 
-Use the provided shell script `scripts.sh` to execute OpenEvolve across the generated benchmark tasks. This script iterates through the task-specific configurations and applies the evolutionary process.
+Example workflow with checkpoints:
 
 ```bash
-bash scripts.sh
+# Run for 50 iterations (creates checkpoints at iterations 10, 20, 30, 40, 50)
+python openevolve-run.py examples/function_minimization/initial_program.py \
+  examples/function_minimization/evaluator.py \
+  --iterations 50
+
+# Resume from checkpoint 50 for another 50 iterations (creates checkpoints at 60, 70, 80, 90, 100)
+python openevolve-run.py examples/function_minimization/initial_program.py \
+  examples/function_minimization/evaluator.py \
+  --checkpoint examples/function_minimization/openevolve_output/checkpoints/checkpoint_50 \
+  --iterations 50
 ```
 
-### 4. Evaluate Results
+### Comparing Results Across Checkpoints
 
-After OpenEvolve has completed its runs, you can evaluate the performance on different subsets of tasks (e.g., bio, chemical, physics, material). The `eval.py` script collates the results and provides a summary.
+Each checkpoint directory contains the best program found up to that point, making it easy to compare solutions over time:
 
-```bash
-python eval.py <subset_path>
+```
+checkpoints/
+  checkpoint_10/
+    best_program.py         # Best program at iteration 10
+    best_program_info.json  # Metrics and details
+    programs/               # All programs evaluated so far
+    metadata.json           # Database state
+  checkpoint_20/
+    best_program.py         # Best program at iteration 20
+    ...
 ```
 
-For example, to evaluate results for the 'physics' subset located in `./problems/phys_osc/`, you would run:
+You can compare the evolution of solutions by examining the best programs at different checkpoints:
 
 ```bash
-python eval.py ./problems/phys_osc
+# Compare best programs at different checkpoints
+diff -u checkpoints/checkpoint_10/best_program.py checkpoints/checkpoint_20/best_program.py
+
+# Compare metrics
+cat checkpoints/checkpoint_*/best_program_info.json | grep -A 10 metrics
 ```
+### Docker
 
-This script will also save a `JSON` file containing detailed results for your analysis.
+You can also install and execute via Docker:
+```bash
+docker build -t openevolve .
+docker run --rm -v .:/app openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
+```
 
-------
+## Configuration
 
-## 🌱 Algorithm Evolution: From Linear Model to Complex Expression
+OpenEvolve is highly configurable. You can specify configuration options in a YAML file:
 
-OpenEvolve works by iteratively modifying an initial Python program to find a better-fitting mathematical expression.
+```yaml
+# Example configuration
+max_iterations: 1000
+llm:
+  primary_model: "gemini-2.0-flash-lite"
+  secondary_model: "gemini-2.0-flash"
+  temperature: 0.7
+database:
+  population_size: 500
+  num_islands: 5
+```
 
-### Initial Algorithm (Example: Linear Model)
+Sample configuration files are available in the `configs/` directory:
+- `default_config.yaml`: Comprehensive configuration with all available options
 
-The `data_api.py` script typically generates a basic linear model as the starting point. For a given task, this `initial_program.py` might look like this:
+See the [Configuration Guide](configs/default_config.yaml) for a full list of options.
 
-```python
-"""
-Initial program: A naive linear model for symbolic regression.
-This model predicts the output as a linear combination of input variables
-or a constant if no input variables are present.
-The function is designed for vectorized input (X matrix).
-
-Target output variable: dv_dt (Acceleration in Nonl-linear Harmonic Oscillator)
-Input variables (columns of x): x (Position at time t), t (Time), v (Velocity at time t)
-"""
-import numpy as np
-
-# Input variable mapping for x (columns of the input matrix):
-#   x[:, 0]: x (Position at time t)
-#   x[:, 1]: t (Time)
-#   x[:, 2]: v (Velocity at time t)
-
-# Parameters will be optimized by BFGS outside this function.
-# Number of parameters expected by this model: 10.
-# Example initialization: params = np.random.rand(10)
-
-# EVOLVE-BLOCK-START
-
-def func(x, params):
-    """
-    Calculates the model output using a linear combination of input variables
-    or a constant value if no input variables. Operates on a matrix of samples.
-
-    Args:
-        x (np.ndarray): A 2D numpy array of input variable values, shape (n_samples, n_features).
-                        n_features is 3.
-                        If n_features is 0, x should be shape (n_samples, 0).
-                        The order of columns in x must correspond to:
-                        (x, t, v).
-        params (np.ndarray): A 1D numpy array of parameters.
-                             Expected length: 10.
-
-    Returns:
-        np.ndarray: A 1D numpy array of predicted output values, shape (n_samples,).
-    """
-
-    result = x[:, 0] * params[0] + x[:, 1] * params[1] + x[:, 2] * params[2]
-    return result
-    
-# EVOLVE-BLOCK-END
-
-# This part remains fixed (not evolved)
-# It ensures that OpenEvolve can consistently call the evolving function.
-def run_search():
-    return func
-
-# Note: The actual structure of initial_program.py is determined by data_api.py.
-```
+## Examples
 
-### Evolved Algorithm (Discovered Symbolic Expression)
+See the `examples/` directory for complete examples of using OpenEvolve on various problems:
 
-OpenEvolve will iteratively modify the Python code within the `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` markers in `initial_program.py`. The goal is to transform the simple initial model into a more complex and accurate symbolic expression that minimizes the Mean Squared Error (MSE) on the training data.
+### Circle Packing
 
-An evolved `func` might, for instance, discover a non-linear expression like:
+Our implementation of the circle packing problem from the AlphaEvolve paper. For the n=26 case, where one needs to pack 26 circles in a unit square we also obtain SOTA results.
 
-```python
-# Hypothetical example of what OpenEvolve might find:
-def func(x, params):
-   # Assuming X_train_scaled maps to x and const maps to a parameter in params
-   predictions = np.sin(x[:, 0]) * x[:, 1]**2 + params[0]
-   return predictions
-```
+[Explore the Circle Packing Example](examples/circle_packing/)
 
-*(This is a simplified, hypothetical example to illustrate the transformation.)*
+We have sucessfully replicated the results from the AlphaEvolve paper, below is the packing found by OpenEvolve after 800 iterations
 
-------
+![alpha-evolve-replication](https://github.com/user-attachments/assets/00100f9e-2ac3-445b-9266-0398b7174193)
 
-## ⚙️ Key Configuration & Approach
+This is exactly the packing reported by AlphaEolve in their paper (Figure 14): 
 
-- LLM Models:
-  - **Primary Model:** `gpt-4o` (or your configured `primary_model`) is typically used for sophisticated code generation and modification.
-  - **Secondary Model:** `o3` (or your configured `secondary_model`) can be used for refinements, simpler modifications, or other auxiliary tasks within the evolutionary process.
-- Evaluation Strategy:
-  - Currently, this example employs a direct evaluation strategy (not **cascade evaluation**).
-- Objective Function:
-  - The primary objective is to **minimize the Mean Squared Error (MSE)** between the model's predictions and the true values on the training data.
+![alpha-evolve-results](https://github.com/user-attachments/assets/0c9affa5-053d-404e-bb2d-11479ab248c9)
 
-------
+### Function Minimization
 
-## 📊 Results
+An example showing how OpenEvolve can transform a simple random search algorithm into a sophisticated simulated annealing approach.
 
-The `eval.py` script will help you collect and analyze performance metrics. The LLM-SRBench paper provides a comprehensive comparison of various baselines. For results generated by this specific OpenEvolve example, you should run the evaluation script as described in the "Getting Started" section.
+[Explore the Function Minimization Example](examples/function_minimization/)
 
-For benchmark-wide comparisons and results from other methods, please refer to the official LLM-SRBench paper.
+## Preparing Your Own Problems
 
-| **Task Category**       | Med. NMSE (Test) | Med. R2 (Test) | **Med. NMSE (OOD Test)** | **Med. R2 (OOD Test)** |
-| ----------------------- | ---------------- | -------------- | ------------------------ | ---------------------- |
-| Chemistry (36 tasks)    | 2.3419e-06       | 1.000          | 3.1384e-02               | 0.9686                 |
-| Biology (24 tasks)      |                  |                |                          |                        |
-| Physics (44 tasks)      | 1.8548e-05       | 1.000          | 7.9255e-04               | 0.9992                 |
-| Material Sc. (25 tasks) |                  |                |                          |                        |
+To use OpenEvolve for your own problems:
 
-------
+1. **Mark code sections** to evolve with `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` comments
+2. **Create an evaluation function** that returns a dictionary of metrics
+3. **Configure OpenEvolve** with appropriate parameters
+4. **Run the evolution** process
 
-## 🤝 Contribution
+## Citation
 
-This OpenEvolve example for LLM-SRBench was implemented by [**Haowei Lin**](https://linhaowei1.github.io/) from Peking University. If you encounter any issues or have questions, please feel free to reach out to Haowei via email (linhaowei@pku.edu.cn) for discussion.
+If you use OpenEvolve in your research, please cite:
 
+```
+@software{openevolve,
+  title = {OpenEvolve: Open-source implementation of AlphaEvolve},
+  author = {Asankhaya Sharma},
+  year = {2025},
+  publisher = {GitHub},
+  url = {https://github.com/codelion/openevolve}
+}
+```
diff --git a/examples/symbolic_regression/README.md b/examples/symbolic_regression/README.md
index 76692ab96..5b9caa509 100644
--- a/examples/symbolic_regression/README.md
+++ b/examples/symbolic_regression/README.md
@@ -1,16 +1,16 @@
-# Evolving Symbolic Regression Models with OpenEvolve on LLM-SRBench 🧬🔍
+# Evolving Symbolic Regression with OpenEvolve on LLM-SRBench 🧬🔍
 
-This example demonstrates how **OpenEvolve** can be utilized to perform **symbolic regression** tasks using the **LLM-SRBench benchmark**. It showcases the ability of OpenEvolve to evolve Python code representing mathematical expressions to fit given datasets.
+This example demonstrates how **OpenEvolve** can be utilized to perform **symbolic regression** tasks using the **[LLM-SRBench benchmark](https://arxiv.org/pdf/2504.10415)**. It showcases OpenEvolve's capability to evolve Python code, transforming simple mathematical expressions into more complex and accurate models that fit given datasets.
 
----
+------
 
-## Problem Description: Symbolic Regression on LLM-SRBench
+## 🎯 Problem Description: Symbolic Regression on LLM-SRBench
 
-**Symbolic Regression** is the task of discovering a mathematical expression that best fits a given dataset. Unlike traditional regression techniques that fit parameters to a predefined model structure, symbolic regression aims to find both the structure of the model and its parameters.
+**Symbolic Regression** is the task of discovering a mathematical expression that best fits a given dataset. Unlike traditional regression techniques that optimize parameters for a predefined model structure, symbolic regression aims to find both the **structure of the model** and its **parameters**.
 
-This example leverages **LLM-SRBench**, a benchmark for Large Language Model based Symbolic Regression (highlighted at ICML 2025). The goal is to use OpenEvolve to evolve an initial, simple model (e.g., a linear model) into a more accurate symbolic expression that captures the underlying relationships in various scientific datasets provided by the benchmark.
+This example leverages **LLM-SRBench**, a benchmark specifically designed for Large Language Model-based Symbolic Regression. The core objective is to use OpenEvolve to evolve an initial, often simple, model (e.g., a linear model) into a more sophisticated symbolic expression. This evolved expression should accurately capture the underlying relationships within various scientific datasets provided by the benchmark.
 
----
+------
 
 ## 🚀 Getting Started
 
@@ -18,34 +18,44 @@ Follow these steps to set up and run the symbolic regression benchmark example:
 
 ### 1. Configure API Secrets
 
-You'll need to provide your API credentials for the language models.
-Create a `secrets.yaml` file in the example directory with the following structure:
+You'll need to provide your API credentials for the language models used by OpenEvolve.
 
-```yaml
+- Create a `secrets.yaml` file in the example directory.
+- Add your API key and model preferences:
+
+YAML
+
+```
 # secrets.yaml
 api_key: <YOUR_OPENAI_API_KEY>
-api_base: "[https://api.openai.com/v1](https://api.openai.com/v1)"  # Or your custom endpoint
+api_base: "https://api.openai.com/v1"  # Or your custom endpoint
 primary_model: "gpt-4o"
-secondary_model: "o3" # Or another preferred model
+secondary_model: "o3" # Or another preferred model for specific tasks
 ```
-Replace `<YOUR_OPENAI_API_KEY>` with your actual key.
+
+Replace `<YOUR_OPENAI_API_KEY>` with your actual OpenAI API key.
 
 ### 2. Load Benchmark Tasks & Generate Initial Programs
 
-The `data_api.py` script is used to load tasks from the LLM-SRBench dataset (located in `./problems` and defined by classes in `./bench`). This script will also automatically generate:
-* An `initial_program.py` (typically a simple linear model) for each benchmark task.
-* An `evaluator.py` tailored for each task.
-* A `config.yaml` for OpenEvolve for each task.
+The `data_api.py` script is crucial for setting up the environment. It prepares tasks from the LLM-SRBench dataset (defined by classes in `./bench`, and will be located at `./problems`).
+
+For each benchmark task, this script will automatically generate:
+
+- `initial_program.py`: A starting Python program, typically a simple linear model.
+- `evaluator.py`: A tailored evaluation script for the task.
+- `config.yaml`: An OpenEvolve configuration file specific to the task.
+
+Run the script from your terminal:
 
-Run the script:
 ```bash
 python data_api.py
 ```
-This will prepare all necessary files within subdirectories for each benchmark task.
+
+This will create subdirectories for each benchmark task, populated with the necessary files.
 
 ### 3. Run OpenEvolve
 
-Use the provided shell script `scripts.sh` to execute OpenEvolve across the generated benchmark tasks. This script will iterate through the task-specific configurations and apply the evolutionary process.
+Use the provided shell script `scripts.sh` to execute OpenEvolve across the generated benchmark tasks. This script iterates through the task-specific configurations and applies the evolutionary process.
 
 ```bash
 bash scripts.sh
@@ -53,129 +63,130 @@ bash scripts.sh
 
 ### 4. Evaluate Results
 
-After OpenEvolve has completed its runs, you can evaluate the performance on different subsets of tasks (e.g., `bio`, `chemical`, `physics`, `material`). The `eval.py` script collates the results and provides a summary.
+After OpenEvolve has completed its runs, you can evaluate the performance on different subsets of tasks (e.g., bio, chemical, physics, material). The `eval.py` script collates the results and provides a summary.
 
 ```bash
 python eval.py <subset_path>
 ```
-For example, to evaluate results for the 'physics' subset, if they are located in `results/physics_tasks`, you might run `python eval.py results/physics_tasks`.
 
-This will also save a JSON file containing detailed results for your analysis.
+For example, to evaluate results for the 'physics' subset located in `./problems/phys_osc/`, you would run:
+
+```bash
+python eval.py ./problems/phys_osc
+```
+
+This script will also save a `JSON` file containing detailed results for your analysis.
 
----
+------
 
-## Algorithm Evolution: From Linear Model to...?
+## 🌱 Algorithm Evolution: From Linear Model to Complex Expression
 
-### Initial Algorithm (e.g., Linear Model)
+OpenEvolve works by iteratively modifying an initial Python program to find a better-fitting mathematical expression.
 
-The `data_api.py` script typically generates a basic linear model as the starting point for evolution. For a given task, this `initial_program.py` might look something like:
+### Initial Algorithm (Example: Linear Model)
+
+The `data_api.py` script typically generates a basic linear model as the starting point. For a given task, this `initial_program.py` might look like this:
 
 ```python
-# initial_program.py (conceptual example)
+"""
+Initial program: A naive linear model for symbolic regression.
+This model predicts the output as a linear combination of input variables
+or a constant if no input variables are present.
+The function is designed for vectorized input (X matrix).
+
+Target output variable: dv_dt (Acceleration in Nonl-linear Harmonic Oscillator)
+Input variables (columns of x): x (Position at time t), t (Time), v (Velocity at time t)
+"""
 import numpy as np
 
-# [[evolve_start]]
-def symbolic_model(X_train_scaled, X_test_scaled, y_train_scaled, feature_names, X_train, y_train):
-    # A simple linear model or a placeholder function
-    # The actual initial model is generated by data_api.py
-    predictions = np.zeros(len(X_train_scaled)) # Placeholder
-    # For a real linear model, it might be:
-    # if X_train_scaled.shape[1] > 0:
-    #     coeffs = np.random.rand(X_train_scaled.shape[1])
-    #     intercept = np.random.rand()
-    #     predictions = X_train_scaled @ coeffs + intercept
-    # else:
-    #     predictions = np.full(len(X_train_scaled), np.mean(y_train_scaled))
-
-    # The goal of openevolve is to replace this function
-    # with one that produces better predictions by finding
-    # a symbolic expression using the input features.
-    mse = np.mean((y_train_scaled - predictions)**2)
-    return mse, predictions
-# [[evolve_end]]
-
-def evaluate(X_train_scaled, X_test_scaled, y_train_scaled, feature_names, X_train, y_train):
-    # [[evolve_start]]
-    mse, _ = symbolic_model(X_train_scaled, X_test_scaled, y_train_scaled, feature_names, X_train, y_train)
-    # [[evolve_end]]
-    return mse
+# Input variable mapping for x (columns of the input matrix):
+#   x[:, 0]: x (Position at time t)
+#   x[:, 1]: t (Time)
+#   x[:, 2]: v (Velocity at time t)
+
+# Parameters will be optimized by BFGS outside this function.
+# Number of parameters expected by this model: 10.
+# Example initialization: params = np.random.rand(10)
+
+# EVOLVE-BLOCK-START
+
+def func(x, params):
+    """
+    Calculates the model output using a linear combination of input variables
+    or a constant value if no input variables. Operates on a matrix of samples.
+
+    Args:
+        x (np.ndarray): A 2D numpy array of input variable values, shape (n_samples, n_features).
+                        n_features is 3.
+                        If n_features is 0, x should be shape (n_samples, 0).
+                        The order of columns in x must correspond to:
+                        (x, t, v).
+        params (np.ndarray): A 1D numpy array of parameters.
+                             Expected length: 10.
+
+    Returns:
+        np.ndarray: A 1D numpy array of predicted output values, shape (n_samples,).
+    """
+
+    result = x[:, 0] * params[0] + x[:, 1] * params[1] + x[:, 2] * params[2]
+    return result
+    
+# EVOLVE-BLOCK-END
+
+# This part remains fixed (not evolved)
+# It ensures that OpenEvolve can consistently call the evolving function.
+def run_search():
+    return func
+
+# Note: The actual structure of initial_program.py is determined by data_api.py.
 ```
-*Note: The actual structure of `initial_program.py` is determined by `data_api.py`.*
 
 ### Evolved Algorithm (Discovered Symbolic Expression)
 
-OpenEvolve will iteratively modify the code within the `[[evolve_start]]` and `[[evolve_end]]` blocks in `initial_program.py`. The aim is to transform the simple initial model into a more complex and accurate symbolic expression that minimizes the Mean Squared Error (MSE) on the training data.
-
-An evolved `symbolic_model` function might, for instance, discover an expression like:
-`predictions = np.sin(X_train_scaled[:, 0]) * X_train_scaled[:, 1]**2 + const`
-(This is a hypothetical example of what OpenEvolve might find).
-
----
-
-## ⚙️ Key Configuration & Approach
-
-* **LLM Models**:
-    * **Primary Model**: `gpt-4o` (used for sophisticated code generation)
-    * **Secondary Model**: `o3` (or your configured alternative, potentially for refinements or specific sub-tasks)
-* **Evaluation Strategy**:
-    * Currently, this example **does not use cascade evaluation**. Each evolved program is evaluated directly. Exploring cascade evaluation could be a future enhancement to potentially boost performance.
-* **Objective Function**:
-    * The primary objective is straightforward: **minimize the Mean Squared Error (MSE)** of the model's predictions on the training data.
+OpenEvolve will iteratively modify the Python code within the `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` markers in `initial_program.py`. The goal is to transform the simple initial model into a more complex and accurate symbolic expression that minimizes the Mean Squared Error (MSE) on the training data.
 
----
+An evolved `func` might, for instance, discover a non-linear expression like:
 
-## 📊 Results
-
-*(This section can be filled in once you have run the benchmarks and gathered performance data.)*
+```python
+# Hypothetical example of what OpenEvolve might find:
+def func(x, params):
+   # Assuming X_train_scaled maps to x and const maps to a parameter in params
+   predictions = np.sin(x[:, 0]) * x[:, 1]**2 + params[0]
+   return predictions
+```
 
-You can present results in a table or through plots, comparing metrics like:
-* Achieved MSE on training/test sets.
-* Complexity of discovered expressions.
-* Performance across different LLM-SRBench task categories (bio, chemical, physics, material).
+*(This is a simplified, hypothetical example to illustrate the transformation.)*
 
-Example Table Structure:
+------
 
-| Task Category | Avg. MSE (Train) | Avg. MSE (Test) | Notes |
-|---------------|------------------|-----------------|-------|
-| Bio           | *value* | *value* |       |
-| Chemical      | *value* | *value* |       |
-| Physics       | *value* | *value* |       |
-| Material      | *value* | *value* |       |
-| **Overall** | **value** | **value** |       |
+## ⚙️ Key Configuration & Approach
 
-The `eval.py` script will output a JSON file with detailed results suitable for populating such tables or for further analysis.
+- LLM Models:
+  - **Primary Model:** `gpt-4o` (or your configured `primary_model`) is typically used for sophisticated code generation and modification.
+  - **Secondary Model:** `o3` (or your configured `secondary_model`) can be used for refinements, simpler modifications, or other auxiliary tasks within the evolutionary process.
+- Evaluation Strategy:
+  - Currently, this example employs a direct evaluation strategy (not **cascade evaluation**).
+- Objective Function:
+  - The primary objective is to **minimize the Mean Squared Error (MSE)** between the model's predictions and the true values on the training data.
 
----
+------
 
-## 💡 How It Works with OpenEvolve
+## 📊 Results
 
-This example highlights several capabilities of OpenEvolve:
+The `eval.py` script will help you collect and analyze performance metrics. The LLM-SRBench paper provides a comprehensive comparison of various baselines. For results generated by this specific OpenEvolve example, you should run the evaluation script as described in the "Getting Started" section.
 
-* **Automated Code Evolution**: OpenEvolve directly modifies Python code within specified blocks to search for better solutions.
-* **Symbolic Discovery**: Instead of just tuning parameters, OpenEvolve attempts to discover the underlying mathematical structure (the symbolic expression) that best models the data.
-* **Adaptability to Benchmarks**: The framework is set up to systematically process multiple tasks from the LLM-SRBench.
-* **Leveraging LLMs for Code Generation**: It utilizes powerful LLMs like GPT-4o to propose novel code structures representing mathematical formulas.
+For benchmark-wide comparisons and results from other methods, please refer to the official LLM-SRBench paper.
 
----
+| **Task Category**       | Med. NMSE (Test) | Med. R2 (Test) | **Med. NMSE (OOD Test)** | **Med. R2 (OOD Test)** |
+| ----------------------- | ---------------- | -------------- | ------------------------ | ---------------------- |
+| Chemistry (36 tasks)    | 2.3419e-06       | 1.000          | 3.1384e-02               | 0.9686                 |
+| Physics (44 tasks)      | 1.8548e-05       | 1.000          | 7.9255e-04               | 0.9992                 |
 
-## 🔮 Next Steps & Future Exploration
+Current results are only for two subset of LSR-Synth. We will update the comprehensive results soon.
 
-* **Analyze Detailed Results**: Dive into the JSON output from `eval.py` to understand the performance on specific tasks and the nature of the evolved expressions.
-* **Implement Cascade Evaluation**: Explore adding a cascade evaluation mechanism where promising programs are subjected to more rigorous or diverse evaluation criteria.
-* **Experiment with Different LLMs**: Try swapping the primary and secondary models in `secrets.yaml` or testing newer models as they become available.
-* **Modify Evolutionary Parameters**: Adjust settings in the task-specific `config.yaml` files (e.g., population size, number of generations, mutation rates) to see their impact on the discovery process.
-* **Explore Different Objectives**: While MSE is standard, consider incorporating other objectives like model complexity (e.g., using a Pareto front or adding a complexity penalty to the fitness function) to find simpler, more interpretable expressions.
+------
 
----
+## 🤝 Contribution
 
-## 📁 Files in this Example
+This OpenEvolve example for LLM-SRBench was implemented by [**Haowei Lin**](https://linhaowei1.github.io/) from Peking University. If you encounter any issues or have questions, please feel free to reach out to Haowei via email (linhaowei@pku.edu.cn) for discussion.
 
-* `data_api.py`: Loads benchmark tasks and generates initial files (`initial_program.py`, `evaluator.py`, `config.yaml`) for each.
-* `./problems/`: Contains the raw data files for the LLM-SRBench tasks.
-* `./bench/`: Contains Python data classes and helpers for loading and handling LLM-SRBench tasks.
-* `eval.py`: Script to evaluate the results from OpenEvolve runs for a subset of tasks.
-* `secrets.yaml` (to be created by you): Stores API keys and model preferences.
-* `scripts.sh`: Utility script to run OpenEvolve across all configured benchmark tasks.
-* `initial_program.py` (generated per task): The starting Python code that OpenEvolve will evolve.
-* `evaluator.py` (generated per task): Defines how the evolved programs are evaluated for a specific task.
-* `config.yaml` (generated per task): Configuration file for OpenEvolve for a specific task.

From c4ccdae30e1ca5394696ea9d2088e65b1ced0fe9 Mon Sep 17 00:00:00 2001
From: Haowei Lin <56622542+linhaowei1@users.noreply.github.com>
Date: Fri, 23 May 2025 10:28:33 +0800
Subject: [PATCH 3/3] improve readme

fix_readme

black

fix secret

fix readme
---
 examples/symbolic_regression/README.md        | 122 +++++--
 .../symbolic_regression/bench/dataclasses.py  |  28 +-
 .../symbolic_regression/bench/datamodules.py  | 154 ++++++---
 examples/symbolic_regression/data_api.py      | 309 +++++++++---------
 examples/symbolic_regression/eval.py          | 249 ++++++++------
 openevolve/controller.py                      |   4 +-
 6 files changed, 532 insertions(+), 334 deletions(-)

diff --git a/examples/symbolic_regression/README.md b/examples/symbolic_regression/README.md
index 5b9caa509..5040fabe9 100644
--- a/examples/symbolic_regression/README.md
+++ b/examples/symbolic_regression/README.md
@@ -16,24 +16,10 @@ This example leverages **LLM-SRBench**, a benchmark specifically designed for La
 
 Follow these steps to set up and run the symbolic regression benchmark example:
 
-### 1. Configure API Secrets
+### 1. Configure API Keys
 
-You'll need to provide your API credentials for the language models used by OpenEvolve.
+The API key is read from the environment `OPENAI_API_KEY` by default. The primary and secondary model we used in testing LLM-SRBench is `gpt-4o` and `o3`. You can check `create_config()` in `data_api.py`.
 
-- Create a `secrets.yaml` file in the example directory.
-- Add your API key and model preferences:
-
-YAML
-
-```
-# secrets.yaml
-api_key: <YOUR_OPENAI_API_KEY>
-api_base: "https://api.openai.com/v1"  # Or your custom endpoint
-primary_model: "gpt-4o"
-secondary_model: "o3" # Or another preferred model for specific tasks
-```
-
-Replace `<YOUR_OPENAI_API_KEY>` with your actual OpenAI API key.
 
 ### 2. Load Benchmark Tasks & Generate Initial Programs
 
@@ -143,19 +129,83 @@ def run_search():
 
 ### Evolved Algorithm (Discovered Symbolic Expression)
 
-OpenEvolve will iteratively modify the Python code within the `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` markers in `initial_program.py`. The goal is to transform the simple initial model into a more complex and accurate symbolic expression that minimizes the Mean Squared Error (MSE) on the training data.
+**OpenEvolve** iteratively modifies Python code segments, delineated by `# EVOLVE-BLOCK-START` and `# EVOLVE-BLOCK-END` markers within an `initial_program.py` file. The primary objective is to evolve a simple initial model into a more complex and accurate symbolic expression that minimizes the Mean Squared Error (MSE) against the training data.
 
-An evolved `func` might, for instance, discover a non-linear expression like:
+Below is a symbolic expression discovered by OpenEvolve for the physics task `PO10`:
 
 ```python
-# Hypothetical example of what OpenEvolve might find:
+import numpy as np
+
 def func(x, params):
-   # Assuming X_train_scaled maps to x and const maps to a parameter in params
-   predictions = np.sin(x[:, 0]) * x[:, 1]**2 + params[0]
-   return predictions
+    """
+    Calculates the model output using a linear combination of input variables
+    or a constant value if no input variables. Operates on a matrix of samples.
+
+    Args:
+        x (np.ndarray): A 2D numpy array of input variable values, shape (n_samples, n_features).
+                        n_features is 2.
+                        If n_features is 0, x should be shape (n_samples, 0).
+                        The order of columns in x must correspond to:
+                        (x, t).
+        params (np.ndarray): A 1D numpy array of parameters.
+                             Expected length: 10.
+
+    Returns:
+        np.ndarray: A 1D numpy array of predicted output values, shape (n_samples,).
+    """
+    # --------------------------------------------------------------------------
+    # Allow for flexible parameter count, only padding essential parts.
+    if len(params) < 10:
+        required_params = params.shape[0]
+        params = np.pad(params, (0, 10 - required_params))
+
+    # Readable aliases for the two input features
+    pos = x[:, 0]       # position   x(t)
+    t_val = x[:, 1]     # time       t
+
+    # ----------   Internal restoring forces (Duffing-like)   ------------------
+    # −k x −β x³ −γ x⁵    (only odd powers, respecting the usual symmetry)
+    # Reduced polynomial order (up to cubic) to avoid over-fitting while
+    # still capturing the essential softening/stiffening behaviour.
+    restoring = -(params[0] * pos + params[1] * pos**3)
+
+    # ----------   Externally forced, periodically driven term   --------------
+    #  A e^{-λ t} sin(ω t)   +   B cos(Ω t)   (General form considered)
+    # Let the optimiser decide whether the envelope should grow
+    # or decay by keeping the sign of params[4].  The exponent is
+    # clipped to avoid numerical overflow.
+    # Simple periodic forcing without exponential envelope.  This is
+    # sufficient for many driven oscillator benchmarks and reduces the
+    # risk of numerical overflow in exp().
+    trig1 = params[3] * t_val
+    trig2 = params[5] * t_val
+    forcing = params[2] * np.cos(trig1) + params[4] * np.sin(trig2)
+
+    # ----------   Weak position–time coupling & constant bias   ---------------
+    interaction = params[8] * pos * t_val
+    bias = params[9]
+
+    return restoring + forcing + interaction + bias
 ```
 
-*(This is a simplified, hypothetical example to illustrate the transformation.)*
+The ground truth for this PO10 task is represented by the equation: 
+
+$F_0sin(t)−ω_0^2(γt+1)x(t)−ω_0^2x(t)^3−ω_0^2x(t).$
+
+This can be expanded and simplified to:
+
+$F_0sin(t)−ω_0^2γtx(t)−2ω_0^2x(t)−ω_0^2x(t)^3.$
+
+Notably, the core functional forms present in this ground truth equation are captured by the evolved symbolic expression:
+
+- The $sin(t)$ component can be represented by `params[4] * np.sin(params[5] * t_val)`.
+- The linear $x(t)$ term corresponds to `params[0] * pos`.
+- The cubic $x(t)^3$ term is `params[1] * pos**3`.
+- The interaction term $t⋅x(t)$ is captured by `params[8] * pos * t_val`.
+
+The evolved code also includes terms like `params[2] * np.cos(params[3] * t_val)` (a cosine forcing term) and `params[9]` (a constant bias). These might evolve to have negligible parameter values if not supported by the data, or they could capture secondary effects or noise. The inclusion of the primary terms demonstrates OpenEvolve's strength in identifying the correct underlying structure of the equation.
+
+*Note: Symbolic regression, despite such promising results, remains a very challenging task. This difficulty largely stems from the inherent complexities of inferring precise mathematical models from finite and potentially noisy training data, which provides only a partial observation of the true underlying system.*
 
 ------
 
@@ -177,12 +227,28 @@ The `eval.py` script will help you collect and analyze performance metrics. The
 
 For benchmark-wide comparisons and results from other methods, please refer to the official LLM-SRBench paper.
 
-| **Task Category**       | Med. NMSE (Test) | Med. R2 (Test) | **Med. NMSE (OOD Test)** | **Med. R2 (OOD Test)** |
-| ----------------------- | ---------------- | -------------- | ------------------------ | ---------------------- |
-| Chemistry (36 tasks)    | 2.3419e-06       | 1.000          | 3.1384e-02               | 0.9686                 |
-| Physics (44 tasks)      | 1.8548e-05       | 1.000          | 7.9255e-04               | 0.9992                 |
+*Note: Below we extract the approximate results of baselines in Fig.5 from LLMSR-Bench paper.*
+
+**Median NMSE (Test Set)**
+
+| **Domain**       | **Direct**  | **LLMSR**       | **LaSR**    | **SGA**     | **OpenEvolve** |
+| ---------------- | ----------- | --------------- | ----------- | ----------- | -------------- |
+| Chemistry        | ~6.0 × 10⁻¹ | **~1.5 × 10⁻⁶** | ~1.0 × 10⁻⁴ | ~1.0 × 10⁻² | 2.34 × 10⁻⁶    |
+| Biology          | ~2.0 × 10⁻² | ~1.0 × 10⁻⁵     | ~1.0 × 10⁻⁴ | ~2.0 × 10⁻⁴ | –              |
+| Physics          | ~3.0 × 10⁻¹ | **~2.0 × 10⁻⁷** | ~1.0 × 10⁻³ | ~4.0 × 10⁻³ | 1.85 × 10⁻⁵    |
+| Material Science | ~3.0 × 10⁻¹ | ~1.0 × 10⁻⁴     | ~7.0 × 10⁻⁴ | ~3.0 × 10⁻² | –              |
+
+**Median NMSE (OOD Test Set)**
+
+| **Domain**       | **Direct** | **LLMSR**   | **LaSR**    | **SGA**    | **OpenEvolve**  |
+| ---------------- | ---------- | ----------- | ----------- | ---------- | --------------- |
+| Chemistry        | ~3.0 × 10² | ~5.0 × 10⁻² | ~1.0 × 10⁰  | ~1.5 × 10⁰ | **3.14 × 10⁻²** |
+| Biology          | ~1.2 × 10² | ~4.0 × 10⁰  | ~3.0 × 10¹  | ~4.0 × 10¹ | –               |
+| Physics          | ~1.0 × 10¹ | ~1.0 × 10⁻³ | ~5.0 × 10⁻² | ~1.0 × 10⁰ | **7.93 × 10⁻⁴** |
+| Material Science | ~2.5 × 10¹ | ~3.0 × 10⁰  | ~8.0 × 10⁰  | ~2.5 × 10¹ | –               |
+
+Current results for OpenEvolve are only for two subsets of LSR-Synth. We will update the comprehensive results soon.
 
-Current results are only for two subset of LSR-Synth. We will update the comprehensive results soon.
 
 ------
 
diff --git a/examples/symbolic_regression/bench/dataclasses.py b/examples/symbolic_regression/bench/dataclasses.py
index 83082aca4..f2b74233d 100755
--- a/examples/symbolic_regression/bench/dataclasses.py
+++ b/examples/symbolic_regression/bench/dataclasses.py
@@ -15,11 +15,13 @@ class Equation:
     lambda_format: Optional[callable] = None
     program_format: Optional[str] = None
 
+
 @dataclass
 class SearchResult:
     equation: Equation
     aux: Any
 
+
 @dataclass
 class SEDTask:
     name: str
@@ -29,6 +31,7 @@ class SEDTask:
     samples: Any
     desc: Optional[str] = None
 
+
 @dataclass
 class Problem:
     dataset_identifier: str
@@ -37,20 +40,23 @@ class Problem:
     samples: Any
 
     def create_task(self) -> SEDTask:
-        return SEDTask(name=self.equation_idx,
-                        symbols=self.gt_equation.symbols,
-                        symbol_descs=self.gt_equation.symbol_descs,
-                        symbol_properties=self.gt_equation.symbol_properties,
-                        samples=self.train_samples,
-                        desc=self.gt_equation.desc)
+        return SEDTask(
+            name=self.equation_idx,
+            symbols=self.gt_equation.symbols,
+            symbol_descs=self.gt_equation.symbol_descs,
+            symbol_properties=self.gt_equation.symbol_properties,
+            samples=self.train_samples,
+            desc=self.gt_equation.desc,
+        )
+
     @property
     def train_samples(self):
-        return self.samples['train']
-    
+        return self.samples["train"]
+
     @property
     def test_samples(self):
-        return self.samples['test']
-    
+        return self.samples["test"]
+
     @property
     def ood_test_samples(self):
-        return self.samples.get('ood_test', None) 
\ No newline at end of file
+        return self.samples.get("ood_test", None)
diff --git a/examples/symbolic_regression/bench/datamodules.py b/examples/symbolic_regression/bench/datamodules.py
index d2a8dff0b..60e2dff77 100755
--- a/examples/symbolic_regression/bench/datamodules.py
+++ b/examples/symbolic_regression/bench/datamodules.py
@@ -14,124 +14,174 @@
 
 REPO_ID = "nnheui/llm-srbench"
 
+
 def _download(repo_id):
     return snapshot_download(repo_id=repo_id, repo_type="dataset")
 
+
 class TransformedFeynmanDataModule:
     def __init__(self):
         self._dataset_dir = None
-        self._dataset_identifier = 'lsr_transform'
-    
+        self._dataset_identifier = "lsr_transform"
+
     def setup(self):
         self._dataset_dir = Path(_download(repo_id=REPO_ID))
-        ds = datasets.load_dataset(REPO_ID)['lsr_transform']
+        ds = datasets.load_dataset(REPO_ID)["lsr_transform"]
         sample_h5file_path = self._dataset_dir / "lsr_bench_data.hdf5"
         self.problems = []
         with h5py.File(sample_h5file_path, "r") as sample_file:
             for e in ds:
-                samples = {k:v[...].astype(np.float64) for k,v in sample_file[f'/lsr_transform/{e["name"]}'].items()}
-                self.problems.append(Problem(dataset_identifier=self._dataset_identifier,
-                                        equation_idx = e['name'],
-                                        gt_equation=Equation(
-                                            symbols=e['symbols'],
-                                            symbol_descs=e['symbol_descs'],
-                                            symbol_properties=e['symbol_properties'],
-                                            expression=e['expression'],
-                                        ),
-                                        samples=samples)
+                samples = {
+                    k: v[...].astype(np.float64)
+                    for k, v in sample_file[f'/lsr_transform/{e["name"]}'].items()
+                }
+                self.problems.append(
+                    Problem(
+                        dataset_identifier=self._dataset_identifier,
+                        equation_idx=e["name"],
+                        gt_equation=Equation(
+                            symbols=e["symbols"],
+                            symbol_descs=e["symbol_descs"],
+                            symbol_properties=e["symbol_properties"],
+                            expression=e["expression"],
+                        ),
+                        samples=samples,
+                    )
                 )
-        self.name2id = {p.equation_idx: i for i,p in enumerate(self.problems)}
+        self.name2id = {p.equation_idx: i for i, p in enumerate(self.problems)}
 
     @property
     def name(self):
         return "LSR_Transform"
 
+
 class SynProblem(Problem):
     @property
     def train_samples(self):
-        return self.samples['train_data']
-    
+        return self.samples["train_data"]
+
     @property
     def test_samples(self):
-        return self.samples['id_test_data']
-    
+        return self.samples["id_test_data"]
+
     @property
     def ood_test_samples(self):
-        return self.samples['ood_test_data']
+        return self.samples["ood_test_data"]
+
 
 class BaseSynthDataModule:
-    def __init__(self, dataset_identifier, short_dataset_identifier, root, default_symbols = None, default_symbol_descs=None):
+    def __init__(
+        self,
+        dataset_identifier,
+        short_dataset_identifier,
+        root,
+        default_symbols=None,
+        default_symbol_descs=None,
+    ):
         self._dataset_dir = Path(root)
         self._dataset_identifier = dataset_identifier
         self._short_dataset_identifier = short_dataset_identifier
         self._default_symbols = default_symbols
         self._default_symbol_descs = default_symbol_descs
-    
+
     def setup(self):
         self._dataset_dir = Path(_download(repo_id=REPO_ID))
-        ds = datasets.load_dataset(REPO_ID)[f'lsr_synth_{self._dataset_identifier}']
+        ds = datasets.load_dataset(REPO_ID)[f"lsr_synth_{self._dataset_identifier}"]
         sample_h5file_path = self._dataset_dir / "lsr_bench_data.hdf5"
         self.problems = []
         with h5py.File(sample_h5file_path, "r") as sample_file:
             for e in ds:
-                samples = {k:v[...].astype(np.float64) for k,v in sample_file[f'/lsr_synth/{self._dataset_identifier}/{e["name"]}'].items()}
-                self.problems.append(Problem(dataset_identifier=self._dataset_identifier,
-                                        equation_idx = e['name'],
-                                        gt_equation=Equation(
-                                            symbols=e['symbols'],
-                                            symbol_descs=e['symbol_descs'],
-                                            symbol_properties=e['symbol_properties'],
-                                            expression=e['expression'],
-                                        ),
-                                        samples=samples)
+                samples = {
+                    k: v[...].astype(np.float64)
+                    for k, v in sample_file[
+                        f'/lsr_synth/{self._dataset_identifier}/{e["name"]}'
+                    ].items()
+                }
+                self.problems.append(
+                    Problem(
+                        dataset_identifier=self._dataset_identifier,
+                        equation_idx=e["name"],
+                        gt_equation=Equation(
+                            symbols=e["symbols"],
+                            symbol_descs=e["symbol_descs"],
+                            symbol_properties=e["symbol_properties"],
+                            expression=e["expression"],
+                        ),
+                        samples=samples,
+                    )
                 )
-        self.name2id = {p.equation_idx: i for i,p in enumerate(self.problems)}
+        self.name2id = {p.equation_idx: i for i, p in enumerate(self.problems)}
 
-    
-        self.name2id = {p.equation_idx: i for i,p in enumerate(self.problems)}
+        self.name2id = {p.equation_idx: i for i, p in enumerate(self.problems)}
 
     @property
     def name(self):
         return self._dataset_identifier
 
+
 class MatSciDataModule(BaseSynthDataModule):
     def __init__(self, root):
         super().__init__("matsci", "MatSci", root)
 
+
 class ChemReactKineticsDataModule(BaseSynthDataModule):
     def __init__(self, root):
-        super().__init__("chem_react", "CRK", root,
-                         default_symbols=['dA_dt', 't', 'A'],
-                         default_symbol_descs=['Rate of change of concentration in chemistry reaction kinetics', 'Time', 'Concentration at time t'])
-        
+        super().__init__(
+            "chem_react",
+            "CRK",
+            root,
+            default_symbols=["dA_dt", "t", "A"],
+            default_symbol_descs=[
+                "Rate of change of concentration in chemistry reaction kinetics",
+                "Time",
+                "Concentration at time t",
+            ],
+        )
+
+
 class BioPopGrowthDataModule(BaseSynthDataModule):
     def __init__(self, root):
-        super().__init__("bio_pop_growth", "BPG", root,
-                         default_symbols=['dP_dt', 't', 'P'],
-                         default_symbol_descs=['Population growth rate', 'Time', 'Population at time t'])
-        
+        super().__init__(
+            "bio_pop_growth",
+            "BPG",
+            root,
+            default_symbols=["dP_dt", "t", "P"],
+            default_symbol_descs=["Population growth rate", "Time", "Population at time t"],
+        )
+
+
 class PhysOscilDataModule(BaseSynthDataModule):
     def __init__(self, root):
-        super().__init__("phys_osc", "PO", root,
-                         default_symbols=['dv_dt', 'x', 't', 'v'],
-                         default_symbol_descs=['Acceleration in Nonl-linear Harmonic Oscillator', 'Position at time t', 'Time', 'Velocity at time t'])
+        super().__init__(
+            "phys_osc",
+            "PO",
+            root,
+            default_symbols=["dv_dt", "x", "t", "v"],
+            default_symbol_descs=[
+                "Acceleration in Nonl-linear Harmonic Oscillator",
+                "Position at time t",
+                "Time",
+                "Velocity at time t",
+            ],
+        )
+
 
 def get_datamodule(name, root_folder):
-    if name == 'bio_pop_growth':
+    if name == "bio_pop_growth":
         root = root_folder or "datasets/lsr-synth-bio"
         return BioPopGrowthDataModule(root)
-    elif name == 'chem_react':
+    elif name == "chem_react":
         root = root_folder or "datasets/lsr-synth-chem"
         return ChemReactKineticsDataModule(root)
-    elif name == 'matsci':
+    elif name == "matsci":
         root = root_folder or "datasets/lsr-synth-matsci"
         return MatSciDataModule(root)
-    elif name == 'phys_osc':
+    elif name == "phys_osc":
         root = root_folder or "datasets/lsr-synth-phys"
         return PhysOscilDataModule(root)
     # elif name == 'feynman':
     #     return FeynmanDataModule()
-    elif name == 'lsrtransform':
+    elif name == "lsrtransform":
         return TransformedFeynmanDataModule()
     else:
-        raise ValueError(f"Unknown datamodule name: {name}")
\ No newline at end of file
+        raise ValueError(f"Unknown datamodule name: {name}")
diff --git a/examples/symbolic_regression/data_api.py b/examples/symbolic_regression/data_api.py
index a4be4fca7..8b33d21ff 100755
--- a/examples/symbolic_regression/data_api.py
+++ b/examples/symbolic_regression/data_api.py
@@ -18,15 +18,15 @@
 def load_secret(secrets_file: str = "secrets.yaml") -> Dict[str, Any]:
     """
     Load API keys and configuration from a secrets file.
-    
+
     Args:
         secrets_file: Path to the YAML secrets file
-        
+
     Returns:
         Dictionary containing secret configuration, empty dict if file not found
     """
     try:
-        with open(secrets_file, 'r') as f:
+        with open(secrets_file, "r") as f:
             return yaml.safe_load(f)
     except FileNotFoundError:
         print(f"Warning: Secrets file '{secrets_file}' not found.")
@@ -36,31 +36,33 @@ def load_secret(secrets_file: str = "secrets.yaml") -> Dict[str, Any]:
         return {}
 
 
-def extract_problem_data_from_initialized_dataset(initialized_dataset, problem_id: int) -> Dict[str, Any]:
+def extract_problem_data_from_initialized_dataset(
+    initialized_dataset, problem_id: int
+) -> Dict[str, Any]:
     """
     Extract data for a specific problem from an initialized dataset.
-    
+
     Args:
         initialized_dataset: Pre-initialized and setup dataset object
         problem_id: Index of the problem to extract
-        
+
     Returns:
         Dictionary containing problem data including train/test samples, symbols, and metadata
     """
     problem = initialized_dataset.problems[problem_id]
     gt_eq = problem.gt_equation
     samples = problem.samples
-    
+
     data = {
-        'train': samples['train'],
-        'test': samples['test'],
-        'ood_test': samples.get('ood_test', None),
-        'symbols': gt_eq.symbols,
-        'symbol_descs': gt_eq.symbol_descs,
-        'symbol_properties': gt_eq.symbol_properties,
-        'expression': gt_eq.expression,
-        'dataset_identifier': problem.dataset_identifier,
-        'equation_idx': problem.equation_idx,
+        "train": samples["train"],
+        "test": samples["test"],
+        "ood_test": samples.get("ood_test", None),
+        "symbols": gt_eq.symbols,
+        "symbol_descs": gt_eq.symbol_descs,
+        "symbol_properties": gt_eq.symbol_properties,
+        "expression": gt_eq.expression,
+        "dataset_identifier": problem.dataset_identifier,
+        "equation_idx": problem.equation_idx,
     }
     return data
 
@@ -68,40 +70,40 @@ def extract_problem_data_from_initialized_dataset(initialized_dataset, problem_i
 def create_program(problem: Dict[str, Any]) -> str:
     """
     Create a Python script with a naive linear model for symbolic regression.
-    
+
     The generated script contains a `func(x, params)` that computes predictions
     in a vectorized manner: x @ params. If no input features exist, it predicts
     a constant params[0].
-    
+
     Args:
         problem: Dictionary containing problem data
-        
+
     Returns:
         Path to the created program file
     """
     problem_dir = f'problems/{problem["dataset_identifier"]}/{problem["equation_idx"]}'
-    
+
     # Parse symbols and properties
-    symbols = problem['symbols']
-    properties = problem['symbol_properties']
-    descs = problem['symbol_descs']
-    
+    symbols = problem["symbols"]
+    properties = problem["symbol_properties"]
+    descs = problem["symbol_descs"]
+
     input_vars = []
     input_vars_descs = []
     output_var = None
     output_var_desc = "N/A"
-    
+
     for i, prop in enumerate(properties):
-        if prop == 'V':
+        if prop == "V":
             input_vars.append(symbols[i])
             input_vars_descs.append(descs[i])
-        elif prop == 'O':
+        elif prop == "O":
             output_var = symbols[i]
             output_var_desc = descs[i]
-    
+
     if not output_var:
         raise ValueError("No output variable ('O') found in symbol_properties.")
-    
+
     # Build input variable mapping comments
     x_mapping_comments = ["# Input variable mapping for x (columns of the input matrix):"]
     if not input_vars:
@@ -110,20 +112,22 @@ def create_program(problem: Dict[str, Any]) -> str:
         for i, var_name in enumerate(input_vars):
             x_mapping_comments.append(f"#   x[:, {i}]: {var_name} ({input_vars_descs[i]})")
     x_mapping_str = "\n".join(x_mapping_comments)
-    
+
     # Build function body
     num_features = len(input_vars)
     if num_features > 0:
-        function_body = ' + '.join([f"x[:, {i}] * params[{i}]" for i in range(num_features)])
+        function_body = " + ".join([f"x[:, {i}] * params[{i}]" for i in range(num_features)])
     else:
-        function_body = "np.full(x.shape[0], params[0])  # Predicts a constant value for all samples"
-    
+        function_body = (
+            "np.full(x.shape[0], params[0])  # Predicts a constant value for all samples"
+        )
+
     model_num_params = 10
-    
+
     # Build input variables description
     input_vars_desc_list = [f"{v} ({input_vars_descs[i]})" for i, v in enumerate(input_vars)]
-    input_vars_desc_str = ', '.join(input_vars_desc_list) if input_vars else "None"
-    
+    input_vars_desc_str = ", ".join(input_vars_desc_list) if input_vars else "None"
+
     program_content = f'''"""
 Initial program: A naive linear model for symbolic regression.
 This model predicts the output as a linear combination of input variables
@@ -169,50 +173,50 @@ def func(x, params):
 def run_search():
     return func
 '''
-    
+
     os.makedirs(problem_dir, exist_ok=True)
     file_path = os.path.join(problem_dir, "initial_program.py")
     with open(file_path, "w") as f:
         f.write(program_content)
-    
+
     return file_path
 
 
 def create_evaluator(problem: Dict[str, Any]) -> str:
     """
     Create an evaluator script for the symbolic regression problem.
-    
+
     The evaluator assesses model performance using BFGS optimization
     and computes various metrics including MSE and combined scores.
-    
+
     Args:
         problem: Dictionary containing problem data
-        
+
     Returns:
         Path to the created evaluator file
     """
     problem_dir = f'problems/{problem["dataset_identifier"]}/{problem["equation_idx"]}'
     os.makedirs(problem_dir, exist_ok=True)
-    
+
     # Extract data arrays
-    symbols = problem['symbols']
-    properties = problem['symbol_properties']
-    train_samples = np.asarray(problem['train'])
-    test_samples = np.asarray(problem['test'])
-    ood_test_samples = problem['ood_test']
+    symbols = problem["symbols"]
+    properties = problem["symbol_properties"]
+    train_samples = np.asarray(problem["train"])
+    test_samples = np.asarray(problem["test"])
+    ood_test_samples = problem["ood_test"]
     if ood_test_samples is not None:
         ood_test_samples = np.asarray(ood_test_samples)
-    
+
     # Find input and output indices
-    input_indices = [i for i, prop in enumerate(properties) if prop == 'V']
-    output_indices = [i for i, prop in enumerate(properties) if prop == 'O']
-    
+    input_indices = [i for i, prop in enumerate(properties) if prop == "V"]
+    output_indices = [i for i, prop in enumerate(properties) if prop == "O"]
+
     if not output_indices:
         raise ValueError("No output variable ('O') found in symbol_properties.")
     if len(output_indices) > 1:
         raise ValueError("Multiple output variables ('O') found. Evaluator supports single output.")
     output_index = output_indices[0]
-    
+
     # Prepare data arrays
     if not input_indices:
         X_train = np.empty((len(train_samples), 0))
@@ -222,32 +226,32 @@ def create_evaluator(problem: Dict[str, Any]) -> str:
         X_train = train_samples[:, input_indices]
         X_test = test_samples[:, input_indices]
         X_ood_test = ood_test_samples[:, input_indices] if ood_test_samples is not None else None
-    
+
     y_train = train_samples[:, output_index]
     y_test = test_samples[:, output_index]
     y_ood_test = ood_test_samples[:, output_index] if ood_test_samples is not None else None
-    
+
     num_input_features = len(input_indices)
     model_num_params_expected = 10
-    
+
     # Save data files
-    base_data_path = './'
-    x_train_path = os.path.join(base_data_path, problem_dir, 'X_train_for_eval.npy')
-    y_train_path = os.path.join(base_data_path, problem_dir, 'y_train_for_eval.npy')
+    base_data_path = "./"
+    x_train_path = os.path.join(base_data_path, problem_dir, "X_train_for_eval.npy")
+    y_train_path = os.path.join(base_data_path, problem_dir, "y_train_for_eval.npy")
     np.save(x_train_path, X_train)
     np.save(y_train_path, y_train)
-    
-    x_test_path = os.path.join(problem_dir, 'X_test_for_eval.npy')
-    y_test_path = os.path.join(problem_dir, 'y_test_for_eval.npy')
+
+    x_test_path = os.path.join(problem_dir, "X_test_for_eval.npy")
+    y_test_path = os.path.join(problem_dir, "y_test_for_eval.npy")
     np.save(x_test_path, X_test)
     np.save(y_test_path, y_test)
-    
+
     if X_ood_test is not None and y_ood_test is not None:
-        x_ood_test_path = os.path.join(problem_dir, 'X_ood_test_for_eval.npy')
-        y_ood_test_path = os.path.join(problem_dir, 'y_ood_test_for_eval.npy')
+        x_ood_test_path = os.path.join(problem_dir, "X_ood_test_for_eval.npy")
+        y_ood_test_path = os.path.join(problem_dir, "y_ood_test_for_eval.npy")
         np.save(x_ood_test_path, X_ood_test)
         np.save(y_ood_test_path, y_ood_test)
-    
+
     evaluator_script_content = f'''"""
 Evaluator for a symbolic regression model.
 It assesses a model program based on its performance on training data.
@@ -506,47 +510,49 @@ def evaluate(program_path):
         else:
             print(f"  {{key}}: {{value}}")
 '''
-    
+
     evaluator_file_path = os.path.join(problem_dir, "evaluator.py")
     with open(evaluator_file_path, "w") as f:
         f.write(evaluator_script_content)
-    
+
     return evaluator_file_path
 
 
 def create_config(problem: Dict[str, Any]) -> str:
     """
     Create a YAML configuration file for the symbolic regression task.
-    
+
     Args:
         problem: Dictionary containing problem data
-        
+
     Returns:
         Path to the created configuration file
     """
     problem_dir = f'problems/{problem["dataset_identifier"]}/{problem["equation_idx"]}'
     os.makedirs(problem_dir, exist_ok=True)
     config_file_path = os.path.join(problem_dir, "config.yaml")
-    
+
     # Parse variables
-    symbols = problem['symbols']
-    properties = problem['symbol_properties']
-    descs = problem['symbol_descs']
-    
+    symbols = problem["symbols"]
+    properties = problem["symbol_properties"]
+    descs = problem["symbol_descs"]
+
     input_vars_list = []
     output_var_list = []
-    
+
     for i, prop in enumerate(properties):
-        if prop == 'V':
+        if prop == "V":
             input_vars_list.append(f"{symbols[i]} ({descs[i]})")
-        elif prop == 'O':
+        elif prop == "O":
             output_var_list.append(f"{symbols[i]} ({descs[i]})")
-    
+
     input_vars_str = ", ".join(input_vars_list) if input_vars_list else "None"
-    output_var_str = ", ".join(output_var_list) if output_var_list else "None (Error: No output defined!)"
-    
+    output_var_str = (
+        ", ".join(output_var_list) if output_var_list else "None (Error: No output defined!)"
+    )
+
     num_initial_params = 10
-    
+
     system_message = (
         "Your task is to evolve a Python function `func(x, params)` that models a scientific process, "
         "considering the physical meaning and relationships of inputs, "
@@ -569,7 +575,7 @@ def create_config(problem: Dict[str, Any]) -> str:
         "Model performance (score = -log_10(mse)) will be evaluated on a held-out dataset. "
         "Ensure the model is free of potential numerical errors (e.g., log0, division by 0)."
     )
-    
+
     secret = load_secret()
     config_data = {
         "# Configuration for Symbolic Regression Task": f"{problem['dataset_identifier']}/{problem['equation_idx']}",
@@ -577,22 +583,18 @@ def create_config(problem: Dict[str, Any]) -> str:
         "log_level": "INFO",
         "target_score": "combined_score",
         "checkpoint_interval": 10,
-        
         "llm": {
-            "primary_model": secret.get('primary_model', "gpt-4o"),
+            "primary_model": "gpt-4o",
             "primary_model_weight": 0.8,
-            "secondary_model": secret.get('secondary_model', "o3-mini"),
+            "secondary_model": "o3",
             "secondary_model_weight": 0.2,
-            "api_key": secret.get('api_key', "YOUR_API_KEY_PLACEHOLDER"),
-            "api_base": secret.get('api_base', "https://api.dwyu.top/v1")
+            "api_base": "https://api.openai.com/v1",
         },
-        
         "prompt": {
             "system_message": system_message,
             "num_top_programs": 4,
             "use_template_stochasticity": True,
         },
-        
         "database": {
             "population_size": 70,
             "archive_size": 30,
@@ -600,7 +602,6 @@ def create_config(problem: Dict[str, Any]) -> str:
             "elite_selection_ratio": 0.3,
             "exploitation_ratio": 0.6,
         },
-        
         "evaluator": {
             "timeout": 90,
             "cascade_evaluation": False,
@@ -608,146 +609,162 @@ def create_config(problem: Dict[str, Any]) -> str:
             "parallel_evaluations": 4,
             "use_llm_feedback": False,
         },
-        
         "diff_based_evolution": True,
         "allow_full_rewrites": False,
     }
-    
+
     class PreserveNewlinesDumper(yaml.SafeDumper):
         """Custom YAML dumper that preserves multi-line strings."""
+
         def represent_scalar(self, tag, value, style=None):
-            if style is None and isinstance(value, str) and '\n' in value:
-                style = '|'
+            if style is None and isinstance(value, str) and "\n" in value:
+                style = "|"
             return super().represent_scalar(tag, value, style)
-    
+
     with open(config_file_path, "w") as f:
-        yaml.dump(config_data, f, Dumper=PreserveNewlinesDumper, 
-                  default_flow_style=False, sort_keys=False, indent=2)
-    
+        yaml.dump(
+            config_data,
+            f,
+            Dumper=PreserveNewlinesDumper,
+            default_flow_style=False,
+            sort_keys=False,
+            indent=2,
+        )
+
     return config_file_path
 
 
 def process_problem(initialized_dataset, problem_id: int, split_name: str) -> str:
     """
     Process a single problem using a pre-initialized dataset.
-    
+
     Loads specific problem data, creates program, evaluator, and config.
     Skips processing if essential output files already exist.
-    
+
     Args:
         initialized_dataset: Pre-initialized and setup dataset object
         problem_id: Index of the problem to process
         split_name: Name of the dataset split
-        
+
     Returns:
         Status message indicating success, skip, or error
     """
     try:
-        problem_data = extract_problem_data_from_initialized_dataset(initialized_dataset, problem_id)
-        
-        dataset_identifier = problem_data['dataset_identifier']
-        equation_idx = problem_data['equation_idx']
-        problem_dir = os.path.join('problems', dataset_identifier, str(equation_idx))
-        base_data_path = './'
-        
+        problem_data = extract_problem_data_from_initialized_dataset(
+            initialized_dataset, problem_id
+        )
+
+        dataset_identifier = problem_data["dataset_identifier"]
+        equation_idx = problem_data["equation_idx"]
+        problem_dir = os.path.join("problems", dataset_identifier, str(equation_idx))
+        base_data_path = "./"
+
         # Check if all essential files already exist
         essential_files = [
             os.path.join(problem_dir, "initial_program.py"),
             os.path.join(problem_dir, "evaluator.py"),
             os.path.join(problem_dir, "config.yaml"),
-            os.path.join(base_data_path, problem_dir, 'X_train_for_eval.npy'),
-            os.path.join(base_data_path, problem_dir, 'y_train_for_eval.npy'),
-            os.path.join(problem_dir, 'X_test_for_eval.npy'),
-            os.path.join(problem_dir, 'y_test_for_eval.npy'),
+            os.path.join(base_data_path, problem_dir, "X_train_for_eval.npy"),
+            os.path.join(base_data_path, problem_dir, "y_train_for_eval.npy"),
+            os.path.join(problem_dir, "X_test_for_eval.npy"),
+            os.path.join(problem_dir, "y_test_for_eval.npy"),
         ]
-        
+
         # Add OOD test files if applicable
-        if problem_data.get('ood_test') is not None:
-            essential_files.extend([
-                os.path.join(problem_dir, 'X_ood_test_for_eval.npy'),
-                os.path.join(problem_dir, 'y_ood_test_for_eval.npy')
-            ])
-        
+        if problem_data.get("ood_test") is not None:
+            essential_files.extend(
+                [
+                    os.path.join(problem_dir, "X_ood_test_for_eval.npy"),
+                    os.path.join(problem_dir, "y_ood_test_for_eval.npy"),
+                ]
+            )
+
         # Check if all files exist
         all_files_exist = all(os.path.exists(f) for f in essential_files)
-        
+
         if all_files_exist:
             return f"Skipped (already processed): problem_id: {problem_id} for split: {split_name} ({dataset_identifier}/{equation_idx})"
-        
+
         # Create necessary files
         create_program(problem_data)
         create_evaluator(problem_data)
         create_config(problem_data)
-        
+
         return f"Successfully processed problem_id: {problem_id} for split: {split_name} ({dataset_identifier}/{equation_idx})"
-    
+
     except Exception as e:
         import traceback
+
         return f"Error processing problem_id {problem_id} for split {split_name}: {str(e)}\n{traceback.format_exc()}"
 
 
 def main():
     """
     Main entry point for processing symbolic regression problems.
-    
+
     Initializes datasets and processes problems in parallel using multiprocessing.
     """
     # Determine number of processes to use
     num_cores_available = os.cpu_count()
     num_processes = min(max(1, (num_cores_available - 1) if num_cores_available else 4), 24)
-    
+
     print(f"Starting processing with {num_processes} processes...")
-    
+
     # Define dataset splits and their problem counts
     splits_data = {
-        'bio_pop_growth': 24,
-        'chem_react': 36,
-        'matsci': 25,
-        'phys_osc': 44,
+        "bio_pop_growth": 24,
+        "chem_react": 36,
+        "matsci": 25,
+        "phys_osc": 44,
         # 'lsrtransform': 111  # Uncomment to include this split
     }
-    
+
     all_tasks = []
-    
+
     # Initialize datasets and prepare tasks
     for split_name, num_problems in splits_data.items():
         print(f"\nInitializing dataset for split: {split_name}...")
-        dataset_root_folder = f'dataset/{split_name}'
-        
+        dataset_root_folder = f"dataset/{split_name}"
+
         try:
             # Initialize and setup dataset once per split
             initialized_dataset = get_datamodule(split_name, dataset_root_folder)
             initialized_dataset.setup()
             print(f"Dataset for {split_name} initialized and setup complete.")
-            
+
             # Prepare tasks for this split
             print(f"Preparing tasks for split: {split_name} ({num_problems} problems)")
             for problem_id in range(num_problems):
                 all_tasks.append((initialized_dataset, problem_id, split_name))
-                
+
         except Exception as e:
-            print(f"ERROR: Could not initialize or setup dataset for split {split_name}. Skipping this split.")
+            print(
+                f"ERROR: Could not initialize or setup dataset for split {split_name}. Skipping this split."
+            )
             print(f"Details: {e}")
             import traceback
+
             traceback.print_exc()
             continue
-    
+
     if not all_tasks:
-        print("No tasks to process. This could be due to errors in dataset initialization. Exiting.")
+        print(
+            "No tasks to process. This could be due to errors in dataset initialization. Exiting."
+        )
         return
-    
+
     print(f"\nTotal tasks to process across all successfully initialized splits: {len(all_tasks)}")
-    
+
     # Process tasks in parallel
     with multiprocessing.Pool(processes=num_processes) as pool:
         results = pool.starmap(process_problem, all_tasks)
-    
+
     # Print results summary
     print("\n--- Processing Complete ---")
     success_count = 0
     skipped_count = 0
     error_count = 0
-    
+
     for result in results:
         print(result)
         if "Successfully processed" in result:
@@ -756,10 +773,10 @@ def main():
             skipped_count += 1
         elif "Error processing" in result:
             error_count += 1
-    
+
     print(f"\nSummary: {success_count} successful, {skipped_count} skipped, {error_count} errors.")
     print("\nAll tasks finished.")
 
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    main()
diff --git a/examples/symbolic_regression/eval.py b/examples/symbolic_regression/eval.py
index a662092e3..fa68caa80 100755
--- a/examples/symbolic_regression/eval.py
+++ b/examples/symbolic_regression/eval.py
@@ -1,18 +1,22 @@
-from typing import Dict, Any # List removed as it's not used
+from typing import Dict, Any  # List removed as it's not used
 import json
 import os
 from pathlib import Path
 import numpy as np
+
 # import time # Not used
 from scipy.stats import kendalltau
 from sklearn.metrics import mean_absolute_percentage_error
 from scipy.optimize import minimize
 import importlib.util
 import sys
+
 # import traceback # Not used
 # import json # Not used
 # Example custom JSON encoder if you need to save results with numpy types
 import json
+
+
 class NumpyFloatJSONEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, np.integer):
@@ -22,7 +26,8 @@ def default(self, obj):
         elif isinstance(obj, np.ndarray):
             return obj.tolist()
         return super(NumpyFloatJSONEncoder, self).default(obj)
-    
+
+
 def compute_output_base_metrics(y_pred: np.ndarray, y: np.ndarray) -> Dict[str, Any]:
     """
     Computes base metrics after filtering NaNs from predictions.
@@ -39,66 +44,85 @@ def compute_output_base_metrics(y_pred: np.ndarray, y: np.ndarray) -> Dict[str,
         y_1d = y_1d.reshape(1)
 
     base_metrics_nan = {
-        "mse": float('nan'), "nmse": float('nan'), "r2": float('nan'),
-        "kdt": float('nan'), "mape": float('nan'), "num_valid_points": 0
+        "mse": float("nan"),
+        "nmse": float("nan"),
+        "r2": float("nan"),
+        "kdt": float("nan"),
+        "mape": float("nan"),
+        "num_valid_points": 0,
     }
 
     if y_pred_1d.shape != y_1d.shape and not (y_pred_1d.size == 0 and y_1d.size == 0):
-        return {**base_metrics_nan, "error": "y_pred and y have incompatible shapes after ensuring 1D."}
+        return {
+            **base_metrics_nan,
+            "error": "y_pred and y have incompatible shapes after ensuring 1D.",
+        }
 
     nonnan_mask = ~np.isnan(y_pred_1d)
     y_pred_filtered = y_pred_1d[nonnan_mask]
     y_filtered = y_1d[nonnan_mask]
 
-    if y_pred_filtered.size == 0: # All predictions were NaN or inputs were empty
-        return {**base_metrics_nan, "error": "All predictions are NaN or no data to compare after filtering."}
+    if y_pred_filtered.size == 0:  # All predictions were NaN or inputs were empty
+        return {
+            **base_metrics_nan,
+            "error": "All predictions are NaN or no data to compare after filtering.",
+        }
 
-    mse = np.mean((y_filtered - y_pred_filtered)**2)
+    mse = np.mean((y_filtered - y_pred_filtered) ** 2)
     var_y = np.var(y_filtered)
 
     if var_y == 0:
-        nmse = 0.0 if mse == 0 else float('inf') # Consistent if true values are constant
+        nmse = 0.0 if mse == 0 else float("inf")  # Consistent if true values are constant
     else:
         nmse = mse / var_y
-    
-    sum_sq_res = np.sum((y_filtered - y_pred_filtered)**2)
-    sum_sq_total = np.sum((y_filtered - np.mean(y_filtered))**2) # Use mean of filtered y
 
-    if sum_sq_total == 0: # True values (after filtering) are constant
-        r2 = 1.0 if sum_sq_res == 0 else -float('inf') # Or 0.0 if mse is also 0, definition varies. Sklearn uses 1.0.
+    sum_sq_res = np.sum((y_filtered - y_pred_filtered) ** 2)
+    sum_sq_total = np.sum((y_filtered - np.mean(y_filtered)) ** 2)  # Use mean of filtered y
+
+    if sum_sq_total == 0:  # True values (after filtering) are constant
+        r2 = (
+            1.0 if sum_sq_res == 0 else -float("inf")
+        )  # Or 0.0 if mse is also 0, definition varies. Sklearn uses 1.0.
     else:
         r2 = 1 - (sum_sq_res / sum_sq_total)
 
-    kdt = float('nan')
+    kdt = float("nan")
     try:
-        if y_filtered.size >= 2: # Kendall's tau requires at least 2 points
+        if y_filtered.size >= 2:  # Kendall's tau requires at least 2 points
             kdt_val, _ = kendalltau(y_filtered, y_pred_filtered)
-            kdt = float(kdt_val) # Ensure it's a basic float (handles np.nan)
+            kdt = float(kdt_val)  # Ensure it's a basic float (handles np.nan)
         # If size < 2, kdt remains float('nan')
-    except ValueError: # Should be less common with size check, but as a fallback
-        kdt = float('nan') # Explicitly set, though already NaN.
+    except ValueError:  # Should be less common with size check, but as a fallback
+        kdt = float("nan")  # Explicitly set, though already NaN.
 
-    mape = float('nan')
+    mape = float("nan")
     try:
         valid_mape_indices = y_filtered != 0
         if np.sum(valid_mape_indices) > 0:
-            mape = mean_absolute_percentage_error(y_filtered[valid_mape_indices], y_pred_filtered[valid_mape_indices])
-        elif y_filtered.size > 0: # All true values are zero
-            mape = 0.0 if np.all(y_pred_filtered == 0) else float('inf')
+            mape = mean_absolute_percentage_error(
+                y_filtered[valid_mape_indices], y_pred_filtered[valid_mape_indices]
+            )
+        elif y_filtered.size > 0:  # All true values are zero
+            mape = 0.0 if np.all(y_pred_filtered == 0) else float("inf")
         # If y_filtered.size is 0, mape remains float('nan')
-    except ValueError: # Fallback for any other MAPE calculation issues
-        mape = float('nan')
+    except ValueError:  # Fallback for any other MAPE calculation issues
+        mape = float("nan")
 
     return {
         "mse": float(mse),
         "nmse": float(nmse),
         "r2": float(r2),
-        "kdt": kdt, # Already a float
-        "mape": float(mape) if mape is not float('inf') else float('inf'), # Ensure float, preserve inf
+        "kdt": kdt,  # Already a float
+        "mape": float(mape)
+        if mape is not float("inf")
+        else float("inf"),  # Ensure float, preserve inf
         "num_valid_points": int(y_pred_filtered.size),
     }
 
-def objective_function(params: np.ndarray, model_func: callable, X_matrix: np.ndarray, y_true_vector: np.ndarray) -> float:
+
+def objective_function(
+    params: np.ndarray, model_func: callable, X_matrix: np.ndarray, y_true_vector: np.ndarray
+) -> float:
     """
     Objective function for scipy.optimize.minimize.
     Calculates MSE of the model_func with given params on X_matrix, y_true_vector.
@@ -108,17 +132,18 @@ def objective_function(params: np.ndarray, model_func: callable, X_matrix: np.nd
         predictions = model_func(X_matrix, params)
         if not isinstance(predictions, np.ndarray) or predictions.shape != y_true_vector.shape:
             # print(f"Debug: Objective func - Bad prediction shape/type. Got {type(predictions)}, shape {getattr(predictions, 'shape', 'N/A')}. Expected {y_true_vector.shape}")
-            return float('inf')
+            return float("inf")
         if np.any(np.isnan(predictions)) or np.any(np.isinf(predictions)):
             # print("Debug: Objective func - Predictions contain NaN/Inf.")
-            return float('inf')
-    except Exception: # Catch any error during model prediction
+            return float("inf")
+    except Exception:  # Catch any error during model prediction
         # print(f"Debug: Objective func - Exception during model_func call: {e_obj}")
-        return float('inf')
+        return float("inf")
 
-    mse = np.mean((predictions - y_true_vector)**2)
+    mse = np.mean((predictions - y_true_vector) ** 2)
     return mse
 
+
 def evaluation(
     program_path: str,
     data_path: str,
@@ -128,8 +153,12 @@ def evaluation(
     The model function from program_path is expected to be named 'func'.
     """
     base_error_metrics = {
-        "mse": float('nan'), "nmse": float('nan'), "r2": float('nan'),
-        "kdt": float('nan'), "mape": float('nan'), "num_valid_points": 0,
+        "mse": float("nan"),
+        "nmse": float("nan"),
+        "r2": float("nan"),
+        "kdt": float("nan"),
+        "mape": float("nan"),
+        "num_valid_points": 0,
     }
 
     def _create_error_return(error_message: str) -> Dict[str, Dict[str, Any]]:
@@ -144,11 +173,11 @@ def _create_error_return(error_message: str) -> Dict[str, Dict[str, Any]]:
     try:
         p_data_path = Path(data_path)
         train_x = np.load(p_data_path / "X_train_for_eval.npy")
-        train_y = np.load(p_data_path / "y_train_for_eval.npy").squeeze() # Ensure 1D
+        train_y = np.load(p_data_path / "y_train_for_eval.npy").squeeze()  # Ensure 1D
         test_x = np.load(p_data_path / "X_test_for_eval.npy")
-        test_y = np.load(p_data_path / "y_test_for_eval.npy").squeeze()   # Ensure 1D
+        test_y = np.load(p_data_path / "y_test_for_eval.npy").squeeze()  # Ensure 1D
         test_x_ood = np.load(p_data_path / "X_ood_test_for_eval.npy")
-        test_y_ood = np.load(p_data_path / "y_ood_test_for_eval.npy").squeeze() # Ensure 1D
+        test_y_ood = np.load(p_data_path / "y_ood_test_for_eval.npy").squeeze()  # Ensure 1D
     except FileNotFoundError as e:
         return _create_error_return(f"Data file not found: {e.filename}")
     except Exception as e:
@@ -164,35 +193,39 @@ def _create_error_return(error_message: str) -> Dict[str, Dict[str, Any]]:
         spec = importlib.util.spec_from_file_location("custom_model_module", str(p_program_path))
         if spec is None or spec.loader is None:
             raise ImportError(f"Could not create module spec from {program_path}")
-        
+
         custom_model_module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(custom_model_module)
-        
+
         model_func = getattr(custom_model_module, "func", None)
         if not callable(model_func):
             raise AttributeError(f"'func' function not found or not callable in {program_path}")
     except Exception as e:
-        return _create_error_return(f"Failed to load model function 'func' from '{program_path}': {str(e)}")
+        return _create_error_return(
+            f"Failed to load model function 'func' from '{program_path}': {str(e)}"
+        )
 
     # 3. Optimize parameters on training data
     optimized_params = None
-    num_attempts = 10 # Default number of attempts
-    best_func_value = float('inf')
+    num_attempts = 10  # Default number of attempts
+    best_func_value = float("inf")
     optimization_critical_error_msg = None
 
     # Try to get num_params from the model if it provides it, otherwise default
-    num_params_to_optimize = getattr(model_func, 'num_params', 10) # Default to 10 if not specified
+    num_params_to_optimize = getattr(model_func, "num_params", 10)  # Default to 10 if not specified
 
-    print(f"Starting optimization for {program_path} with {num_attempts} attempts (num_params: {num_params_to_optimize})...")
+    print(
+        f"Starting optimization for {program_path} with {num_attempts} attempts (num_params: {num_params_to_optimize})..."
+    )
     for i in range(num_attempts):
         print(f"Attempt {i+1}/{num_attempts}")
-        initial_params = np.random.rand(num_params_to_optimize) 
+        initial_params = np.random.rand(num_params_to_optimize)
         try:
             optimization_result = minimize(
                 objective_function,
                 initial_params,
                 args=(model_func, train_x, train_y),
-                method='BFGS',
+                method="BFGS",
                 # options={'maxiter': 1000, 'disp': False} # Example options
             )
             if optimization_result.success:
@@ -202,32 +235,42 @@ def _create_error_return(error_message: str) -> Dict[str, Dict[str, Any]]:
                     optimized_params = optimization_result.x
                     print(f"New best result found in attempt {i+1}. Func value: {best_func_value}")
             else:
-                print(f"Warning: Optimization attempt {i+1} did not converge. Message: {optimization_result.message}. Func value: {optimization_result.fun}")
-                if optimization_result.fun < best_func_value: # Still consider if it's the best so far
-                    print(f"Non-converged result from attempt {i+1} is an improvement. Func value: {optimization_result.fun}")
+                print(
+                    f"Warning: Optimization attempt {i+1} did not converge. Message: {optimization_result.message}. Func value: {optimization_result.fun}"
+                )
+                if (
+                    optimization_result.fun < best_func_value
+                ):  # Still consider if it's the best so far
+                    print(
+                        f"Non-converged result from attempt {i+1} is an improvement. Func value: {optimization_result.fun}"
+                    )
                     best_func_value = optimization_result.fun
                     optimized_params = optimization_result.x
 
         except Exception as e:
-            optimization_critical_error_msg = f"Critical error during optimization attempt {i+1} for {program_path}: {str(e)}"
+            optimization_critical_error_msg = (
+                f"Critical error during optimization attempt {i+1} for {program_path}: {str(e)}"
+            )
             print(f"Error: {optimization_critical_error_msg}")
-            break 
+            break
 
     if optimization_critical_error_msg:
         return _create_error_return(optimization_critical_error_msg)
 
-    def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str) -> Dict[str, Any]:
+    def _get_metrics_for_set(
+        X_data: np.ndarray, y_data: np.ndarray, set_name: str
+    ) -> Dict[str, Any]:
         if optimized_params is None:
             msg = f"Optimization failed to find parameters for {program_path}, cannot evaluate {set_name}."
             return {**base_error_metrics, "error": msg}
         try:
             pred_y = model_func(X_data, optimized_params)
             if not isinstance(pred_y, np.ndarray):
-                 raise ValueError(f"{set_name} predictions are not numpy arrays. Got {type(pred_y)}")
-            
+                raise ValueError(f"{set_name} predictions are not numpy arrays. Got {type(pred_y)}")
+
             metrics = compute_output_base_metrics(pred_y, y_data)
-            if "error" in metrics and metrics["num_valid_points"] == 0 :
-                 print(f"Warning for {set_name} ({program_path}): {metrics['error']}")
+            if "error" in metrics and metrics["num_valid_points"] == 0:
+                print(f"Warning for {set_name} ({program_path}): {metrics['error']}")
             return metrics
         except Exception as e:
             error_msg = f"{set_name} evaluation failed for '{program_path}': {str(e)}"
@@ -244,18 +287,19 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
         "ood_metrics": ood_metrics,
     }
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     if len(sys.argv) < 2:
         print("Usage: python your_script_name.py <path_to_problems_directory_or_single_problem>")
         sys.exit(1)
-        
+
     root_path_arg = sys.argv[1]
     path_obj = Path(root_path_arg)
     problem_dirs = []
 
     # Check if the path is a single problem directory
     # A problem directory is expected to contain data files directly and an openevolve_output subdir
-    program_file_check = path_obj / 'openevolve_output' / 'best' / 'best_program.py'
+    program_file_check = path_obj / "openevolve_output" / "best" / "best_program.py"
     data_file_check = path_obj / "X_train_for_eval.npy"
 
     if data_file_check.exists() and program_file_check.exists():
@@ -263,16 +307,19 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
         print(f"Identified as single problem directory: {path_obj}")
     else:
         # Assume path is a parent directory containing multiple problem subdirectories
-        print(f"Identified as parent directory: {path_obj}. Searching for problem subdirectories...")
+        print(
+            f"Identified as parent directory: {path_obj}. Searching for problem subdirectories..."
+        )
         try:
             if not path_obj.is_dir():
-                 print(f"Error: Root path {root_path_arg} is not a directory.")
-                 sys.exit(1)
+                print(f"Error: Root path {root_path_arg} is not a directory.")
+                sys.exit(1)
             for d in path_obj.iterdir():
                 if d.is_dir():
                     # Check if this subdirectory looks like a problem directory
-                    if (d / "X_train_for_eval.npy").exists() and \
-                       (d / 'openevolve_output' / 'best' / 'best_program.py').exists():
+                    if (d / "X_train_for_eval.npy").exists() and (
+                        d / "openevolve_output" / "best" / "best_program.py"
+                    ).exists():
                         problem_dirs.append(d)
                         print(f"  Found problem subdirectory: {d.name}")
                     else:
@@ -282,7 +329,9 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
             sys.exit(1)
 
     if not problem_dirs:
-        print(f"No valid problem subdirectories found in '{root_path_arg}' or '{root_path_arg}' itself is not a valid problem directory.")
+        print(
+            f"No valid problem subdirectories found in '{root_path_arg}' or '{root_path_arg}' itself is not a valid problem directory."
+        )
         sys.exit(1)
 
     all_results = {}
@@ -290,10 +339,12 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
         problem_name = subdir_path.name
         # if "21" not in problem_name: continue
         print(f"\nProcessing problem: {problem_name}")
-        program_file_path = subdir_path / 'openevolve_output' / 'best' / 'best_program.py'
-        data_dir_path = subdir_path 
+        program_file_path = subdir_path / "openevolve_output" / "best" / "best_program.py"
+        data_dir_path = subdir_path
 
-        if not program_file_path.exists(): # Should have been caught by subdir check, but as a safeguard
+        if (
+            not program_file_path.exists()
+        ):  # Should have been caught by subdir check, but as a safeguard
             print(f"Skipping {problem_name}: best_program.py not found at {program_file_path}")
             all_results[problem_name] = {
                 "train_metrics": {"error": "best_program.py not found"},
@@ -301,17 +352,17 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
                 "ood_metrics": {"error": "best_program.py not found"},
             }
             continue
-            
+
         print(f"  Program path: {program_file_path}")
         print(f"  Data path: {data_dir_path}")
 
-        metrics_output = evaluation( # Renamed from 'metrics' to avoid conflict
+        metrics_output = evaluation(  # Renamed from 'metrics' to avoid conflict
             program_path=str(program_file_path),
             data_path=str(data_dir_path),
         )
         print(f"  Metrics for {problem_name}: {metrics_output}")
         all_results[problem_name] = metrics_output
-    
+
     print("\n--- All Evaluation Results ---")
     for problem, result in all_results.items():
         print(f"\nProblem: {problem}")
@@ -322,7 +373,7 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
     # --- Overall Performance Calculation ---
     overall_performance = {}
     # Metrics to aggregate: mse, nmse, r2, kdt, mape
-    metric_keys = ["mse", "nmse", "r2", "kdt", "mape"] 
+    metric_keys = ["mse", "nmse", "r2", "kdt", "mape"]
     dataset_types = ["train_metrics", "test_metrics", "ood_metrics"]
 
     for d_type in dataset_types:
@@ -337,32 +388,39 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
                     # np.nanmean and np.nanmedian will handle internal NaNs gracefully.
                     # We explicitly exclude inf from aggregation here, as it can skew means badly.
                     # For R2, -inf is possible and should be handled by nanmedian/nanmean or filtered if desired.
-                    if isinstance(score, (int, float)) and not np.isinf(score): # np.isnan(score) is fine for nan* functions
+                    if isinstance(score, (int, float)) and not np.isinf(
+                        score
+                    ):  # np.isnan(score) is fine for nan* functions
+                        all_scores.append(score)
+                    elif (
+                        score == -float("inf") and m_key == "r2"
+                    ):  # Special case for R2, allow -inf
                         all_scores.append(score)
-                    elif score == -float('inf') and m_key == "r2": # Special case for R2, allow -inf
-                         all_scores.append(score)
-
 
             if all_scores:
                 # Replace -inf with NaN for R2 mean calculation if desired, or handle as is.
                 # For simplicity, we'll let nanmean/nanmedian handle it.
                 # Extreme values can still affect the mean significantly.
-                
+
                 # Filter out inf values for mean calculation as they make it non-informative
                 # but keep them for median if appropriate (or filter there too).
                 # For simplicity here, we are filtering inf before both.
                 # A more nuanced approach might replace inf with a very large/small number or handle per metric.
-                
-                scores_for_mean = [s for s in all_scores if s != -float('inf')] # R2 can be -inf
-                
-                overall_performance[d_type][f"mean_{m_key}"] = np.nanmean(scores_for_mean) if scores_for_mean else float('nan')
-                overall_performance[d_type][f"median_{m_key}"] = np.nanmedian(all_scores) if all_scores else float('nan')
+
+                scores_for_mean = [s for s in all_scores if s != -float("inf")]  # R2 can be -inf
+
+                overall_performance[d_type][f"mean_{m_key}"] = (
+                    np.nanmean(scores_for_mean) if scores_for_mean else float("nan")
+                )
+                overall_performance[d_type][f"median_{m_key}"] = (
+                    np.nanmedian(all_scores) if all_scores else float("nan")
+                )
                 overall_performance[d_type][f"num_problems_for_{m_key}"] = len(all_scores)
             else:
-                overall_performance[d_type][f"mean_{m_key}"] = float('nan')
-                overall_performance[d_type][f"median_{m_key}"] = float('nan')
+                overall_performance[d_type][f"mean_{m_key}"] = float("nan")
+                overall_performance[d_type][f"median_{m_key}"] = float("nan")
                 overall_performance[d_type][f"num_problems_for_{m_key}"] = 0
-    
+
     print("\n--- Overall Performance Summary ---")
     for d_type, metrics_summary in overall_performance.items():
         print(f"\n{d_type.replace('_', ' ').title()}:")
@@ -370,26 +428,29 @@ def _get_metrics_for_set(X_data: np.ndarray, y_data: np.ndarray, set_name: str)
             print("  No data for overall summary.")
             continue
         for stat_name, value in metrics_summary.items():
-            if "num_problems_for_" in stat_name: # Print count separately or alongside
+            if "num_problems_for_" in stat_name:  # Print count separately or alongside
                 m_key = stat_name.replace("num_problems_for_", "")
                 print(f"  Number of problems for {m_key.upper()} stats: {value}")
             elif "mean_" in stat_name or "median_" in stat_name:
-                 print(f"  {stat_name.replace('_', ' ').title()}: {value:.4f}" if isinstance(value, float) and not np.isnan(value) else f"  {stat_name.replace('_', ' ').title()}: {value}")
-
+                print(
+                    f"  {stat_name.replace('_', ' ').title()}: {value:.4f}"
+                    if isinstance(value, float) and not np.isnan(value)
+                    else f"  {stat_name.replace('_', ' ').title()}: {value}"
+                )
 
     # Add overall performance to the results to be saved
     all_results["overall_performance_summary"] = overall_performance
-    
+
     # Optional: Save all_results to a JSON file
     # Determine the output file path. If root_path_arg is a file, save alongside it. If a dir, save inside it.
-    if path_obj.is_file(): # Should not happen with current logic, but as a fallback
+    if path_obj.is_file():  # Should not happen with current logic, but as a fallback
         output_results_file = path_obj.parent / "all_evaluation_results.json"
-    else: # path_obj is a directory
+    else:  # path_obj is a directory
         output_results_file = path_obj / "all_evaluation_results.json"
 
     try:
-        with open(output_results_file, 'w') as f:
+        with open(output_results_file, "w") as f:
             json.dump(all_results, f, indent=4, cls=NumpyFloatJSONEncoder)
         print(f"\nAll results, including overall performance, saved to {output_results_file}")
     except Exception as e:
-        print(f"\nError saving results to JSON: {e}")
\ No newline at end of file
+        print(f"\nError saving results to JSON: {e}")
diff --git a/openevolve/controller.py b/openevolve/controller.py
index 041dd157c..6642d48df 100644
--- a/openevolve/controller.py
+++ b/openevolve/controller.py
@@ -243,9 +243,7 @@ async def run(
 
                 # Specifically check if this is the new best program
                 if self.database.best_program_id == child_program.id:
-                    logger.info(
-                        f"🌟 New best solution found at iteration {i+1}: {child_program.id}"
-                    )
+                    logger.info(f"🌟 New best solution found at iteration {i+1}: {child_program.id}")
                     logger.info(
                         f"Metrics: {', '.join(f'{name}={value:.4f}' for name, value in child_program.metrics.items())}"
                     )