diff --git a/examples/trading_volume_optimization_orchestrator/README.md b/examples/trading_volume_optimization_orchestrator/README.md new file mode 100644 index 000000000..5f3b58d1d --- /dev/null +++ b/examples/trading_volume_optimization_orchestrator/README.md @@ -0,0 +1,270 @@ +README +====== + +Optimal-Execution Toy Benchmark for OpenEvolve +--------------------------------------------- + +This repository contains a **minimal yet complete** benchmark that lets an evolutionary-search engine learn how to execute a fixed quantity of shares in an order-book with market impact. +It mirrors the structure of the earlier “function-minimisation” example but replaces the mathematical objective with a *trading* objective: + +*Minimise implementation-shortfall / slippage when buying or selling a random volume during a short horizon.* + +The benchmark is intentionally lightweight – short Python, no external dependencies – yet it shows every building-block you would find in a realistic execution engine: + +1. synthetic order-book generation +2. execution-schedule parameterisation +3. a search / learning loop confined to an `EVOLVE-BLOCK` +4. an **independent evaluator** that scores candidates on unseen market scenarios. + +------------------------------------------------------------------------------- + +Repository Layout +----------------- + +``` +. +├── initial_program.py # candidate – contains the EVOLVE-BLOCK +├── evaluator.py # ground-truth evaluator +└── README.md # ← you are here +``` + +Why two files? +• `initial_program.py` is what the evolutionary framework mutates. +• `evaluator.py` is trusted, *never* mutated and imports nothing except the + candidate’s public `run_search()` function. + +------------------------------------------------------------------------------- + +Quick-start +----------- + +``` +python initial_program.py + # Runs the candidate’s own training loop (random-search on α) + +python evaluator.py initial_program.py + # Scores the candidate on fresh market scenarios + + +python openevolve-run.py examples/trading_volume_optimization_orchestrator/initial_program.py examples/trading_volume_optimization_orchestrator/evaluator.py --iterations 20 --config examples/trading_volume_optimization_orchestrator/config.yaml + # Runs the candidate with OpenEvolve orchestrator + +``` + +Typical console output: + +``` +Best alpha: 1.482 | Estimated average slippage: 0.00834 +{'value_score': 0.213, 'speed_score': 0.667, + 'reliability': 1.0, 'overall_score': 0.269} +``` + +------------------------------------------------------------------------------- + +1. Mechanics – Inside the Candidate (`initial_program.py`) +---------------------------------------------------------- + +The file is split into two parts: + +### 1.1 EVOLVE-BLOCK (mutable) + +```python +# EVOLVE-BLOCK-START … EVOLVE-BLOCK-END +``` + +Only the code between those delimiters will be altered by OpenEvolve. +Everything else is *frozen*; it plays the role of a “library.” + +Current strategy: + +1. **Parameter** – a single scalar `alpha (α)` + • α < 0 → front-loads the schedule + • α = 0 → uniform (TWAP) + • α > 0 → back-loads the schedule + +2. **Search** – naïve random search over α + (`search_algorithm()` evaluates ~250 random α’s and keeps the best.) + +3. **Fitness** – measured by `evaluate_alpha()` which, in turn, calls the + **fixed** simulator (`simulate_execution`) for many random scenarios and + averages per-share slippage. + +Return signature required by the evaluator: + +```python +def run_search() -> tuple[float, float]: + return best_alpha, estimated_cost +``` + +The first element (α) is mandatory; anything after that is ignored by the +evaluator but can be useful for debugging. + +### 1.2 Fixed “library” code (non-mutable) + +* `create_schedule(volume, horizon, alpha)` + Weights each slice `(t+1)^α`, then normalises to equal volume. + +* `simulate_execution(...)` + Ultra-simplified micro-structure: + + • The mid-price `P_t` follows a Gaussian random walk + • The current spread is constant (`±spread/2`) + • Market impact grows linearly with child-order size relative to + book depth: + `impact = (size / depth) * spread/2` + + Execution price for each slice: + + ``` + BUY : P_t + spread/2 + impact + SELL: P_t - spread/2 - impact + ``` + + Slippage is summed over the horizon and returned *per share*. + +------------------------------------------------------------------------------- + +2. Mechanics – The Evaluator (`evaluator.py`) +--------------------------------------------- + +The evaluator is the **oracle**; it owns the test scenarios and the scoring +function. A successful candidate must *generalise*: the random numbers in +the evaluator are independent from those inside the candidate. + +### 2.1 Process flow + +For each of `NUM_TRIALS = 10`: + +1. Draw a *fresh* `(volume, side)` pair + `volume ∈ [100, 1000]`, `side ∈ {buy, sell}` + +2. Call `run_search()` **once** (time-limited to 8 s) + +3. Extract α and compute: + + ``` + cost_candidate = simulate_execution(vol, side, α) + cost_baseline = simulate_execution(vol, side, 0.0) # uniform TWAP + improvement = (cost_baseline - cost_candidate) + / max(cost_baseline, 1e-9) + ``` + +4. Store runtime and improvement. + +### 2.2 Scores + +After the 10 trials: + +``` +value_score = mean(max(0, improvement)) ∈ [0, 1] +speed_score = min(10, 1/mean(runtime)) / 10 ∈ [0, 1] +reliability_score = success / 10 ∈ [0, 1] + +overall_score = 0.8·value + 0.1·speed + 0.1·reliability +``` + +Intuition: + +* **Value** (quality of execution) dominates. +* **Speed** rewards fast optimisation but is capped. +* **Reliability** ensures the candidate rarely crashes or times-out. + +### 2.3 Stage-based evaluation (optional) + +* `evaluate_stage1()` – smoke-test; passes if `overall_score > 0.05` +* `evaluate_stage2()` – identical to `evaluate()` + +Those mirrors the two-stage funnel from the previous demo. + +------------------------------------------------------------------------------- + +3. Extending the Benchmark +-------------------------- + +The framework is deliberately tiny so you can experiment. + +Ideas: + +1. **Richer parameterisation** + • Add `beta` for *U-shape* schedule + • Add *child-order participation cap* (%ADV) + +2. **Better search / learning** + • Replace random search with gradient-free CMA-ES, Bayesian optimisation or + even RL inside the EVOLVE-BLOCK. + +3. **Enhanced market model** + • Stochastic spread + • Non-linear impact (`impact ∝ volume^γ`) + • Resilience (price reverts after child order) + +4. **Multi-objective scoring** + Mix risk metrics (variance of slippage) into the evaluator. + +When you add knobs, remember: + +* All **simulation logic for evaluation must live in `evaluator.py`**. + Candidates cannot peek or tamper with it. +* The evaluator must still be able to extract the *decision variables* from + the tuple returned by `run_search()`. + +------------------------------------------------------------------------------- + +4. Known Limitations +-------------------- + +1. **Impact model is linear & memory-less** + Good for demonstration; unrealistic for real-world HFT. + +2. **No order-book micro-structure** + We do not simulate queue positions, cancellations, hidden liquidity, etc. + +3. **Single parameter α** + Optimal execution in reality depends on volatility, spread forecast, + order-book imbalance and so forth. Here we sidestep all that for clarity. + +4. **Random search baseline** + Evolutionary engines will easily outperform it; that is the point – we + want a hill to climb. + +------------------------------------------------------------------------------- + +5. Optimized Solution +-------------------- + +The evolutionary search has discovered an efficient strategy for trading volume execution, which can be found in `openevolve_output/best/best_program.py`. + +### Key Features of the Solution + +- **Alpha-based Schedule Creation**: The solution generates trading schedules using a parametrized approach where an alpha parameter controls the distribution of trading volume over time. + - α < 0: Front-loaded execution (more volume traded early) + - α = 0: Uniform execution (equal volume across all time slices) + - α > 0: Back-loaded execution (more volume traded later) + +- **Scenario-based Evaluation**: The solution evaluates different alpha values across multiple random market scenarios, considering: + - Random buy/sell sides + - Variable trading volumes + - Price impact simulation + +- **Optimization Method**: The algorithm uses a simple but effective random search approach to find the optimal alpha value that minimizes average slippage costs. + + +6. FAQ +------ +Q: **How do I run the example?** +A: Run `python openevolve-run.py examples/optimal_execution/initial_program.py examples/optimal_execution/evaluator.py --iterations 20 --config config.yaml' +Q: **Why does the evaluator re-implement `simulate_execution`?** +A: To guarantee the candidate cannot cheat by hard-coding answers from its own +RNG realisations. + +Q: **What happens if my `run_search()` returns something weird?** +A: The evaluator casts the *first* item to `float`. Non-numeric or `NaN` +values yield zero score. + +Q: **Is it okay to import heavy libraries (pandas, torch) inside the EVOLVE-BLOCK?** +A: Technically yes, but remember the 8-second time-out and the judge’s machine +may not have GPU or large RAM. + + + + diff --git a/examples/trading_volume_optimization_orchestrator/config.yaml b/examples/trading_volume_optimization_orchestrator/config.yaml new file mode 100644 index 000000000..9be2106fe --- /dev/null +++ b/examples/trading_volume_optimization_orchestrator/config.yaml @@ -0,0 +1,112 @@ +# OpenEvolve Default Configuration +# This file contains all available configuration options with sensible defaults +# You can use this as a template for your own configuration + +# General settings +max_iterations: 1000 # Maximum number of evolution iterations +checkpoint_interval: 50 # Save checkpoints every N iterations +log_level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) +log_dir: null # Custom directory for logs (default: output_dir/logs) +random_seed: null # Random seed for reproducibility (null = random) + +# Evolution settings +diff_based_evolution: true # Use diff-based evolution (true) or full rewrites (false) +allow_full_rewrites: false # Allow occasional full rewrites even in diff-based mode +max_code_length: 10000 # Maximum allowed code length in characters + +# LLM configuration +llm: + models: + - name: "gpt-4" + weight: 1.0 + + evaluator_models: + - name: "gpt-4" + weight: 1.0 + + # Azure endpoint *root* – no path, no query string + api_base: "https://XXXXXX.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2025-01-01-preview" + + # Tell the SDK which API flavour and version to use + # api_type: "azure" + # api_version: "2025-01-01-preview" + api_key: "XXXXXXXXXXXXXXXXXX" # Or provide it directly here + temperature: 0.7 + top_p: 0.95 + max_tokens: 4096 + timeout: 60 + retries: 3 + retry_delay: 5 + + +# Prompt configuration +prompt: + template_dir: null # Custom directory for prompt templates + system_message: "You are an expert coder helping to improve programs through evolution." + evaluator_system_message: "You are an expert code reviewer." + + # Number of examples to include in the prompt + num_top_programs: 3 # Number of top-performing programs to include + num_diverse_programs: 2 # Number of diverse programs to include + + # Template stochasticity + use_template_stochasticity: true # Use random variations in templates for diversity + template_variations: # Different phrasings for parts of the template + improvement_suggestion: + - "Here's how we could improve this code:" + - "I suggest the following improvements:" + - "We can enhance this code by:" + + # Note: meta-prompting features are not yet implemented + +# Database configuration +database: + # General settings + db_path: null # Path to persist database (null = in-memory only) + in_memory: true # Keep database in memory for faster access + + # Evolutionary parameters + population_size: 1000 # Maximum number of programs to keep in memory + archive_size: 100 # Size of elite archive + num_islands: 5 # Number of islands for island model (separate populations) + + # Island-based evolution parameters + # Islands provide diversity by maintaining separate populations that evolve independently. + # Migration periodically shares the best solutions between adjacent islands. + migration_interval: 50 # Migrate between islands every N generations + migration_rate: 0.1 # Fraction of top programs to migrate (0.1 = 10%) + + # Selection parameters + elite_selection_ratio: 0.1 # Ratio of elite programs to select + exploration_ratio: 0.2 # Ratio of exploration vs exploitation + exploitation_ratio: 0.7 # Ratio of exploitation vs random selection + # Note: diversity_metric is fixed to "edit_distance" (feature_based not implemented) + + # Feature map dimensions for MAP-Elites + feature_dimensions: # Dimensions for MAP-Elites feature map + - "score" # Performance score + - "complexity" # Code complexity (length) + feature_bins: 10 # Number of bins per dimension + +# Evaluator configuration +evaluator: + # General settings + timeout: 300 # Maximum evaluation time in seconds + max_retries: 3 # Maximum number of retries for evaluation + + # Note: resource limits (memory_limit_mb, cpu_limit) are not yet implemented + + # Evaluation strategies + cascade_evaluation: true # Use cascade evaluation to filter bad solutions early + cascade_thresholds: # Thresholds for advancing to next evaluation stage + - 0.5 # First stage threshold + - 0.75 # Second stage threshold + - 0.9 # Third stage threshold + + # Parallel evaluation + parallel_evaluations: 4 # Number of parallel evaluations + # Note: distributed evaluation is not yet implemented + + # LLM-based feedback (experimental) + use_llm_feedback: false # Use LLM to evaluate code quality + llm_feedback_weight: 0.1 # Weight for LLM feedback in final score diff --git a/examples/trading_volume_optimization_orchestrator/evaluator.py b/examples/trading_volume_optimization_orchestrator/evaluator.py new file mode 100644 index 000000000..8a051eb3d --- /dev/null +++ b/examples/trading_volume_optimization_orchestrator/evaluator.py @@ -0,0 +1,154 @@ +""" +Evaluator for the optimal-execution example. +The only requirement for the candidate program is a `run_search` +function that returns *at least* the first element `alpha`. +""" +import importlib.util +import time +import concurrent.futures +import traceback +import numpy as np +import sys + +# ----------------------------------------------------------------- +# Small helper copied from the previous demo +def run_with_timeout(func, args=(), kwargs=None, timeout_seconds=8): + if kwargs is None: + kwargs = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as ex: + fut = ex.submit(func, *args, **kwargs) + try: + return fut.result(timeout=timeout_seconds) + except concurrent.futures.TimeoutError: + raise TimeoutError(f"Timed-out after {timeout_seconds}s") + + # ----------------------------------------------------------------- +# A *fixed* copy of the market simulator – identical to the one in +# initial_program.py but held here so candidates cannot tamper with it. +def create_schedule(volume: float, horizon: int, alpha: float) -> np.ndarray: + weights = np.array([(t + 1) ** alpha for t in range(horizon)], dtype=float) + weights /= weights.sum() + return volume * weights + +def simulate_execution( + volume: float, + side: str, + alpha: float, + horizon: int = 10, + spread: float = 0.02, + depth: float = 1_000.0, + rng: np.random.Generator | None = None, +) -> float: + if rng is None: + rng = np.random.default_rng() + + mid0 = 100.0 + mid_price = mid0 + slices = create_schedule(volume, horizon, alpha) + slippage = 0.0 + + for child_vol in slices: + mid_price += rng.normal(0.0, 0.05) + impact = (child_vol / depth) * (spread / 2) + if side == "buy": + exec_px = mid_price + spread / 2 + impact + slippage += (exec_px - mid0) * child_vol + else: + exec_px = mid_price - spread / 2 - impact + slippage += (mid0 - exec_px) * child_vol + return slippage / volume + +# ----------------------------------------------------------------- +def evaluate(program_path: str): + """ + Score a candidate program on 10 fresh, unseen market scenarios. + Metric: improvement relative to a naïve uniform schedule (alpha = 0). + """ + try: + spec = importlib.util.spec_from_file_location("candidate", program_path) + candidate = importlib.util.module_from_spec(spec) + spec.loader.exec_module(candidate) + + if not hasattr(candidate, "run_search"): + return {"overall_score": 0.0, "error": "Missing run_search()"} + + NUM_TRIALS = 10 + rng = np.random.default_rng(seed=42) + + improvements = [] + times = [] + success = 0 + + for trial in range(NUM_TRIALS): + volume = rng.integers(100, 1000) + side = rng.choice(["buy", "sell"]) + + try: + t0 = time.time() + result = run_with_timeout(candidate.run_search, timeout_seconds=8) + t1 = time.time() + times.append(t1 - t0) + + # Extract alpha (first value) – additional items ignored + if isinstance(result, (tuple, list)): + alpha = float(result[0]) + else: + alpha = float(result) + + # Our simulator – independent of candidate’s own one + cost_candidate = simulate_execution(volume, side, alpha, rng=rng) + cost_baseline = simulate_execution(volume, side, 0.0, rng=rng) + + improvement = (cost_baseline - cost_candidate) / max(cost_baseline, 1e-9) + improvements.append(improvement) + success += 1 + + except TimeoutError as e: + print(f"Trial {trial}: {e}") + except Exception as e: + print(f"Trial {trial}: {e}\n{traceback.format_exc()}") + + if success == 0: + return {"overall_score": 0.0, "error": "All trials failed"} + + avg_improvement = float(np.mean(improvements)) + avg_time = float(np.mean(times)) + + value_score = max(0.0, avg_improvement) # already 0-1 range + speed_score = min(10.0, 1.0 / avg_time) / 10.0 # cap influence + reliability_score= success / NUM_TRIALS + + overall_score = 0.8 * value_score + 0.1 * speed_score + 0.1 * reliability_score + + return { + "value_score" : value_score, + "speed_score" : speed_score, + "reliability" : reliability_score, + "overall_score" : overall_score, + } + except Exception as e: + print(traceback.format_exc()) + return {"overall_score": 0.0, "error": str(e)} + + # ----------------------------------------------------------------- +# Two quick stage helpers (optional, mirrors the original demo) + +def evaluate_stage1(program_path: str): + """ + Smoke-test: does it run & make *some* improvement? + """ + res = evaluate(program_path) + ok = res.get("overall_score", 0.0) > 0.05 + return {"runs_successfully": 1.0 if ok else 0.0, **res} + +def evaluate_stage2(program_path: str): + return evaluate(program_path) + +# ----------------------------------------------------------------- +if __name__ == "__main__": + # Allow quick manual test: python evaluator.py initial_program.py + if len(sys.argv) != 2: + print("Usage: python evaluator.py path/to/initial_program.py") + sys.exit(0) + scores = evaluate(sys.argv[1]) + print(scores) diff --git a/examples/trading_volume_optimization_orchestrator/initial_program.py b/examples/trading_volume_optimization_orchestrator/initial_program.py new file mode 100644 index 000000000..1fdf964dd --- /dev/null +++ b/examples/trading_volume_optimization_orchestrator/initial_program.py @@ -0,0 +1,117 @@ +# EVOLVE-BLOCK-START +""" +Optimal-execution example for OpenEvolve +Only the code enclosed by EVOLVE-BLOCK-START / EVOLVE-BLOCK-END +will be mutated by the evolutionary search. +""" +import numpy as np + +def create_schedule(volume: float, horizon: int, alpha: float) -> np.ndarray: + """ + Generate a slice-by-slice schedule. + alpha < 0 → front–load + alpha = 0 → uniform + alpha > 0 → back–load + """ + weights = np.array([(t + 1) ** alpha for t in range(horizon)], dtype=float) + weights /= weights.sum() + return volume * weights + +def evaluate_alpha(alpha: float, horizon: int, scenarios: int) -> float: + """ + Average per-share slippage of an 'alpha' schedule + over a number of random market scenarios. + (simulate_execution is defined outside the evolve block.) + """ + + rng = np.random.default_rng() + cost = 0.0 + for _ in range(scenarios): + vol = rng.integers(100, 1000) + side = rng.choice(["buy", "sell"]) + cost += simulate_execution(volume=vol, side=side, alpha=alpha, rng=rng) + return cost / scenarios + +def search_algorithm( + iterations: int = 250, + horizon: int = 10, + alpha_bounds: tuple = (-1.0, 3.0), + scenarios: int = 40, +): + """ + Very simple random search for a good ‘alpha’. + """ + best_alpha = np.random.uniform(*alpha_bounds) + best_cost = evaluate_alpha(best_alpha, horizon, scenarios) + + for _ in range(iterations): + alpha = np.random.uniform(*alpha_bounds) + cost = evaluate_alpha(alpha, horizon, scenarios) + if cost < best_cost: + best_alpha, best_cost = alpha, cost + + return best_alpha, best_cost +# EVOLVE-BLOCK-END + + +# ------------ Fixed (non-evolved) part below ----------------- +import numpy as np + +def create_schedule(volume: float, horizon: int, alpha: float) -> np.ndarray: + """ + Duplicate of the schedule helper so the evaluator can import it too. + (This definition is outside the evolve block and therefore fixed.) + """ + weights = np.array([(t + 1) ** alpha for t in range(horizon)], dtype=float) + weights /= weights.sum() + return volume * weights + +def simulate_execution( + volume: float, + side: str, + alpha: float, + horizon: int = 10, + spread: float = 0.02, + depth: float = 1_000.0, + rng: np.random.Generator | None = None, +) -> float: + """ + Ultra-light order-book / price-impact simulation. + Returns *per-share* slippage (positive number, lower is better). + """ + if rng is None: + rng = np.random.default_rng() + + mid0 = 100.0 # reference price + mid_price = mid0 + slices = create_schedule(volume, horizon, alpha) + slippage = 0.0 + + for child_vol in slices: + # mid-price random walk + mid_price += rng.normal(0.0, 0.05) + + # very simple linear impact model + impact = (child_vol / depth) * (spread / 2) + + if side == "buy": + exec_px = mid_price + spread / 2 + impact + slippage += (exec_px - mid0) * child_vol + else: # sell + exec_px = mid_price - spread / 2 - impact + slippage += (mid0 - exec_px) * child_vol + + return slippage / volume # per-share value + +def run_search(): + """ + Entry point required by the evaluator. + Returns the best ‘alpha’ found and the cost on the + training scenarios used inside search_algorithm. + """ + alpha, cost = search_algorithm() + return alpha, cost + +if __name__ == "__main__": + best_alpha, est_cost = run_search() + print(f"Best alpha: {best_alpha:.3f} | Estimated average slippage: {est_cost:.5f}") diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py index 2b8eb9a42..ef3c7752e 100644 --- a/openevolve/llm/openai.py +++ b/openevolve/llm/openai.py @@ -12,6 +12,8 @@ from openevolve.config import LLMConfig from openevolve.llm.base import LLMInterface +from openai import AzureOpenAI + logger = logging.getLogger(__name__) @@ -35,9 +37,13 @@ def __init__( self.random_seed = getattr(model_cfg, 'random_seed', None) # Set up API client + print("##########################") + self.client = openai.OpenAI( - api_key=self.api_key, - base_url=self.api_base, + api_key = self.api_key, + base_url = self.api_base, + default_headers={"api-key": self.api_key}, + default_query = {"api-version": "2025-01-01-preview"}, ) logger.info(f"Initialized OpenAI LLM with model: {self.model}") @@ -66,6 +72,14 @@ async def generate_with_context( "messages": formatted_messages, "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens), } + # if we use aifoundry we need to get rid of max_completion_tokens + elif self.api_base.startswith('https://aispocuksouth'): + params = { + "model": self.model, + "messages": formatted_messages, + "temperature": kwargs.get("temperature", self.temperature), + "top_p": kwargs.get("top_p", self.top_p), + } else: params = { "model": self.model, @@ -108,6 +122,16 @@ async def generate_with_context( async def _call_api(self, params: Dict[str, Any]) -> str: """Make the actual API call""" + # ----- Azure o-series models need max_completion_tokens ----- + if "max_tokens" in params: + params = params.copy() # don’t mutate caller’s dict + params["extra_body"] = {"max_completion_tokens": params.pop("max_tokens")} + # ----------------------------------------------------------- + # ⬇ NEW: drop sampling knobs that o-series refuses + for unsupported in ("temperature", "top_p", + "frequency_penalty", "presence_penalty"): + params.pop(unsupported, None) + # ------------------------------------------------------------------ # Use asyncio to run the blocking API call in a thread pool loop = asyncio.get_event_loop() response = await loop.run_in_executor(