diff --git a/examples/circle_packing/evaluator.py b/examples/circle_packing/evaluator.py index 9f4cd1c94..677365357 100644 --- a/examples/circle_packing/evaluator.py +++ b/examples/circle_packing/evaluator.py @@ -35,22 +35,22 @@ def validate_packing(centers, radii): True if valid, False otherwise """ n = centers.shape[0] - + # Check for NaN values if np.isnan(centers).any(): print("NaN values detected in circle centers") return False - + if np.isnan(radii).any(): print("NaN values detected in circle radii") return False # Check if radii are nonnegative and not nan for i in range(n): - if(radii[i] < 0): + if radii[i] < 0: print(f"Circle {i} has negative radius {radii[i]}") return False - elif(np.isnan(radii[i])): + elif np.isnan(radii[i]): print(f"Circle {i} has nan radius") return False @@ -214,7 +214,7 @@ def evaluate(program_path): centers = np.array(centers) if not isinstance(radii, np.ndarray): radii = np.array(radii) - + # Check for NaN values before validation if np.isnan(centers).any() or np.isnan(radii).any(): print("NaN values detected in solution") diff --git a/examples/circle_packing_with_artifacts/evaluator.py b/examples/circle_packing_with_artifacts/evaluator.py index a8692c936..ea3202546 100644 --- a/examples/circle_packing_with_artifacts/evaluator.py +++ b/examples/circle_packing_with_artifacts/evaluator.py @@ -295,9 +295,9 @@ def evaluate(program_path): # Add successful packing stats for good solutions if valid and target_ratio > 0.95: # Near-optimal solutions artifacts["stdout"] = f"Excellent packing! Achieved {target_ratio:.1%} of target value" - artifacts[ - "radius_stats" - ] = f"Min: {validation_details['min_radius']:.6f}, Max: {validation_details['max_radius']:.6f}, Avg: {validation_details['avg_radius']:.6f}" + artifacts["radius_stats"] = ( + f"Min: {validation_details['min_radius']:.6f}, Max: {validation_details['max_radius']:.6f}, Avg: {validation_details['avg_radius']:.6f}" + ) return EvaluationResult( metrics={ @@ -404,9 +404,9 @@ def evaluate_stage1(program_path): # Add validation issues if any if not valid: - artifacts[ - "stderr" - ] = f"Validation failed: {len(validation_details.get('boundary_violations', []))} boundary violations, {len(validation_details.get('overlaps', []))} overlaps" + artifacts["stderr"] = ( + f"Validation failed: {len(validation_details.get('boundary_violations', []))} boundary violations, {len(validation_details.get('overlaps', []))} overlaps" + ) artifacts["failure_stage"] = "stage1_geometric_validation" if validation_details.get("boundary_violations"): artifacts["boundary_issues"] = validation_details["boundary_violations"][ diff --git a/examples/llm_prompt_optimization/evaluate_prompts.py b/examples/llm_prompt_optimization/evaluate_prompts.py index b4e9c795a..2505a9024 100755 --- a/examples/llm_prompt_optimization/evaluate_prompts.py +++ b/examples/llm_prompt_optimization/evaluate_prompts.py @@ -14,41 +14,42 @@ from openai import OpenAI from tqdm import tqdm + # Initialize OpenAI client def get_client(): - api_key = os.environ.get('OPENAI_API_KEY') + api_key = os.environ.get("OPENAI_API_KEY") if not api_key: raise ValueError("OPENAI_API_KEY environment variable not set") - - return OpenAI( - base_url="https://openrouter.ai/api/v1", - api_key=api_key - ) -def load_prompt(dataset_name, prompt_type='baseline'): + return OpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key) + + +def load_prompt(dataset_name, prompt_type="baseline"): """Load prompt template for a dataset.""" - if prompt_type == 'baseline': + if prompt_type == "baseline": prompt_path = f"{dataset_name}_prompt.txt" else: # evolved prompt_path = f"openevolve_output_qwen3_{dataset_name}/best/best_program.txt" - + if not os.path.exists(prompt_path): raise FileNotFoundError(f"Prompt file not found: {prompt_path}") - - with open(prompt_path, 'r') as f: + + with open(prompt_path, "r") as f: return f.read().strip() + def load_dataset_config(dataset_name): """Load dataset configuration.""" config_path = f"{dataset_name}_prompt_dataset.yaml" - - with open(config_path, 'r') as f: + + with open(config_path, "r") as f: return yaml.safe_load(f) + def evaluate_ifeval(client, prompt_template, num_samples, model): """Evaluate IFEval dataset.""" print("\nLoading IFEval dataset...") - + # Try test split first, then train try: dataset = load_dataset("google/IFEval", split="test") @@ -56,7 +57,7 @@ def evaluate_ifeval(client, prompt_template, num_samples, model): except: dataset = load_dataset("google/IFEval", split="train") split_used = "train" - + # Determine samples to process if num_samples is None: samples_to_process = len(dataset) @@ -66,23 +67,25 @@ def evaluate_ifeval(client, prompt_template, num_samples, model): samples_to_process = min(num_samples, len(dataset)) print(f"Using {samples_to_process} samples from {split_used} split") dataset = load_dataset("google/IFEval", split=split_used, streaming=True) - dataset_iter = tqdm(dataset.take(samples_to_process), total=samples_to_process, desc="Evaluating") - + dataset_iter = tqdm( + dataset.take(samples_to_process), total=samples_to_process, desc="Evaluating" + ) + correct = 0 total = 0 empty_responses = 0 - + for i, example in enumerate(dataset_iter): if num_samples is not None and i >= samples_to_process: break - instruction = example['prompt'] - + instruction = example["prompt"] + try: formatted_prompt = prompt_template.format(instruction=instruction) except KeyError: # Handle prompts with different placeholder names formatted_prompt = prompt_template.replace("{instruction}", instruction) - + # Call LLM with retries output_text = None for attempt in range(3): @@ -91,9 +94,9 @@ def evaluate_ifeval(client, prompt_template, num_samples, model): model=model, messages=[{"role": "user", "content": formatted_prompt}], temperature=0.1, - max_tokens=4096 + max_tokens=4096, ) - + if response and response.choices and response.choices[0].message: output_text = response.choices[0].message.content if output_text and output_text.strip(): @@ -102,28 +105,29 @@ def evaluate_ifeval(client, prompt_template, num_samples, model): if attempt == 2: print(f"\nError after 3 attempts: {e}") time.sleep(2) - + if not output_text or not output_text.strip(): empty_responses += 1 else: # Simple evaluation: response has reasonable length if len(output_text.strip()) > 20: correct += 1 - + total += 1 - + accuracy = correct / total if total > 0 else 0.0 return accuracy, correct, total, empty_responses + def evaluate_hover(client, prompt_template, num_samples, model): """Evaluate HoVer dataset.""" print("\nLoading HoVer dataset...") - + # Try test split first (but it's unlabeled), then validation try: test_dataset = load_dataset("hover", split="test", trust_remote_code=True) # Check if test set has labels - if test_dataset[0]['label'] != -1: + if test_dataset[0]["label"] != -1: dataset = test_dataset split_used = "test" else: @@ -133,7 +137,7 @@ def evaluate_hover(client, prompt_template, num_samples, model): except: dataset = load_dataset("hover", split="validation", trust_remote_code=True) split_used = "validation" - + # Determine samples to process if num_samples is None: samples_to_process = len(dataset) @@ -143,23 +147,25 @@ def evaluate_hover(client, prompt_template, num_samples, model): samples_to_process = min(num_samples, len(dataset)) print(f"Using {samples_to_process} samples from {split_used} split") dataset = load_dataset("hover", split=split_used, streaming=True, trust_remote_code=True) - dataset_iter = tqdm(dataset.take(samples_to_process), total=samples_to_process, desc="Evaluating") - + dataset_iter = tqdm( + dataset.take(samples_to_process), total=samples_to_process, desc="Evaluating" + ) + correct = 0 total = 0 empty_responses = 0 - + for i, example in enumerate(dataset_iter): if num_samples is not None and i >= samples_to_process: break - claim = example['claim'] - label = example['label'] # Integer: 0=SUPPORTED, 1=NOT_SUPPORTED - + claim = example["claim"] + label = example["label"] # Integer: 0=SUPPORTED, 1=NOT_SUPPORTED + try: formatted_prompt = prompt_template.format(claim=claim) except KeyError: formatted_prompt = prompt_template.replace("{claim}", claim) - + # Call LLM with retries output_text = None for attempt in range(3): @@ -168,9 +174,9 @@ def evaluate_hover(client, prompt_template, num_samples, model): model=model, messages=[{"role": "user", "content": formatted_prompt}], temperature=0.1, - max_tokens=4096 + max_tokens=4096, ) - + if response and response.choices and response.choices[0].message: output_text = response.choices[0].message.content if output_text and output_text.strip(): @@ -179,43 +185,48 @@ def evaluate_hover(client, prompt_template, num_samples, model): if attempt == 2: print(f"\nError after 3 attempts: {e}") time.sleep(2) - + if not output_text or not output_text.strip(): empty_responses += 1 else: output_upper = output_text.strip().upper() - + # Parse prediction from output - if 'NOT SUPPORTED' in output_upper or 'NOT_SUPPORTED' in output_upper: + if "NOT SUPPORTED" in output_upper or "NOT_SUPPORTED" in output_upper: prediction = 1 # NOT_SUPPORTED - elif 'SUPPORTED' in output_upper: - prediction = 0 # SUPPORTED + elif "SUPPORTED" in output_upper: + prediction = 0 # SUPPORTED else: prediction = -1 # Invalid/unclear response - + # Compare with actual label if prediction == label: correct += 1 - + total += 1 - + accuracy = correct / total if total > 0 else 0.0 return accuracy, correct, total, empty_responses + def evaluate_hotpotqa(client, prompt_template, num_samples, model): """Evaluate HotpotQA dataset.""" print("\nLoading HotpotQA dataset (this may take a moment)...") - + # Try test split first, then validation try: - dataset = load_dataset("hotpotqa/hotpot_qa", "distractor", split="test", trust_remote_code=True) + dataset = load_dataset( + "hotpotqa/hotpot_qa", "distractor", split="test", trust_remote_code=True + ) split_used = "test" except: - dataset = load_dataset("hotpotqa/hotpot_qa", "distractor", split="validation", trust_remote_code=True) + dataset = load_dataset( + "hotpotqa/hotpot_qa", "distractor", split="validation", trust_remote_code=True + ) split_used = "validation" - + print(f"Dataset loaded. Using {split_used} split with {len(dataset)} samples") - + # Determine samples to process if num_samples is None: samples_to_process = len(dataset) @@ -223,36 +234,35 @@ def evaluate_hotpotqa(client, prompt_template, num_samples, model): else: samples_to_process = min(num_samples, len(dataset)) print(f"Using {samples_to_process} samples") - + correct = 0 total = 0 empty_responses = 0 - + for i in tqdm(range(samples_to_process), desc="Evaluating"): example = dataset[i] - - question = example['question'] - context = example['context'] - answer = example['answer'].lower().strip() - + + question = example["question"] + context = example["context"] + answer = example["answer"].lower().strip() + # Format context context_str = "" - titles = context['title'] - sentences = context['sentences'] - + titles = context["title"] + sentences = context["sentences"] + for title, sents in zip(titles, sentences): context_str += f"{title}: {' '.join(sents)}\n" - + try: formatted_prompt = prompt_template.format( - context=context_str.strip(), - question=question + context=context_str.strip(), question=question ) except KeyError: # Try alternative formatting formatted_prompt = prompt_template.replace("{context}", context_str.strip()) formatted_prompt = formatted_prompt.replace("{question}", question) - + # Call LLM with retries output_text = None for attempt in range(3): @@ -261,9 +271,9 @@ def evaluate_hotpotqa(client, prompt_template, num_samples, model): model=model, messages=[{"role": "user", "content": formatted_prompt}], temperature=0.1, - max_tokens=4096 + max_tokens=4096, ) - + if response and response.choices and response.choices[0].message: output_text = response.choices[0].message.content if output_text and output_text.strip(): @@ -272,65 +282,76 @@ def evaluate_hotpotqa(client, prompt_template, num_samples, model): if attempt == 2: print(f"\nError after 3 attempts: {e}") time.sleep(2) - + if not output_text or not output_text.strip(): empty_responses += 1 else: output_lower = output_text.strip().lower() - + # Check if answer is in output if answer in output_lower: correct += 1 - + total += 1 - + accuracy = correct / total if total > 0 else 0.0 return accuracy, correct, total, empty_responses + def main(): - parser = argparse.ArgumentParser(description='Evaluate prompts on GEPA benchmark datasets') - parser.add_argument('--dataset', type=str, required=True, - choices=['ifeval', 'hover', 'hotpotqa', 'all'], - help='Dataset to evaluate on') - parser.add_argument('--prompt-type', type=str, default='baseline', - choices=['baseline', 'evolved'], - help='Type of prompt to use') - parser.add_argument('--samples', type=int, default=None, - help='Number of samples to evaluate (default: full dataset)') - parser.add_argument('--model', type=str, default='qwen/qwen3-8b', - help='Model to use for evaluation') - parser.add_argument('--output', type=str, default=None, - help='Output file for results (default: auto-generated)') - + parser = argparse.ArgumentParser(description="Evaluate prompts on GEPA benchmark datasets") + parser.add_argument( + "--dataset", + type=str, + required=True, + choices=["ifeval", "hover", "hotpotqa", "all"], + help="Dataset to evaluate on", + ) + parser.add_argument( + "--prompt-type", + type=str, + default="baseline", + choices=["baseline", "evolved"], + help="Type of prompt to use", + ) + parser.add_argument( + "--samples", + type=int, + default=None, + help="Number of samples to evaluate (default: full dataset)", + ) + parser.add_argument( + "--model", type=str, default="qwen/qwen3-8b", help="Model to use for evaluation" + ) + parser.add_argument( + "--output", type=str, default=None, help="Output file for results (default: auto-generated)" + ) + args = parser.parse_args() - + # Initialize client client = get_client() - + # Determine which datasets to evaluate - if args.dataset == 'all': - datasets = ['ifeval', 'hover', 'hotpotqa'] + if args.dataset == "all": + datasets = ["ifeval", "hover", "hotpotqa"] else: datasets = [args.dataset] - + # Evaluation functions - eval_funcs = { - 'ifeval': evaluate_ifeval, - 'hover': evaluate_hover, - 'hotpotqa': evaluate_hotpotqa - } - + eval_funcs = {"ifeval": evaluate_ifeval, "hover": evaluate_hover, "hotpotqa": evaluate_hotpotqa} + # Load baseline results for comparison baseline_results = {} - if os.path.exists('baseline_results_50samples.json'): - with open('baseline_results_50samples.json', 'r') as f: + if os.path.exists("baseline_results_50samples.json"): + with open("baseline_results_50samples.json", "r") as f: baseline_data = json.load(f) - for result in baseline_data.get('results', []): - baseline_results[result['dataset']] = result['accuracy'] - + for result in baseline_data.get("results", []): + baseline_results[result["dataset"]] = result["accuracy"] + # Store results all_results = [] - + print(f"\n{'='*60}") print(f"PROMPT EVALUATION - {args.prompt_type.upper()}") print(f"Model: {args.model}") @@ -339,45 +360,45 @@ def main(): else: print(f"Samples per dataset: Full dataset") print(f"{'='*60}") - + for dataset_name in datasets: print(f"\nEvaluating {dataset_name.upper()}...") - + try: # Load prompt prompt_template = load_prompt(dataset_name, args.prompt_type) print(f"Loaded {args.prompt_type} prompt ({len(prompt_template)} chars)") - + # Run evaluation start_time = time.time() accuracy, correct, total, empty_responses = eval_funcs[dataset_name]( client, prompt_template, args.samples, args.model ) elapsed_time = time.time() - start_time - + # Get baseline accuracy baseline_acc = baseline_results.get(dataset_name) if baseline_acc: improvement = ((accuracy - baseline_acc) / baseline_acc) * 100 else: improvement = 0 - + # Store result result = { - 'dataset': dataset_name, - 'prompt_type': args.prompt_type, - 'accuracy': accuracy, - 'baseline_accuracy': baseline_acc, - 'improvement_percent': improvement, - 'correct': correct, - 'total': total, - 'empty_responses': empty_responses, - 'elapsed_time': elapsed_time, - 'timestamp': datetime.now().isoformat() + "dataset": dataset_name, + "prompt_type": args.prompt_type, + "accuracy": accuracy, + "baseline_accuracy": baseline_acc, + "improvement_percent": improvement, + "correct": correct, + "total": total, + "empty_responses": empty_responses, + "elapsed_time": elapsed_time, + "timestamp": datetime.now().isoformat(), } - + all_results.append(result) - + # Print results print(f"\nResults for {dataset_name.upper()}:") print(f" Accuracy: {accuracy:.3f} ({correct}/{total})") @@ -386,65 +407,68 @@ def main(): print(f" Improvement: {improvement:+.1f}%") print(f" Empty responses: {empty_responses}") print(f" Time: {elapsed_time:.1f}s ({elapsed_time/total:.1f}s per sample)") - + except Exception as e: print(f"Error evaluating {dataset_name}: {str(e)}") - all_results.append({ - 'dataset': dataset_name, - 'prompt_type': args.prompt_type, - 'error': str(e), - 'timestamp': datetime.now().isoformat() - }) - + all_results.append( + { + "dataset": dataset_name, + "prompt_type": args.prompt_type, + "error": str(e), + "timestamp": datetime.now().isoformat(), + } + ) + # Save results output_path = args.output if not output_path: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"evaluation_results_{args.prompt_type}_{timestamp}.json" - + final_results = { - 'prompt_type': args.prompt_type, - 'model': args.model, - 'samples_per_dataset': args.samples, - 'timestamp': datetime.now().isoformat(), - 'results': all_results + "prompt_type": args.prompt_type, + "model": args.model, + "samples_per_dataset": args.samples, + "timestamp": datetime.now().isoformat(), + "results": all_results, } - + # Calculate aggregate statistics - valid_results = [r for r in all_results if 'error' not in r] + valid_results = [r for r in all_results if "error" not in r] if valid_results: - total_correct = sum(r['correct'] for r in valid_results) - total_samples = sum(r['total'] for r in valid_results) + total_correct = sum(r["correct"] for r in valid_results) + total_samples = sum(r["total"] for r in valid_results) aggregate_accuracy = total_correct / total_samples if total_samples > 0 else 0 - - final_results['summary'] = { - 'aggregate_accuracy': aggregate_accuracy, - 'total_correct': total_correct, - 'total_samples': total_samples, - 'datasets_evaluated': len(valid_results) + + final_results["summary"] = { + "aggregate_accuracy": aggregate_accuracy, + "total_correct": total_correct, + "total_samples": total_samples, + "datasets_evaluated": len(valid_results), } - - with open(output_path, 'w') as f: + + with open(output_path, "w") as f: json.dump(final_results, f, indent=2) - + # Print summary print(f"\n{'='*60}") print("EVALUATION SUMMARY") print(f"{'='*60}") - + for result in all_results: - if 'error' not in result: + if "error" not in result: print(f"\n{result['dataset'].upper()}:") print(f" Accuracy: {result['accuracy']:.3f}") - if result.get('baseline_accuracy'): + if result.get("baseline_accuracy"): print(f" vs Baseline: {result['improvement_percent']:+.1f}%") - - if 'summary' in final_results: + + if "summary" in final_results: print(f"\nAGGREGATE:") print(f" Overall Accuracy: {final_results['summary']['aggregate_accuracy']:.3f}") print(f" Total Samples: {final_results['summary']['total_samples']}") - + print(f"\nResults saved to: {output_path}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/llm_prompt_optimization/evaluator.py b/examples/llm_prompt_optimization/evaluator.py index c5c36dd81..98c5294c5 100644 --- a/examples/llm_prompt_optimization/evaluator.py +++ b/examples/llm_prompt_optimization/evaluator.py @@ -12,28 +12,28 @@ from datasets import load_dataset # Read config.yaml to get model settings -with open(os.path.join(os.path.dirname(__file__), "config.yaml"), 'r') as f: +with open(os.path.join(os.path.dirname(__file__), "config.yaml"), "r") as f: config = yaml.safe_load(f) # Get model settings from config -llm_config = config.get('llm', {}) -api_base = llm_config.get('api_base', 'http://localhost:1234/v1') +llm_config = config.get("llm", {}) +api_base = llm_config.get("api_base", "http://localhost:1234/v1") # Handle both single model and model list configurations -models = llm_config.get('models', []) +models = llm_config.get("models", []) if models: # Use first model from list - TASK_MODEL_NAME = models[0].get('name', 'default-model') + TASK_MODEL_NAME = models[0].get("name", "default-model") else: # Fallback to direct model specification - TASK_MODEL_NAME = llm_config.get('primary_model', 'default-model') + TASK_MODEL_NAME = llm_config.get("primary_model", "default-model") # Get evaluator settings -evaluator_config = config.get('evaluator', {}) -MAX_RETRIES = evaluator_config.get('max_retries', 3) +evaluator_config = config.get("evaluator", {}) +MAX_RETRIES = evaluator_config.get("max_retries", 3) # Get max_tokens from LLM config -MAX_TOKENS = llm_config.get('max_tokens', 16000) +MAX_TOKENS = llm_config.get("max_tokens", 16000) print(f"Using max_tokens: {MAX_TOKENS}") # Initialize OpenAI client once for all evaluations @@ -42,15 +42,18 @@ # Determine which dataset to use based on the OPENEVOLVE_PROMPT environment variable import sys -prompt_file = os.environ.get('OPENEVOLVE_PROMPT') + +prompt_file = os.environ.get("OPENEVOLVE_PROMPT") if not prompt_file: # Default to a generic dataset config if not using the wrapper script evaluator_dir = os.path.dirname(os.path.abspath(__file__)) - DATASET_CONFIG_PATH = os.path.join(evaluator_dir, 'dataset_config.yaml') + DATASET_CONFIG_PATH = os.path.join(evaluator_dir, "dataset_config.yaml") print("Warning: OPENEVOLVE_PROMPT not set. Using default dataset_config.yaml") else: basename = os.path.basename(prompt_file) - dataset_filename = basename.replace('_prompt.txt', '_prompt_dataset.yaml').replace('.txt', '_dataset.yaml') + dataset_filename = basename.replace("_prompt.txt", "_prompt_dataset.yaml").replace( + ".txt", "_dataset.yaml" + ) evaluator_dir = os.path.dirname(os.path.abspath(__file__)) DATASET_CONFIG_PATH = os.path.join(evaluator_dir, dataset_filename) print(f"Dataset configuration: {dataset_filename}") @@ -59,47 +62,51 @@ def calculate_prompt_features(prompt): """ Calculate custom features for MAP-Elites binning - + Returns: tuple: (prompt_length, reasoning_strategy) - both in range 0-9 """ # Feature 1: Prompt length bin (0-9) length = len(prompt) if length < 100: - prompt_length = 0 # Minimal + prompt_length = 0 # Minimal elif length < 200: - prompt_length = 1 # Very short + prompt_length = 1 # Very short elif length < 400: - prompt_length = 2 # Short + prompt_length = 2 # Short elif length < 600: - prompt_length = 3 # Medium-short + prompt_length = 3 # Medium-short elif length < 900: - prompt_length = 4 # Medium + prompt_length = 4 # Medium elif length < 1200: - prompt_length = 5 # Medium-long + prompt_length = 5 # Medium-long elif length < 1600: - prompt_length = 6 # Long + prompt_length = 6 # Long elif length < 2000: - prompt_length = 7 # Very long + prompt_length = 7 # Very long elif length < 2500: - prompt_length = 8 # Extensive + prompt_length = 8 # Extensive else: - prompt_length = 9 # Very extensive - + prompt_length = 9 # Very extensive + # Feature 2: Reasoning strategy (0-9) prompt_lower = prompt.lower() - + # Check for few-shot examples - has_example = ('example' in prompt_lower or - prompt.count('####') >= 4 or - bool(re.search(r'problem:.*?solution:', prompt_lower, re.DOTALL))) - + has_example = ( + "example" in prompt_lower + or prompt.count("####") >= 4 + or bool(re.search(r"problem:.*?solution:", prompt_lower, re.DOTALL)) + ) + # Check for Chain-of-Thought (CoT) indicators - has_cot = ('step by step' in prompt_lower or - 'step-by-step' in prompt_lower or - any(phrase in prompt_lower for phrase in ['think through', 'reasoning', 'explain your']) or - bool(re.search(r'(first|then|next|finally)', prompt_lower))) - + has_cot = ( + "step by step" in prompt_lower + or "step-by-step" in prompt_lower + or any(phrase in prompt_lower for phrase in ["think through", "reasoning", "explain your"]) + or bool(re.search(r"(first|then|next|finally)", prompt_lower)) + ) + # Assign reasoning strategy bins if has_example: # Few-shot examples (bins 7-9) @@ -111,7 +118,7 @@ def calculate_prompt_features(prompt): reasoning_strategy = 7 # Basic few-shot elif has_cot: # Chain-of-thought (bins 4-6) - if 'must' in prompt_lower or 'exactly' in prompt_lower: + if "must" in prompt_lower or "exactly" in prompt_lower: reasoning_strategy = 6 # Strict CoT elif length > 500: reasoning_strategy = 5 # Detailed CoT @@ -121,87 +128,104 @@ def calculate_prompt_features(prompt): # Basic prompts (bins 0-3) if length < 100: reasoning_strategy = 0 # Minimal - elif 'solve' in prompt_lower or 'calculate' in prompt_lower: + elif "solve" in prompt_lower or "calculate" in prompt_lower: reasoning_strategy = 2 # Direct instruction else: reasoning_strategy = 1 # Simple prompt - + return prompt_length, reasoning_strategy def load_prompt_config(prompt_path): """Load the prompt from text file and dataset config from matching _dataset.yaml file.""" # Load prompt from text file - with open(prompt_path, 'r') as f: + with open(prompt_path, "r") as f: prompt = f.read().strip() - + # Load the configuration (already determined from environment variable) if not os.path.exists(DATASET_CONFIG_PATH): raise FileNotFoundError(f"Dataset configuration not found: {DATASET_CONFIG_PATH}") - - with open(DATASET_CONFIG_PATH, 'r') as f: + + with open(DATASET_CONFIG_PATH, "r") as f: config = yaml.safe_load(f) - + return config, prompt + def load_hf_dataset(config): """Load HuggingFace dataset based on configuration.""" - dataset_name = config['dataset_name'] - dataset_config = config.get('dataset_config', None) - split = config.get('split', 'test') - trust_remote_code = config.get('trust_remote_code', True) # Default to True for convenience - + dataset_name = config["dataset_name"] + dataset_config = config.get("dataset_config", None) + split = config.get("split", "test") + trust_remote_code = config.get("trust_remote_code", True) # Default to True for convenience + print(f"Loading dataset: {dataset_name}") - + # Special handling for HotpotQA - always use non-streaming mode - if dataset_name == "hotpot_qa" or config.get('is_hotpotqa', False): + if dataset_name == "hotpot_qa" or config.get("is_hotpotqa", False): print("Using non-streaming mode for HotpotQA to avoid PyArrow issues") streaming = False else: # For other datasets, use streaming if not specified - streaming = config.get('streaming', True) - + streaming = config.get("streaming", True) + try: # Try to load the specified split if dataset_config: - dataset = load_dataset(dataset_name, dataset_config, split=split, - trust_remote_code=trust_remote_code, streaming=streaming) + dataset = load_dataset( + dataset_name, + dataset_config, + split=split, + trust_remote_code=trust_remote_code, + streaming=streaming, + ) else: - dataset = load_dataset(dataset_name, split=split, - trust_remote_code=trust_remote_code, streaming=streaming) + dataset = load_dataset( + dataset_name, split=split, trust_remote_code=trust_remote_code, streaming=streaming + ) except: # Fallback to train split if test is not available print(f"Split '{split}' not found, falling back to 'train'") if dataset_config: - dataset = load_dataset(dataset_name, dataset_config, split='train', - trust_remote_code=trust_remote_code, streaming=streaming) + dataset = load_dataset( + dataset_name, + dataset_config, + split="train", + trust_remote_code=trust_remote_code, + streaming=streaming, + ) else: - dataset = load_dataset(dataset_name, split='train', - trust_remote_code=trust_remote_code, streaming=streaming) - + dataset = load_dataset( + dataset_name, + split="train", + trust_remote_code=trust_remote_code, + streaming=streaming, + ) + # Print dataset info - if hasattr(dataset, '__len__'): + if hasattr(dataset, "__len__"): print(f"Dataset loaded with {len(dataset)} examples") else: print(f"Dataset loaded (streaming mode)") - + return dataset + def evaluate_prompt(prompt, dataset, config, num_samples): """Evaluate a prompt on a subset of the dataset.""" - input_field = config['input_field'] - target_field = config['target_field'] - + input_field = config["input_field"] + target_field = config["target_field"] + # Check dataset type - dataset_name = config.get('dataset_name', '').lower() - is_emotion = 'emotion' in dataset_name - is_gsm8k = 'gsm8k' in dataset_name - is_hotpotqa = config.get('is_hotpotqa', False) - is_ifeval = config.get('is_ifeval', False) - is_hover = config.get('is_hover', False) - + dataset_name = config.get("dataset_name", "").lower() + is_emotion = "emotion" in dataset_name + is_gsm8k = "gsm8k" in dataset_name + is_hotpotqa = config.get("is_hotpotqa", False) + is_ifeval = config.get("is_ifeval", False) + is_hover = config.get("is_hover", False) + # Sample from dataset - handle both streaming and non-streaming - if hasattr(dataset, 'take'): + if hasattr(dataset, "take"): # Streaming dataset samples = dataset.take(num_samples) sample_iter = tqdm(samples, desc=f"Evaluating {num_samples} samples", total=num_samples) @@ -210,22 +234,24 @@ def evaluate_prompt(prompt, dataset, config, num_samples): indices = range(min(num_samples, len(dataset))) samples = dataset.select(indices) sample_iter = tqdm(samples, desc=f"Evaluating {num_samples} samples") - + correct = 0 total = 0 - + for example in sample_iter: input_text = example[input_field] expected = example[target_field] - + # Prepare the prompt with appropriate formatting if is_hotpotqa: # Format context from paragraphs - context_items = example.get('context', {}) + context_items = example.get("context", {}) context_text = "" - if 'title' in context_items and 'sentences' in context_items: + if "title" in context_items and "sentences" in context_items: # Handle the specific structure of HotpotQA - for i, (title, sentences) in enumerate(zip(context_items['title'], context_items['sentences'])): + for i, (title, sentences) in enumerate( + zip(context_items["title"], context_items["sentences"]) + ): context_text += f"Paragraph {i+1} ({title}):\n" context_text += " ".join(sentences) + "\n\n" formatted_prompt = prompt.format(context=context_text.strip(), question=input_text) @@ -238,12 +264,10 @@ def evaluate_prompt(prompt, dataset, config, num_samples): else: # Default formatting for other datasets formatted_prompt = prompt.format(input_text=input_text) - + # Prepare the message for the LLM - messages = [ - {"role": "user", "content": formatted_prompt} - ] - + messages = [{"role": "user", "content": formatted_prompt}] + # Call the LLM with retry logic for attempt in range(MAX_RETRIES): try: @@ -252,7 +276,7 @@ def evaluate_prompt(prompt, dataset, config, num_samples): model=TASK_MODEL_NAME, messages=messages, temperature=0.1, # Low temperature for consistent results - max_tokens=MAX_TOKENS + max_tokens=MAX_TOKENS, ) break except Exception as e: @@ -260,134 +284,144 @@ def evaluate_prompt(prompt, dataset, config, num_samples): print(f"Failed to get response after {MAX_RETRIES} attempts: {e}") raise e time.sleep(1) - + # Handle potential None response if not response: print(f"Warning: No response object from LLM") total += 1 # Count as incorrect continue - + if not response.choices: print(f"Warning: No choices in response from LLM") total += 1 # Count as incorrect continue - + if not response.choices[0].message: print(f"Warning: No message in response choice") total += 1 # Count as incorrect continue - + output_text = response.choices[0].message.content if output_text is None: print(f"Warning: None content in LLM response") print(f"Full response: {response}") total += 1 # Count as incorrect continue - + output_text = output_text.strip() - + # Extract prediction from output try: if is_gsm8k: # For GSM8K, extract the numeric answer after #### # First, extract the expected answer from the ground truth - expected_answer = expected.split('####')[-1].strip() + expected_answer = expected.split("####")[-1].strip() try: - expected_number = float(expected_answer.replace(',', '')) + expected_number = float(expected_answer.replace(",", "")) except: print(f"Warning: Could not parse expected answer: {expected_answer}") total += 1 continue - + # Extract prediction from model output prediction = None - if '####' in output_text: - predicted_answer = output_text.split('####')[-1].strip() + if "####" in output_text: + predicted_answer = output_text.split("####")[-1].strip() # Extract just the number, removing any extra text like $ signs import re - numbers = re.findall(r'-?\$?[\d,]+\.?\d*', predicted_answer) + + numbers = re.findall(r"-?\$?[\d,]+\.?\d*", predicted_answer) if numbers: try: # Remove $ and , from the number - number_str = numbers[0].replace('$', '').replace(',', '') + number_str = numbers[0].replace("$", "").replace(",", "") prediction = float(number_str) except: pass - + # If we found a prediction, check if it matches if prediction is not None: # Check if answers match (with small tolerance for floats) if abs(prediction - expected_number) < 0.001: correct += 1 - + total += 1 continue # Skip the general case to avoid double counting - + elif is_hotpotqa: # For HotpotQA, do exact match comparison (case-insensitive) output_lower = output_text.lower().strip() expected_lower = str(expected).lower().strip() - + # Remove common punctuation for better matching - output_lower = output_lower.rstrip('.,!?;:') - expected_lower = expected_lower.rstrip('.,!?;:') - + output_lower = output_lower.rstrip(".,!?;:") + expected_lower = expected_lower.rstrip(".,!?;:") + if output_lower == expected_lower: correct += 1 elif expected_lower in output_lower: # Partial credit if answer is contained in response correct += 1 - + total += 1 continue - + elif is_ifeval: # For IFEval, we need more complex evaluation # For now, do basic keyword matching # Note: Full IFEval requires checking multiple constraints output_lower = output_text.lower() - + # Simple heuristic: check if response seems to follow instruction format if len(output_text.strip()) > 10: # Non-trivial response correct += 1 # Simplified - real IFEval needs constraint checking - + total += 1 continue - + elif is_hover: # For HoVer, check if prediction matches SUPPORTED/NOT_SUPPORTED output_upper = output_text.upper() expected_upper = str(expected).upper() - + # Look for the verdict in the output - if 'SUPPORTED' in output_upper and 'NOT' not in output_upper.replace('NOT SUPPORTED', ''): - prediction = 'SUPPORTED' - elif 'NOT SUPPORTED' in output_upper or 'NOT_SUPPORTED' in output_upper: - prediction = 'NOT_SUPPORTED' + if "SUPPORTED" in output_upper and "NOT" not in output_upper.replace( + "NOT SUPPORTED", "" + ): + prediction = "SUPPORTED" + elif "NOT SUPPORTED" in output_upper or "NOT_SUPPORTED" in output_upper: + prediction = "NOT_SUPPORTED" else: prediction = None - + if prediction == expected_upper: correct += 1 - + total += 1 continue - + elif is_emotion: # For emotion classification (0-5) - numbers = re.findall(r'\b[0-5]\b', output_text) + numbers = re.findall(r"\b[0-5]\b", output_text) if numbers: prediction = int(numbers[-1]) # Use the last number found else: # Try to infer from emotion keywords output_lower = output_text.lower() emotion_map = { - 'sadness': 0, 'sad': 0, - 'joy': 1, 'happy': 1, 'happiness': 1, - 'love': 2, - 'anger': 3, 'angry': 3, - 'fear': 4, 'afraid': 4, 'scared': 4, - 'surprise': 5, 'surprised': 5 + "sadness": 0, + "sad": 0, + "joy": 1, + "happy": 1, + "happiness": 1, + "love": 2, + "anger": 3, + "angry": 3, + "fear": 4, + "afraid": 4, + "scared": 4, + "surprise": 5, + "surprised": 5, } prediction = -1 for emotion, label in emotion_map.items(): @@ -396,168 +430,165 @@ def evaluate_prompt(prompt, dataset, config, num_samples): break else: # For sentiment classification (0-1) - numbers = re.findall(r'\b[01]\b', output_text) + numbers = re.findall(r"\b[01]\b", output_text) if numbers: prediction = int(numbers[-1]) # Use the last number found else: # Try to infer from keywords output_lower = output_text.lower() - if 'positive' in output_lower: + if "positive" in output_lower: prediction = 1 - elif 'negative' in output_lower: + elif "negative" in output_lower: prediction = 0 else: prediction = -1 # Invalid prediction - + if prediction == expected: correct += 1 - + total += 1 - + except Exception as e: print(f"Error parsing response '{output_text}': {e}") total += 1 # Count as incorrect - + accuracy = correct / total if total > 0 else 0.0 return accuracy, correct, total + def evaluate_stage1(prompt_path): """ Stage 1 evaluation: Quick evaluation with 10% of samples - + Args: prompt_path: Path to the prompt file - + Returns: Dictionary with combined_score metric """ - print('-' * 80) + print("-" * 80) print("Starting Stage 1 evaluation...") - print('-' * 80) - + print("-" * 80) + try: # Load prompt configuration config, prompt = load_prompt_config(prompt_path) print(f"Loaded prompt configuration") - + # Load dataset dataset = load_hf_dataset(config) - + # Get number of samples from config - num_samples = config.get('max_samples', 50) + num_samples = config.get("max_samples", 50) # Fixed to 10 samples for Stage 1 (quick evaluation) stage1_samples = 10 - + print(f"Stage 1: Evaluating {stage1_samples} samples...") - + # Run evaluation - accuracy, correct, total = evaluate_prompt( - prompt, dataset, config, stage1_samples - ) - + accuracy, correct, total = evaluate_prompt(prompt, dataset, config, stage1_samples) + print(f"Stage 1 accuracy: {accuracy:.3f} ({correct}/{total})") - print('-' * 80) - + print("-" * 80) + # Calculate custom features prompt_length, reasoning_strategy = calculate_prompt_features(prompt) print(f"Prompt features - Length bin: {prompt_length}, Reasoning bin: {reasoning_strategy}") - + return { "combined_score": accuracy, "prompt_length": prompt_length, - "reasoning_strategy": reasoning_strategy + "reasoning_strategy": reasoning_strategy, } - + except Exception as e: print(f"Stage 1 evaluation failed: {str(e)}") traceback.print_exc() - print('-' * 80) - + print("-" * 80) + # Always return feature dimensions, even on failure try: # Try to calculate features from the failed prompt - with open(prompt_path, 'r') as f: + with open(prompt_path, "r") as f: failed_prompt = f.read().strip() prompt_length, reasoning_strategy = calculate_prompt_features(failed_prompt) except: # Fallback values if prompt can't be read prompt_length, reasoning_strategy = 0, 0 - + return { "combined_score": 0.0, "prompt_length": prompt_length, "reasoning_strategy": reasoning_strategy, - "error": str(e) + "error": str(e), } def evaluate_stage2(prompt_path): """ Stage 2 evaluation: Full evaluation with all samples - + Args: prompt_path: Path to the prompt file - + Returns: Dictionary with combined_score metric """ - print('-' * 80) + print("-" * 80) print("Starting Stage 2 evaluation...") - print('-' * 80) - + print("-" * 80) + try: # Load prompt configuration config, prompt = load_prompt_config(prompt_path) print(f"Loaded prompt configuration") - + # Load dataset dataset = load_hf_dataset(config) - + # Get number of samples from config - num_samples = config.get('max_samples', 50) + num_samples = config.get("max_samples", 50) # Fixed to 40 samples for Stage 2 (comprehensive evaluation) stage2_samples = 40 - + print(f"Stage 2: Evaluating {stage2_samples} samples...") - + # Run evaluation - accuracy, correct, total = evaluate_prompt( - prompt, dataset, config, stage2_samples - ) - + accuracy, correct, total = evaluate_prompt(prompt, dataset, config, stage2_samples) + print(f"Stage 2 accuracy: {accuracy:.3f} ({correct}/{total})") - print('-' * 80) - + print("-" * 80) + # Calculate custom features prompt_length, reasoning_strategy = calculate_prompt_features(prompt) print(f"Prompt features - Length bin: {prompt_length}, Reasoning bin: {reasoning_strategy}") - + return { "combined_score": accuracy, "prompt_length": prompt_length, - "reasoning_strategy": reasoning_strategy + "reasoning_strategy": reasoning_strategy, } - + except Exception as e: print(f"Stage 2 evaluation failed: {str(e)}") traceback.print_exc() - print('-' * 80) - + print("-" * 80) + # Always return feature dimensions, even on failure try: # Try to calculate features from the failed prompt - with open(prompt_path, 'r') as f: + with open(prompt_path, "r") as f: failed_prompt = f.read().strip() prompt_length, reasoning_strategy = calculate_prompt_features(failed_prompt) except: # Fallback values if prompt can't be read prompt_length, reasoning_strategy = 0, 0 - + return { "combined_score": 0.0, "prompt_length": prompt_length, "reasoning_strategy": reasoning_strategy, - "error": str(e) + "error": str(e), } @@ -565,11 +596,11 @@ def evaluate(prompt_path): """ Main evaluation function - for backwards compatibility Calls evaluate_stage2 for full evaluation - + Args: prompt_path: Path to the prompt file - + Returns: Dictionary with combined_score metric """ - return evaluate_stage2(prompt_path) \ No newline at end of file + return evaluate_stage2(prompt_path) diff --git a/examples/mlx_metal_kernel_opt/run_benchmarks.py b/examples/mlx_metal_kernel_opt/run_benchmarks.py index 3095a8523..bc7c5fc2b 100644 --- a/examples/mlx_metal_kernel_opt/run_benchmarks.py +++ b/examples/mlx_metal_kernel_opt/run_benchmarks.py @@ -457,7 +457,9 @@ def print_comparison_summary(comparison_results): print(f" ā±ļø Average Time Reduction: {summary['avg_time_reduction_pct']:+.2f}%") print(f"\nšŸ“Š ABSOLUTE PERFORMANCE:") - print(f" šŸ”µ Standard MLX-LM: {summary['avg_standard_decode_speed']:.1f} tokens/sec average") + print( + f" šŸ”µ Standard MLX-LM: {summary['avg_standard_decode_speed']:.1f} tokens/sec average" + ) print( f" 🟠 Metal Kernel Optimized: {summary['avg_optimized_decode_speed']:.1f} tokens/sec average" ) diff --git a/examples/r_robust_regression/evaluator.py b/examples/r_robust_regression/evaluator.py index 66b9577b1..12527478e 100644 --- a/examples/r_robust_regression/evaluator.py +++ b/examples/r_robust_regression/evaluator.py @@ -19,7 +19,7 @@ async def evaluate(program_path: str) -> EvaluationResult: """ Evaluate an R program implementing robust regression. - + Tests the program on synthetic data with outliers to measure: - Accuracy (MSE, MAE, R-squared) - Robustness to outliers @@ -33,7 +33,7 @@ async def evaluate(program_path: str) -> EvaluationResult: generate_regression_data(n_samples=100, n_features=3, outlier_fraction=0.2, noise=0.1), generate_regression_data(n_samples=200, n_features=5, outlier_fraction=0.15, noise=0.2), ] - + total_score = 0 total_mse = 0 total_mae = 0 @@ -41,13 +41,14 @@ async def evaluate(program_path: str) -> EvaluationResult: total_r_squared = 0 total_outlier_robustness = 0 total_time = 0 - + artifacts = {"test_results": []} - + for i, (X, y, true_coeffs) in enumerate(test_cases): # Create a temporary R script that sources the program and runs it - with tempfile.NamedTemporaryFile(mode='w', suffix='.r', delete=False) as f: - f.write(f""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".r", delete=False) as f: + f.write( + f""" # Source the program source("{program_path}") @@ -66,115 +67,109 @@ async def evaluate(program_path: str) -> EvaluationResult: # Save results write(jsonlite::toJSON(metrics, auto_unbox=TRUE), "results.json") -""") +""" + ) test_script = f.name - + # Save test data to temporary CSV files - X_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) - y_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) - np.savetxt(X_file.name, X, delimiter=',', fmt='%.6f') - np.savetxt(y_file.name, y, delimiter=',', fmt='%.6f') + X_file = tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) + y_file = tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) + np.savetxt(X_file.name, X, delimiter=",", fmt="%.6f") + np.savetxt(y_file.name, y, delimiter=",", fmt="%.6f") X_file.close() y_file.close() - + # Run the R script try: result = subprocess.run( - ['Rscript', test_script], + ["Rscript", test_script], capture_output=True, text=True, timeout=30, - cwd=os.path.dirname(test_script) + cwd=os.path.dirname(test_script), ) - + if result.returncode != 0: - artifacts["test_results"].append({ - "test_case": i, - "error": "R execution failed", - "stderr": result.stderr - }) + artifacts["test_results"].append( + {"test_case": i, "error": "R execution failed", "stderr": result.stderr} + ) continue - + # Read results - results_path = os.path.join(os.path.dirname(test_script), 'results.json') + results_path = os.path.join(os.path.dirname(test_script), "results.json") if not os.path.exists(results_path): - artifacts["test_results"].append({ - "test_case": i, - "error": "No results file produced" - }) + artifacts["test_results"].append( + {"test_case": i, "error": "No results file produced"} + ) continue - - with open(results_path, 'r') as f: + + with open(results_path, "r") as f: metrics = json.load(f) - + # Calculate case score (emphasize robustness for cases with outliers) outlier_fraction = [0.0, 0.1, 0.2, 0.15][i] if outlier_fraction > 0: # For cases with outliers, prioritize robust metrics case_score = ( - 0.2 * (1 - min(metrics.get('mse', 1), 1)) + - 0.3 * (1 - min(metrics.get('medae', 1), 1)) + - 0.4 * metrics.get('outlier_robustness', 0) + - 0.1 * max(0, metrics.get('r_squared', 0)) + 0.2 * (1 - min(metrics.get("mse", 1), 1)) + + 0.3 * (1 - min(metrics.get("medae", 1), 1)) + + 0.4 * metrics.get("outlier_robustness", 0) + + 0.1 * max(0, metrics.get("r_squared", 0)) ) else: # For clean data, prioritize accuracy case_score = ( - 0.4 * (1 - min(metrics.get('mse', 1), 1)) + - 0.3 * (1 - min(metrics.get('mae', 1), 1)) + - 0.2 * max(0, metrics.get('r_squared', 0)) + - 0.1 * metrics.get('outlier_robustness', 0) + 0.4 * (1 - min(metrics.get("mse", 1), 1)) + + 0.3 * (1 - min(metrics.get("mae", 1), 1)) + + 0.2 * max(0, metrics.get("r_squared", 0)) + + 0.1 * metrics.get("outlier_robustness", 0) ) - + total_score += case_score - total_mse += metrics.get('mse', 1) - total_mae += metrics.get('mae', 1) - total_medae += metrics.get('medae', 1) - total_r_squared += max(0, metrics.get('r_squared', 0)) - total_outlier_robustness += metrics.get('outlier_robustness', 0) - total_time += metrics.get('execution_time', 1) - - artifacts["test_results"].append({ - "test_case": i, - "outlier_fraction": outlier_fraction, - "metrics": metrics, - "case_score": case_score - }) - + total_mse += metrics.get("mse", 1) + total_mae += metrics.get("mae", 1) + total_medae += metrics.get("medae", 1) + total_r_squared += max(0, metrics.get("r_squared", 0)) + total_outlier_robustness += metrics.get("outlier_robustness", 0) + total_time += metrics.get("execution_time", 1) + + artifacts["test_results"].append( + { + "test_case": i, + "outlier_fraction": outlier_fraction, + "metrics": metrics, + "case_score": case_score, + } + ) + except subprocess.TimeoutExpired: - artifacts["test_results"].append({ - "test_case": i, - "error": "Timeout" - }) + artifacts["test_results"].append({"test_case": i, "error": "Timeout"}) except Exception as e: - artifacts["test_results"].append({ - "test_case": i, - "error": str(e) - }) + artifacts["test_results"].append({"test_case": i, "error": str(e)}) finally: # Cleanup os.unlink(test_script) os.unlink(X_file.name) os.unlink(y_file.name) - if os.path.exists(os.path.join(os.path.dirname(test_script), 'results.json')): - os.unlink(os.path.join(os.path.dirname(test_script), 'results.json')) - + if os.path.exists(os.path.join(os.path.dirname(test_script), "results.json")): + os.unlink(os.path.join(os.path.dirname(test_script), "results.json")) + # Calculate average metrics n_successful = len([r for r in artifacts["test_results"] if "error" not in r]) if n_successful == 0: return EvaluationResult( metrics={ "score": 0.0, - "mse": float('inf'), - "mae": float('inf'), - "medae": float('inf'), + "mse": float("inf"), + "mae": float("inf"), + "medae": float("inf"), "r_squared": 0.0, "outlier_robustness": 0.0, - "execution_time": float('inf') + "execution_time": float("inf"), }, - artifacts=artifacts + artifacts=artifacts, ) - + avg_score = total_score / n_successful avg_mse = total_mse / n_successful avg_mae = total_mae / n_successful @@ -182,11 +177,11 @@ async def evaluate(program_path: str) -> EvaluationResult: avg_r_squared = total_r_squared / n_successful avg_outlier_robustness = total_outlier_robustness / n_successful avg_time = total_time / n_successful - + # Add efficiency bonus for faster execution efficiency_bonus = max(0, 1 - avg_time) * 0.1 final_score = min(1.0, avg_score + efficiency_bonus) - + return EvaluationResult( metrics={ "score": final_score, @@ -195,54 +190,57 @@ async def evaluate(program_path: str) -> EvaluationResult: "medae": avg_medae, "r_squared": avg_r_squared, "outlier_robustness": avg_outlier_robustness, - "execution_time": avg_time + "execution_time": avg_time, }, - artifacts=artifacts + artifacts=artifacts, ) - + except Exception as e: return EvaluationResult( metrics={ "score": 0.0, - "mse": float('inf'), - "mae": float('inf'), - "medae": float('inf'), + "mse": float("inf"), + "mae": float("inf"), + "medae": float("inf"), "r_squared": 0.0, "outlier_robustness": 0.0, - "execution_time": float('inf') + "execution_time": float("inf"), }, - artifacts={"error": str(e), "type": "evaluation_error"} + artifacts={"error": str(e), "type": "evaluation_error"}, ) def generate_regression_data(n_samples=100, n_features=3, outlier_fraction=0.1, noise=0.1): """Generate synthetic regression data with outliers.""" np.random.seed(42) - + # Generate features X = np.random.randn(n_samples, n_features) - + # True coefficients true_coeffs = np.random.randn(n_features + 1) # +1 for intercept - + # Generate target values y = true_coeffs[0] + X @ true_coeffs[1:] + noise * np.random.randn(n_samples) - + # Add outliers n_outliers = int(n_samples * outlier_fraction) if n_outliers > 0: outlier_indices = np.random.choice(n_samples, n_outliers, replace=False) # Make outliers by adding large errors - y[outlier_indices] += np.random.choice([-1, 1], n_outliers) * np.random.uniform(3, 10, n_outliers) - + y[outlier_indices] += np.random.choice([-1, 1], n_outliers) * np.random.uniform( + 3, 10, n_outliers + ) + return X, y, true_coeffs # For testing if __name__ == "__main__": import sys + if len(sys.argv) > 1: result = asyncio.run(evaluate(sys.argv[1])) print(f"Score: {result.metrics['score']:.4f}") print(f"MSE: {result.metrics['mse']:.4f}") - print(f"Outlier Robustness: {result.metrics['outlier_robustness']:.4f}") \ No newline at end of file + print(f"Outlier Robustness: {result.metrics['outlier_robustness']:.4f}") diff --git a/examples/rust_adaptive_sort/evaluator.py b/examples/rust_adaptive_sort/evaluator.py index 1a2d7005a..65f4ae290 100644 --- a/examples/rust_adaptive_sort/evaluator.py +++ b/examples/rust_adaptive_sort/evaluator.py @@ -19,7 +19,7 @@ async def evaluate(program_path: str) -> EvaluationResult: """ Evaluate a Rust sorting algorithm implementation. - + Tests the algorithm on various data patterns to measure: - Correctness - Performance (speed) @@ -30,27 +30,27 @@ async def evaluate(program_path: str) -> EvaluationResult: # Create a temporary Rust project with tempfile.TemporaryDirectory() as temp_dir: project_dir = Path(temp_dir) / "sort_test" - + # Initialize Cargo project result = subprocess.run( ["cargo", "init", "--name", "sort_test", str(project_dir)], capture_output=True, - text=True + text=True, ) - + if result.returncode != 0: return EvaluationResult( metrics={"score": 0.0, "compile_success": 0.0}, - artifacts={"error": "Failed to create Cargo project", "stderr": result.stderr} + artifacts={"error": "Failed to create Cargo project", "stderr": result.stderr}, ) - + # Copy the program to src/lib.rs lib_path = project_dir / "src" / "lib.rs" - with open(program_path, 'r') as src: + with open(program_path, "r") as src: lib_content = src.read() - with open(lib_path, 'w') as dst: + with open(lib_path, "w") as dst: dst.write(lib_content) - + # Create main.rs with benchmark code main_content = """ use sort_test::{adaptive_sort, run_benchmark}; @@ -170,18 +170,18 @@ async def evaluate(program_path: str) -> EvaluationResult: } """ main_path = project_dir / "src" / "main.rs" - with open(main_path, 'w') as f: + with open(main_path, "w") as f: f.write(main_content) - + # Build the project build_result = subprocess.run( ["cargo", "build", "--release"], cwd=project_dir, capture_output=True, text=True, - timeout=60 + timeout=60, ) - + if build_result.returncode != 0: # Extract compilation errors return EvaluationResult( @@ -190,24 +190,24 @@ async def evaluate(program_path: str) -> EvaluationResult: "compile_success": 0.0, "correctness": 0.0, "performance_score": 0.0, - "adaptability_score": 0.0 + "adaptability_score": 0.0, }, artifacts={ "error": "Compilation failed", "stderr": build_result.stderr, - "stdout": build_result.stdout - } + "stdout": build_result.stdout, + }, ) - + # Run the benchmark run_result = subprocess.run( ["cargo", "run", "--release"], cwd=project_dir, capture_output=True, text=True, - timeout=30 + timeout=30, ) - + if run_result.returncode != 0: return EvaluationResult( metrics={ @@ -215,41 +215,35 @@ async def evaluate(program_path: str) -> EvaluationResult: "compile_success": 1.0, "correctness": 0.0, "performance_score": 0.0, - "adaptability_score": 0.0 + "adaptability_score": 0.0, }, - artifacts={ - "error": "Runtime error", - "stderr": run_result.stderr - } + artifacts={"error": "Runtime error", "stderr": run_result.stderr}, ) - + # Parse JSON output try: # Find JSON in output (between first { and last }) output = run_result.stdout - start = output.find('{') - end = output.rfind('}') + 1 + start = output.find("{") + end = output.rfind("}") + 1 json_str = output[start:end] - + results = json.loads(json_str) - + # Calculate overall score - correctness = results['correctness'] - performance = results['performance_score'] - adaptability = results['adaptability_score'] - + correctness = results["correctness"] + performance = results["performance_score"] + adaptability = results["adaptability_score"] + # Weighted score (correctness is mandatory) if correctness < 1.0: overall_score = 0.0 else: - overall_score = ( - 0.6 * performance + - 0.4 * adaptability - ) - + overall_score = 0.6 * performance + 0.4 * adaptability + # Check for memory safety (basic check via valgrind if available) memory_safe = 1.0 # Rust is memory safe by default - + return EvaluationResult( metrics={ "score": overall_score, @@ -257,16 +251,16 @@ async def evaluate(program_path: str) -> EvaluationResult: "correctness": correctness, "performance_score": performance, "adaptability_score": adaptability, - "avg_time": results['avg_time'], - "memory_safe": memory_safe + "avg_time": results["avg_time"], + "memory_safe": memory_safe, }, artifacts={ - "times": results['times'], - "all_correct": results['all_correct'], - "build_output": build_result.stdout - } + "times": results["times"], + "all_correct": results["all_correct"], + "build_output": build_result.stdout, + }, ) - + except (json.JSONDecodeError, KeyError) as e: return EvaluationResult( metrics={ @@ -274,14 +268,14 @@ async def evaluate(program_path: str) -> EvaluationResult: "compile_success": 1.0, "correctness": 0.0, "performance_score": 0.0, - "adaptability_score": 0.0 + "adaptability_score": 0.0, }, artifacts={ "error": f"Failed to parse results: {str(e)}", - "stdout": run_result.stdout - } + "stdout": run_result.stdout, + }, ) - + except subprocess.TimeoutExpired: return EvaluationResult( metrics={ @@ -289,9 +283,9 @@ async def evaluate(program_path: str) -> EvaluationResult: "compile_success": 0.0, "correctness": 0.0, "performance_score": 0.0, - "adaptability_score": 0.0 + "adaptability_score": 0.0, }, - artifacts={"error": "Timeout during evaluation"} + artifacts={"error": "Timeout during evaluation"}, ) except Exception as e: return EvaluationResult( @@ -300,18 +294,19 @@ async def evaluate(program_path: str) -> EvaluationResult: "compile_success": 0.0, "correctness": 0.0, "performance_score": 0.0, - "adaptability_score": 0.0 + "adaptability_score": 0.0, }, - artifacts={"error": str(e), "type": "evaluation_error"} + artifacts={"error": str(e), "type": "evaluation_error"}, ) # For testing if __name__ == "__main__": import sys + if len(sys.argv) > 1: result = asyncio.run(evaluate(sys.argv[1])) print(f"Score: {result.metrics['score']:.4f}") print(f"Correctness: {result.metrics['correctness']:.4f}") print(f"Performance: {result.metrics['performance_score']:.4f}") - print(f"Adaptability: {result.metrics['adaptability_score']:.4f}") \ No newline at end of file + print(f"Adaptability: {result.metrics['adaptability_score']:.4f}") diff --git a/examples/signal_processing/evaluator.py b/examples/signal_processing/evaluator.py index 67911002b..6abe2021a 100644 --- a/examples/signal_processing/evaluator.py +++ b/examples/signal_processing/evaluator.py @@ -47,86 +47,86 @@ def safe_float(value): def calculate_slope_changes(signal_data): """ Calculate slope change penalty S(Īø) - counts directional reversals - + Args: signal_data: 1D array of signal values - + Returns: Number of slope changes (directional reversals) """ if len(signal_data) < 3: return 0 - + # Calculate differences diffs = np.diff(signal_data) - + # Count sign changes in consecutive differences sign_changes = 0 for i in range(1, len(diffs)): - if np.sign(diffs[i]) != np.sign(diffs[i-1]) and diffs[i-1] != 0: + if np.sign(diffs[i]) != np.sign(diffs[i - 1]) and diffs[i - 1] != 0: sign_changes += 1 - + return sign_changes def calculate_lag_error(filtered_signal, original_signal, window_size): """ Calculate instantaneous lag error L_recent(Īø) = |y[n] - x[n]| - + Args: filtered_signal: Output of the filter original_signal: Original input signal window_size: Size of the processing window - + Returns: Instantaneous lag error at the most recent sample """ if len(filtered_signal) == 0: return 1.0 # Maximum penalty - + # Account for processing delay delay = window_size - 1 if len(original_signal) <= delay: return 1.0 - + # Compare the last filtered sample with the corresponding original sample recent_filtered = filtered_signal[-1] recent_original = original_signal[delay + len(filtered_signal) - 1] - + return abs(recent_filtered - recent_original) def calculate_average_tracking_error(filtered_signal, original_signal, window_size): """ Calculate average tracking error L_avg(Īø) over the window - + Args: filtered_signal: Output of the filter original_signal: Original input signal window_size: Size of the processing window - + Returns: Average absolute error over the processed samples """ if len(filtered_signal) == 0: return 1.0 # Maximum penalty - + # Account for processing delay delay = window_size - 1 if len(original_signal) <= delay: return 1.0 - + # Align signals - aligned_original = original_signal[delay:delay + len(filtered_signal)] - + aligned_original = original_signal[delay : delay + len(filtered_signal)] + # Ensure same length min_length = min(len(filtered_signal), len(aligned_original)) if min_length == 0: return 1.0 - + filtered_aligned = filtered_signal[:min_length] original_aligned = aligned_original[:min_length] - + # Calculate mean absolute error return np.mean(np.abs(filtered_aligned - original_aligned)) @@ -134,78 +134,83 @@ def calculate_average_tracking_error(filtered_signal, original_signal, window_si def calculate_false_reversal_penalty(filtered_signal, clean_signal, window_size): """ Calculate false reversal penalty R(Īø) - mismatched trend changes - + Args: filtered_signal: Output of the filter clean_signal: Ground truth clean signal window_size: Size of the processing window - + Returns: Penalty for trend changes that don't match the clean signal """ if len(filtered_signal) < 3 or len(clean_signal) < 3: return 0 - + # Account for processing delay delay = window_size - 1 if len(clean_signal) <= delay: return 1.0 - + # Align signals - aligned_clean = clean_signal[delay:delay + len(filtered_signal)] + aligned_clean = clean_signal[delay : delay + len(filtered_signal)] min_length = min(len(filtered_signal), len(aligned_clean)) - + if min_length < 3: return 0 - + filtered_aligned = filtered_signal[:min_length] clean_aligned = aligned_clean[:min_length] - + # Calculate trend changes for both signals filtered_diffs = np.diff(filtered_aligned) clean_diffs = np.diff(clean_aligned) - + # Count mismatched trend changes false_reversals = 0 for i in range(1, len(filtered_diffs)): # Check if there's a trend change in filtered signal - filtered_change = (np.sign(filtered_diffs[i]) != np.sign(filtered_diffs[i-1]) - and filtered_diffs[i-1] != 0) - + filtered_change = ( + np.sign(filtered_diffs[i]) != np.sign(filtered_diffs[i - 1]) + and filtered_diffs[i - 1] != 0 + ) + # Check if there's a corresponding trend change in clean signal - clean_change = (np.sign(clean_diffs[i]) != np.sign(clean_diffs[i-1]) - and clean_diffs[i-1] != 0) - + clean_change = ( + np.sign(clean_diffs[i]) != np.sign(clean_diffs[i - 1]) and clean_diffs[i - 1] != 0 + ) + # Count as false reversal if filtered has change but clean doesn't if filtered_change and not clean_change: false_reversals += 1 - + return false_reversals def calculate_composite_score(S, L_recent, L_avg, R, alpha=[0.3, 0.2, 0.2, 0.3]): """ Calculate the composite metric J(Īø) = α₁·S(Īø) + α₂·L_recent(Īø) + Ī±ā‚ƒĀ·L_avg(Īø) + α₄·R(Īø) - + All metrics are normalized and converted to penalties (higher = worse) The final score is converted to a maximization problem (higher = better) """ # Normalize slope changes (typical range 0-100) S_norm = min(S / 50.0, 2.0) - + # Lag errors are already in reasonable range (0-10 typically) L_recent_norm = min(L_recent, 2.0) L_avg_norm = min(L_avg, 2.0) - + # Normalize false reversals (typical range 0-50) R_norm = min(R / 25.0, 2.0) - + # Calculate weighted penalty - penalty = alpha[0] * S_norm + alpha[1] * L_recent_norm + alpha[2] * L_avg_norm + alpha[3] * R_norm - + penalty = ( + alpha[0] * S_norm + alpha[1] * L_recent_norm + alpha[2] * L_avg_norm + alpha[3] * R_norm + ) + # Convert to maximization score (higher is better) score = 1.0 / (1.0 + penalty) - + return score @@ -214,43 +219,47 @@ def generate_test_signals(num_signals=5): Generate multiple test signals with different characteristics """ test_signals = [] - + for i in range(num_signals): np.random.seed(42 + i) # Different seed for each signal length = 500 + i * 100 # Varying lengths noise_level = 0.2 + i * 0.1 # Varying noise levels - + t = np.linspace(0, 10, length) - + # Different signal characteristics if i == 0: # Smooth sinusoidal with trend clean = 2 * np.sin(2 * np.pi * 0.5 * t) + 0.1 * t elif i == 1: # Multiple frequency components - clean = (np.sin(2 * np.pi * 0.5 * t) + - 0.5 * np.sin(2 * np.pi * 2 * t) + - 0.2 * np.sin(2 * np.pi * 5 * t)) + clean = ( + np.sin(2 * np.pi * 0.5 * t) + + 0.5 * np.sin(2 * np.pi * 2 * t) + + 0.2 * np.sin(2 * np.pi * 5 * t) + ) elif i == 2: # Non-stationary with changing frequency clean = np.sin(2 * np.pi * (0.5 + 0.2 * t) * t) elif i == 3: # Step changes - clean = np.concatenate([ - np.ones(length//3), - 2 * np.ones(length//3), - 0.5 * np.ones(length - 2*(length//3)) - ]) + clean = np.concatenate( + [ + np.ones(length // 3), + 2 * np.ones(length // 3), + 0.5 * np.ones(length - 2 * (length // 3)), + ] + ) else: # Random walk with trend clean = np.cumsum(np.random.randn(length) * 0.1) + 0.05 * t - + # Add noise noise = np.random.normal(0, noise_level, length) noisy = clean + noise - + test_signals.append((noisy, clean)) - + return test_signals @@ -267,120 +276,120 @@ def evaluate(program_path): # Check if required function exists if not hasattr(program, "run_signal_processing"): - return { - "composite_score": 0.0, - "error": "Missing run_signal_processing function" - } + return {"composite_score": 0.0, "error": "Missing run_signal_processing function"} # Generate test signals test_signals = generate_test_signals(5) - + # Collect metrics across all test signals all_scores = [] all_metrics = [] successful_runs = 0 - + for i, (noisy_signal, clean_signal) in enumerate(test_signals): try: # Run the algorithm with timeout start_time = time.time() - + # Call the program's main function result = run_with_timeout( program.run_signal_processing, kwargs={ - 'signal_length': len(noisy_signal), - 'noise_level': 0.3, - 'window_size': 20 + "signal_length": len(noisy_signal), + "noise_level": 0.3, + "window_size": 20, }, - timeout_seconds=10 + timeout_seconds=10, ) - + execution_time = time.time() - start_time - + # Validate result format if not isinstance(result, dict): print(f"Signal {i}: Invalid result format") continue - - if 'filtered_signal' not in result: + + if "filtered_signal" not in result: print(f"Signal {i}: Missing filtered_signal in result") continue - - filtered_signal = result['filtered_signal'] - + + filtered_signal = result["filtered_signal"] + if len(filtered_signal) == 0: print(f"Signal {i}: Empty filtered signal") continue - + # Convert to numpy arrays filtered_signal = np.array(filtered_signal) - + # Calculate metrics using the generated test signal window_size = 20 - + # Calculate all penalty components S = calculate_slope_changes(filtered_signal) L_recent = calculate_lag_error(filtered_signal, noisy_signal, window_size) L_avg = calculate_average_tracking_error(filtered_signal, noisy_signal, window_size) R = calculate_false_reversal_penalty(filtered_signal, clean_signal, window_size) - + # Calculate composite score composite_score = calculate_composite_score(S, L_recent, L_avg, R) - + # Additional quality metrics correlation = 0.0 noise_reduction = 0.0 - + try: # Calculate correlation with clean signal delay = window_size - 1 - aligned_clean = clean_signal[delay:delay + len(filtered_signal)] + aligned_clean = clean_signal[delay : delay + len(filtered_signal)] min_length = min(len(filtered_signal), len(aligned_clean)) - + if min_length > 1: - corr_result = pearsonr(filtered_signal[:min_length], aligned_clean[:min_length]) + corr_result = pearsonr( + filtered_signal[:min_length], aligned_clean[:min_length] + ) correlation = corr_result[0] if not np.isnan(corr_result[0]) else 0.0 - + # Calculate noise reduction - aligned_noisy = noisy_signal[delay:delay + len(filtered_signal)] + aligned_noisy = noisy_signal[delay : delay + len(filtered_signal)] aligned_noisy = aligned_noisy[:min_length] aligned_clean = aligned_clean[:min_length] - + if min_length > 0: noise_before = np.var(aligned_noisy - aligned_clean) noise_after = np.var(filtered_signal[:min_length] - aligned_clean) - noise_reduction = ((noise_before - noise_after) / noise_before - if noise_before > 0 else 0) + noise_reduction = ( + (noise_before - noise_after) / noise_before if noise_before > 0 else 0 + ) noise_reduction = max(0, noise_reduction) # Ensure non-negative - + except Exception as e: print(f"Signal {i}: Error calculating additional metrics: {e}") - + # Store metrics metrics = { - 'slope_changes': safe_float(S), - 'lag_error': safe_float(L_recent), - 'avg_error': safe_float(L_avg), - 'false_reversals': safe_float(R), - 'composite_score': safe_float(composite_score), - 'correlation': safe_float(correlation), - 'noise_reduction': safe_float(noise_reduction), - 'execution_time': safe_float(execution_time), - 'signal_length': len(filtered_signal) + "slope_changes": safe_float(S), + "lag_error": safe_float(L_recent), + "avg_error": safe_float(L_avg), + "false_reversals": safe_float(R), + "composite_score": safe_float(composite_score), + "correlation": safe_float(correlation), + "noise_reduction": safe_float(noise_reduction), + "execution_time": safe_float(execution_time), + "signal_length": len(filtered_signal), } - + all_scores.append(composite_score) all_metrics.append(metrics) successful_runs += 1 - + except TimeoutError: print(f"Signal {i}: Timeout") continue except Exception as e: print(f"Signal {i}: Error - {str(e)}") continue - + # If no successful runs, return minimal scores if successful_runs == 0: return { @@ -392,35 +401,35 @@ def evaluate(program_path): "correlation": 0.0, "noise_reduction": 0.0, "success_rate": 0.0, - "error": "All test signals failed" + "error": "All test signals failed", } - + # Calculate aggregate metrics avg_composite_score = np.mean(all_scores) - avg_slope_changes = np.mean([m['slope_changes'] for m in all_metrics]) - avg_lag_error = np.mean([m['lag_error'] for m in all_metrics]) - avg_avg_error = np.mean([m['avg_error'] for m in all_metrics]) - avg_false_reversals = np.mean([m['false_reversals'] for m in all_metrics]) - avg_correlation = np.mean([m['correlation'] for m in all_metrics]) - avg_noise_reduction = np.mean([m['noise_reduction'] for m in all_metrics]) - avg_execution_time = np.mean([m['execution_time'] for m in all_metrics]) + avg_slope_changes = np.mean([m["slope_changes"] for m in all_metrics]) + avg_lag_error = np.mean([m["lag_error"] for m in all_metrics]) + avg_avg_error = np.mean([m["avg_error"] for m in all_metrics]) + avg_false_reversals = np.mean([m["false_reversals"] for m in all_metrics]) + avg_correlation = np.mean([m["correlation"] for m in all_metrics]) + avg_noise_reduction = np.mean([m["noise_reduction"] for m in all_metrics]) + avg_execution_time = np.mean([m["execution_time"] for m in all_metrics]) success_rate = successful_runs / len(test_signals) - + # Calculate additional derived scores smoothness_score = 1.0 / (1.0 + avg_slope_changes / 20.0) # Higher is better responsiveness_score = 1.0 / (1.0 + avg_lag_error) # Higher is better accuracy_score = max(0, avg_correlation) # 0-1, higher is better efficiency_score = min(1.0, 1.0 / max(0.001, avg_execution_time)) # Speed bonus - + # Overall score combining multiple factors overall_score = ( - 0.4 * avg_composite_score + # Primary metric - 0.2 * smoothness_score + # Smoothness - 0.2 * accuracy_score + # Correlation with clean signal - 0.1 * avg_noise_reduction + # Noise reduction capability - 0.1 * success_rate # Reliability + 0.4 * avg_composite_score # Primary metric + + 0.2 * smoothness_score # Smoothness + + 0.2 * accuracy_score # Correlation with clean signal + + 0.1 * avg_noise_reduction # Noise reduction capability + + 0.1 * success_rate # Reliability ) - + return { "composite_score": safe_float(avg_composite_score), "overall_score": safe_float(overall_score), # Primary selection metric @@ -435,17 +444,13 @@ def evaluate(program_path): "accuracy_score": safe_float(accuracy_score), "efficiency_score": safe_float(efficiency_score), "execution_time": safe_float(avg_execution_time), - "success_rate": safe_float(success_rate) + "success_rate": safe_float(success_rate), } - + except Exception as e: print(f"Evaluation failed: {str(e)}") print(traceback.format_exc()) - return { - "composite_score": 0.0, - "overall_score": 0.0, - "error": str(e) - } + return {"composite_score": 0.0, "overall_score": 0.0, "error": str(e)} def evaluate_stage1(program_path): @@ -460,62 +465,44 @@ def evaluate_stage1(program_path): # Check if required function exists if not hasattr(program, "run_signal_processing"): - return { - "runs_successfully": 0.0, - "error": "Missing run_signal_processing function" - } + return {"runs_successfully": 0.0, "error": "Missing run_signal_processing function"} # Quick test with small signal try: result = run_with_timeout( program.run_signal_processing, - kwargs={'signal_length': 100, 'noise_level': 0.3, 'window_size': 10}, - timeout_seconds=5 + kwargs={"signal_length": 100, "noise_level": 0.3, "window_size": 10}, + timeout_seconds=5, ) - - if isinstance(result, dict) and 'filtered_signal' in result: - filtered_signal = result['filtered_signal'] + + if isinstance(result, dict) and "filtered_signal" in result: + filtered_signal = result["filtered_signal"] if len(filtered_signal) > 0: # Quick quality check composite_score = 0.5 # Baseline score for working programs - + # Bonus for reasonable output length expected_length = 100 - 10 + 1 # signal_length - window_size + 1 if len(filtered_signal) == expected_length: composite_score += 0.2 - + return { "runs_successfully": 1.0, "composite_score": composite_score, - "output_length": len(filtered_signal) + "output_length": len(filtered_signal), } else: - return { - "runs_successfully": 0.5, - "error": "Empty filtered signal" - } + return {"runs_successfully": 0.5, "error": "Empty filtered signal"} else: - return { - "runs_successfully": 0.3, - "error": "Invalid result format" - } - + return {"runs_successfully": 0.3, "error": "Invalid result format"} + except TimeoutError: - return { - "runs_successfully": 0.0, - "error": "Timeout in stage 1" - } + return {"runs_successfully": 0.0, "error": "Timeout in stage 1"} except Exception as e: - return { - "runs_successfully": 0.0, - "error": f"Stage 1 error: {str(e)}" - } - + return {"runs_successfully": 0.0, "error": f"Stage 1 error: {str(e)}"} + except Exception as e: - return { - "runs_successfully": 0.0, - "error": f"Stage 1 failed: {str(e)}" - } + return {"runs_successfully": 0.0, "error": f"Stage 1 failed: {str(e)}"} def evaluate_stage2(program_path): diff --git a/examples/signal_processing/initial_program.py b/examples/signal_processing/initial_program.py index 2636e4746..1350c577a 100644 --- a/examples/signal_processing/initial_program.py +++ b/examples/signal_processing/initial_program.py @@ -2,7 +2,7 @@ """ Real-Time Adaptive Signal Processing Algorithm for Non-Stationary Time Series -This algorithm implements a sliding window approach to filter volatile, non-stationary +This algorithm implements a sliding window approach to filter volatile, non-stationary time series data while minimizing noise and preserving signal dynamics. """ import numpy as np @@ -13,70 +13,70 @@ def adaptive_filter(x, window_size=20): """ Adaptive signal processing algorithm using sliding window approach. - + Args: x: Input signal (1D array of real-valued samples) window_size: Size of the sliding window (W samples) - + Returns: y: Filtered output signal with length = len(x) - window_size + 1 """ if len(x) < window_size: raise ValueError(f"Input signal length ({len(x)}) must be >= window_size ({window_size})") - + # Initialize output array output_length = len(x) - window_size + 1 y = np.zeros(output_length) - + # Simple moving average as baseline for i in range(output_length): - window = x[i:i + window_size] - + window = x[i : i + window_size] + # Basic moving average filter y[i] = np.mean(window) - + return y def enhanced_filter_with_trend_preservation(x, window_size=20): """ Enhanced version with trend preservation using weighted moving average. - + Args: x: Input signal (1D array of real-valued samples) window_size: Size of the sliding window - + Returns: y: Filtered output signal """ if len(x) < window_size: raise ValueError(f"Input signal length ({len(x)}) must be >= window_size ({window_size})") - + output_length = len(x) - window_size + 1 y = np.zeros(output_length) - + # Create weights that emphasize recent samples weights = np.exp(np.linspace(-2, 0, window_size)) weights = weights / np.sum(weights) - + for i in range(output_length): - window = x[i:i + window_size] - + window = x[i : i + window_size] + # Weighted moving average with exponential weights y[i] = np.sum(window * weights) - + return y def process_signal(input_signal, window_size=20, algorithm_type="enhanced"): """ Main signal processing function that applies the selected algorithm. - + Args: input_signal: Input time series data window_size: Window size for processing algorithm_type: Type of algorithm to use ("basic" or "enhanced") - + Returns: Filtered signal """ @@ -92,91 +92,91 @@ def process_signal(input_signal, window_size=20, algorithm_type="enhanced"): def generate_test_signal(length=1000, noise_level=0.3, seed=42): """ Generate synthetic test signal with known characteristics. - + Args: length: Length of the signal noise_level: Standard deviation of noise to add seed: Random seed for reproducibility - + Returns: Tuple of (noisy_signal, clean_signal) """ np.random.seed(seed) t = np.linspace(0, 10, length) - + # Create a complex signal with multiple components clean_signal = ( - 2 * np.sin(2 * np.pi * 0.5 * t) + # Low frequency component - 1.5 * np.sin(2 * np.pi * 2 * t) + # Medium frequency component - 0.5 * np.sin(2 * np.pi * 5 * t) + # Higher frequency component - 0.8 * np.exp(-t/5) * np.sin(2 * np.pi * 1.5 * t) # Decaying oscillation + 2 * np.sin(2 * np.pi * 0.5 * t) # Low frequency component + + 1.5 * np.sin(2 * np.pi * 2 * t) # Medium frequency component + + 0.5 * np.sin(2 * np.pi * 5 * t) # Higher frequency component + + 0.8 * np.exp(-t / 5) * np.sin(2 * np.pi * 1.5 * t) # Decaying oscillation ) - + # Add non-stationary behavior trend = 0.1 * t * np.sin(0.2 * t) # Slowly varying trend clean_signal += trend - + # Add random walk component for non-stationarity random_walk = np.cumsum(np.random.randn(length) * 0.05) clean_signal += random_walk - + # Add noise noise = np.random.normal(0, noise_level, length) noisy_signal = clean_signal + noise - + return noisy_signal, clean_signal def run_signal_processing(signal_length=1000, noise_level=0.3, window_size=20): """ Run the signal processing algorithm on a test signal. - + Returns: Dictionary containing results and metrics """ # Generate test signal noisy_signal, clean_signal = generate_test_signal(signal_length, noise_level) - + # Process the signal filtered_signal = process_signal(noisy_signal, window_size, "enhanced") - + # Calculate basic metrics if len(filtered_signal) > 0: # Align signals for comparison (account for processing delay) delay = window_size - 1 aligned_clean = clean_signal[delay:] aligned_noisy = noisy_signal[delay:] - + # Ensure same length min_length = min(len(filtered_signal), len(aligned_clean)) filtered_signal = filtered_signal[:min_length] aligned_clean = aligned_clean[:min_length] aligned_noisy = aligned_noisy[:min_length] - + # Calculate correlation with clean signal correlation = np.corrcoef(filtered_signal, aligned_clean)[0, 1] if min_length > 1 else 0 - + # Calculate noise reduction noise_before = np.var(aligned_noisy - aligned_clean) noise_after = np.var(filtered_signal - aligned_clean) noise_reduction = (noise_before - noise_after) / noise_before if noise_before > 0 else 0 - + return { - 'filtered_signal': filtered_signal, - 'clean_signal': aligned_clean, - 'noisy_signal': aligned_noisy, - 'correlation': correlation, - 'noise_reduction': noise_reduction, - 'signal_length': min_length + "filtered_signal": filtered_signal, + "clean_signal": aligned_clean, + "noisy_signal": aligned_noisy, + "correlation": correlation, + "noise_reduction": noise_reduction, + "signal_length": min_length, } else: return { - 'filtered_signal': [], - 'clean_signal': [], - 'noisy_signal': [], - 'correlation': 0, - 'noise_reduction': 0, - 'signal_length': 0 + "filtered_signal": [], + "clean_signal": [], + "noisy_signal": [], + "correlation": 0, + "noise_reduction": 0, + "signal_length": 0, } diff --git a/examples/web_scraper_optillm/evaluator.py b/examples/web_scraper_optillm/evaluator.py index 14098b31c..c0a1e1c65 100644 --- a/examples/web_scraper_optillm/evaluator.py +++ b/examples/web_scraper_optillm/evaluator.py @@ -15,120 +15,117 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - def evaluate(program_path: str) -> Dict: """ Evaluate the web scraper program. - + Args: program_path: Path to the program to evaluate - + Returns: Dictionary with metrics and artifacts for OpenEvolve compatibility """ try: # Import the program sys.path.insert(0, os.path.dirname(program_path)) - program_name = os.path.basename(program_path).replace('.py', '') + program_name = os.path.basename(program_path).replace(".py", "") program = __import__(program_name) - + # Test data: HTML content from various documentation sources test_cases = get_test_cases() - + # Evaluate each test case metrics = { - 'accuracy': 0.0, - 'completeness': 0.0, - 'robustness': 0.0, - 'parsing_errors': 0.0, - 'total_score': 0.0 + "accuracy": 0.0, + "completeness": 0.0, + "robustness": 0.0, + "parsing_errors": 0.0, + "total_score": 0.0, } - + artifacts = {} - + total_correct = 0 total_expected = 0 parsing_errors = 0 - + for i, test_case in enumerate(test_cases): try: # Run the scraper - docs = program.scrape_api_docs(test_case['html']) - + docs = program.scrape_api_docs(test_case["html"]) + # Evaluate accuracy - correct, expected = evaluate_extraction(docs, test_case['expected']) + correct, expected = evaluate_extraction(docs, test_case["expected"]) total_correct += correct total_expected += expected - + # Test parameter extraction for doc in docs: - if 'parameters' not in doc: - doc['parameters'] = program.extract_parameters(doc.get('signature', '')) - + if "parameters" not in doc: + doc["parameters"] = program.extract_parameters(doc.get("signature", "")) + # Test formatting formatted = program.format_documentation(docs) - + # Store results for debugging - artifacts[f'test_case_{i}'] = { - 'expected_count': expected, - 'found_count': correct, - 'extracted_functions': [doc.get('name', 'unknown') for doc in docs], - 'formatted_length': len(formatted) + artifacts[f"test_case_{i}"] = { + "expected_count": expected, + "found_count": correct, + "extracted_functions": [doc.get("name", "unknown") for doc in docs], + "formatted_length": len(formatted), } - + except Exception as e: parsing_errors += 1 - artifacts[f'test_case_{i}_error'] = str(e) - + artifacts[f"test_case_{i}_error"] = str(e) + # Calculate metrics if total_expected > 0: - metrics['accuracy'] = total_correct / total_expected - - metrics['completeness'] = min(1.0, total_correct / 20) # Expect ~20 functions total - metrics['robustness'] = max(0.0, 1.0 - (parsing_errors / len(test_cases))) - metrics['parsing_errors'] = parsing_errors / len(test_cases) - + metrics["accuracy"] = total_correct / total_expected + + metrics["completeness"] = min(1.0, total_correct / 20) # Expect ~20 functions total + metrics["robustness"] = max(0.0, 1.0 - (parsing_errors / len(test_cases))) + metrics["parsing_errors"] = parsing_errors / len(test_cases) + # Overall score - use 'combined_score' as primary metric for evolution - metrics['combined_score'] = ( - metrics['accuracy'] * 0.4 + - metrics['completeness'] * 0.3 + - metrics['robustness'] * 0.3 + metrics["combined_score"] = ( + metrics["accuracy"] * 0.4 + metrics["completeness"] * 0.3 + metrics["robustness"] * 0.3 ) - + # Add detailed feedback for the LLM - artifacts['evaluation_feedback'] = generate_feedback(metrics, artifacts) - + artifacts["evaluation_feedback"] = generate_feedback(metrics, artifacts) + # Return dictionary format for OpenEvolve compatibility return metrics - + except Exception as e: return { - 'accuracy': 0.0, - 'completeness': 0.0, - 'robustness': 0.0, - 'parsing_errors': 1.0, - 'combined_score': 0.0, - 'error': str(e), - 'traceback': traceback.format_exc(), - 'stage': 'program_import' + "accuracy": 0.0, + "completeness": 0.0, + "robustness": 0.0, + "parsing_errors": 1.0, + "combined_score": 0.0, + "error": str(e), + "traceback": traceback.format_exc(), + "stage": "program_import", } def get_test_cases() -> List[Dict[str, Any]]: """ Get test cases with HTML content and expected results. - + These test cases include URLs that will be fetched by optillm's readurls plugin during evolution, providing the LLM with actual documentation structure. - + Returns: List of test cases with HTML content and expected results """ return [ { - 'name': 'json_module_docs', - 'html': ''' + "name": "json_module_docs", + "html": """
@@ -162,15 +159,15 @@ def get_test_cases() -> List[Dict[str, Any]]:
- ''', - 'expected': [ - {'name': 'dumps', 'params': ['obj', 'indent']}, - {'name': 'loads', 'params': ['s']} - ] + """, + "expected": [ + {"name": "dumps", "params": ["obj", "indent"]}, + {"name": "loads", "params": ["s"]}, + ], }, { - 'name': 'requests_docs', - 'html': ''' + "name": "requests_docs", + "html": """
@@ -189,15 +186,15 @@ def get_test_cases() -> List[Dict[str, Any]]:
- ''', - 'expected': [ - {'name': 'requests.get', 'params': ['url', 'params']}, - {'name': 'requests.post', 'params': ['url', 'data', 'json']} - ] + """, + "expected": [ + {"name": "requests.get", "params": ["url", "params"]}, + {"name": "requests.post", "params": ["url", "data", "json"]}, + ], }, { - 'name': 'beautifulsoup_docs', - 'html': ''' + "name": "beautifulsoup_docs", + "html": """
@@ -221,16 +218,16 @@ def get_test_cases() -> List[Dict[str, Any]]:
- ''', - 'expected': [ - {'name': 'BeautifulSoup', 'params': ['markup', 'parser']}, - {'name': 'find', 'params': ['name', 'attrs']}, - {'name': 'find_all', 'params': ['name', 'attrs', 'limit']} - ] + """, + "expected": [ + {"name": "BeautifulSoup", "params": ["markup", "parser"]}, + {"name": "find", "params": ["name", "attrs"]}, + {"name": "find_all", "params": ["name", "attrs", "limit"]}, + ], }, { - 'name': 'edge_case_malformed', - 'html': ''' + "name": "edge_case_malformed", + "html": """
@@ -251,110 +248,118 @@ def get_test_cases() -> List[Dict[str, Any]]:
- ''', - 'expected': [ - {'name': 'function_name', 'params': ['arg1', 'arg2']}, - {'name': 'another_func', 'params': []} - ] - } + """, + "expected": [ + {"name": "function_name", "params": ["arg1", "arg2"]}, + {"name": "another_func", "params": []}, + ], + }, ] -def evaluate_extraction(docs: List[Dict[str, Any]], expected: List[Dict[str, Any]]) -> tuple[int, int]: +def evaluate_extraction( + docs: List[Dict[str, Any]], expected: List[Dict[str, Any]] +) -> tuple[int, int]: """ Evaluate the accuracy of extracted documentation. - + Args: docs: Extracted documentation expected: Expected results - + Returns: Tuple of (correct_count, expected_count) """ correct = 0 expected_count = len(expected) - + for exp in expected: # Check if we found this function found = False for doc in docs: - doc_name = doc.get('name', '').lower() - exp_name = exp['name'].lower() - + doc_name = doc.get("name", "").lower() + exp_name = exp["name"].lower() + if exp_name in doc_name or doc_name in exp_name: found = True # Check parameter extraction - doc_params = doc.get('parameters', []) - exp_params = exp.get('params', []) - + doc_params = doc.get("parameters", []) + exp_params = exp.get("params", []) + if len(doc_params) >= len(exp_params): correct += 1 else: correct += 0.5 # Partial credit break - + if not found and docs: # Only penalize if we extracted something pass # No additional penalty - + return correct, expected_count def generate_feedback(metrics: Dict[str, float], artifacts: Dict[str, Any]) -> str: """ Generate detailed feedback for the LLM to improve the scraper. - + This feedback will be included in the evolution prompt to guide the LLM toward better solutions. - + Args: metrics: Evaluation metrics artifacts: Evaluation artifacts - + Returns: Detailed feedback string """ feedback = [] - + feedback.append("## Evaluation Feedback") feedback.append(f"Overall Score: {metrics['combined_score']:.2f}/1.0") feedback.append("") - + # Accuracy feedback - if metrics['accuracy'] < 0.5: + if metrics["accuracy"] < 0.5: feedback.append("āš ļø **Low Accuracy**: The scraper is missing many expected functions.") - feedback.append("Consider improving the HTML parsing logic to handle different documentation formats.") - feedback.append("Look for patterns like
,
, and tags.") - elif metrics['accuracy'] < 0.8: + feedback.append( + "Consider improving the HTML parsing logic to handle different documentation formats." + ) + feedback.append( + "Look for patterns like
,
, and tags." + ) + elif metrics["accuracy"] < 0.8: feedback.append("āœ… **Good Accuracy**: Most functions are found, but some are missed.") feedback.append("Fine-tune the extraction logic for edge cases.") else: feedback.append("šŸŽ‰ **Excellent Accuracy**: Function extraction is working well!") - + feedback.append("") - + # Completeness feedback - if metrics['completeness'] < 0.5: + if metrics["completeness"] < 0.5: feedback.append("āš ļø **Low Completeness**: Not extracting enough functions overall.") feedback.append("Increase the limit or improve the search scope.") - + # Robustness feedback - if metrics['robustness'] < 0.8: + if metrics["robustness"] < 0.8: feedback.append("āš ļø **Low Robustness**: The scraper fails on some HTML formats.") feedback.append("Add try-catch blocks and handle different documentation structures.") feedback.append("Consider multiple parsing strategies and fallback methods.") - + # Specific improvements feedback.append("") feedback.append("## Specific Improvements:") - + # Analyze test case results for key, value in artifacts.items(): - if key.startswith('test_case_') and isinstance(value, dict): - if 'error' in key: + if key.startswith("test_case_") and isinstance(value, dict): + if "error" in key: feedback.append(f"- Fix error in {key}: {value}") - elif value.get('found_count', 0) < value.get('expected_count', 0): - feedback.append(f"- Improve extraction for {key}: found {value.get('found_count', 0)}/{value.get('expected_count', 0)} functions") - + elif value.get("found_count", 0) < value.get("expected_count", 0): + feedback.append( + f"- Improve extraction for {key}: found {value.get('found_count', 0)}/{value.get('expected_count', 0)} functions" + ) + # Documentation URL hints (these will be fetched by readurls plugin) feedback.append("") feedback.append("## Documentation References:") @@ -362,5 +367,5 @@ def generate_feedback(metrics: Dict[str, float], artifacts: Dict[str, Any]) -> s feedback.append("- Python docs: https://docs.python.org/3/library/json.html") feedback.append("- Requests docs: https://requests.readthedocs.io/en/latest/api/") feedback.append("- BeautifulSoup docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/") - - return '\n'.join(feedback) \ No newline at end of file + + return "\n".join(feedback) diff --git a/examples/web_scraper_optillm/initial_program.py b/examples/web_scraper_optillm/initial_program.py index f0c97f5a2..4daead447 100644 --- a/examples/web_scraper_optillm/initial_program.py +++ b/examples/web_scraper_optillm/initial_program.py @@ -18,94 +18,102 @@ def scrape_api_docs(html_content: str) -> List[Dict[str, any]]: """ Extract API documentation from HTML content. - + Args: html_content: Raw HTML content of a documentation page - + Returns: List of dictionaries containing function documentation """ - soup = BeautifulSoup(html_content, 'html.parser') + soup = BeautifulSoup(html_content, "html.parser") functions = [] - + # Try multiple approaches to find functions # 1. Look for code blocks - code_blocks = soup.find_all('code') + code_blocks = soup.find_all("code") for block in code_blocks: text = block.get_text(strip=True) - if '(' in text and ')' in text: - functions.append({ - 'name': text.split('(')[0].strip(), - 'signature': text, - 'description': 'No description found', - 'parameters': extract_parameters(text) - }) - + if "(" in text and ")" in text: + functions.append( + { + "name": text.split("(")[0].strip(), + "signature": text, + "description": "No description found", + "parameters": extract_parameters(text), + } + ) + # 2. Look for function signatures in headers (h3) - h3_blocks = soup.find_all('h3') + h3_blocks = soup.find_all("h3") for block in h3_blocks: text = block.get_text(strip=True) - if '(' in text and ')' in text: - functions.append({ - 'name': text.split('(')[0].strip(), - 'signature': text, - 'description': 'No description found', - 'parameters': extract_parameters(text) - }) - + if "(" in text and ")" in text: + functions.append( + { + "name": text.split("(")[0].strip(), + "signature": text, + "description": "No description found", + "parameters": extract_parameters(text), + } + ) + # 3. Look for dt elements with sig class - dt_blocks = soup.find_all('dt', class_='sig') + dt_blocks = soup.find_all("dt", class_="sig") for block in dt_blocks: - sig_name = block.find(class_='sig-name') + sig_name = block.find(class_="sig-name") if sig_name: name = sig_name.get_text(strip=True) - functions.append({ - 'name': name, - 'signature': block.get_text(strip=True), - 'description': 'No description found', - 'parameters': extract_parameters(block.get_text(strip=True)) - }) - + functions.append( + { + "name": name, + "signature": block.get_text(strip=True), + "description": "No description found", + "parameters": extract_parameters(block.get_text(strip=True)), + } + ) + return functions[:20] # Return more functions def extract_parameters(signature: str) -> List[Dict[str, str]]: """ Extract parameter information from a function signature. - + Args: signature: Function signature string - + Returns: List of parameter dictionaries """ params = [] # Very basic parameter extraction - match = re.search(r'\((.*?)\)', signature) + match = re.search(r"\((.*?)\)", signature) if match: param_string = match.group(1) if param_string: - param_parts = param_string.split(',') + param_parts = param_string.split(",") for part in param_parts: part = part.strip() if part: - params.append({ - 'name': part.split('=')[0].strip(), - 'type': 'unknown', - 'default': None, - 'description': '' - }) - + params.append( + { + "name": part.split("=")[0].strip(), + "type": "unknown", + "default": None, + "description": "", + } + ) + return params def format_documentation(api_docs: List[Dict[str, any]]) -> str: """ Format extracted documentation into a readable string. - + Args: api_docs: List of API documentation dictionaries - + Returns: Formatted documentation string """ @@ -114,15 +122,17 @@ def format_documentation(api_docs: List[Dict[str, any]]) -> str: output.append(f"Function: {doc['name']}") output.append(f"Signature: {doc['signature']}") output.append(f"Description: {doc['description']}") - - if doc.get('parameters'): + + if doc.get("parameters"): output.append("Parameters:") - for param in doc['parameters']: + for param in doc["parameters"]: output.append(f" - {param['name']}: {param.get('description', 'No description')}") - + output.append("") # Empty line between functions - - return '\n'.join(output) + + return "\n".join(output) + + # EVOLVE-BLOCK-END @@ -143,7 +153,7 @@ def format_documentation(api_docs: List[Dict[str, any]]) -> str: """ - + docs = scrape_api_docs(sample_html) print(format_documentation(docs)) - print(f"\nExtracted {len(docs)} functions") \ No newline at end of file + print(f"\nExtracted {len(docs)} functions") diff --git a/openevolve/config.py b/openevolve/config.py index 1e81eb0ea..358d0e16b 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -143,13 +143,23 @@ class PromptConfig: artifact_security_filter: bool = True # Feature extraction and program labeling - suggest_simplification_after_chars: Optional[int] = 500 # Suggest simplifying if program exceeds this many characters - include_changes_under_chars: Optional[int] = 100 # Include change descriptions in features if under this length - concise_implementation_max_lines: Optional[int] = 10 # Label as "concise" if program has this many lines or fewer - comprehensive_implementation_min_lines: Optional[int] = 50 # Label as "comprehensive" if program has this many lines or more - + suggest_simplification_after_chars: Optional[int] = ( + 500 # Suggest simplifying if program exceeds this many characters + ) + include_changes_under_chars: Optional[int] = ( + 100 # Include change descriptions in features if under this length + ) + concise_implementation_max_lines: Optional[int] = ( + 10 # Label as "concise" if program has this many lines or fewer + ) + comprehensive_implementation_min_lines: Optional[int] = ( + 50 # Label as "comprehensive" if program has this many lines or more + ) + # Backward compatibility - deprecated - code_length_threshold: Optional[int] = None # Deprecated: use suggest_simplification_after_chars + code_length_threshold: Optional[int] = ( + None # Deprecated: use suggest_simplification_after_chars + ) @dataclass diff --git a/openevolve/controller.py b/openevolve/controller.py index 00fd311c6..b5c9ca54c 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -249,12 +249,13 @@ async def run( ) self.database.add(initial_program) - + # Check if combined_score is present in the metrics if "combined_score" not in initial_metrics: # Calculate average of numeric metrics numeric_metrics = [ - v for v in initial_metrics.values() + v + for v in initial_metrics.values() if isinstance(v, (int, float)) and not isinstance(v, bool) ] if numeric_metrics: @@ -300,13 +301,13 @@ def force_exit_handler(signum, frame): # So we need to adjust the start_iteration for the actual evolution evolution_start = start_iteration evolution_iterations = max_iterations - + # If we just added the initial program at iteration 0, start evolution from iteration 1 if should_add_initial and start_iteration == 0: evolution_start = 1 # User expects max_iterations evolutionary iterations AFTER the initial program # So we don't need to reduce evolution_iterations - + # Run evolution with improved parallel processing and checkpoint callback await self._run_evolution_with_checkpoints( evolution_start, evolution_iterations, target_score diff --git a/openevolve/database.py b/openevolve/database.py index 740768839..4e1fdf8e3 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -7,6 +7,7 @@ import logging import os import random +import shutil import time import uuid from dataclasses import asdict, dataclass, field, fields @@ -457,6 +458,9 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None: logger.warning("No database path specified, skipping save") return + # Perform artifact cleanup before saving + self._cleanup_old_artifacts(save_path) + # create directory if it doesn't exist os.makedirs(save_path, exist_ok=True) @@ -1945,6 +1949,46 @@ def _create_artifact_dir(self, program_id: str) -> str: os.makedirs(artifact_dir, exist_ok=True) return artifact_dir + def _cleanup_old_artifacts(self, checkpoint_path: str) -> None: + """ + Remove artifact directories older than the configured retention period. + + Args: + checkpoint_path: The path of the current checkpoint being saved, which + contains the artifacts folder to be cleaned. + """ + if not self.config.cleanup_old_artifacts: + return + + artifacts_base_path = os.path.join(checkpoint_path, "artifacts") + + if not os.path.isdir(artifacts_base_path): + return + + now = time.time() + retention_seconds = self.config.artifact_retention_days * 24 * 60 * 60 + deleted_count = 0 + + logger.debug(f"Starting artifact cleanup in {artifacts_base_path}...") + + for dirname in os.listdir(artifacts_base_path): + dirpath = os.path.join(artifacts_base_path, dirname) + if os.path.isdir(dirpath): + try: + dir_mod_time = os.path.getmtime(dirpath) + if (now - dir_mod_time) > retention_seconds: + shutil.rmtree(dirpath) + deleted_count += 1 + logger.debug(f"Removed old artifact directory: {dirpath}") + except FileNotFoundError: + # Can happen in race conditions; ignore. + continue + except Exception as e: + logger.error(f"Error removing artifact directory {dirpath}: {e}") + + if deleted_count > 0: + logger.info(f"Cleaned up {deleted_count} old artifact directories.") + def _write_artifact_file(self, artifact_dir: str, key: str, value: Union[str, bytes]) -> None: """Write an artifact to a file""" # Sanitize filename diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index 654ef913a..716a95408 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -194,18 +194,22 @@ async def evaluate_program( weighted_value = value * self.config.llm_feedback_weight eval_result.metrics[f"llm_{name}"] = weighted_value llm_scores.append(value) # Use unweighted value for average - + # Add average of LLM metrics if llm_scores: llm_average = sum(llm_scores) / len(llm_scores) - eval_result.metrics["llm_average"] = llm_average * self.config.llm_feedback_weight - + eval_result.metrics["llm_average"] = ( + llm_average * self.config.llm_feedback_weight + ) + # Recalculate combined_score if it exists if "combined_score" in eval_result.metrics: # Original combined_score is just accuracy accuracy = eval_result.metrics["combined_score"] # Combine with LLM average (70% accuracy, 30% LLM quality) - eval_result.metrics["combined_score"] = accuracy * 0.7 + llm_average * 0.3 + eval_result.metrics["combined_score"] = ( + accuracy * 0.7 + llm_average * 0.3 + ) # Store artifacts if enabled and present if ( @@ -659,7 +663,7 @@ def _create_cascade_error_context(self, stage: str, error: Exception) -> dict: def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool: """ Check if metrics pass a threshold - + Uses 'combined_score' if available (for consistency with evolution), otherwise falls back to averaging all numeric metrics except 'error' diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index ec14aec92..d2df82f6c 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -22,6 +22,7 @@ @dataclass class SerializableResult: """Result that can be pickled and sent between processes""" + child_program_dict: Optional[Dict[str, Any]] = None parent_id: Optional[str] = None iteration_time: float = 0.0 @@ -39,39 +40,49 @@ def _worker_init(config_dict: dict, evaluation_file: str) -> None: global _worker_evaluator global _worker_llm_ensemble global _worker_prompt_sampler - + # Store config for later use # Reconstruct Config object from nested dictionaries - from openevolve.config import Config, DatabaseConfig, EvaluatorConfig, LLMConfig, PromptConfig, LLMModelConfig - + from openevolve.config import ( + Config, + DatabaseConfig, + EvaluatorConfig, + LLMConfig, + PromptConfig, + LLMModelConfig, + ) + # Reconstruct model objects - models = [LLMModelConfig(**m) for m in config_dict['llm']['models']] - evaluator_models = [LLMModelConfig(**m) for m in config_dict['llm']['evaluator_models']] - + models = [LLMModelConfig(**m) for m in config_dict["llm"]["models"]] + evaluator_models = [LLMModelConfig(**m) for m in config_dict["llm"]["evaluator_models"]] + # Create LLM config with models - llm_dict = config_dict['llm'].copy() - llm_dict['models'] = models - llm_dict['evaluator_models'] = evaluator_models + llm_dict = config_dict["llm"].copy() + llm_dict["models"] = models + llm_dict["evaluator_models"] = evaluator_models llm_config = LLMConfig(**llm_dict) - + # Create other configs - prompt_config = PromptConfig(**config_dict['prompt']) - database_config = DatabaseConfig(**config_dict['database']) - evaluator_config = EvaluatorConfig(**config_dict['evaluator']) - + prompt_config = PromptConfig(**config_dict["prompt"]) + database_config = DatabaseConfig(**config_dict["database"]) + evaluator_config = EvaluatorConfig(**config_dict["evaluator"]) + _worker_config = Config( llm=llm_config, prompt=prompt_config, database=database_config, evaluator=evaluator_config, - **{k: v for k, v in config_dict.items() - if k not in ['llm', 'prompt', 'database', 'evaluator']} + **{ + k: v + for k, v in config_dict.items() + if k not in ["llm", "prompt", "database", "evaluator"] + }, ) _worker_evaluation_file = evaluation_file - + # These will be lazily initialized on first use _worker_evaluator = None - _worker_llm_ensemble = None + _worker_llm_ensemble = None _worker_prompt_sampler = None @@ -80,75 +91,73 @@ def _lazy_init_worker_components(): global _worker_evaluator global _worker_llm_ensemble global _worker_prompt_sampler - + if _worker_llm_ensemble is None: from openevolve.llm.ensemble import LLMEnsemble + _worker_llm_ensemble = LLMEnsemble(_worker_config.llm.models) - + if _worker_prompt_sampler is None: from openevolve.prompt.sampler import PromptSampler + _worker_prompt_sampler = PromptSampler(_worker_config.prompt) - + if _worker_evaluator is None: from openevolve.evaluator import Evaluator from openevolve.llm.ensemble import LLMEnsemble from openevolve.prompt.sampler import PromptSampler - + # Create evaluator-specific components evaluator_llm = LLMEnsemble(_worker_config.llm.evaluator_models) evaluator_prompt = PromptSampler(_worker_config.prompt) evaluator_prompt.set_templates("evaluator_system_message") - + _worker_evaluator = Evaluator( _worker_config.evaluator, _worker_evaluation_file, evaluator_llm, evaluator_prompt, - database=None # No shared database in worker + database=None, # No shared database in worker ) def _run_iteration_worker( - iteration: int, - db_snapshot: Dict[str, Any], - parent_id: str, - inspiration_ids: List[str] + iteration: int, db_snapshot: Dict[str, Any], parent_id: str, inspiration_ids: List[str] ) -> SerializableResult: """Run a single iteration in a worker process""" try: # Lazy initialization _lazy_init_worker_components() - + # Reconstruct programs from snapshot - programs = { - pid: Program(**prog_dict) - for pid, prog_dict in db_snapshot["programs"].items() - } - + programs = {pid: Program(**prog_dict) for pid, prog_dict in db_snapshot["programs"].items()} + parent = programs[parent_id] inspirations = [programs[pid] for pid in inspiration_ids if pid in programs] - + # Get parent artifacts if available parent_artifacts = db_snapshot["artifacts"].get(parent_id) - + # Get island-specific programs for context parent_island = parent.metadata.get("island", db_snapshot["current_island"]) island_programs = [ - programs[pid] for pid in db_snapshot["islands"][parent_island] - if pid in programs + programs[pid] for pid in db_snapshot["islands"][parent_island] if pid in programs ] - + # Sort by metrics for top programs from openevolve.utils.metrics_utils import safe_numeric_average + island_programs.sort( key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)), - reverse=True + reverse=True, ) - + # Use config values for limits instead of hardcoding - island_top_programs = island_programs[:_worker_config.prompt.num_top_programs + _worker_config.prompt.num_diverse_programs] - island_previous_programs = island_programs[:_worker_config.prompt.num_top_programs] - + island_top_programs = island_programs[ + : _worker_config.prompt.num_top_programs + _worker_config.prompt.num_diverse_programs + ] + island_previous_programs = island_programs[: _worker_config.prompt.num_top_programs] + # Build prompt prompt = _worker_prompt_sampler.build_prompt( current_program=parent.code, @@ -162,9 +171,9 @@ def _run_iteration_worker( diff_based_evolution=_worker_config.diff_based_evolution, program_artifacts=parent_artifacts, ) - + iteration_start = time.time() - + # Generate code modification (sync wrapper for async) llm_response = asyncio.run( _worker_llm_ensemble.generate_with_context( @@ -172,50 +181,47 @@ def _run_iteration_worker( messages=[{"role": "user", "content": prompt["user"]}], ) ) - + # Parse response based on evolution mode if _worker_config.diff_based_evolution: from openevolve.utils.code_utils import extract_diffs, apply_diff, format_diff_summary - + diff_blocks = extract_diffs(llm_response) if not diff_blocks: return SerializableResult( - error=f"No valid diffs found in response", - iteration=iteration + error=f"No valid diffs found in response", iteration=iteration ) - + child_code = apply_diff(parent.code, llm_response) changes_summary = format_diff_summary(diff_blocks) else: from openevolve.utils.code_utils import parse_full_rewrite - + new_code = parse_full_rewrite(llm_response, _worker_config.language) if not new_code: return SerializableResult( - error=f"No valid code found in response", - iteration=iteration + error=f"No valid code found in response", iteration=iteration ) - + child_code = new_code changes_summary = "Full rewrite" - + # Check code length if len(child_code) > _worker_config.max_code_length: return SerializableResult( error=f"Generated code exceeds maximum length ({len(child_code)} > {_worker_config.max_code_length})", - iteration=iteration + iteration=iteration, ) - + # Evaluate the child program import uuid + child_id = str(uuid.uuid4()) - child_metrics = asyncio.run( - _worker_evaluator.evaluate_program(child_code, child_id) - ) - + child_metrics = asyncio.run(_worker_evaluator.evaluate_program(child_code, child_id)) + # Get artifacts artifacts = _worker_evaluator.get_pending_artifacts(child_id) - + # Create child program child_program = Program( id=child_id, @@ -229,11 +235,11 @@ def _run_iteration_worker( "changes": changes_summary, "parent_metrics": parent.metrics, "island": parent_island, - } + }, ) - + iteration_time = time.time() - iteration_start - + return SerializableResult( child_program_dict=child_program.to_dict(), parent_id=parent.id, @@ -241,107 +247,99 @@ def _run_iteration_worker( prompt=prompt, llm_response=llm_response, artifacts=artifacts, - iteration=iteration + iteration=iteration, ) - + except Exception as e: logger.exception(f"Error in worker iteration {iteration}") - return SerializableResult( - error=str(e), - iteration=iteration - ) + return SerializableResult(error=str(e), iteration=iteration) class ProcessParallelController: """Controller for process-based parallel evolution""" - + def __init__(self, config: Config, evaluation_file: str, database: ProgramDatabase): self.config = config self.evaluation_file = evaluation_file self.database = database - + self.executor: Optional[ProcessPoolExecutor] = None self.shutdown_event = mp.Event() - + # Number of worker processes self.num_workers = config.evaluator.parallel_evaluations - + logger.info(f"Initialized process parallel controller with {self.num_workers} workers") - + def _serialize_config(self, config: Config) -> dict: """Serialize config object to a dictionary that can be pickled""" # Manual serialization to handle nested objects properly return { - 'llm': { - 'models': [asdict(m) for m in config.llm.models], - 'evaluator_models': [asdict(m) for m in config.llm.evaluator_models], - 'api_base': config.llm.api_base, - 'api_key': config.llm.api_key, - 'temperature': config.llm.temperature, - 'top_p': config.llm.top_p, - 'max_tokens': config.llm.max_tokens, - 'timeout': config.llm.timeout, - 'retries': config.llm.retries, - 'retry_delay': config.llm.retry_delay, + "llm": { + "models": [asdict(m) for m in config.llm.models], + "evaluator_models": [asdict(m) for m in config.llm.evaluator_models], + "api_base": config.llm.api_base, + "api_key": config.llm.api_key, + "temperature": config.llm.temperature, + "top_p": config.llm.top_p, + "max_tokens": config.llm.max_tokens, + "timeout": config.llm.timeout, + "retries": config.llm.retries, + "retry_delay": config.llm.retry_delay, }, - 'prompt': asdict(config.prompt), - 'database': asdict(config.database), - 'evaluator': asdict(config.evaluator), - 'max_iterations': config.max_iterations, - 'checkpoint_interval': config.checkpoint_interval, - 'log_level': config.log_level, - 'log_dir': config.log_dir, - 'random_seed': config.random_seed, - 'diff_based_evolution': config.diff_based_evolution, - 'max_code_length': config.max_code_length, - 'language': config.language, + "prompt": asdict(config.prompt), + "database": asdict(config.database), + "evaluator": asdict(config.evaluator), + "max_iterations": config.max_iterations, + "checkpoint_interval": config.checkpoint_interval, + "log_level": config.log_level, + "log_dir": config.log_dir, + "random_seed": config.random_seed, + "diff_based_evolution": config.diff_based_evolution, + "max_code_length": config.max_code_length, + "language": config.language, } - + def start(self) -> None: """Start the process pool""" # Convert config to dict for pickling # We need to be careful with nested dataclasses config_dict = self._serialize_config(self.config) - + # Create process pool with initializer self.executor = ProcessPoolExecutor( max_workers=self.num_workers, initializer=_worker_init, - initargs=(config_dict, self.evaluation_file) + initargs=(config_dict, self.evaluation_file), ) - + logger.info(f"Started process pool with {self.num_workers} processes") - + def stop(self) -> None: """Stop the process pool""" self.shutdown_event.set() - + if self.executor: self.executor.shutdown(wait=True) self.executor = None - + logger.info("Stopped process pool") - + def request_shutdown(self) -> None: """Request graceful shutdown""" logger.info("Graceful shutdown requested...") self.shutdown_event.set() - + def _create_database_snapshot(self) -> Dict[str, Any]: """Create a serializable snapshot of the database state""" # Only include necessary data for workers snapshot = { - "programs": { - pid: prog.to_dict() - for pid, prog in self.database.programs.items() - }, - "islands": [ - list(island) for island in self.database.islands - ], + "programs": {pid: prog.to_dict() for pid, prog in self.database.programs.items()}, + "islands": [list(island) for island in self.database.islands], "current_island": self.database.current_island, "artifacts": {}, # Will be populated selectively } - + # Include artifacts for programs that might be selected # IMPORTANT: This limits artifacts (execution outputs/errors) to first 100 programs only. # This does NOT affect program code - all programs are fully serialized above. @@ -353,9 +351,9 @@ def _create_database_snapshot(self) -> Dict[str, Any]: artifacts = self.database.get_artifacts(pid) if artifacts: snapshot["artifacts"][pid] = artifacts - + return snapshot - + async def run_evolution( self, start_iteration: int, @@ -366,34 +364,34 @@ async def run_evolution( """Run evolution with process-based parallelism""" if not self.executor: raise RuntimeError("Process pool not started") - + total_iterations = start_iteration + max_iterations - + logger.info( f"Starting process-based evolution from iteration {start_iteration} " f"for {max_iterations} iterations (total: {total_iterations})" ) - + # Track pending futures pending_futures: Dict[int, Future] = {} batch_size = min(self.num_workers * 2, max_iterations) - + # Submit initial batch for i in range(start_iteration, min(start_iteration + batch_size, total_iterations)): future = self._submit_iteration(i) if future: pending_futures[i] = future - + next_iteration = start_iteration + batch_size completed_iterations = 0 - + # Island management programs_per_island = max(1, max_iterations // (self.config.database.num_islands * 10)) current_island_counter = 0 - + # Process results as they complete while ( - pending_futures + pending_futures and completed_iterations < max_iterations and not self.shutdown_event.is_set() ): @@ -403,58 +401,61 @@ async def run_evolution( if future.done(): completed_iteration = iteration break - + if completed_iteration is None: await asyncio.sleep(0.01) continue - + # Process completed result future = pending_futures.pop(completed_iteration) - + try: result = future.result() - + if result.error: logger.warning(f"Iteration {completed_iteration} error: {result.error}") elif result.child_program_dict: # Reconstruct program from dict child_program = Program(**result.child_program_dict) - + # Add to database self.database.add(child_program, iteration=completed_iteration) - + # Store artifacts if result.artifacts: self.database.store_artifacts(child_program.id, result.artifacts) - + # Log prompts if result.prompt: self.database.log_prompt( template_key=( - "full_rewrite_user" - if not self.config.diff_based_evolution + "full_rewrite_user" + if not self.config.diff_based_evolution else "diff_user" ), program_id=child_program.id, prompt=result.prompt, - responses=[result.llm_response] if result.llm_response else [] + responses=[result.llm_response] if result.llm_response else [], ) - + # Island management - if completed_iteration > start_iteration and current_island_counter >= programs_per_island: + if ( + completed_iteration > start_iteration + and current_island_counter >= programs_per_island + ): self.database.next_island() current_island_counter = 0 logger.debug(f"Switched to island {self.database.current_island}") - + current_island_counter += 1 self.database.increment_island_generation() - + # Check migration if self.database.should_migrate(): logger.info(f"Performing migration at iteration {completed_iteration}") self.database.migrate_programs() self.database.log_island_status() - + # Log progress logger.info( f"Iteration {completed_iteration}: " @@ -462,20 +463,26 @@ async def run_evolution( f"(parent: {result.parent_id}) " f"completed in {result.iteration_time:.2f}s" ) - + if child_program.metrics: - metrics_str = ", ".join([ - f"{k}={v:.4f}" if isinstance(v, (int, float)) else f"{k}={v}" - for k, v in child_program.metrics.items() - ]) + metrics_str = ", ".join( + [ + f"{k}={v:.4f}" if isinstance(v, (int, float)) else f"{k}={v}" + for k, v in child_program.metrics.items() + ] + ) logger.info(f"Metrics: {metrics_str}") - + # Check if this is the first program without combined_score - if not hasattr(self, '_warned_about_combined_score'): + if not hasattr(self, "_warned_about_combined_score"): self._warned_about_combined_score = False - - if "combined_score" not in child_program.metrics and not self._warned_about_combined_score: + + if ( + "combined_score" not in child_program.metrics + and not self._warned_about_combined_score + ): from openevolve.utils.metrics_utils import safe_numeric_average + avg_score = safe_numeric_average(child_program.metrics) logger.warning( f"āš ļø No 'combined_score' metric found in evaluation results. " @@ -484,27 +491,31 @@ async def run_evolution( f"metric that properly weights different aspects of program performance." ) self._warned_about_combined_score = True - + # Check for new best if self.database.best_program_id == child_program.id: logger.info( f"🌟 New best solution found at iteration {completed_iteration}: " f"{child_program.id}" ) - + # Checkpoint callback # Don't checkpoint at iteration 0 (that's just the initial program) - if completed_iteration > 0 and completed_iteration % self.config.checkpoint_interval == 0: - logger.info(f"Checkpoint interval reached at iteration {completed_iteration}") + if ( + completed_iteration > 0 + and completed_iteration % self.config.checkpoint_interval == 0 + ): + logger.info( + f"Checkpoint interval reached at iteration {completed_iteration}" + ) self.database.log_island_status() if checkpoint_callback: checkpoint_callback(completed_iteration) - + # Check target score if target_score is not None and child_program.metrics: numeric_metrics = [ - v for v in child_program.metrics.values() - if isinstance(v, (int, float)) + v for v in child_program.metrics.values() if isinstance(v, (int, float)) ] if numeric_metrics: avg_score = sum(numeric_metrics) / len(numeric_metrics) @@ -513,49 +524,49 @@ async def run_evolution( f"Target score {target_score} reached at iteration {completed_iteration}" ) break - + except Exception as e: logger.error(f"Error processing result from iteration {completed_iteration}: {e}") - + completed_iterations += 1 - + # Submit next iteration if next_iteration < total_iterations and not self.shutdown_event.is_set(): future = self._submit_iteration(next_iteration) if future: pending_futures[next_iteration] = future next_iteration += 1 - + # Handle shutdown if self.shutdown_event.is_set(): logger.info("Shutdown requested, canceling remaining evaluations...") for future in pending_futures.values(): future.cancel() - + logger.info("Evolution completed") - + return self.database.get_best_program() - + def _submit_iteration(self, iteration: int) -> Optional[Future]: """Submit an iteration to the process pool""" try: # Sample parent and inspirations parent, inspirations = self.database.sample() - + # Create database snapshot db_snapshot = self._create_database_snapshot() - + # Submit to process pool future = self.executor.submit( _run_iteration_worker, iteration, db_snapshot, parent.id, - [insp.id for insp in inspirations] + [insp.id for insp in inspirations], ) - + return future - + except Exception as e: logger.error(f"Error submitting iteration {iteration}: {e}") - return None \ No newline at end of file + return None diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py index d4ec14adb..ec9c0743d 100644 --- a/openevolve/prompt/sampler.py +++ b/openevolve/prompt/sampler.py @@ -172,7 +172,9 @@ def _identify_improvement_areas( # Check program length # Support both old and new parameter names for backward compatibility - threshold = self.config.suggest_simplification_after_chars or self.config.code_length_threshold + threshold = ( + self.config.suggest_simplification_after_chars or self.config.code_length_threshold + ) if threshold and len(current_program) > threshold: improvement_areas.append( "Consider simplifying the code to improve readability and maintainability" @@ -501,7 +503,11 @@ def _extract_unique_features(self, program: Dict[str, Any]) -> str: metadata = program.get("metadata", {}) if "changes" in metadata: changes = metadata["changes"] - if isinstance(changes, str) and self.config.include_changes_under_chars and len(changes) < self.config.include_changes_under_chars: + if ( + isinstance(changes, str) + and self.config.include_changes_under_chars + and len(changes) < self.config.include_changes_under_chars + ): features.append(f"Modification: {changes}") # Analyze metrics for standout characteristics @@ -523,9 +529,15 @@ def _extract_unique_features(self, program: Dict[str, Any]) -> str: features.append("NumPy-based implementation") if "for" in code_lower and "while" in code_lower: features.append("Mixed iteration strategies") - if self.config.concise_implementation_max_lines and len(code.split("\n")) <= self.config.concise_implementation_max_lines: + if ( + self.config.concise_implementation_max_lines + and len(code.split("\n")) <= self.config.concise_implementation_max_lines + ): features.append("Concise implementation") - elif self.config.comprehensive_implementation_min_lines and len(code.split("\n")) >= self.config.comprehensive_implementation_min_lines: + elif ( + self.config.comprehensive_implementation_min_lines + and len(code.split("\n")) >= self.config.comprehensive_implementation_min_lines + ): features.append("Comprehensive implementation") # Default if no specific features found diff --git a/tests/test_database.py b/tests/test_database.py index cd11a7e26..090f4d48f 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -498,19 +498,22 @@ def test_migration_prevents_re_migration(self): # It should still exist with the same ID (not a new migrant ID) still_exists = multi_db.get(original_id) self.assertIsNotNone(still_exists) - + # Count new programs created by migration (identified by "_migrant_" pattern) new_migrant_ids = [pid for pid in multi_db.programs if "_migrant_" in pid] - + # Each non-migrant program (2 of them) migrates to 2 adjacent islands # So we expect 2 * 2 = 4 new migrant programs # The already-marked migrant (test_prog_0) should NOT create any new copies self.assertEqual(len(new_migrant_ids), 4) - + # Verify the already-migrant program didn't create new copies migrant_descendants = [pid for pid in new_migrant_ids if original_id in pid] - self.assertEqual(len(migrant_descendants), 0, - f"Program {original_id} should not have created migrant copies") + self.assertEqual( + len(migrant_descendants), + 0, + f"Program {original_id} should not have created migrant copies", + ) def test_empty_island_initialization_creates_copies(self): """Test that empty islands are initialized with copies, not shared references""" @@ -531,7 +534,7 @@ def test_empty_island_initialization_creates_copies(self): metrics={"score": 0.9, "combined_score": 0.9}, ) multi_db.add(program, target_island=1) - + # Make it the best program multi_db.best_program_id = "original_program" @@ -543,12 +546,12 @@ def test_empty_island_initialization_creates_copies(self): self.assertNotEqual(sampled_parent.id, "original_program") self.assertEqual(sampled_parent.code, program.code) # Same code self.assertEqual(sampled_parent.parent_id, "original_program") # Parent is the original - + # Check island membership self.assertIn("original_program", multi_db.islands[1]) self.assertNotIn("original_program", multi_db.islands[0]) self.assertIn(sampled_parent.id, multi_db.islands[0]) - + # Run validation - should not raise any errors multi_db._validate_migration_results() @@ -619,20 +622,19 @@ def test_migration_validation_passes(self): # Increment generations to trigger migration for island in range(3): multi_db.island_generations[island] += 1 - + # Migrate programs multi_db.migrate_programs() - + # Validation should pass without warnings multi_db._validate_migration_results() - + # Verify no program has exponential ID growth for program_id in multi_db.programs: # Count occurrences of "migrant" in ID migrant_count = program_id.count("migrant") self.assertLessEqual( - migrant_count, 1, - f"Program ID {program_id} has been migrated multiple times" + migrant_count, 1, f"Program ID {program_id} has been migrated multiple times" ) diff --git a/tests/test_database_cleanup.py b/tests/test_database_cleanup.py new file mode 100644 index 000000000..0ec012565 --- /dev/null +++ b/tests/test_database_cleanup.py @@ -0,0 +1,60 @@ +# tests/test_database_cleanup.py + +import os +import shutil +import tempfile +import time +import unittest + +from openevolve.config import DatabaseConfig +from openevolve.database import Program, ProgramDatabase + + +class TestArtifactCleanup(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.mkdtemp() + self.db_path = os.path.join(self.temp_dir, "db") + os.makedirs(self.db_path, exist_ok=True) + + def tearDown(self): + shutil.rmtree(self.temp_dir) + + def test_artifact_cleanup(self): + # 1. Configure the database for cleanup + config = DatabaseConfig( + db_path=self.db_path, cleanup_old_artifacts=True, artifact_retention_days=1 + ) + db = ProgramDatabase(config) + + # 2. Create dummy artifact directories + artifacts_path = os.path.join(self.db_path, "artifacts") + os.makedirs(artifacts_path, exist_ok=True) + + dir_to_keep = os.path.join(artifacts_path, "artifact_to_keep") + dir_to_delete = os.path.join(artifacts_path, "artifact_to_delete") + + os.makedirs(dir_to_keep, exist_ok=True) + os.makedirs(dir_to_delete, exist_ok=True) + + # 3. Set the modification time of one directory to be old + old_time = time.time() - (2 * 24 * 60 * 60) # 2 days ago + os.utime(dir_to_delete, (old_time, old_time)) + + self.assertTrue(os.path.exists(dir_to_keep)) + self.assertTrue(os.path.exists(dir_to_delete)) + + # 4. Call the save method, which should trigger the cleanup + db.save() + + # 5. Assert that the old directory was deleted and the new one was kept + self.assertTrue( + os.path.exists(dir_to_keep), "New artifact directory should not be deleted." + ) + self.assertFalse( + os.path.exists(dir_to_delete), "Old artifact directory should have been deleted." + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_iteration_counting.py b/tests/test_iteration_counting.py index 0d44c47dc..3f0df9b84 100644 --- a/tests/test_iteration_counting.py +++ b/tests/test_iteration_counting.py @@ -22,7 +22,7 @@ class TestIterationCounting(unittest.TestCase): def setUp(self): """Set up test environment""" self.test_dir = tempfile.mkdtemp() - + # Create test program self.program_content = """# EVOLVE-BLOCK-START def compute(x): @@ -32,7 +32,7 @@ def compute(x): self.program_file = os.path.join(self.test_dir, "test_program.py") with open(self.program_file, "w") as f: f.write(self.program_content) - + # Create test evaluator self.eval_content = """ def evaluate(program_path): @@ -41,108 +41,109 @@ def evaluate(program_path): self.eval_file = os.path.join(self.test_dir, "evaluator.py") with open(self.eval_file, "w") as f: f.write(self.eval_content) - + def tearDown(self): """Clean up test environment""" import shutil + shutil.rmtree(self.test_dir, ignore_errors=True) - + def test_fresh_start_iteration_counting(self): """Test that fresh start correctly handles iteration 0 as special""" # Test the logic without actually running evolution config = Config() config.max_iterations = 20 config.checkpoint_interval = 10 - + # Simulate fresh start start_iteration = 0 should_add_initial = True - + # Apply the logic from controller.py evolution_start = start_iteration evolution_iterations = config.max_iterations - + if should_add_initial and start_iteration == 0: evolution_start = 1 - + # Verify self.assertEqual(evolution_start, 1, "Evolution should start at iteration 1") self.assertEqual(evolution_iterations, 20, "Should run 20 evolution iterations") - + # Simulate what process_parallel would do total_iterations = evolution_start + evolution_iterations self.assertEqual(total_iterations, 21, "Total range should be 21 (1 through 20)") - + # Check checkpoint alignment expected_checkpoints = [] for i in range(evolution_start, total_iterations): if i > 0 and i % config.checkpoint_interval == 0: expected_checkpoints.append(i) - + self.assertEqual(expected_checkpoints, [10, 20], "Checkpoints should be at 10 and 20") - + def test_resume_iteration_counting(self): """Test that resume correctly continues from checkpoint""" config = Config() config.max_iterations = 10 config.checkpoint_interval = 10 - + # Simulate resume from checkpoint 10 start_iteration = 11 # Last iteration was 10, so start at 11 should_add_initial = False - + # Apply the logic evolution_start = start_iteration evolution_iterations = config.max_iterations - + if should_add_initial and start_iteration == 0: evolution_start = 1 - + # Verify self.assertEqual(evolution_start, 11, "Evolution should continue from iteration 11") self.assertEqual(evolution_iterations, 10, "Should run 10 more iterations") - + # Total iterations total_iterations = evolution_start + evolution_iterations self.assertEqual(total_iterations, 21, "Should run through iteration 20") - + # Check checkpoint at 20 expected_checkpoints = [] for i in range(evolution_start, total_iterations): if i > 0 and i % config.checkpoint_interval == 0: expected_checkpoints.append(i) - + self.assertEqual(expected_checkpoints, [20], "Should checkpoint at 20") - + def test_checkpoint_boundary_conditions(self): """Test checkpoint behavior at various boundaries""" test_cases = [ # (start_iter, max_iter, checkpoint_interval, expected_checkpoints) (1, 100, 10, list(range(10, 101, 10))), # Standard case - (1, 99, 10, list(range(10, 100, 10))), # Just short of last checkpoint + (1, 99, 10, list(range(10, 100, 10))), # Just short of last checkpoint (1, 101, 10, list(range(10, 101, 10))), # Just past checkpoint (0, 20, 5, [5, 10, 15, 20]), # Special case with iteration 0 ] - + for start, max_iter, interval, expected in test_cases: # Apply fresh start logic evolution_start = start if start == 0: evolution_start = 1 - + total = evolution_start + max_iter - + checkpoints = [] for i in range(evolution_start, total): if i > 0 and i % interval == 0: checkpoints.append(i) - + self.assertEqual( - checkpoints, - expected, - f"Failed for start={start}, max={max_iter}, interval={interval}" + checkpoints, + expected, + f"Failed for start={start}, max={max_iter}, interval={interval}", ) - + async def test_controller_iteration_behavior(self): """Test actual controller behavior with iteration counting""" config = Config() @@ -150,42 +151,42 @@ async def test_controller_iteration_behavior(self): config.checkpoint_interval = 10 config.database.in_memory = True config.evaluator.parallel_evaluations = 1 - + controller = OpenEvolve( initial_program_path=self.program_file, evaluation_file=self.eval_file, config=config, - output_dir=self.test_dir + output_dir=self.test_dir, ) - + # Track checkpoint calls checkpoint_calls = [] original_save = controller._save_checkpoint controller._save_checkpoint = lambda i: checkpoint_calls.append(i) or original_save(i) - + # Mock LLM - with patch('openevolve.llm.ensemble.LLMEnsemble.generate_with_context') as mock_llm: - mock_llm.return_value = '''```python + with patch("openevolve.llm.ensemble.LLMEnsemble.generate_with_context") as mock_llm: + mock_llm.return_value = """```python # EVOLVE-BLOCK-START def compute(x): return x << 1 # EVOLVE-BLOCK-END -```''' - +```""" + # Run with limited iterations to test await controller.run(iterations=20) - + # Verify checkpoints were called correctly # Note: We expect checkpoints at 10 and 20 self.assertIn(10, checkpoint_calls, "Should checkpoint at iteration 10") self.assertIn(20, checkpoint_calls, "Should checkpoint at iteration 20") - + # Verify we have the right number of programs (initial + 20 evolution) # This may vary due to parallel execution, but should be at least 21 self.assertGreaterEqual( - len(controller.database.programs), - 21, - "Should have at least 21 programs (initial + 20 iterations)" + len(controller.database.programs), + 21, + "Should have at least 21 programs (initial + 20 iterations)", ) @@ -194,7 +195,7 @@ def compute(x): suite = unittest.TestLoader().loadTestsFromTestCase(TestIterationCounting) runner = unittest.TextTestRunner(verbosity=2) result = runner.run(suite) - + # Run the async test separately async def run_async_test(): test = TestIterationCounting() @@ -206,5 +207,5 @@ async def run_async_test(): print(f"āœ— test_controller_iteration_behavior failed: {e}") finally: test.tearDown() - - asyncio.run(run_async_test()) \ No newline at end of file + + asyncio.run(run_async_test()) diff --git a/tests/test_process_parallel.py b/tests/test_process_parallel.py index 29c8ecbe4..f7965a2e4 100644 --- a/tests/test_process_parallel.py +++ b/tests/test_process_parallel.py @@ -23,7 +23,7 @@ class TestProcessParallel(unittest.TestCase): def setUp(self): """Set up test environment""" self.test_dir = tempfile.mkdtemp() - + # Create test config self.config = Config() self.config.max_iterations = 10 @@ -32,7 +32,7 @@ def setUp(self): self.config.database.num_islands = 2 self.config.database.in_memory = True self.config.checkpoint_interval = 5 - + # Create test evaluation file self.eval_content = """ def evaluate(program_path): @@ -41,10 +41,10 @@ def evaluate(program_path): self.eval_file = os.path.join(self.test_dir, "evaluator.py") with open(self.eval_file, "w") as f: f.write(self.eval_content) - + # Create test database self.database = ProgramDatabase(self.config.database) - + # Add some test programs for i in range(3): program = Program( @@ -52,62 +52,64 @@ def evaluate(program_path): code=f"def func_{i}(): return {i}", language="python", metrics={"score": 0.5 + i * 0.1, "performance": 0.4 + i * 0.1}, - iteration_found=0 + iteration_found=0, ) self.database.add(program) - + def tearDown(self): """Clean up test environment""" import shutil + shutil.rmtree(self.test_dir, ignore_errors=True) - + def test_controller_initialization(self): """Test that controller initializes correctly""" controller = ProcessParallelController(self.config, self.eval_file, self.database) - + self.assertEqual(controller.num_workers, 2) self.assertIsNone(controller.executor) self.assertIsNotNone(controller.shutdown_event) - + def test_controller_start_stop(self): """Test starting and stopping the controller""" controller = ProcessParallelController(self.config, self.eval_file, self.database) - + # Start controller controller.start() self.assertIsNotNone(controller.executor) - + # Stop controller controller.stop() self.assertIsNone(controller.executor) self.assertTrue(controller.shutdown_event.is_set()) - + def test_database_snapshot_creation(self): """Test creating database snapshot for workers""" controller = ProcessParallelController(self.config, self.eval_file, self.database) - + snapshot = controller._create_database_snapshot() - + # Verify snapshot structure self.assertIn("programs", snapshot) self.assertIn("islands", snapshot) self.assertIn("current_island", snapshot) self.assertIn("artifacts", snapshot) - + # Verify programs are serialized self.assertEqual(len(snapshot["programs"]), 3) for pid, prog_dict in snapshot["programs"].items(): self.assertIsInstance(prog_dict, dict) self.assertIn("id", prog_dict) self.assertIn("code", prog_dict) - + def test_run_evolution_basic(self): """Test basic evolution run""" + async def run_test(): controller = ProcessParallelController(self.config, self.eval_file, self.database) - + # Mock the executor to avoid actually spawning processes - with patch.object(controller, '_submit_iteration') as mock_submit: + with patch.object(controller, "_submit_iteration") as mock_submit: # Create mock futures that complete immediately mock_future1 = asyncio.Future() mock_result1 = SerializableResult( @@ -119,47 +121,45 @@ async def run_test(): "generation": 1, "metrics": {"score": 0.7, "performance": 0.8}, "iteration_found": 1, - "metadata": {"changes": "test", "island": 0} + "metadata": {"changes": "test", "island": 0}, }, parent_id="test_0", iteration_time=0.1, - iteration=1 + iteration=1, ) mock_future1.set_result(mock_result1) - + mock_submit.return_value = mock_future1 - + # Start controller controller.start() - + # Run evolution for 1 iteration result = await controller.run_evolution( - start_iteration=1, - max_iterations=1, - target_score=None + start_iteration=1, max_iterations=1, target_score=None ) - + # Verify iteration was submitted mock_submit.assert_called_once_with(1) - + # Verify program was added to database self.assertIn("child_1", self.database.programs) child = self.database.get("child_1") self.assertEqual(child.metrics["score"], 0.7) - + # Run the async test asyncio.run(run_test()) - + def test_request_shutdown(self): """Test graceful shutdown request""" controller = ProcessParallelController(self.config, self.eval_file, self.database) - + # Request shutdown controller.request_shutdown() - + # Verify shutdown event is set self.assertTrue(controller.shutdown_event.is_set()) - + def test_serializable_result(self): """Test SerializableResult dataclass""" result = SerializableResult( @@ -167,24 +167,21 @@ def test_serializable_result(self): parent_id="parent", iteration_time=1.5, iteration=10, - error=None + error=None, ) - + # Verify attributes self.assertEqual(result.child_program_dict["id"], "test") self.assertEqual(result.parent_id, "parent") self.assertEqual(result.iteration_time, 1.5) self.assertEqual(result.iteration, 10) self.assertIsNone(result.error) - + # Test with error - error_result = SerializableResult( - error="Test error", - iteration=5 - ) + error_result = SerializableResult(error="Test error", iteration=5) self.assertEqual(error_result.error, "Test error") self.assertIsNone(error_result.child_program_dict) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main()