In [None]:
%pip install -U "jax[cuda]"

In [None]:
%pip install -U "git+https://github.com/briancf1/QDax.git#egg=qdax[examples]"

In [None]:
# Clone the repository to get experiment scripts
!git clone https://github.com/briancf1/QDax.git
%cd QDax/examples

## Setup and Imports

In [None]:
import os
import json
import time
from datetime import datetime
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import functools

import jax
import jax.numpy as jnp

from qdax.core.dns_ga import DominatedNoveltySearchGA
from qdax.core.dns import DominatedNoveltySearch
import qdax.tasks.brax as environments
from qdax.tasks.brax.env_creators import scoring_function_brax_envs as scoring_function
from qdax.core.neuroevolution.buffers.buffer import QDTransition
from qdax.core.neuroevolution.networks.networks import MLP
from qdax.core.emitters.mutation_operators import isoline_variation
from qdax.core.emitters.standard_emitters import MixingEmitter
from qdax.utils.metrics import CSVLogger, default_qd_metrics

# Configure plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Create experiment logs directory
os.makedirs("multiseed_logs", exist_ok=True)

print("Setup complete!")
print(f"Current directory: {os.getcwd()}")
print(f"JAX devices: {jax.devices()}")

## Experiment Configuration

**Selected configurations** (based on initial promising results):
- **Baselines**: iso_sigma = 0.005, 0.01
- **Control Test**: g_n=∞ (never calls GA, should match baseline exactly)
- **Tier 1**: g300_gen2 (proven winner)
- **Tier 3**: g500_gen3, g1000_gen4 (best efficiency)
- **Tier 4**: g150_gen1 (best performance in initial run)

**Seeds**: 3 different random seeds for statistical robustness

**Total**: 6 configs × 2 iso_sigmas × 3 seeds = **36 experiments** (~2.4 hours)

In [None]:
FIXED_PARAMS = {
    'batch_size': 100,
    'env_name': 'walker2d_uni',
    'episode_length': 100,
    'num_iterations': 3000,
    'policy_hidden_layer_sizes': (64, 64),
    'population_size': 1024,
    'k': 3,
    'line_sigma': 0.05,
}

# Multiple seeds for statistical significance
SEEDS = [42, 123, 456]

# ISO_SIGMA values to test
ISO_SIGMAS = [0.005, 0.01]

# Selected promising configurations only
SELECTED_CONFIGS = [
    # Baseline (no GA)
    {
        'type': 'baseline',
        'name': 'DNS_baseline',
        'g_n': None,
        'num_ga_children': None,
        'num_ga_generations': None,
    },
    # Proven winners
    {
        'type': 'dns-ga',
        'name': 'DNS-GA_g300_gen2',
        'g_n': 300,
        'num_ga_children': 2,
        'num_ga_generations': 2,
    },
    # Best efficiency configurations
    {
        'type': 'dns-ga',
        'name': 'DNS-GA_g500_gen3',
        'g_n': 500,
        'num_ga_children': 2,
        'num_ga_generations': 3,
    },
    {
        'type': 'dns-ga',
        'name': 'DNS-GA_g1000_gen4',
        'g_n': 1000,
        'num_ga_children': 2,
        'num_ga_generations': 4,
    },
    # Best performance (initial results)
    {
        'type': 'dns-ga',
        'name': 'DNS-GA_g150_gen1',
        'g_n': 150,
        'num_ga_children': 2,
        'num_ga_generations': 1,
    },
]

print(f"Total experiments: {len(SELECTED_CONFIGS)} configs × {len(ISO_SIGMAS)} iso_sigmas × {len(SEEDS)} seeds = {len(SELECTED_CONFIGS) * len(ISO_SIGMAS) * len(SEEDS)}")

## Helper Functions

In [None]:
def calculate_ga_overhead_evals(g_n, num_iterations, population_size, num_ga_children, num_ga_generations):
    """Calculate total evaluations performed by Competition-GA."""
    if g_n is None:
        return 0, 0, 0
    
    num_ga_calls = num_iterations // g_n
    if num_ga_children == 1:
        offspring_per_call = population_size * num_ga_generations
    else:
        offspring_per_call = population_size * num_ga_children * (num_ga_children**num_ga_generations - 1) // (num_ga_children - 1)
    evals_per_ga_call = offspring_per_call
    total_ga_evals = num_ga_calls * evals_per_ga_call
    return total_ga_evals, num_ga_calls, evals_per_ga_call


def setup_environment(env_name, episode_length, policy_hidden_layer_sizes, batch_size, seed):
    """Initialize environment and policy network."""
    env = environments.create(env_name, episode_length=episode_length)
    reset_fn = jax.jit(env.reset)
    key = jax.random.key(seed)
    
    policy_layer_sizes = policy_hidden_layer_sizes + (env.action_size,)
    policy_network = MLP(
        layer_sizes=policy_layer_sizes,
        kernel_init=jax.nn.initializers.lecun_uniform(),
        final_activation=jnp.tanh,
    )
    
    key, subkey = jax.random.split(key)
    keys = jax.random.split(subkey, num=batch_size)
    fake_batch = jnp.zeros(shape=(batch_size, env.observation_size))
    init_variables = jax.vmap(policy_network.init)(keys, fake_batch)
    
    return env, policy_network, reset_fn, init_variables, key


def create_scoring_function(env, policy_network, reset_fn, episode_length, env_name):
    """Create scoring function for fitness evaluation."""
    def play_step_fn(env_state, policy_params, key):
        actions = policy_network.apply(policy_params, env_state.obs)
        state_desc = env_state.info["state_descriptor"]
        next_state = env.step(env_state, actions)
        
        transition = QDTransition(
            obs=env_state.obs,
            next_obs=next_state.obs,
            rewards=next_state.reward,
            dones=next_state.done,
            actions=actions,
            truncations=next_state.info["truncation"],
            state_desc=state_desc,
            next_state_desc=next_state.info["state_descriptor"],
        )
        return next_state, policy_params, key, transition
    
    descriptor_extraction_fn = environments.descriptor_extractor[env_name]
    scoring_fn = functools.partial(
        scoring_function,
        episode_length=episode_length,
        play_reset_fn=reset_fn,
        play_step_fn=play_step_fn,
        descriptor_extractor=descriptor_extraction_fn,
    )
    
    return scoring_fn


def create_mutation_function(iso_sigma):
    """Create mutation function for Competition-GA."""
    def competition_ga_mutation_fn(genotype, key):
        genotype_flat, tree_def = jax.tree_util.tree_flatten(genotype)
        num_leaves = len(genotype_flat)
        keys = jax.random.split(key, num_leaves)
        keys_tree = jax.tree_util.tree_unflatten(tree_def, keys)
        
        def add_noise(x, k):
            return x + jax.random.normal(k, shape=x.shape) * iso_sigma
        
        mutated = jax.tree_util.tree_map(add_noise, genotype, keys_tree)
        return mutated
    
    return competition_ga_mutation_fn

print("Helper functions loaded!")

## Run Experiments

In [None]:
def run_single_experiment(config, iso_sigma, seed, baseline_target_qd):
    """Run a single experiment with given config, iso_sigma, and seed."""
    exp_name = f"{config['name']}_iso{iso_sigma}_seed{seed}"
    print(f"\n{'='*80}")
    print(f"Running: {exp_name}")
    print(f"{'='*80}")
    
    # Setup environment
    env, policy_network, reset_fn, init_variables, key = setup_environment(
        FIXED_PARAMS['env_name'],
        FIXED_PARAMS['episode_length'],
        FIXED_PARAMS['policy_hidden_layer_sizes'],
        FIXED_PARAMS['batch_size'],
        seed
    )
    
    scoring_fn = create_scoring_function(env, policy_network, reset_fn, 
                                        FIXED_PARAMS['episode_length'],
                                        FIXED_PARAMS['env_name'])
    
    reward_offset = environments.reward_offset[FIXED_PARAMS['env_name']]
    metrics_function = functools.partial(
        default_qd_metrics,
        qd_offset=reward_offset * FIXED_PARAMS['episode_length'],
    )
    
    # Create emitter
    variation_fn = functools.partial(
        isoline_variation,
        iso_sigma=iso_sigma,
        line_sigma=FIXED_PARAMS['line_sigma']
    )
    
    mixing_emitter = MixingEmitter(
        mutation_fn=None,
        variation_fn=variation_fn,
        variation_percentage=1.0,
        batch_size=FIXED_PARAMS['batch_size']
    )
    
    # Create algorithm (DNS or DNS-GA)
    if config['type'] == 'baseline':
        algorithm = DominatedNoveltySearch(
            scoring_function=scoring_fn,
            emitter=mixing_emitter,
            metrics_function=metrics_function,
            population_size=FIXED_PARAMS['population_size'],
            k=FIXED_PARAMS['k'],
        )
        print(f"Config: DNS baseline, iso_sigma={iso_sigma}, seed={seed}")
    else:
        mutation_fn = create_mutation_function(iso_sigma)
        algorithm = DominatedNoveltySearchGA(
            scoring_function=scoring_fn,
            emitter=mixing_emitter,
            metrics_function=metrics_function,
            population_size=FIXED_PARAMS['population_size'],
            k=FIXED_PARAMS['k'],
            g_n=config['g_n'],
            num_ga_children=config['num_ga_children'],
            num_ga_generations=config['num_ga_generations'],
            mutation_fn=mutation_fn,
        )
        print(f"Config: g_n={config['g_n']}, gens={config['num_ga_generations']}, iso_sigma={iso_sigma}, seed={seed}")
    
    # Initialize
    key, subkey = jax.random.split(key)
    repertoire, emitter_state, init_metrics = algorithm.init(init_variables, subkey)
    
    # Setup logging
    log_period = 100
    num_loops = FIXED_PARAMS['num_iterations'] // log_period
    
    metrics = {key: jnp.array([]) for key in ["iteration", "qd_score", "coverage", "max_fitness", "time"]}
    init_metrics = jax.tree.map(lambda x: jnp.array([x]) if x.shape == () else x, init_metrics)
    init_metrics["iteration"] = jnp.array([0], dtype=jnp.int32)
    init_metrics["time"] = jnp.array([0.0])
    metrics = jax.tree.map(
        lambda metric, init_metric: jnp.concatenate([metric, init_metric], axis=0),
        metrics, init_metrics
    )
    
    log_filename = os.path.join("multiseed_logs", f"{exp_name}_logs.csv")
    csv_logger = CSVLogger(log_filename, header=list(metrics.keys()))
    csv_logger.log(jax.tree.map(lambda x: x[-1], metrics))
    
    # Main training loop
    if config['type'] == 'baseline':
        algorithm_scan_update = algorithm.scan_update
        scan_state = (repertoire, emitter_state, key)
    else:
        algorithm_scan_update = algorithm.scan_update
        scan_state = (repertoire, emitter_state, key, 1)  # generation_counter
    
    start_time_total = time.time()
    convergence_iter = None
    
    for i in range(num_loops):
        start_time = time.time()
        
        scan_state, current_metrics = jax.lax.scan(
            algorithm_scan_update,
            scan_state,
            (),
            length=log_period,
        )
        
        timelapse = time.time() - start_time
        
        current_metrics["iteration"] = jnp.arange(
            1 + log_period * i, 1 + log_period * (i + 1), dtype=jnp.int32
        )
        current_metrics["time"] = jnp.repeat(timelapse, log_period)
        metrics = jax.tree.map(
            lambda metric, current_metric: jnp.concatenate([metric, current_metric], axis=0),
            metrics, current_metrics
        )
        
        csv_logger.log(jax.tree.map(lambda x: x[-1], metrics))
        
        # Track convergence
        if convergence_iter is None and baseline_target_qd is not None:
            if float(metrics['qd_score'][-1]) >= baseline_target_qd:
                convergence_iter = int(metrics['iteration'][-1])
        
        if (i + 1) % 10 == 0:
            print(f"Iter {1+log_period*(i+1)}/{FIXED_PARAMS['num_iterations']} - "
                  f"QD: {metrics['qd_score'][-1]:.2f}, "
                  f"MaxFit: {metrics['max_fitness'][-1]:.2f}, "
                  f"Cov: {metrics['coverage'][-1]:.2f}%")
    
    total_time = time.time() - start_time_total
    
    print(f"Completed in {total_time:.2f}s - Final QD: {metrics['qd_score'][-1]:.2f}")
    
    # Calculate metrics
    ga_total_evals, ga_num_calls, ga_evals_per_call = calculate_ga_overhead_evals(
        config.get('g_n'), FIXED_PARAMS['num_iterations'], FIXED_PARAMS['population_size'],
        config.get('num_ga_children'), config.get('num_ga_generations')
    )
    
    baseline_total_evals = FIXED_PARAMS['num_iterations'] * FIXED_PARAMS['batch_size']
    eval_savings_pct = None
    net_eval_savings_pct = None
    dns_ga_total_evals = None
    
    if convergence_iter is not None:
        dns_main_evals = convergence_iter * FIXED_PARAMS['batch_size']
        if config['type'] == 'dns-ga':
            ga_calls_until_convergence = convergence_iter // config['g_n']
            ga_evals_until_convergence = ga_calls_until_convergence * ga_evals_per_call
            dns_ga_total_evals = dns_main_evals + ga_evals_until_convergence
        else:
            dns_ga_total_evals = dns_main_evals
        
        eval_savings_pct = (FIXED_PARAMS['num_iterations'] - convergence_iter) / FIXED_PARAMS['num_iterations'] * 100
        net_eval_savings_pct = (baseline_total_evals - dns_ga_total_evals) / baseline_total_evals * 100
    
    return {
        'config_name': config['name'],
        'config_type': config['type'],
        'iso_sigma': iso_sigma,
        'seed': seed,
        'g_n': config.get('g_n'),
        'num_ga_generations': config.get('num_ga_generations'),
        'final_qd_score': float(metrics['qd_score'][-1]),
        'final_max_fitness': float(metrics['max_fitness'][-1]),
        'final_coverage': float(metrics['coverage'][-1]),
        'total_time': total_time,
        'convergence_iter': convergence_iter,
        'eval_savings_pct': eval_savings_pct,
        'net_eval_savings_pct': net_eval_savings_pct,
        'dns_ga_total_evals': dns_ga_total_evals,
        'baseline_total_evals': baseline_total_evals,
        'ga_overhead_evals': ga_total_evals,
        'log_file': log_filename,
    }

print("Experiment runner ready!")

In [None]:
# Run all experiments
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"\n{'='*80}")
print(f"MULTI-SEED EXPERIMENTS - {timestamp}")
print(f"{'='*80}")
print(f"Total experiments: {len(SELECTED_CONFIGS) * len(ISO_SIGMAS) * len(SEEDS)}")
print(f"Estimated time: ~{len(SELECTED_CONFIGS) * len(ISO_SIGMAS) * len(SEEDS) * 4 / 60:.1f} hours")

all_results = []
experiment_count = 0
total_experiments = len(SELECTED_CONFIGS) * len(ISO_SIGMAS) * len(SEEDS)

# First, run baseline with iso=0.01 and seed=42 to get target
print("\n" + "#"*80)
print("# ESTABLISHING BASELINE TARGET")
print("#"*80)
baseline_config = SELECTED_CONFIGS[0]  # DNS_baseline
baseline_target_result = run_single_experiment(baseline_config, 0.01, 42, None)
BASELINE_TARGET_QD = baseline_target_result['final_qd_score']
all_results.append(baseline_target_result)
experiment_count += 1

print(f"\n>>> BASELINE TARGET QD: {BASELINE_TARGET_QD:.2f} <<<\n")

# Now run all experiments
for config in SELECTED_CONFIGS:
    for iso_sigma in ISO_SIGMAS:
        for seed in SEEDS:
            # Skip the baseline we already ran
            if config['name'] == 'DNS_baseline' and iso_sigma == 0.01 and seed == 42:
                continue
            
            experiment_count += 1
            print(f"\n{'#'*80}")
            print(f"# Experiment {experiment_count}/{total_experiments}")
            print(f"{'#'*80}")
            
            try:
                result = run_single_experiment(config, iso_sigma, seed, BASELINE_TARGET_QD)
                all_results.append(result)
                
                # Save intermediate results
                if experiment_count % 5 == 0:
                    interim_file = f"multiseed_logs/interim_results_{timestamp}.json"
                    with open(interim_file, 'w') as f:
                        json.dump({'results': all_results, 'baseline_target': BASELINE_TARGET_QD}, f, indent=2)
                    print(f"\n>>> Saved interim results: {len(all_results)} experiments <<<")
                
            except Exception as e:
                print(f"ERROR: {e}")
                import traceback
                traceback.print_exc()

# Save final results
results_file = f"multiseed_logs/multiseed_results_{timestamp}.json"
with open(results_file, 'w') as f:
    json.dump({'results': all_results, 'baseline_target': BASELINE_TARGET_QD}, f, indent=2)

print(f"\n{'='*80}")
print(f"ALL EXPERIMENTS COMPLETE!")
print(f"{'='*80}")
print(f"Total experiments: {len(all_results)}")
print(f"Results saved to: {results_file}")

## Statistical Analysis

In [None]:
# Load results into DataFrame
df = pd.DataFrame(all_results)

print("="*80)
print("RESULTS SUMMARY")
print("="*80)
print(f"\nTotal experiments: {len(df)}")
print(f"Configurations: {df['config_name'].nunique()}")
print(f"ISO_SIGMA values: {df['iso_sigma'].nunique()}")
print(f"Seeds: {df['seed'].nunique()}")

# Calculate statistics by config and iso_sigma
stats_df = df.groupby(['config_name', 'iso_sigma']).agg({
    'final_qd_score': ['mean', 'std', 'min', 'max'],
    'final_max_fitness': ['mean', 'std'],
    'convergence_iter': ['mean', 'std'],
    'net_eval_savings_pct': ['mean', 'std'],
}).round(2)

print("\n" + "="*80)
print("STATISTICS BY CONFIGURATION AND ISO_SIGMA")
print("="*80)
print(stats_df)

# Save statistics
stats_df.to_csv(f"multiseed_logs/statistics_{timestamp}.csv")
print(f"\nStatistics saved to: multiseed_logs/statistics_{timestamp}.csv")

## Visualization: QD Score Comparison

In [None]:
# Create grouped bar plot comparing QD scores
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: ISO_SIGMA = 0.005
df_005 = df[df['iso_sigma'] == 0.005].copy()
summary_005 = df_005.groupby('config_name')['final_qd_score'].agg(['mean', 'std']).reset_index()
summary_005 = summary_005.sort_values('mean', ascending=False)

ax1.bar(range(len(summary_005)), summary_005['mean'], 
        yerr=summary_005['std'], capsize=5, alpha=0.7, color='steelblue')
ax1.set_xticks(range(len(summary_005)))
ax1.set_xticklabels(summary_005['config_name'], rotation=45, ha='right')
ax1.set_ylabel('Final QD Score', fontsize=12)
ax1.set_title('QD Score Comparison (iso_sigma=0.005)', fontsize=14, fontweight='bold')
ax1.axhline(y=BASELINE_TARGET_QD, color='red', linestyle='--', linewidth=2, 
            label=f'Baseline Target ({BASELINE_TARGET_QD:.0f})')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)

# Plot 2: ISO_SIGMA = 0.01
df_01 = df[df['iso_sigma'] == 0.01].copy()
summary_01 = df_01.groupby('config_name')['final_qd_score'].agg(['mean', 'std']).reset_index()
summary_01 = summary_01.sort_values('mean', ascending=False)

ax2.bar(range(len(summary_01)), summary_01['mean'], 
        yerr=summary_01['std'], capsize=5, alpha=0.7, color='darkgreen')
ax2.set_xticks(range(len(summary_01)))
ax2.set_xticklabels(summary_01['config_name'], rotation=45, ha='right')
ax2.set_ylabel('Final QD Score', fontsize=12)
ax2.set_title('QD Score Comparison (iso_sigma=0.01)', fontsize=14, fontweight='bold')
ax2.axhline(y=BASELINE_TARGET_QD, color='red', linestyle='--', linewidth=2, 
            label=f'Baseline Target ({BASELINE_TARGET_QD:.0f})')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(f'multiseed_logs/qd_score_comparison_{timestamp}.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop 3 configurations by QD score (iso=0.01):")
print(summary_01.head(3).to_string(index=False))

## Visualization: Efficiency vs Performance

In [None]:
# Scatter plot: Net evaluation savings vs QD score
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for idx, iso_val in enumerate([0.005, 0.01]):
    ax = axes[idx]
    df_iso = df[df['iso_sigma'] == iso_val].copy()
    
    # Remove baseline for clarity (no net savings to plot)
    df_iso_ga = df_iso[df_iso['config_type'] == 'dns-ga'].copy()
    
    if len(df_iso_ga) > 0:
        # Plot each seed as a point
        for config in df_iso_ga['config_name'].unique():
            config_data = df_iso_ga[df_iso_ga['config_name'] == config]
            ax.scatter(config_data['net_eval_savings_pct'], 
                      config_data['final_qd_score'],
                      alpha=0.6, s=100, label=config)
        
        # Add baseline reference line
        baseline_mean = df_iso[df_iso['config_name'] == 'DNS_baseline']['final_qd_score'].mean()
        ax.axhline(y=baseline_mean, color='red', linestyle='--', linewidth=2, 
                   label=f'Baseline Mean ({baseline_mean:.0f})')
        ax.axvline(x=0, color='gray', linestyle=':', linewidth=1, alpha=0.5)
        
        ax.set_xlabel('Net Evaluation Savings (%)', fontsize=12)
        ax.set_ylabel('Final QD Score', fontsize=12)
        ax.set_title(f'Efficiency vs Performance (iso_sigma={iso_val})', 
                     fontsize=14, fontweight='bold')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
        ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'multiseed_logs/efficiency_vs_performance_{timestamp}.png', dpi=300, bbox_inches='tight')
plt.show()

## Statistical Significance Testing

In [None]:
# T-tests comparing each DNS-GA config against baseline
print("="*80)
print("STATISTICAL SIGNIFICANCE TESTS")
print("="*80)
print("\nComparing each DNS-GA configuration against DNS baseline")
print("(using Welch's t-test for unequal variances)\n")

results_tests = []

for iso_val in ISO_SIGMAS:
    print(f"\n{'='*60}")
    print(f"ISO_SIGMA = {iso_val}")
    print(f"{'='*60}")
    
    baseline_scores = df[(df['config_name'] == 'DNS_baseline') & 
                         (df['iso_sigma'] == iso_val)]['final_qd_score'].values
    
    for config_name in df[df['config_type'] == 'dns-ga']['config_name'].unique():
        ga_scores = df[(df['config_name'] == config_name) & 
                       (df['iso_sigma'] == iso_val)]['final_qd_score'].values
        
        if len(ga_scores) > 0 and len(baseline_scores) > 0:
            # Welch's t-test (unequal variances)
            t_stat, p_value = stats.ttest_ind(ga_scores, baseline_scores, equal_var=False)
            
            mean_diff = np.mean(ga_scores) - np.mean(baseline_scores)
            pct_diff = (mean_diff / np.mean(baseline_scores)) * 100
            
            # Cohen's d effect size
            pooled_std = np.sqrt((np.std(ga_scores, ddof=1)**2 + np.std(baseline_scores, ddof=1)**2) / 2)
            cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0
            
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
            
            print(f"\n{config_name}:")
            print(f"  Mean DNS-GA: {np.mean(ga_scores):.2f} ± {np.std(ga_scores):.2f}")
            print(f"  Mean Baseline: {np.mean(baseline_scores):.2f} ± {np.std(baseline_scores):.2f}")
            print(f"  Difference: {mean_diff:.2f} ({pct_diff:+.2f}%)")
            print(f"  t-statistic: {t_stat:.3f}, p-value: {p_value:.4f} {significance}")
            print(f"  Effect size (Cohen's d): {cohens_d:.3f}")
            
            results_tests.append({
                'iso_sigma': iso_val,
                'config_name': config_name,
                'mean_ga': np.mean(ga_scores),
                'std_ga': np.std(ga_scores),
                'mean_baseline': np.mean(baseline_scores),
                'std_baseline': np.std(baseline_scores),
                'mean_diff': mean_diff,
                'pct_diff': pct_diff,
                't_stat': t_stat,
                'p_value': p_value,
                'cohens_d': cohens_d,
                'significant': significance,
            })

# Save test results
tests_df = pd.DataFrame(results_tests)
tests_df.to_csv(f'multiseed_logs/significance_tests_{timestamp}.csv', index=False)
print(f"\n\nSignificance tests saved to: multiseed_logs/significance_tests_{timestamp}.csv")

## Final Summary and Conclusions

In [None]:
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)

# Best configuration by QD score
best_overall = df.loc[df['final_qd_score'].idxmax()]
print(f"\nBEST OVERALL QD SCORE:")
print(f"  Config: {best_overall['config_name']}")
print(f"  ISO_SIGMA: {best_overall['iso_sigma']}")
print(f"  Seed: {best_overall['seed']}")
print(f"  QD Score: {best_overall['final_qd_score']:.2f}")

# Best average performance
best_avg = df.groupby(['config_name', 'iso_sigma'])['final_qd_score'].mean().reset_index()
best_avg_row = best_avg.loc[best_avg['final_qd_score'].idxmax()]
print(f"\nBEST AVERAGE QD SCORE:")
print(f"  Config: {best_avg_row['config_name']}")
print(f"  ISO_SIGMA: {best_avg_row['iso_sigma']}")
print(f"  Mean QD Score: {best_avg_row['final_qd_score']:.2f}")

# Most efficient with positive net savings
df_ga = df[df['config_type'] == 'dns-ga'].copy()
df_ga_positive = df_ga[df_ga['net_eval_savings_pct'] > 0]
if len(df_ga_positive) > 0:
    best_efficiency = df_ga_positive.groupby(['config_name', 'iso_sigma'])['net_eval_savings_pct'].mean().reset_index()
    best_eff_row = best_efficiency.loc[best_efficiency['net_eval_savings_pct'].idxmax()]
    print(f"\nMOST EFFICIENT (Positive Net Savings):")
    print(f"  Config: {best_eff_row['config_name']}")
    print(f"  ISO_SIGMA: {best_eff_row['iso_sigma']}")
    print(f"  Mean Net Savings: {best_eff_row['net_eval_savings_pct']:.2f}%")
else:
    print(f"\nNO CONFIGURATIONS WITH POSITIVE NET SAVINGS")

# Key findings
print(f"\n" + "="*80)
print("KEY FINDINGS:")
print("="*80)

baseline_mean_005 = df[(df['config_name'] == 'DNS_baseline') & (df['iso_sigma'] == 0.005)]['final_qd_score'].mean()
baseline_mean_01 = df[(df['config_name'] == 'DNS_baseline') & (df['iso_sigma'] == 0.01)]['final_qd_score'].mean()

print(f"\n1. Mutation Strength Impact:")
print(f"   - Baseline with iso=0.01: {baseline_mean_01:.2f}")
print(f"   - Baseline with iso=0.005: {baseline_mean_005:.2f}")
print(f"   - Improvement: {((baseline_mean_01 - baseline_mean_005) / baseline_mean_005 * 100):+.2f}%")

# Check if any DNS-GA beats baseline at iso=0.01
df_01_ga = df[(df['iso_sigma'] == 0.01) & (df['config_type'] == 'dns-ga')]
if len(df_01_ga) > 0:
    best_ga_01 = df_01_ga.groupby('config_name')['final_qd_score'].mean().max()
    print(f"\n2. Competition-GA Performance:")
    print(f"   - Best DNS-GA (iso=0.01): {best_ga_01:.2f}")
    print(f"   - Baseline (iso=0.01): {baseline_mean_01:.2f}")
    if best_ga_01 > baseline_mean_01:
        print(f"   ✓ Competition-GA WINS by {((best_ga_01 - baseline_mean_01) / baseline_mean_01 * 100):+.2f}%")
    else:
        print(f"   ✗ Baseline wins by {((baseline_mean_01 - best_ga_01) / best_ga_01 * 100):+.2f}%")

# Count statistically significant improvements
sig_tests_01 = [t for t in results_tests if t['iso_sigma'] == 0.01 and t['p_value'] < 0.05 and t['mean_diff'] > 0]
print(f"\n3. Statistical Significance:")
print(f"   - Configs with significant improvement over baseline (iso=0.01): {len(sig_tests_01)}")
if sig_tests_01:
    for test in sig_tests_01:
        print(f"     • {test['config_name']}: +{test['pct_diff']:.2f}% (p={test['p_value']:.4f})")

print("\n" + "="*80)
print("EXPERIMENT COMPLETE!")
print("="*80)

## Next Steps

Based on these results:

1. **If Competition-GA wins statistically**: Document the winning configuration and analyze what makes it effective
2. **If baseline wins**: Competition-GA is primarily an efficiency technique, not a performance enhancement
3. **Analyze tradeoffs**: Quantify evaluation savings vs QD score differences
4. **Publication**: Use these multi-seed results for robust scientific claims

All results, statistics, and visualizations are saved in `multiseed_logs/`