## Arm Manipulation Task - Full 31-Seed Study

**Environment**: arm (manipulation, 100-DoF, end-effector xy position)
**Purpose**: Test Competition-GA generalization beyond locomotion to manipulation domain
**Timeline**: Expected completion: ~30 minutes (analytical task, no physics sim)

In [2]:
import os
import json
import time
from datetime import datetime
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import functools
import warnings
warnings.filterwarnings('ignore')

import jax
import jax.numpy as jnp

from qdax.core.dns_ga import DominatedNoveltySearchGA
from qdax.core.dns import DominatedNoveltySearch
from qdax.tasks.arm import arm_scoring_function
from qdax.core.emitters.mutation_operators import isoline_variation
from qdax.core.emitters.standard_emitters import MixingEmitter
from qdax.utils.metrics import CSVLogger, default_qd_metrics

# Configure plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Create experiment logs directory
os.makedirs("seed_variability_logs_arm", exist_ok=True)

print("Setup complete!")
print(f"Current directory: {os.getcwd()}")
print(f"JAX devices: {jax.devices()}")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Setup complete!
Current directory: /Users/briancf/Desktop/source/EvoAlgsAndSwarm/lib-qdax/QDax/examples
JAX devices: [CpuDevice(id=0)]
Start time: 2025-11-16 09:12:08


## Generate Random Seeds (SAME as ant_omni for consistency)

In [3]:
# Use SAME 31 seeds as ant_omni for direct comparison
np.random.seed(2024)
RANDOM_SEEDS = np.random.randint(1, 100000, size=31).tolist()

print("="*80)
print("USING SAME 31 RANDOM SEEDS AS ANT_OMNI")
print("="*80)
print(f"Seeds: {RANDOM_SEEDS[:10]}... (showing first 10)")
print(f"Total: {len(RANDOM_SEEDS)} seeds")
print("="*80)

# Save seeds
with open('seed_variability_logs_arm/random_seeds.json', 'w') as f:
    json.dump({'seeds': RANDOM_SEEDS, 'generation_seed': 2024}, f, indent=2)

USING SAME 31 RANDOM SEEDS AS ANT_OMNI
Seeds: [7817, 52731, 51809, 35457, 47644, 95781, 68031, 49336, 7978, 61378]... (showing first 10)
Total: 31 seeds


## Experiment Configuration - Arm Task

In [4]:
FIXED_PARAMS = {
    'batch_size': 256,  # GPU-optimized batch size
    'num_param_dimensions': 100,  # Arm joints (100 DoF for complex manipulation)
    'num_iterations': 10000,  # Good convergence for arm task (reduced from 20k to save time)
    'population_size': 2048,  # Match batch_size
    'k': 3,  # Increased k for better novelty estimation
    'line_sigma': 0.1,  # Increased for more exploration
    'iso_sigma': 0.05,  # Increased significantly for arm task
    'min_param': 0.0,  # Normalized joint angles [0, 1]
    'max_param': 1.0,
    'min_desc': 0.0,   # End-effector position [0, 1]^2
    'max_desc': 1.0,
}

MAIN_CONFIGS = [
    # Baseline
    {
        'type': 'baseline',
        'name': 'DNS_baseline',
        'g_n': None,
        'num_ga_children': None,
        'num_ga_generations': None,
    },
    # Frequent GA
    {
        'type': 'dns-ga',
        'name': 'DNS-GA_g300_gen2',
        'g_n': 300,
        'num_ga_children': 2,
        'num_ga_generations': 2,
    },
    # Rare but deep GA
    {
        'type': 'dns-ga',
        'name': 'DNS-GA_g1000_gen4',
        'g_n': 1000,
        'num_ga_children': 2,
        'num_ga_generations': 4,
    },
]

print("="*80)
print("ARM MANIPULATION TASK CONFIGURATION (GPU-OPTIMIZED)")
print("="*80)
print(f"\nEnvironment: Arm (manipulation)")
print(f"  Type: Robotic arm reaching task")
print(f"  DoF: {FIXED_PARAMS['num_param_dimensions']} (joint angles)")
print(f"  Descriptor: End-effector xy position (2D)")
print(f"  Descriptor bounds: [0, 1]^2 (normalized)")
print(f"  Iterations: {FIXED_PARAMS['num_iterations']}")
print(f"  Population: {FIXED_PARAMS['population_size']}")
print(f"  Batch size: {FIXED_PARAMS['batch_size']} (GPU-optimized)")
print(f"  k (novelty): {FIXED_PARAMS['k']}")
print(f"  Sigma (iso/line): {FIXED_PARAMS['iso_sigma']}/{FIXED_PARAMS['line_sigma']}")
print(f"  Seeds: {len(RANDOM_SEEDS)}")

print(f"\nConfigurations:")
for config in MAIN_CONFIGS:
    if config['type'] == 'baseline':
        print(f"  â€¢ {config['name']}: No GA")
    else:
        ga_calls = FIXED_PARAMS['num_iterations'] // config['g_n']
        print(f"  â€¢ {config['name']}: {ga_calls} GA calls")

total_exp = len(MAIN_CONFIGS) * len(RANDOM_SEEDS)
print(f"\nTotal Experiments: {total_exp}")
print(f"Estimated time (2-parallel): ~30-45 minutes with GPU")
print("="*80)

ARM MANIPULATION TASK CONFIGURATION (GPU-OPTIMIZED)

Environment: Arm (manipulation)
  Type: Robotic arm reaching task
  DoF: 100 (joint angles)
  Descriptor: End-effector xy position (2D)
  Descriptor bounds: [0, 1]^2 (normalized)
  Iterations: 10000
  Population: 2048
  Batch size: 256 (GPU-optimized)
  k (novelty): 3
  Sigma (iso/line): 0.05/0.1
  Seeds: 31

Configurations:
  â€¢ DNS_baseline: No GA
  â€¢ DNS-GA_g300_gen2: 33 GA calls
  â€¢ DNS-GA_g1000_gen4: 10 GA calls

Total Experiments: 93
Estimated time (2-parallel): ~30-45 minutes with GPU


## Helper Functions (same as ant_omni)

In [5]:
def calculate_ga_overhead_evals(g_n, num_iterations, population_size, num_ga_children, num_ga_generations):
    """Calculate total evaluations performed by Competition-GA."""
    if g_n is None or g_n >= num_iterations:
        return 0, 0, 0
    
    num_ga_calls = num_iterations // g_n
    if num_ga_children == 1:
        offspring_per_call = population_size * num_ga_generations
    else:
        offspring_per_call = population_size * num_ga_children * (num_ga_children**num_ga_generations - 1) // (num_ga_children - 1)
    evals_per_ga_call = offspring_per_call
    total_ga_evals = num_ga_calls * evals_per_ga_call
    return total_ga_evals, num_ga_calls, evals_per_ga_call


def create_mutation_function(iso_sigma, min_param=0.0, max_param=1.0):
    """Create mutation function for Competition-GA with parameter clipping."""
    def competition_ga_mutation_fn(genotype, key):
        genotype_flat, tree_def = jax.tree_util.tree_flatten(genotype)
        num_leaves = len(genotype_flat)
        keys = jax.random.split(key, num_leaves)
        keys_tree = jax.tree_util.tree_unflatten(tree_def, keys)
        
        def add_noise_and_clip(x, k):
            mutated = x + jax.random.normal(k, shape=x.shape) * iso_sigma
            return jnp.clip(mutated, min_param, max_param)
        
        mutated = jax.tree_util.tree_map(add_noise_and_clip, genotype, keys_tree)
        return mutated
    
    return competition_ga_mutation_fn

print("Helper functions loaded!")

Helper functions loaded!


## Single Experiment Runner

In [6]:
def run_single_experiment(config, seed, fixed_params):
    """Run a single experiment with given config and seed."""
    exp_name = f"{config['name']}_seed{seed}"
    
    # Initialize JAX key
    key = jax.random.key(seed)
    
    # Create initial population - use batch_size for initial sampling
    key, subkey = jax.random.split(key)
    init_variables = jax.random.uniform(
        subkey,
        shape=(fixed_params['batch_size'], fixed_params['num_param_dimensions']),
        minval=fixed_params['min_param'],
        maxval=fixed_params['max_param'],
    )
    
    # Scoring function is arm_scoring_function
    scoring_fn = arm_scoring_function
    
    # Metrics function (no offset for arm task, same as test)
    metrics_function = functools.partial(
        default_qd_metrics,
        qd_offset=0.0,
    )
    
    # Variation function WITH minval/maxval parameters (built-in clipping!)
    variation_fn = functools.partial(
        isoline_variation,
        iso_sigma=fixed_params['iso_sigma'],
        line_sigma=fixed_params['line_sigma'],
        minval=fixed_params['min_param'],
        maxval=fixed_params['max_param'],
    )
    
    # Note: mutation_fn should be identity for emitter (test uses lambda x, y: (x, y))
    mixing_emitter = MixingEmitter(
        mutation_fn=lambda x, y: (x, y),
        variation_fn=variation_fn,
        variation_percentage=1.0,
        batch_size=fixed_params['batch_size']
    )
    
    if config['type'] == 'baseline':
        algorithm = DominatedNoveltySearch(
            scoring_function=scoring_fn,
            emitter=mixing_emitter,
            metrics_function=metrics_function,
            population_size=fixed_params['population_size'],
            k=fixed_params['k'],
        )
    else:
        mutation_fn = create_mutation_function(
            fixed_params['iso_sigma'],
            fixed_params['min_param'],
            fixed_params['max_param']
        )
        algorithm = DominatedNoveltySearchGA(
            scoring_function=scoring_fn,
            emitter=mixing_emitter,
            metrics_function=metrics_function,
            population_size=fixed_params['population_size'],
            k=fixed_params['k'],
            g_n=config['g_n'],
            num_ga_children=config['num_ga_children'],
            num_ga_generations=config['num_ga_generations'],
            mutation_fn=mutation_fn,
        )
    
    key, subkey = jax.random.split(key)
    repertoire, emitter_state, init_metrics = algorithm.init(init_variables, subkey)
    
    log_period = 100
    num_loops = fixed_params['num_iterations'] // log_period
    
    # Log initial metrics - maintain consistent ordering
    log_filename = os.path.join("seed_variability_logs_arm", f"{exp_name}_logs.csv")
    csv_logger = CSVLogger(log_filename, header=["coverage", "iteration", "max_fitness", "qd_score", "time"])
    
    init_metrics_formatted = jax.tree.map(lambda x: jnp.array([x]) if x.shape == () else x, init_metrics)
    init_metrics_formatted["iteration"] = jnp.array([0], dtype=jnp.int32)
    init_metrics_formatted["time"] = jnp.array([0.0])
    csv_logger.log(jax.tree.map(lambda x: x[-1] if len(x.shape) > 0 else x, init_metrics_formatted))
    
    if config['type'] == 'baseline':
        scan_state = (repertoire, emitter_state, key)
    else:
        scan_state = (repertoire, emitter_state, key, 1)
    
    start_time_total = time.time()
    
    for i in range(num_loops):
        start_time = time.time()
        
        scan_state, current_metrics = jax.lax.scan(
            algorithm.scan_update,
            scan_state,
            (),
            length=log_period,
        )
        
        timelapse = time.time() - start_time
        
        current_metrics["iteration"] = jnp.arange(
            1 + log_period * i, 1 + log_period * (i + 1), dtype=jnp.int32
        )
        current_metrics["time"] = jnp.repeat(timelapse, log_period)
        
        # Only log the last value, don't accumulate full history
        csv_logger.log(jax.tree.map(lambda x: x[-1], current_metrics))
    
    total_time = time.time() - start_time_total
    
    ga_total_evals, ga_num_calls, ga_evals_per_call = calculate_ga_overhead_evals(
        config.get('g_n'), fixed_params['num_iterations'], fixed_params['population_size'],
        config.get('num_ga_children'), config.get('num_ga_generations')
    )
    
    # Save final repertoire for behavior space visualization
    if config['type'] == 'baseline':
        final_repertoire = scan_state[0]
    else:
        final_repertoire = scan_state[0]
    
    repertoire_file = os.path.join("seed_variability_logs_arm", f"{exp_name}_repertoire.npz")
    jnp.savez(repertoire_file,
        descriptors=final_repertoire.descriptors,
        fitnesses=final_repertoire.fitnesses
    )
    
    result = {
        'config_name': config['name'],
        'config_type': config['type'],
        'seed': seed,
        'g_n': config.get('g_n'),
        'num_ga_generations': config.get('num_ga_generations'),
        'final_qd_score': float(current_metrics['qd_score'][-1]),
        'final_max_fitness': float(current_metrics['max_fitness'][-1]),
        'final_coverage': float(current_metrics['coverage'][-1]),
        'total_time': total_time,
        'ga_overhead_evals': ga_total_evals,
        'log_file': log_filename,
        'repertoire_file': repertoire_file,
    }
    
    # Aggressive cleanup to free device memory
    del repertoire, emitter_state, final_repertoire, scan_state, current_metrics
    del algorithm, mixing_emitter, scoring_fn, init_variables
    
    # Force JAX to clear device memory
    import gc
    gc.collect()
    
    # Clear JAX compilation cache periodically
    jax.clear_caches()
    
    return result

print("Experiment runner ready!")

Experiment runner ready!


## Build Queue and Run Experiments (2-Parallel with ipyparallel)

In [7]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

print("="*80)
print(f"BUILDING EXPERIMENT QUEUE - {timestamp}")
print("="*80)

experiment_queue = []
exp_num = 0

for config in MAIN_CONFIGS:
    for seed in RANDOM_SEEDS:
        exp_num += 1
        experiment_queue.append((exp_num, exp_num, config, seed))

print(f"\nTotal experiments: {len(experiment_queue)}")
print(f"Execution: 2-parallel with ipyparallel")
print(f"Estimated time: ~45-60 minutes (5000 iterations)")
print("="*80)

BUILDING EXPERIMENT QUEUE - 20251116_091211

Total experiments: 93
Execution: 2-parallel with ipyparallel
Estimated time: ~45-60 minutes (5000 iterations)


In [8]:
print("\n" + "="*80)
print("RUNNING ARM TASK EXPERIMENTS (SEQUENTIAL)")
print("="*80)
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Expected completion: ~45-60 minutes")
print("="*80)

start_time_all = time.time()

all_results = []
errors = []
last_update = time.time()

print(f"\nRunning {len(experiment_queue)} experiments sequentially...\n")

for exp_num, (exp_idx, total_exp, config, seed) in enumerate(experiment_queue, 1):
    try:
        result = run_single_experiment(config, seed, FIXED_PARAMS)
        result['exp_num'] = exp_num
        all_results.append(result)
        print(f"  âœ“ Completed: {result['config_name']}, seed={result['seed']}, QD={result['final_qd_score']:.1f}")
    except Exception as e:
        errors.append({'config_name': config['name'], 'seed': seed, 'error': str(e)})
        print(f"  âœ— Failed: {config['name']}, seed={seed}, error: {str(e)}")
    
    # Progress update every 10 seconds
    if time.time() - last_update > 10:
        elapsed = time.time() - start_time_all
        pct = exp_num / len(experiment_queue) * 100
        avg_time = elapsed / exp_num
        remaining_time = (len(experiment_queue) - exp_num) * avg_time / 60
        print(f"ðŸ“Š Progress: {exp_num}/{len(experiment_queue)} ({pct:.1f}%) | Elapsed: {elapsed/60:.1f}m | Remaining: ~{remaining_time:.1f}m")
        last_update = time.time()

total_time = time.time() - start_time_all

print("\n" + "="*80)
print("ARM TASK EXPERIMENTS COMPLETE!")
print("="*80)
print(f"End time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total time: {total_time / 60:.1f} minutes")
print(f"Successful: {len(all_results)}/{len(experiment_queue)}")
print(f"Failed: {len(errors)}")

if errors:
    print("\nErrors:")
    for error in errors:
        print(f"  â€¢ {error['config_name']}, seed={error['seed']}")

results_file = f"seed_variability_logs_arm/all_results_{timestamp}.json"
with open(results_file, 'w') as f:
    json.dump({
        'results': all_results,
        'errors': errors,
        'total_time': total_time,
        'timestamp': timestamp,
        'environment': 'arm',
    }, f, indent=2)

print(f"\nResults saved to: {results_file}")
print("="*80)


RUNNING ARM TASK EXPERIMENTS (SEQUENTIAL)
Start time: 2025-11-16 09:12:12
Expected completion: ~45-60 minutes

Running 93 experiments sequentially...

  âœ“ Completed: DNS_baseline, seed=7817, QD=-127.5
ðŸ“Š Progress: 1/93 (1.1%) | Elapsed: 1.4m | Remaining: ~127.9m
  âœ“ Completed: DNS_baseline, seed=7817, QD=-127.5
ðŸ“Š Progress: 1/93 (1.1%) | Elapsed: 1.4m | Remaining: ~127.9m
  âœ“ Completed: DNS_baseline, seed=52731, QD=-127.1
ðŸ“Š Progress: 2/93 (2.2%) | Elapsed: 2.8m | Remaining: ~127.8m
  âœ“ Completed: DNS_baseline, seed=52731, QD=-127.1
ðŸ“Š Progress: 2/93 (2.2%) | Elapsed: 2.8m | Remaining: ~127.8m
  âœ“ Completed: DNS_baseline, seed=51809, QD=-127.0
ðŸ“Š Progress: 3/93 (3.2%) | Elapsed: 4.2m | Remaining: ~126.7m
  âœ“ Completed: DNS_baseline, seed=51809, QD=-127.0
ðŸ“Š Progress: 3/93 (3.2%) | Elapsed: 4.2m | Remaining: ~126.7m
  âœ“ Completed: DNS_baseline, seed=35457, QD=-127.4
ðŸ“Š Progress: 4/93 (4.3%) | Elapsed: 5.6m | Remaining: ~124.5m
  âœ“ Completed: DNS_baseline, 

## Quick Results Summary

In [13]:
if len(all_results) > 0:
    df = pd.DataFrame(all_results)
    print("="*80)
    print("ARM TASK RESULTS SUMMARY")
    print("="*80)
    print(f"\nTotal experiments: {len(df)}")
    print("\nFinal QD Scores by Configuration:")
    print(df.groupby('config_name')['final_qd_score'].agg(['mean', 'std', 'min', 'max']).round(2))
    print("\n" + "="*80)
    print("âœ“ Results ready for integration with ant_omni, walker2d, and humanoid_omni")
    print("âœ“ ALL 4 ENVIRONMENTS COMPLETE!")
    print("  - 3 locomotion (ant_omni, walker2d_uni, humanoid_omni)")
    print("  - 1 manipulation (arm)")
    print("="*80)

ARM TASK RESULTS SUMMARY

Total experiments: 93

Final QD Scores by Configuration:
                     mean   std     min     max
config_name                                    
DNS-GA_g1000_gen4 -128.22  0.32 -129.10 -127.59
DNS-GA_g300_gen2  -128.06  0.29 -128.62 -127.60
DNS_baseline      -127.32  0.25 -127.91 -126.92

âœ“ Results ready for integration with ant_omni, walker2d, and humanoid_omni
âœ“ ALL 4 ENVIRONMENTS COMPLETE!
  - 3 locomotion (ant_omni, walker2d_uni, humanoid_omni)
  - 1 manipulation (arm)


## Load Results and Generate Analysis Report

In [20]:
import glob
import json
import pandas as pd
import os

# Load results from JSON file
log_dir = "seed_variability_logs_arm"
results_files = glob.glob(f"{log_dir}/all_results_*.json")
results_file = sorted(results_files)[-1]

print(f"Loading results from: {results_file}")

with open(results_file, 'r') as f:
    data = json.load(f)
    all_results = data['results']

# Create DataFrame
df = pd.DataFrame(all_results)

print(f"\nLoaded {len(df)} experiment results")
print(f"Configurations: {df['config_name'].unique()}")
print(f"Seeds per config: {df.groupby('config_name').size().to_dict()}")

Loading results from: seed_variability_logs_arm/all_results_20251116_091211.json

Loaded 93 experiment results
Configurations: ['DNS_baseline' 'DNS-GA_g300_gen2' 'DNS-GA_g1000_gen4']
Seeds per config: {'DNS-GA_g1000_gen4': 31, 'DNS-GA_g300_gen2': 31, 'DNS_baseline': 31}


In [21]:
# Verify log files exist and fix paths if needed
log_files = [f for f in os.listdir(log_dir) if f.endswith('_logs.csv')]
print(f"Found {len(log_files)} log files in {log_dir}")

# Fix log_file paths to be absolute
df['log_file'] = df.apply(
    lambda row: os.path.join(log_dir, f"{row['config_name']}_seed{row['seed']}_logs.csv"),
    axis=1
)

# Verify all log files exist
missing_files = [f for f in df['log_file'] if not os.path.exists(f)]
if missing_files:
    print(f"WARNING: {len(missing_files)} log files not found")
else:
    print("âœ“ All log files found")

Found 93 log files in seed_variability_logs_arm
âœ“ All log files found
âœ“ All log files found


## Convergence Analysis (Same as other environments)

In [None]:
def find_convergence_iteration(log_file, baseline_final_qd, tolerance=0.99, min_iteration=100):
    """Find iteration where QD score reaches baseline's final score.
    
    For negative QD scores: "better" = less negative (closer to 0)
    For positive QD scores: "better" = more positive (further from 0)
    
    Target is 99% of baseline's performance:
    - Negative: target is slightly MORE negative (easier to reach)
    - Positive: target is slightly LESS positive (easier to reach)
    
    Only checks after min_iteration to avoid spurious early convergence.
    """
    try:
        log_df = pd.read_csv(log_file)
        
        # Filter to only check after minimum iteration
        log_df = log_df[log_df['iteration'] >= min_iteration]
        
        if len(log_df) == 0:
            return None
        
        # Apply tolerance: for both positive and negative, multiply gives us 99% threshold
        target_qd = baseline_final_qd * tolerance
        
        # For negative scores: -127 * 0.99 = -125.73 (less negative, HARDER to reach)
        # We want: -127 / 0.99 = -128.28 (more negative, EASIER to reach)
        if baseline_final_qd < 0:
            target_qd = baseline_final_qd / tolerance
        
        # Find first iteration where QD score reaches or exceeds target
        converged_rows = log_df[log_df['qd_score'] >= target_qd]
        
        if len(converged_rows) > 0:
            return int(converged_rows.iloc[0]['iteration'])
        return None
    except Exception as e:
        print(f"Error reading {log_file}: {e}")
        return None

# Calculate baseline statistics
baseline_df = df[df['config_name'] == 'DNS_baseline'].copy()
baseline_mean_final_qd = baseline_df['final_qd_score'].mean()
baseline_convergence_iteration = FIXED_PARAMS['num_iterations']

print(f"Baseline mean final QD score: {baseline_mean_final_qd:.2f}")
print(f"Baseline convergence iteration: {baseline_convergence_iteration}")

# Calculate convergence for DNS-GA configs - compare each seed to its own baseline
dns_ga_df = df[df['config_name'] != 'DNS_baseline'].copy()

def find_convergence_for_seed(row):
    # Get the baseline final QD for this specific seed
    baseline_for_seed = baseline_df[baseline_df['seed'] == row['seed']]['final_qd_score'].values
    if len(baseline_for_seed) == 0:
        return None
    baseline_qd = baseline_for_seed[0]
    # Find when DNS-GA reaches this seed's baseline performance
    return find_convergence_iteration(row['log_file'], baseline_qd, tolerance=1.0, min_iteration=100)

dns_ga_df['convergence_iteration'] = dns_ga_df.apply(find_convergence_for_seed, axis=1)

# Calculate evaluation savings
dns_ga_df['evaluation_savings'] = dns_ga_df['convergence_iteration'].apply(
    lambda x: baseline_convergence_iteration - x if x is not None else None
)
dns_ga_df['evaluation_savings_pct'] = dns_ga_df['evaluation_savings'].apply(
    lambda x: (x / baseline_convergence_iteration * 100) if x is not None else None
)

convergence_df = dns_ga_df[dns_ga_df['convergence_iteration'].notna()].copy()

print(f"\nConvergence Analysis:")
print(f"  Total DNS-GA experiments: {len(dns_ga_df)}")
print(f"  Converged: {len(convergence_df)} ({len(convergence_df)/len(dns_ga_df)*100:.1f}%)")
print(f"  Did not converge: {len(dns_ga_df) - len(convergence_df)}")

print(f"\nConvergence by Configuration:")
for config_name in dns_ga_df['config_name'].unique():
    config_df = dns_ga_df[dns_ga_df['config_name'] == config_name]
    config_converged = config_df['convergence_iteration'].notna().sum()
    print(f"  {config_name}: {config_converged}/{len(config_df)} ({config_converged/len(config_df)*100:.1f}%)")

Baseline mean final QD score: -127.32
Baseline convergence iteration: 10000

Convergence Analysis:
  Total DNS-GA experiments: 62
  Converged: 62 (100.0%)
  Did not converge: 0

Convergence by Configuration:
  DNS-GA_g300_gen2: 31/31 (100.0%)
  DNS-GA_g1000_gen4: 31/31 (100.0%)


## Save Comprehensive Analysis to Text File

In [23]:
import sys
from io import StringIO

# Redirect output to capture all analysis
output_buffer = StringIO()
original_stdout = sys.stdout
sys.stdout = output_buffer

print("=" * 80)
print("ARM MANIPULATION - SEED VARIABILITY ANALYSIS RESULTS")
print("=" * 80)
print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)
print()

# ============================================================================
# SECTION 1: DATASET SUMMARY
# ============================================================================
print("=" * 80)
print("1. DATASET SUMMARY")
print("=" * 80)
print(f"Total experiments: {len(df)}")
print(f"Configurations: {df['config_name'].unique()}")
print(f"Seeds per config: {df.groupby('config_name').size().to_dict()}")
print()
print("Basic statistics:")
print(df.groupby('config_name')['final_qd_score'].agg(['mean', 'std', 'min', 'max']))
print()
print()

# ============================================================================
# SECTION 2: CONVERGENCE EFFICIENCY STATISTICS (31 SEEDS)
# ============================================================================
print("=" * 80)
print(f"2. CONVERGENCE EFFICIENCY STATISTICS ({len(RANDOM_SEEDS)} SEEDS)")
print("=" * 80)
print()

dns_ga_config_names = [name for name in convergence_df['config_name'].unique() if name != 'DNS_baseline']

for config_name in dns_ga_config_names:
    config_data = convergence_df[convergence_df['config_name'] == config_name].copy()
    
    print(f"{config_name}:")
    print("  " + "=" * 60)
    
    # Success rate
    total_seeds = len(RANDOM_SEEDS)
    converged_seeds = len(config_data)
    print(f"  Success rate: {converged_seeds}/{total_seeds} seeds ({converged_seeds/total_seeds*100:.1f}%)")
    print()
    
    # Evaluation savings statistics
    savings_pct = config_data['evaluation_savings_pct']
    savings_abs = config_data['evaluation_savings']
    
    print(f"  Evaluation Savings:")
    print(f"    Mean:         {savings_abs.mean():>10,.0f} evals ({savings_pct.mean():>6.2f}%)")
    print(f"    Median:       {savings_abs.median():>10,.0f} evals ({savings_pct.median():>6.2f}%)")
    print(f"    Std:          {savings_abs.std():>10,.0f} evals ({savings_pct.std():>6.2f}%)")
    print(f"    Min:          {savings_abs.min():>10,.0f} evals ({savings_pct.min():>6.2f}%)")
    print(f"    Max:          {savings_abs.max():>10,.0f} evals ({savings_pct.max():>6.2f}%)")
    print()
    
    # Positive savings count
    positive_savings = (savings_pct > 0).sum()
    print(f"  Seeds with positive savings: {positive_savings}/{converged_seeds} ({positive_savings/converged_seeds*100:.1f}%)")
    print()
    
    # Mean evaluations to convergence
    mean_dns_ga_evals = config_data['convergence_iteration'].mean()
    print(f"  Mean evaluations to convergence:")
    print(f"    Baseline:             {baseline_convergence_iteration:>10,} evals (always)")
    print(f"    DNS-GA (mean):        {mean_dns_ga_evals:>10,.0f} evals")
    print()
    
    # Best and worst performers
    best_idx = savings_pct.idxmax()
    worst_idx = savings_pct.idxmin()
    best_seed = config_data.loc[best_idx, 'seed']
    worst_seed = config_data.loc[worst_idx, 'seed']
    best_savings = savings_pct.loc[best_idx]
    worst_savings = savings_pct.loc[worst_idx]
    
    print(f"  Best performer:  Seed {best_seed} ({best_savings:+.2f}%)")
    print(f"  Worst performer: Seed {worst_seed} ({worst_savings:+.2f}%)")
    print()

print()

# ============================================================================
# SECTION 3: SUCCESS RATE BREAKDOWN
# ============================================================================
print("=" * 80)
print("3. SUCCESS RATE BREAKDOWN")
print("=" * 80)
print()
print("Defining 'success' as: DNS-GA reaches baseline convergence with fewer evaluations")
print("=" * 80)
print()

for config_name in dns_ga_config_names:
    config_converged = convergence_df[convergence_df['config_name'] == config_name].copy()
    config_positive = config_converged[config_converged['evaluation_savings'] > 0].copy()
    
    total_seeds = len(RANDOM_SEEDS)
    
    print(f"{config_name}:")
    print("  " + "=" * 60)
    print()
    
    print(f"  1. Convergence Rate:")
    print(f"     {len(config_converged)}/{total_seeds} seeds ({len(config_converged)/total_seeds*100:.1f}%)")
    print(f"     â†’ Reached baseline's final QD score within {FIXED_PARAMS['num_iterations']} iterations")
    print()
    
    print(f"  2. Efficiency Success Rate (converged seeds only):")
    print(f"     {len(config_positive)}/{len(config_converged)} seeds ({len(config_positive)/len(config_converged)*100:.1f}%)")
    print(f"     â†’ Converged faster than baseline")
    print()
    
    print(f"  3. Overall Success Rate (all {total_seeds} seeds):")
    print(f"     {len(config_positive)}/{total_seeds} seeds ({len(config_positive)/total_seeds*100:.1f}%)")
    print(f"     â†’ Both converged AND faster than baseline")
    print()
    
    print(f"  Evaluation Savings (converged seeds):")
    print(f"     Mean:    {config_converged['evaluation_savings_pct'].mean():.2f}%")
    print(f"     Median:  {config_converged['evaluation_savings_pct'].median():.2f}%")
    print(f"     Std:     {config_converged['evaluation_savings_pct'].std():.2f}%")
    print()
    
    # Best and worst
    best_idx = config_converged['evaluation_savings_pct'].idxmax()
    worst_idx = config_converged['evaluation_savings_pct'].idxmin()
    best_seed = config_converged.loc[best_idx, 'seed']
    worst_seed = config_converged.loc[worst_idx, 'seed']
    best_savings = config_converged.loc[best_idx, 'evaluation_savings_pct']
    worst_savings = config_converged.loc[worst_idx, 'evaluation_savings_pct']
    
    print(f"  Best performer:  Seed {best_seed} ({best_savings:+.2f}%)")
    print(f"  Worst performer: Seed {worst_seed} ({worst_savings:+.2f}%)")
    print()

print()

# ============================================================================
# SECTION 4: SUMMARY TABLE
# ============================================================================
print("=" * 80)
print("4. SUMMARY TABLE: Success Rate Comparison")
print("=" * 80)
print()

summary_data = []
for config_name in dns_ga_config_names:
    config_converged = convergence_df[convergence_df['config_name'] == config_name].copy()
    config_positive = config_converged[config_converged['evaluation_savings'] > 0].copy()
    
    total_seeds = len(RANDOM_SEEDS)
    
    summary_data.append({
        'config': config_name,
        'convergence_rate': len(config_converged) / total_seeds * 100,
        'efficiency_rate': len(config_positive) / len(config_converged) * 100 if len(config_converged) > 0 else 0,
        'overall_success': len(config_positive) / total_seeds * 100,
        'mean_savings': config_converged['evaluation_savings_pct'].mean() if len(config_converged) > 0 else 0,
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print()
print()

# ============================================================================
# SECTION 5: STATISTICAL COMPARISONS
# ============================================================================
print("=" * 80)
print("5. STATISTICAL COMPARISONS")
print("=" * 80)
print()

print("--- DNS-GA vs Baseline (All Converged Seeds) ---")
baseline_mean_evals = baseline_convergence_iteration
dns_ga_mean_evals = convergence_df['convergence_iteration'].mean()
mean_difference = baseline_mean_evals - dns_ga_mean_evals

print(f"Baseline mean: {baseline_mean_evals:,} evals")
print(f"DNS-GA mean: {dns_ga_mean_evals:,.0f} evals")
print(f"Mean difference: {mean_difference:,.0f} evals")

# T-test
baseline_evals = np.full(len(baseline_df), baseline_convergence_iteration)
dns_ga_evals = convergence_df['convergence_iteration'].values
t_stat, p_value = stats.ttest_ind(dns_ga_evals, baseline_evals)

print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")
if p_value < 0.001:
    print("âœ“ SIGNIFICANT: DNS-GA reaches convergence with FEWER evaluations")
elif p_value < 0.05:
    print("âœ“ SIGNIFICANT: DNS-GA shows reliable difference")
else:
    print("~ NOT SIGNIFICANT: No reliable difference")
print()

print("--- g300_gen2 vs g1000_gen4 Comparison ---")
for config_name in dns_ga_config_names:
    config_converged = convergence_df[convergence_df['config_name'] == config_name]
    total_seeds = len(RANDOM_SEEDS)
    print(f"{config_name} success: {len(config_converged)}/{total_seeds} ({len(config_converged)/total_seeds*100:.1f}%)")

print()
print("Evaluation savings (converged seeds only):")
for config_name in dns_ga_config_names:
    config_converged = convergence_df[convergence_df['config_name'] == config_name]
    if len(config_converged) > 0:
        mean_savings = config_converged['evaluation_savings_pct'].mean()
        std_savings = config_converged['evaluation_savings_pct'].std()
        print(f"  {config_name}: {mean_savings:.2f}% Â± {std_savings:.2f}%")

print()
print()

# ============================================================================
# SECTION 6: DISTRIBUTION ANALYSIS
# ============================================================================
print("=" * 80)
print("6. DISTRIBUTION ANALYSIS")
print("=" * 80)
print()

all_savings_pct = convergence_df['evaluation_savings_pct']
positive_savings_count = (all_savings_pct > 0).sum()

print(f"All DNS-GA configs combined (n={len(convergence_df)} converged seeds):")
print(f"  Mean: {all_savings_pct.mean():.2f}%")
print(f"  Median: {all_savings_pct.median():.2f}%")
print(f"  Std: {all_savings_pct.std():.2f}%")
print(f"  Min: {all_savings_pct.min():.2f}%")
print(f"  Max: {all_savings_pct.max():.2f}%")
print(f"  Q1: {all_savings_pct.quantile(0.25):.2f}%")
print(f"  Q3: {all_savings_pct.quantile(0.75):.2f}%")
print()
print(f"  Positive savings: {positive_savings_count}/{len(convergence_df)} ({positive_savings_count/len(convergence_df)*100:.1f}%)")
print()
print(f"  Skewness: {all_savings_pct.skew():.3f} ({'right-skewed, more low savings' if all_savings_pct.skew() > 0 else 'left-skewed, more high savings' if all_savings_pct.skew() < 0 else 'fairly symmetric'})")
print(f"  Kurtosis: {all_savings_pct.kurtosis():.3f} ({'heavy tails, more outliers' if all_savings_pct.kurtosis() > 0 else 'light tails, fewer outliers' if all_savings_pct.kurtosis() < 0 else 'normal-like tails'})")
print()
print()

# ============================================================================
# SECTION 7: CONCLUSION
# ============================================================================
print("=" * 80)
print("CONCLUSION")
print("=" * 80)
print()
print("Report 'Overall Success Rate' as the true success rate:")
print("  â†’ Percentage of all seeds that both converged AND saved evaluations")
print("This is the most honest metric for publication.")
print("=" * 80)
print()

# Restore stdout and save to file
sys.stdout = original_stdout
analysis_text = output_buffer.getvalue()

output_file = "arm_analysis_results.txt"
with open(output_file, 'w') as f:
    f.write(analysis_text)

print(f"âœ“ Analysis saved to: {output_file}")
print(f"  File size: {len(analysis_text)} characters")
print(f"  Sections: 7 (Dataset, Convergence Stats, Success Breakdown, Summary Table, Statistical Tests, Distribution, Conclusion)")

âœ“ Analysis saved to: arm_analysis_results.txt
  File size: 6097 characters
  Sections: 7 (Dataset, Convergence Stats, Success Breakdown, Summary Table, Statistical Tests, Distribution, Conclusion)
