
# Statistical Testing for Deep RL

In [1]:
import sys
import os
import re
import glob
import time
import json
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import ttest_ind, mannwhitneyu, rankdata
import matplotlib
import matplotlib.pyplot as plt

In [2]:
BASE = os.getcwd() 

In [3]:
# Statistical testing configuration
SEED_GRID = [2, 3, 5, 10, 20, 30] #[5, 10, 20, 30, 50, 100, 150, 200]  # sample sizes per group for FPR/Power analysis 
ALPHAS = [0.05, 0.01]  # Significance levels
EPSILONS = [0.5, 1.0, 2.0]  # cohen's d effect sizes for power analysis
N_RESAMPLES = 1000  # number of bootstrap/permutation resamples (reduce for faster iteration)

In [4]:
TASKS = [
    "Hopper-v5",
    "Walker2d-v5",
    "HalfCheetah-v5",
    "Ant-v5",
    "Humanoid-v5",
]

ALGORITHMS = ["SAC", "TD3", "DDPG", "PPO"]

EVAL_EPISODES = 20

TIMESTEPS_PER_TASK = {
    "Hopper-v5":      1_000_000,
    "Walker2d-v5":    1_000_000,
    "HalfCheetah-v5": 3_000_000,
    "Ant-v5":         3_000_000,
    "Humanoid-v5":   10_000_000,
}

DEFAULT_TOTAL_TIMESTEPS = 5_000_000

BASE_DIR = os.path.join(BASE, "rl_experiments")
RUNS_DIR = os.path.join(BASE_DIR, "runs")
MODELS_DIR = os.path.join(BASE_DIR, "models")
RESULTS_CSV = os.path.join(BASE_DIR, "final_eval_returns.csv")
LEARNING_CURVES_CSV = os.path.join(BASE_DIR, "learning_curves.csv")

os.makedirs(RUNS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

GLOBAL_RNG_SEED = 31415
np.random.seed(GLOBAL_RNG_SEED)

In [5]:
# ============================================================================
# Data Loading Functions
# ============================================================================

def load_learning_curves(learning_curves_csv: str = LEARNING_CURVES_CSV, 
                        learning_curves_dir: str = None) -> pd.DataFrame:
    """
    Load learning curves from BOTH sources and merge:
    1. Single CSV file at learning_curves_csv (some data here)
    2. Individual CSV files in learning_curves_dir (rest of the data)
    """
    if learning_curves_dir is None:
        learning_curves_dir = os.path.join(BASE_DIR, "learning_curves")
    
    all_curves = []
    
    # Load from single CSV file first
    if os.path.exists(learning_curves_csv):
        print(f"Loading from single CSV: {learning_curves_csv}")
        df_single = pd.read_csv(learning_curves_csv, on_bad_lines="skip")
        print(f"  Loaded {len(df_single)} entries")
        all_curves.append(df_single)
    
    # Load from individual CSV files
    if os.path.exists(learning_curves_dir):
        csv_files = glob.glob(os.path.join(learning_curves_dir, "*.csv"))
        if csv_files:
            print(f"Loading from {len(csv_files)} individual files in {learning_curves_dir}")
            individual_curves = []
            for csv_file in csv_files:
                try:
                    filename = os.path.basename(csv_file)
                    match = re.match(r"(.+?)-v5_(.+?)_seed(\d+)\.csv", filename)
                    if match:
                        task = match.group(1) + "-v5"
                        algo = match.group(2)
                        seed = int(match.group(3))
                        df = pd.read_csv(csv_file)
                        if all(col in df.columns for col in ["env_steps", "eval_return_mean"]):
                            if "task" not in df.columns:
                                df["task"] = task
                            if "algorithm" not in df.columns:
                                df["algorithm"] = algo
                            if "seed" not in df.columns:
                                df["seed"] = seed
                            individual_curves.append(df)
                except:
                    continue
            if individual_curves:
                df_individual = pd.concat(individual_curves, ignore_index=True)
                print(f"  Loaded {len(df_individual)} entries from individual files")
                all_curves.append(df_individual)
    
    if all_curves:
        combined = pd.concat(all_curves, ignore_index=True)
        combined = combined.drop_duplicates(
            subset=["task", "algorithm", "seed", "env_steps"], keep="last"
        ).reset_index(drop=True)
        # Drop rows with NaN in key columns
        combined = combined.dropna(subset=["task", "algorithm", "seed", "env_steps"])
        print(f"Total: {len(combined)} entries after cleaning")
        return combined
    
    return pd.DataFrame()


def curves_to_rl_stats_format(curves_df: pd.DataFrame, task: str, algo: str):
    """
    Convert learning curves to rl_stats format: (n_steps, n_seeds).
    Returns: (data_array, steps_array, seeds_array)
    """
    df = curves_df[(curves_df['task'] == task) & (curves_df['algorithm'] == algo)].copy()
    if df.empty:
        return np.array([]).reshape(0, 0), np.array([]), np.array([])
    
    seeds = sorted(df['seed'].unique())
    steps = sorted(df['env_steps'].unique())
    
    data = np.full((len(steps), len(seeds)), np.nan)
    for i, step in enumerate(steps):
        for j, seed in enumerate(seeds):
            val = df[(df['env_steps'] == step) & (df['seed'] == seed)]['eval_return_mean']
            if len(val) > 0:
                data[i, j] = val.iloc[0]
    
    return data, np.array(steps), np.array(seeds)


# ============================================================================
# Load learning curves from both sources
# ============================================================================
curves = load_learning_curves(LEARNING_CURVES_CSV)
print(f"\nLoaded learning curves shape: {curves.shape}")
if len(curves) > 0:
    print(f"Unique tasks: {sorted(curves.task.dropna().unique())}")
    print(f"Unique algorithms: {sorted(curves.algorithm.dropna().unique())}")
    print(f"Total (task, algo, seed) combinations: {len(curves.groupby(['task', 'algorithm', 'seed']))}")
    
    # Check for duplicate (task, algorithm, seed, env_steps) entries in curves
    print("\n" + "="*60)
    print("Checking for duplicate (task, algorithm, seed, env_steps) entries in curves...")
    duplicates_curves = curves.groupby(['task', 'algorithm', 'seed', 'env_steps']).filter(lambda x: len(x) > 1)
    if len(duplicates_curves) > 0:
        print(f"Found {len(duplicates_curves)} rows with duplicate (task, algorithm, seed, env_steps)")
        
        different_curves = []
        same_curves_to_drop = []
        
        for (task, algo, seed, step), group in curves.groupby(['task', 'algorithm', 'seed', 'env_steps']):
            if len(group) > 1:
                unique_returns = group['eval_return_mean'].nunique()
                if unique_returns == 1:
                    # Same eval_return_mean - keep first, mark rest for dropping
                    same_curves_to_drop.extend(group.index[1:].tolist())
                else:
                    # Different eval_return_mean - print them
                    different_curves.append(group)
        
        if different_curves:
            print(f"\n*** WARNING: {len(different_curves)} (task, algo, seed, env_steps) groups have DIFFERENT eval_return_mean values: ***")
            diff_df = pd.concat(different_curves)
            display_cols = ['task', 'algorithm', 'seed', 'env_steps', 'eval_return_mean']
            if 'timestamp' in diff_df.columns:
                display_cols.append('timestamp')
            print(diff_df[display_cols].to_string())
        
        if same_curves_to_drop:
            print(f"\nDropping {len(same_curves_to_drop)} duplicate rows with same eval_return_mean")
            curves = curves.drop(same_curves_to_drop).reset_index(drop=True)
            print(f"Curves after deduplication: {len(curves)} entries")
    else:
        print("No duplicate (task, algorithm, seed, env_steps) entries found in curves")

# ============================================================================
# Load final evaluation returns (for statistical tests)
# ============================================================================
print("\n" + "="*60)
print("Loading final evaluation returns...")
final_returns = pd.read_csv(RESULTS_CSV)
print(f"Loaded {len(final_returns)} entries")

# Check for duplicate (task, algorithm, seed) triples
duplicates = final_returns.groupby(['task', 'algorithm', 'seed']).filter(lambda x: len(x) > 1)
if len(duplicates) > 0:
    print(f"\nFound {len(duplicates)} rows with duplicate (task, algorithm, seed) triples")
    
    # Check if duplicates have same or different eval_return_mean
    different_returns = []
    same_returns_to_drop = []
    
    for (task, algo, seed), group in final_returns.groupby(['task', 'algorithm', 'seed']):
        if len(group) > 1:
            unique_returns = group['final_return_mean'].nunique()
            if unique_returns == 1:
                # Same return values - keep first, mark rest for dropping
                same_returns_to_drop.extend(group.index[1:].tolist())
            else:
                # Different return values - print them
                different_returns.append(group)
    
    if different_returns:
        print(f"\n*** WARNING: {len(different_returns)} (task, algo, seed) groups have DIFFERENT final_return_mean values: ***")
        diff_df = pd.concat(different_returns)
        display_cols = ['task', 'algorithm', 'seed', 'final_return_mean']
        if 'timestamp' in diff_df.columns:
            display_cols.append('timestamp')
        print(diff_df[display_cols].to_string())
    
    if same_returns_to_drop:
        print(f"\nDropping {len(same_returns_to_drop)} duplicate rows with same final_return_mean")
        final_returns = final_returns.drop(same_returns_to_drop).reset_index(drop=True)
        print(f"Final returns after deduplication: {len(final_returns)} entries")
else:
    print("No duplicate (task, algorithm, seed) triples found")

print(f"\nUnique tasks in final_returns: {sorted(final_returns.task.dropna().unique())}")
print(f"Unique algorithms in final_returns: {sorted(final_returns.algorithm.dropna().unique())}")


Loading from single CSV: /n/home09/annabelma/rl_final_proj/rl_experiments/learning_curves.csv
  Loaded 46119 entries
Loading from 3681 individual files in /n/home09/annabelma/rl_final_proj/rl_experiments/learning_curves
  Loaded 368364 entries from individual files
Total: 400384 entries after cleaning

Loaded learning curves shape: (400384, 8)
Unique tasks: ['Ant-v5', 'HalfCheetah-v5', 'Hopper-v5', 'Humanoid-v5', 'Walker2d-v5']
Unique algorithms: ['DDPG', 'PPO', 'SAC', 'TD3']
Total (task, algo, seed) combinations: 4020

Checking for duplicate (task, algorithm, seed, env_steps) entries in curves...
No duplicate (task, algorithm, seed, env_steps) entries found in curves

Loading final evaluation returns...
Loaded 5475 entries

Found 2341 rows with duplicate (task, algorithm, seed) triples

                task algorithm  seed  final_return_mean     timestamp
803           Ant-v5      DDPG     8        4005.700276  1.764646e+09
864           Ant-v5      DDPG     8          -6.718302  1.76

## empirical effect sizes

In [6]:
empirical_effect_sizes = []

for task in TASKS:
    task_df = final_returns[final_returns['task'] == task]
    available_algos = [a for a in ALGORITHMS if a in task_df['algorithm'].unique()]
    
    if len(available_algos) < 2:
        continue
    
    print(f"\n{task}:")
    
    # Compute effect size for all algorithm pairs
    for i, algo1 in enumerate(available_algos):
        for algo2 in available_algos[i+1:]:
            sample1 = task_df[task_df['algorithm'] == algo1]['final_return_mean'].values
            sample2 = task_df[task_df['algorithm'] == algo2]['final_return_mean'].values
            
            if len(sample1) < 2 or len(sample2) < 2:
                continue
            
            # Compute pooled standard deviation
            mean1, mean2 = np.mean(sample1), np.mean(sample2)
            std1, std2 = np.std(sample1, ddof=1), np.std(sample2, ddof=1)
            n1, n2 = len(sample1), len(sample2)
            sigma_pool = np.sqrt(((n1-1)*std1**2 + (n2-1)*std2**2) / (n1 + n2 - 2))
            
            # Empirical effect size: ε = |Δμ| / σ_pool
            delta_mu = abs(mean1 - mean2)
            epsilon_empirical = delta_mu / sigma_pool if sigma_pool > 0 else 0
            
            empirical_effect_sizes.append({
                'task': task,
                'algo1': algo1,
                'algo2': algo2,
                'mean1': mean1,
                'mean2': mean2,
                'delta_mu': delta_mu,
                'sigma_pool': sigma_pool,
                'epsilon_empirical': epsilon_empirical,
                'n1': n1,
                'n2': n2
            })
            
            print(f"  {algo1} vs {algo2}: ε = {epsilon_empirical:.3f} (Δμ = {delta_mu:.2f}, σ_pool = {sigma_pool:.2f})")

empirical_effects_df = pd.DataFrame(empirical_effect_sizes)

if len(empirical_effects_df) > 0:
    print("\n" + "="*60)
    print("Summary of Empirical Effect Sizes")
    print("="*60)
    print(f"Mean empirical effect size: {empirical_effects_df['epsilon_empirical'].mean():.3f}")
    print(f"Median empirical effect size: {empirical_effects_df['epsilon_empirical'].median():.3f}")
    print(f"Min: {empirical_effects_df['epsilon_empirical'].min():.3f}, Max: {empirical_effects_df['epsilon_empirical'].max():.3f}")
    print("\nEffect sizes by task:")
    for task in TASKS:
        task_effects = empirical_effects_df[empirical_effects_df['task'] == task]
        if len(task_effects) > 0:
            print(f"  {task}: mean ε = {task_effects['epsilon_empirical'].mean():.3f}")
    
    display(empirical_effects_df)
else:
    print("No effect sizes computed. Make sure final_returns is loaded.")


Hopper-v5:
  SAC vs TD3: ε = 0.185 (Δμ = 152.69, σ_pool = 825.58)
  SAC vs DDPG: ε = 1.613 (Δμ = 1321.27, σ_pool = 819.06)
  SAC vs PPO: ε = 0.379 (Δμ = 313.69, σ_pool = 828.18)
  TD3 vs DDPG: ε = 1.618 (Δμ = 1473.97, σ_pool = 910.96)
  TD3 vs PPO: ε = 0.507 (Δμ = 466.39, σ_pool = 919.63)
  DDPG vs PPO: ε = 1.103 (Δμ = 1007.58, σ_pool = 913.32)

Walker2d-v5:
  SAC vs TD3: ε = 0.177 (Δμ = 117.61, σ_pool = 664.03)
  SAC vs DDPG: ε = 2.193 (Δμ = 1855.16, σ_pool = 845.87)
  SAC vs PPO: ε = 2.851 (Δμ = 2098.01, σ_pool = 735.89)
  TD3 vs DDPG: ε = 1.842 (Δμ = 1737.55, σ_pool = 943.38)
  TD3 vs PPO: ε = 2.338 (Δμ = 1980.40, σ_pool = 846.95)
  DDPG vs PPO: ε = 0.244 (Δμ = 242.85, σ_pool = 994.90)

HalfCheetah-v5:
  SAC vs TD3: ε = 0.561 (Δμ = 2368.51, σ_pool = 4218.94)
  SAC vs DDPG: ε = 0.761 (Δμ = 2775.11, σ_pool = 3644.77)
  SAC vs PPO: ε = 2.613 (Δμ = 8356.77, σ_pool = 3197.58)
  TD3 vs DDPG: ε = 0.120 (Δμ = 406.60, σ_pool = 3382.05)
  TD3 vs PPO: ε = 2.070 (Δμ = 5988.26, σ_pool = 2892.95

Unnamed: 0,task,algo1,algo2,mean1,mean2,delta_mu,sigma_pool,epsilon_empirical,n1,n2
0,Hopper-v5,SAC,TD3,2931.936823,3084.631793,152.69497,825.580198,0.184955,202,201
1,Hopper-v5,SAC,DDPG,2931.936823,1610.66278,1321.274043,819.060788,1.613157,202,203
2,Hopper-v5,SAC,PPO,2931.936823,2618.242535,313.694288,828.184472,0.378773,202,201
3,Hopper-v5,TD3,DDPG,3084.631793,1610.66278,1473.969013,910.960723,1.618038,201,203
4,Hopper-v5,TD3,PPO,3084.631793,2618.242535,466.389258,919.625414,0.507151,201,201
5,Hopper-v5,DDPG,PPO,1610.66278,2618.242535,1007.579755,913.315708,1.103211,203,201
6,Walker2d-v5,SAC,TD3,4426.519844,4308.910441,117.609403,664.032289,0.177114,202,201
7,Walker2d-v5,SAC,DDPG,4426.519844,2571.364218,1855.155626,845.87124,2.193189,202,205
8,Walker2d-v5,SAC,PPO,4426.519844,2328.510149,2098.009695,735.894322,2.850966,202,201
9,Walker2d-v5,TD3,DDPG,4308.910441,2571.364218,1737.546223,943.37908,1.841832,201,205


## power aggregated plots

In [7]:
power_path = "/n/home09/annabelma/rl_final_proj/power_experiments/results/12_10_final"

# ============================================================================
# Load Power Analysis Results from All Tasks
# ============================================================================
print("="*60)
print("Loading power analysis results from all tasks")
print("="*60)

power_dataframes = []
tasks_dir = os.path.join(power_path, "tasks")

for task in TASKS:
    # Convert task name to directory format (e.g., "Ant-v5" -> "Ant_v5")
    task_dir_name = task.replace('-', '_')
    task_csv = os.path.join(tasks_dir, task_dir_name, f"{task_dir_name}_power_empirical_df.csv")
    
    if os.path.exists(task_csv):
        print(f"Loading {task}...")
        df = pd.read_csv(task_csv)
        power_dataframes.append(df)
        print(f"  Loaded {len(df)} rows")
    else:
        print(f"  Warning: {task_csv} not found")

if power_dataframes:
    power_empirical_df = pd.concat(power_dataframes, ignore_index=True)
    print(f"\nTotal power results: {len(power_empirical_df)} rows")
    print(f"Tasks: {sorted(power_empirical_df['task'].unique())}")
    print(f"Tests: {sorted(power_empirical_df['test'].unique())}")
    print(f"Alphas: {sorted(power_empirical_df['alpha'].unique())}")
    print(f"Epsilons: {sorted(power_empirical_df['epsilon'].unique())}")
else:
    print("No power data found!")
    power_empirical_df = pd.DataFrame() 



Loading power analysis results from all tasks
Loading Hopper-v5...
  Loaded 1728 rows
Loading Walker2d-v5...
  Loaded 1728 rows
Loading HalfCheetah-v5...
  Loaded 1728 rows
Loading Ant-v5...
  Loaded 1728 rows
Loading Humanoid-v5...
  Loaded 1728 rows

Total power results: 8640 rows
Tasks: ['Ant-v5', 'HalfCheetah-v5', 'Hopper-v5', 'Humanoid-v5', 'Walker2d-v5']
Tests: ['Mann-Whitney', 'Ranked t-test', 'Welch t-test', 'bootstrap', 'permutation', 't-test']
Alphas: [0.01, 0.05]
Epsilons: [0.5, 1.0, 2.0]


In [8]:
# ============================================================================
# Generate Aggregated Power Plots (across all tasks)
# Format: 3 rows (epsilon) × 2 columns (alpha)
# ============================================================================

if len(power_empirical_df) > 0:
    print("="*60)
    print("Generating aggregated power plots (across all tasks)")
    print("="*60)
    
    # Color map for tests (matching paper style)
    test_colors = {
        't-test': '#1f77b4',           # blue
        'Welch t-test': '#ff7f0e',     # orange
        'Mann-Whitney': '#2ca02c',     # green
        'Ranked t-test': '#9467bd',    # purple
        'bootstrap': '#17becf',        # cyan
        'permutation': '#bcbd22'       # yellow-green
    }
    
    tests_list = ['t-test', "Welch t-test", 'Mann-Whitney', 'Ranked t-test', 'bootstrap', 'permutation']
    
    # Alpha and epsilon order
    alpha_order = [0.01, 0.05]  # Columns: left to right
    epsilon_order = [0.5, 1.0, 2.0]  # Rows: top to bottom
    
    # Aggregate across all tasks and algorithm pairs
    print("\nAggregating power results across all tasks and algorithm pairs...")
    aggregated_data = power_empirical_df.groupby(['test', 'alpha', 'epsilon', 'target_n']).agg({
        'power': 'mean',
        'se': 'mean'
    }).reset_index()
    
    print(f"Aggregated to {len(aggregated_data)} unique (test, alpha, epsilon, target_n) combinations")
    
    # Create figure with subplots: 3 rows × 2 columns
    fig, axes = plt.subplots(3, 2, figsize=(14, 12))
    fig.suptitle('Aggregated Power vs Sample Size (across all tasks)\nAverage across all algorithm pairs', 
                 fontsize=16, fontweight='bold', y=0.995)
    
    # Iterate over rows (epsilon) and columns (alpha)
    for row_idx, epsilon in enumerate(epsilon_order):
        for col_idx, alpha in enumerate(alpha_order):
            ax = axes[row_idx, col_idx]
            
            # Filter data for this (alpha, epsilon) combination
            combo_data = aggregated_data[
                (aggregated_data['alpha'] == alpha) &
                (aggregated_data['epsilon'] == epsilon)
            ]
            
            if len(combo_data) == 0:
                ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
                ax.set_title(f'α = {alpha:.2f}, ε = {epsilon:.1f}', fontsize=11)
                continue
            
            # Group by test and target_n
            plot_data = combo_data.groupby(['test', 'target_n']).agg({
                'power': 'mean',
                'se': 'mean'
            }).reset_index()
            
            if len(plot_data) == 0:
                ax.text(0.5, 0.5, 'No data', ha='center', va='center', transform=ax.transAxes)
                ax.set_title(f'α = {alpha:.2f}, ε = {epsilon:.1f}', fontsize=11)
                continue
            
            # Plot each test
            for test_name in tests_list:
                test_data = plot_data[plot_data['test'] == test_name].sort_values('target_n')
                if len(test_data) > 0:
                    color = test_colors.get(test_name, '#000000')
                    ax.plot(test_data['target_n'], test_data['power'], 
                           marker='o', label=test_name, linewidth=1.5, color=color, markersize=4)
                    # Add error bars
                    ax.errorbar(test_data['target_n'], test_data['power'], 
                               yerr=test_data['se'], 
                               fmt='none', color=color, alpha=0.3, capsize=2)
            
            # Add reference line at 0.8 power (target)
            ax.axhline(y=0.8, color='red', linestyle='--', linewidth=1.5, 
                      alpha=0.7, zorder=0)
            
            # Formatting
            if row_idx == 2:  # Bottom row
                ax.set_xlabel('Sample size N (log scale)', fontsize=10)
            if col_idx == 0:  # Left column
                ax.set_ylabel('Power (1 - β*)', fontsize=10)
            
            ax.set_title(f'α = {alpha:.2f}, ε = {epsilon:.1f}', fontsize=11, fontweight='bold')
            ax.set_xscale('log')
            
            # Set x-axis ticks
            available_n = sorted(plot_data['target_n'].unique())
            ax.set_xticks(available_n)
            ax.set_xticklabels([str(int(n)) for n in available_n], fontsize=9)
            
            # Y-axis: show from 0 to 1
            ax.set_ylim([0, 1.05])
            ax.set_yticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
            ax.tick_params(axis='y', labelsize=9)
            
            ax.grid(True, alpha=0.3, which='both')
            
            # Add legend only to top-right subplot
            if row_idx == 0 and col_idx == 1:
                ax.legend(loc='upper left', frameon=True, fontsize=8, ncol=2)
    
    plt.tight_layout(rect=[0, 0, 1, 0.98])  # Leave space for suptitle
    
    # Save plot
    output_dir = power_path
    os.makedirs(output_dir, exist_ok=True)
    plot_filename = "power_aggregated_all_tasks.png"
    plot_path = os.path.join(output_dir, plot_filename)
    plt.savefig(plot_path, bbox_inches='tight', dpi=150)
    plt.close()
    
    print(f"\nAggregated power plot saved to: {plot_path}")
    print("="*60)
    
    # Display summary statistics
    print("\nAverage Power by test, effect size, and alpha (aggregated across all tasks):")
    summary = power_empirical_df.groupby(['test', 'epsilon', 'alpha']).agg({
        'power': 'mean',
        'se': 'mean'
    })
    for (test, epsilon, alpha), row in summary.iterrows():
        power_val = row['power']
        se_val = row['se']
        print(f"  {test:20s} (ε={epsilon:.1f}, α={alpha:.3f}): Power = {power_val:.4f} ± {se_val:.4f}")
    
else:
    print("No power data available for plotting.")


Generating aggregated power plots (across all tasks)

Aggregating power results across all tasks and algorithm pairs...
Aggregated to 288 unique (test, alpha, epsilon, target_n) combinations

Aggregated power plot saved to: /n/home09/annabelma/rl_final_proj/power_experiments/results/12_10_final/power_aggregated_all_tasks.png

Average Power by test, effect size, and alpha (aggregated across all tasks):
  Mann-Whitney         (ε=0.5, α=0.010): Power = 0.2423 ± 0.0073
  Mann-Whitney         (ε=0.5, α=0.050): Power = 0.3476 ± 0.0083
  Mann-Whitney         (ε=1.0, α=0.010): Power = 0.4526 ± 0.0065
  Mann-Whitney         (ε=1.0, α=0.050): Power = 0.5395 ± 0.0062
  Mann-Whitney         (ε=2.0, α=0.010): Power = 0.6281 ± 0.0040
  Mann-Whitney         (ε=2.0, α=0.050): Power = 0.6788 ± 0.0031
  Ranked t-test        (ε=0.5, α=0.010): Power = 0.2556 ± 0.0077
  Ranked t-test        (ε=0.5, α=0.050): Power = 0.3803 ± 0.0101
  Ranked t-test        (ε=1.0, α=0.010): Power = 0.4722 ± 0.0068
  Ranked t