# Aggregated CSV Generation (`raw_data/`)



This notebook regenerates the aggregated CSV inputs used by the analysis notebooks in this folder.



Inputs: raw experiment logs originally stored under `whisker_cluster_experiments/results/` (baseline + surrogate-model).

For the replication package, **copy the contents of that folder into `../temp_results/`** at the repository root.



Outputs: aggregated CSVs under `raw_data/` (relative to this `results/` folder).



Run it from the `results/` folder (default VS Code notebook behavior).


In [1]:
# Imports and configuration
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from scipy import stats

# This notebook lives in temp/results_analysis_thesis/
BASE_PATH = Path('.')
RESULTS_DIR = Path('/Users/stefan/Workspace/bauers-ma/thesis/temp/results/')
BASELINE_PATH = RESULTS_DIR / 'baseline'
SURROGATE_PATH = RESULTS_DIR / 'surrogate-model'

# Folder where all generated CSV files will be stored
RAW_DATA_PATH = BASE_PATH / 'raw_data'
RAW_DATA_PATH.mkdir(exist_ok=True)

CONFIG_MAPPING = {
    'genProg-surrogate-model-fi-00': 'FI-00',
    'genProg-surrogate-model-fi-05': 'FI-50',
    'genProg-surrogate-model-fi-07': 'FI-70',
    'genProg-surrogate-model-to-05': 'TO-50',
    'genProg-surrogate-model-to-07': 'TO-70',
}

CONFIG_ORDER = ['FI-00', 'FI-50', 'FI-70', 'TO-50', 'TO-70']

if not BASELINE_PATH.exists():
    raise FileNotFoundError(f'Baseline folder not found: {BASELINE_PATH}')
if not SURROGATE_PATH.exists():
    raise FileNotFoundError(f'Surrogate folder not found: {SURROGATE_PATH}')

print(f'Baseline path: {BASELINE_PATH.resolve()}')
print(f'Surrogate path: {SURROGATE_PATH.resolve()}')
print(f'Raw data path: {RAW_DATA_PATH.resolve()}')

Baseline path: /Users/stefan/Workspace/bauers-ma/thesis/temp/results/baseline
Surrogate path: /Users/stefan/Workspace/bauers-ma/thesis/temp/results/surrogate-model
Raw data path: /Users/stefan/Workspace/bauers-ma-replication-package/results/raw_data


In [2]:
# Load task metadata for baseline and surrogate-model runs
baseline_tasks = pd.read_csv(BASELINE_PATH / 'tasks.csv')
surrogate_tasks = pd.read_csv(SURROGATE_PATH / 'tasks.csv')

baseline_tasks['config_label'] = baseline_tasks['config'].map(CONFIG_MAPPING)
surrogate_tasks['config_label'] = surrogate_tasks['config'].map(CONFIG_MAPPING)

all_tasks = pd.concat([baseline_tasks, surrogate_tasks], ignore_index=True)

print(f'Total tasks: {len(all_tasks)}')
print('Tasks per configuration:')
print(all_tasks['config_label'].value_counts().sort_index())

Total tasks: 410
Tasks per configuration:
config_label
FI-00    82
FI-50    82
FI-70    82
TO-50    82
TO-70    82
Name: count, dtype: int64


In [3]:
# Helper functions to load and summarize a single run

def cohens_d(group1, group2):
    """Calculate Cohen's d effect size for two 1D numpy/pandas arrays."""
    group1 = pd.Series(group1).dropna()
    group2 = pd.Series(group2).dropna()
    n1, n2 = len(group1), len(group2)
    if n1 == 0 or n2 == 0:
        return np.nan
    var1, var2 = group1.var(), group2.var()
    pooled_std = np.sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
    if pooled_std <= 0:
        return 0.0
    return (group1.mean() - group2.mean()) / pooled_std

def load_output_csv(job_id, base_path):
    output_file = base_path / str(job_id) / 'output.csv'
    if output_file.exists():
        try:
            return pd.read_csv(output_file)
        except Exception as exc:
            print(f'Error loading {output_file}: {exc}')
    return None

def load_timing_data(job_id, base_path):
    started_file = base_path / str(job_id) / 'started.txt'
    finished_file = base_path / str(job_id) / 'finished.txt'
    timing = {}
    try:
        if started_file.exists():
            started = started_file.read_text().strip()
            timing['started'] = datetime.fromisoformat(started)
        if finished_file.exists():
            finished = finished_file.read_text().strip()
            timing['finished'] = datetime.fromisoformat(finished)
        if 'started' in timing and 'finished' in timing:
            timing['duration_seconds'] = (timing['finished'] - timing['started']).total_seconds()
    except Exception as exc:
        print(f'Error loading timing for job {job_id}: {exc}')
    return timing

def analyze_single_run(job_id, config_label, project, base_path):
    output_df = load_output_csv(job_id, base_path)
    timing = load_timing_data(job_id, base_path)
    if output_df is None or output_df.empty:
        return None

    initial_data = output_df[output_df['iteration'] == 0]
    initial_numPass = int(initial_data.iloc[0]['numPass']) if len(initial_data) > 0 else 0

    final_iteration = int(output_df['iteration'].max())
    final_row = output_df[output_df['iteration'] == final_iteration].iloc[0]

    total_evaluations = len(output_df)
    actual_executions = int((output_df['fitnessEvalType'] == 'fitnessEvaluation').sum())
    predictions = int((output_df['fitnessEvalType'] == 'fitnessPrediction').sum())

    max_numPass = int(output_df['numPass'].max())
    is_success = max_numPass == 28

    optimal_iteration = None
    if is_success:
        opt_rows = output_df[output_df['numPass'] == 28]
        if len(opt_rows) > 0:
            optimal_iteration = int(opt_rows['iteration'].min())

    first_improvement_iteration = None
    improvement_rows = output_df[(output_df['numPass'] > initial_numPass) & (output_df['iteration'] > 0)]
    if len(improvement_rows) > 0:
        first_improvement_iteration = int(improvement_rows['iteration'].min())

    final_fitness = float(output_df['fitness'].max())
    target_fitness = 0.9 * final_fitness if final_fitness is not None else None
    convergence_90_iteration = None
    if target_fitness is not None and not np.isnan(target_fitness):
        conv_rows = output_df[output_df['fitness'] >= target_fitness]
        if len(conv_rows) > 0:
            convergence_90_iteration = int(conv_rows['iteration'].min())

    result = {
        'job_id': int(job_id),
        'config': config_label,
        'project': project,
        'initial_numPass': initial_numPass,
        'is_success': bool(is_success),
        'max_numPass': max_numPass,
        'final_fitness': final_fitness,
        'final_iteration': final_iteration,
        'optimal_iteration': optimal_iteration,
        'first_improvement_iteration': first_improvement_iteration,
        'convergence_90_iteration': convergence_90_iteration,
        'total_evaluations': int(total_evaluations),
        'actual_executions': actual_executions,
        'predictions': predictions,
        'duration_seconds': timing.get('duration_seconds', np.nan),
    }
    return result

In [4]:
# Process all runs and build per_run_results.csv
results = []
for idx, task in all_tasks.iterrows():
    job_id = int(task['job_id'])
    config_label = task['config_label']
    project = task['project']
    base_path = BASELINE_PATH if config_label == 'FI-00' else SURROGATE_PATH
    row = analyze_single_run(job_id, config_label, project, base_path)
    if row is not None:
        results.append(row)
    if (idx + 1) % 50 == 0:
        print(f'Processed {idx + 1}/{len(all_tasks)} tasks...')

results_df = pd.DataFrame(results)
print(f'Total analyzed runs: {len(results_df)}')

# Derived columns used in downstream summaries
results_df['prediction_rate'] = (results_df['predictions'] / results_df['total_evaluations'] * 100).fillna(0.0)

# Approximate time to first improvement in seconds,
# assuming uniform progress over the run duration.
results_df['time_to_first_improvement_sec'] = np.where(
    results_df['first_improvement_iteration'].notna() & results_df['duration_seconds'].notna(),
    results_df['duration_seconds'] * (results_df['first_improvement_iteration'] / results_df['final_iteration'].replace(0, np.nan)),
    np.nan,
)

results_df.to_csv(RAW_DATA_PATH / 'per_run_results.csv', index=False)
print('Saved: raw_data/per_run_results.csv')

Processed 50/410 tasks...


  return pd.read_csv(output_file)


Processed 100/410 tasks...


  return pd.read_csv(output_file)
  return pd.read_csv(output_file)


Processed 150/410 tasks...
Processed 200/410 tasks...
Processed 250/410 tasks...


  return pd.read_csv(output_file)


Processed 300/410 tasks...
Processed 350/410 tasks...
Processed 400/410 tasks...
Total analyzed runs: 410
Saved: raw_data/per_run_results.csv


In [5]:
# Core aggregated summaries by configuration

config_group = results_df.groupby('config')

# Repair success summary
success_summary = config_group.agg({
    'is_success': ['sum', 'count', 'mean'],
    'max_numPass': ['mean', 'median', 'std'],
    'final_fitness': ['mean', 'median', 'std'],
}).round(4)
success_summary.columns = ['_'.join(col).strip() for col in success_summary.columns.values]
success_summary = success_summary.rename(columns={
    'is_success_sum': 'successful_repairs',
    'is_success_count': 'total_runs',
    'is_success_mean': 'success_rate',
    'max_numPass_mean': 'avg_tests_passed',
    'max_numPass_median': 'median_tests_passed',
    'max_numPass_std': 'std_tests_passed',
    'final_fitness_mean': 'avg_final_fitness',
    'final_fitness_median': 'median_final_fitness',
    'final_fitness_std': 'std_final_fitness',
})
success_summary = success_summary.reindex(CONFIG_ORDER)
success_summary.to_csv(RAW_DATA_PATH / 'repair_success_summary.csv')
print('Saved: raw_data/repair_success_summary.csv')

# Time efficiency (all runs and successful-only)
time_summary = results_df[results_df['duration_seconds'].notna()].groupby('config').agg({
    'duration_seconds': ['mean', 'median', 'std', 'min', 'max', 'count'],
}).round(2)
time_summary.columns = ['_'.join(col).strip() for col in time_summary.columns.values]
time_summary = time_summary.rename(columns={
    'duration_seconds_mean': 'mean_time_sec',
    'duration_seconds_median': 'median_time_sec',
    'duration_seconds_std': 'std_time_sec',
    'duration_seconds_min': 'min_time_sec',
    'duration_seconds_max': 'max_time_sec',
    'duration_seconds_count': 'num_runs',
})
time_summary = time_summary.reindex(CONFIG_ORDER)

baseline_mean = time_summary.loc['FI-00', 'mean_time_sec']
time_summary['speedup_vs_baseline'] = baseline_mean / time_summary['mean_time_sec']
time_summary['time_saved_pct'] = (1 - time_summary['mean_time_sec'] / baseline_mean) * 100

time_summary.to_csv(RAW_DATA_PATH / 'time_efficiency_all_runs.csv')
print('Saved: raw_data/time_efficiency_all_runs.csv')

success_only = results_df[results_df['is_success'] & results_df['duration_seconds'].notna()]
time_success = success_only.groupby('config').agg({
    'duration_seconds': ['mean', 'median', 'std', 'min', 'max', 'count'],
}).round(2)
time_success.columns = ['_'.join(col).strip() for col in time_success.columns.values]
time_success = time_success.rename(columns={
    'duration_seconds_mean': 'mean_time_sec',
    'duration_seconds_median': 'median_time_sec',
    'duration_seconds_std': 'std_time_sec',
    'duration_seconds_min': 'min_time_sec',
    'duration_seconds_max': 'max_time_sec',
    'duration_seconds_count': 'num_runs',
})
time_success = time_success.reindex(CONFIG_ORDER)

baseline_mean_success = time_success.loc['FI-00', 'mean_time_sec']
time_success['speedup_vs_baseline'] = baseline_mean_success / time_success['mean_time_sec']
time_success['time_saved_pct'] = (1 - time_success['mean_time_sec'] / baseline_mean_success) * 100

time_success.to_csv(RAW_DATA_PATH / 'time_efficiency_successful_only.csv')
print('Saved: raw_data/time_efficiency_successful_only.csv')

# Additional aggregated summaries (first improvement, test executions, etc.)
first_improvement = results_df.groupby('config').agg({
    'first_improvement_iteration': ['mean', 'median', 'std', 'min', 'max', 'count'],
    'time_to_first_improvement_sec': ['mean', 'median'],
}).round(2)
first_improvement.columns = ['_'.join(col).strip() for col in first_improvement.columns.values]
first_improvement = first_improvement.rename(columns={
    'first_improvement_iteration_mean': 'mean_first_improvement_iter',
    'first_improvement_iteration_median': 'median_first_improvement_iter',
    'first_improvement_iteration_std': 'std_first_improvement_iter',
    'first_improvement_iteration_min': 'min_first_improvement_iter',
    'first_improvement_iteration_max': 'max_first_improvement_iter',
    'first_improvement_iteration_count': 'num_runs_with_improvement',
    'time_to_first_improvement_sec_mean': 'mean_time_to_first_improvement_sec',
    'time_to_first_improvement_sec_median': 'median_time_to_first_improvement_sec',
})
first_improvement = first_improvement.reindex(CONFIG_ORDER)
first_improvement.to_csv(RAW_DATA_PATH / 'time_to_first_partial_fix.csv')
print('Saved: raw_data/time_to_first_partial_fix.csv')

# Test execution summary
test_exec_summary = results_df.groupby('config').agg({
    'actual_executions': ['mean', 'median', 'std', 'min', 'max', 'count'],
}).round(2)
test_exec_summary.columns = ['_'.join(col).strip() for col in test_exec_summary.columns.values]
test_exec_summary = test_exec_summary.rename(columns={
    'actual_executions_mean': 'mean_test_executions',
    'actual_executions_median': 'median_test_executions',
    'actual_executions_std': 'std_test_executions',
    'actual_executions_min': 'min_test_executions',
    'actual_executions_max': 'max_test_executions',
    'actual_executions_count': 'num_runs',
})
test_exec_summary = test_exec_summary.reindex(CONFIG_ORDER)
test_exec_summary.to_csv(RAW_DATA_PATH / 'test_execution_summary.csv')
print('Saved: raw_data/test_execution_summary.csv')

Saved: raw_data/repair_success_summary.csv
Saved: raw_data/time_efficiency_all_runs.csv
Saved: raw_data/time_efficiency_successful_only.csv
Saved: raw_data/time_to_first_partial_fix.csv
Saved: raw_data/test_execution_summary.csv


In [6]:
# Partial fixes summaries and quality metrics

# Runs with any improvement (partial or full)
improved_runs = results_df[results_df['first_improvement_iteration'].notna()].copy()

# Project-level best improvement
project_best = improved_runs.sort_values(
    ['project', 'config', 'max_numPass'], ascending=[True, True, False]
).drop_duplicates(subset=['project', 'config'])

projects_with_improvement = project_best.groupby('config')['project'].nunique()
projects_total = results_df.groupby('config')['project'].nunique()
partial_fixes_summary = project_best.groupby('config').agg({
    'first_improvement_iteration': ['mean', 'median'],
    'max_numPass': ['mean', 'median'],
}).round(2)
partial_fixes_summary.columns = ['_'.join(col).strip() for col in partial_fixes_summary.columns.values]
partial_fixes_summary = partial_fixes_summary.rename(columns={
    'first_improvement_iteration_mean': 'mean_iterations_to_fix',
    'first_improvement_iteration_median': 'median_iterations_to_fix',
    'max_numPass_mean': 'avg_tests_passed',
    'max_numPass_median': 'median_tests_passed',
})
partial_fixes_summary['projects_with_improvement'] = projects_with_improvement
partial_fixes_summary['total_projects'] = projects_total
partial_fixes_summary['improvement_rate'] = (
    partial_fixes_summary['projects_with_improvement'] / partial_fixes_summary['total_projects']
)
partial_fixes_summary = partial_fixes_summary.reindex(CONFIG_ORDER)
partial_fixes_summary.to_csv(RAW_DATA_PATH / 'partial_fixes_summary.csv')
print('Saved: raw_data/partial_fixes_summary.csv')

# Speed comparison for partial fixes
if not improved_runs.empty:
    speed_comparison = partial_fixes_summary[[
        'median_iterations_to_fix',
        'mean_iterations_to_fix',
        'avg_tests_passed',
    ]].copy()
    speed_comparison['count_improvements'] = improved_runs.groupby('config').size()
    speed_comparison = speed_comparison.sort_values('median_iterations_to_fix')
    speed_comparison.to_csv(RAW_DATA_PATH / 'partial_fixes_speed_comparison.csv')
    print('Saved: raw_data/partial_fixes_speed_comparison.csv')
else:
    print('No improved runs; skipping partial_fixes_speed_comparison.csv')

# Partial fix quality distribution
def categorize_improvement(num_pass, total_tests=28):
    if num_pass == total_tests:
        return 'Full Fix'
    elif num_pass >= 0.75 * total_tests:
        return 'High Quality (75%+)'
    elif num_pass >= 0.5 * total_tests:
        return 'Medium Quality (50-75%)'
    elif num_pass >= 0.25 * total_tests:
        return 'Low Quality (25-50%)'
    else:
        return 'Minimal (<25%)'

if not improved_runs.empty:
    improved_runs['quality_category'] = improved_runs['max_numPass'].apply(categorize_improvement)
    quality_distribution = improved_runs.groupby(['config', 'quality_category']).size().unstack(fill_value=0)
    categories = [
        'Minimal (<25%)',
        'Low Quality (25-50%)',
        'Medium Quality (50-75%)',
        'High Quality (75%+)',
        'Full Fix',
    ]
    for cat in categories:
        if cat not in quality_distribution.columns:
            quality_distribution[cat] = 0
    quality_distribution = quality_distribution[categories]
    quality_distribution['Total'] = quality_distribution.sum(axis=1)
    quality_distribution = quality_distribution.reindex(CONFIG_ORDER, fill_value=0)
    quality_percentages = quality_distribution.div(quality_distribution['Total'], axis=0) * 100
    quality_percentages = quality_percentages.drop('Total', axis=1).round(1)
    quality_distribution.to_csv(RAW_DATA_PATH / 'partial_fixes_quality_distribution.csv')
    quality_percentages.to_csv(RAW_DATA_PATH / 'partial_fixes_quality_percentages.csv')
    print('Saved: raw_data/partial_fixes_quality_distribution.csv')
    print('Saved: raw_data/partial_fixes_quality_percentages.csv')
    # Combined time and quality metrics (simple join of summaries)
    combined_metrics = partial_fixes_summary.join(quality_percentages, how='left')
    combined_metrics.to_csv(RAW_DATA_PATH / 'partial_fixes_time_quality_combined.csv')
    print('Saved: raw_data/partial_fixes_time_quality_combined.csv')
else:
    print('No improved runs; skipping quality distribution CSVs')

Saved: raw_data/partial_fixes_summary.csv
Saved: raw_data/partial_fixes_speed_comparison.csv
Saved: raw_data/partial_fixes_quality_distribution.csv
Saved: raw_data/partial_fixes_quality_percentages.csv
Saved: raw_data/partial_fixes_time_quality_combined.csv


In [7]:
# Statistical tests between configurations

comparisons = [
    ('FI-00', 'FI-50', 'Baseline vs FI-50%'),
    ('FI-00', 'FI-70', 'Baseline vs FI-70%'),
    ('FI-00', 'TO-50', 'Baseline vs TO-50%'),
    ('FI-00', 'TO-70', 'Baseline vs TO-70%'),
    ('FI-50', 'FI-70', 'FI-50% vs FI-70%'),
    ('TO-50', 'TO-70', 'TO-50% vs TO-70%'),
    ('FI-50', 'TO-50', 'FI vs TO (50%)'),
    ('FI-70', 'TO-70', 'FI vs TO (70%)'),
]

# Partial fix statistical tests
partial_stats_results = []
for config1, config2, label in comparisons:
    data1 = results_df[results_df['config'] == config1]
    data2 = results_df[results_df['config'] == config2]

    improved1 = data1['first_improvement_iteration'].notna().sum()
    total1 = len(data1)
    improved2 = data2['first_improvement_iteration'].notna().sum()
    total2 = len(data2)

    contingency = np.array([[improved1, total1 - improved1],
                            [improved2, total2 - improved2]])
    try:
        chi2_impr, p_impr = stats.chi2_contingency(contingency)[:2]
    except ValueError:
        chi2_impr, p_impr = np.nan, np.nan

    iter1 = data1[data1['first_improvement_iteration'].notna()]['first_improvement_iteration']
    iter2 = data2[data2['first_improvement_iteration'].notna()]['first_improvement_iteration']
    if len(iter1) > 0 and len(iter2) > 0:
        u_iter, p_iter = stats.mannwhitneyu(iter1, iter2, alternative='two-sided')
        eff_iter = cohens_d(iter1, iter2)
    else:
        u_iter, p_iter, eff_iter = np.nan, np.nan, np.nan

    tests1 = data1['max_numPass']
    tests2 = data2['max_numPass']
    if len(tests1) > 0 and len(tests2) > 0:
        u_tests, p_tests = stats.mannwhitneyu(tests1, tests2, alternative='two-sided')
        eff_tests = cohens_d(tests1, tests2)
    else:
        u_tests, p_tests, eff_tests = np.nan, np.nan, np.nan

    partial_stats_results.append({
        'comparison': label,
        'config1': config1,
        'config2': config2,
        'chi2_improvement': chi2_impr,
        'p_value_improvement_rate': p_impr,
        'improvement_rate_1': improved1 / total1 if total1 > 0 else np.nan,
        'improvement_rate_2': improved2 / total2 if total2 > 0 else np.nan,
        'p_value_iterations': p_iter,
        'effect_size_iterations': eff_iter,
        'mean_iterations_1': iter1.mean() if len(iter1) > 0 else np.nan,
        'mean_iterations_2': iter2.mean() if len(iter2) > 0 else np.nan,
        'p_value_tests_passed': p_tests,
        'effect_size_tests_passed': eff_tests,
        'mean_tests_passed_1': tests1.mean() if len(tests1) > 0 else np.nan,
        'mean_tests_passed_2': tests2.mean() if len(tests2) > 0 else np.nan,
    })

partial_stats_df = pd.DataFrame(partial_stats_results).round(4)
partial_stats_df.to_csv(RAW_DATA_PATH / 'partial_fixes_statistical_tests.csv', index=False)
print('Saved: raw_data/partial_fixes_statistical_tests.csv')

# Main statistical tests for time and test executions (vs baseline and others)
statistical_results = []
for config1, config2, label in comparisons:
    data1 = results_df[results_df['config'] == config1]
    data2 = results_df[results_df['config'] == config2]

    success1 = data1['is_success'].sum()
    total1 = len(data1)
    success2 = data2['is_success'].sum()
    total2 = len(data2)
    if success1 + success2 >= 5:
        contingency_success = np.array([[success1, total1 - success1],
                                        [success2, total2 - success2]])
        try:
            chi2_succ, p_succ = stats.chi2_contingency(contingency_success)[:2]
        except ValueError:
            chi2_succ, p_succ = np.nan, np.nan
    else:
        chi2_succ, p_succ = np.nan, np.nan

    time1 = data1[data1['duration_seconds'].notna()]['duration_seconds']
    time2 = data2[data2['duration_seconds'].notna()]['duration_seconds']
    if len(time1) > 0 and len(time2) > 0:
        u_time, p_time = stats.mannwhitneyu(time1, time2, alternative='two-sided')
        eff_time = cohens_d(time1, time2)
    else:
        u_time, p_time, eff_time = np.nan, np.nan, np.nan

    exec1 = data1['actual_executions']
    exec2 = data2['actual_executions']
    if len(exec1) > 0 and len(exec2) > 0:
        u_exec, p_exec = stats.mannwhitneyu(exec1, exec2, alternative='two-sided')
        eff_exec = cohens_d(exec1, exec2)
    else:
        u_exec, p_exec, eff_exec = np.nan, np.nan, np.nan

    statistical_results.append({
        'comparison': label,
        'config1': config1,
        'config2': config2,
        'chi2_success': chi2_succ,
        'p_value_success': p_succ,
        'success_rate_1': success1 / total1 if total1 > 0 else np.nan,
        'success_rate_2': success2 / total2 if total2 > 0 else np.nan,
        'p_value_time': p_time,
        'effect_size_time': eff_time,
        'mean_time_1': time1.mean() if len(time1) > 0 else np.nan,
        'mean_time_2': time2.mean() if len(time2) > 0 else np.nan,
        'p_value_executions': p_exec,
        'effect_size_executions': eff_exec,
        'mean_exec_1': exec1.mean() if len(exec1) > 0 else np.nan,
        'mean_exec_2': exec2.mean() if len(exec2) > 0 else np.nan,
    })

stats_df = pd.DataFrame(statistical_results).round(4)
stats_df.to_csv(RAW_DATA_PATH / 'statistical_tests.csv', index=False)
print('Saved: raw_data/statistical_tests.csv')

# Additional statistical tests for time to first improvement and generations
additional_stats = []
if 'time_to_first_partial_fix.csv' in [p.name for p in RAW_DATA_PATH.glob('*.csv')]:
    time_first_df = pd.read_csv(RAW_DATA_PATH / 'time_to_first_partial_fix.csv')
else:
    time_first_df = None

for config1, config2, label in comparisons:
    data1 = results_df[results_df['config'] == config1]
    data2 = results_df[results_df['config'] == config2]

    if time_first_df is not None:
        row1 = time_first_df[time_first_df['config'] == config1]
        row2 = time_first_df[time_first_df['config'] == config2]
        median_time1 = row1['median_time_to_first_improvement_sec'].values[0] if len(row1) > 0 else np.nan
        median_time2 = row2['median_time_to_first_improvement_sec'].values[0] if len(row2) > 0 else np.nan
        mean_time1 = row1['mean_time_to_first_improvement_sec'].values[0] if len(row1) > 0 else np.nan
        mean_time2 = row2['mean_time_to_first_improvement_sec'].values[0] if len(row2) > 0 else np.nan
        mean_time1 = row1['mean_time_to_first_improvement_sec'].values[0] if len(row1) > 0 else np.nan
        mean_time2 = row2['mean_time_to_first_improvement_sec'].values[0] if len(row2) > 0 else np.nan
    else:
        median_time1 = median_time2 = mean_time1 = mean_time2 = np.nan

    improved1 = data1[data1['first_improvement_iteration'].notna()]['first_improvement_iteration']
    improved2 = data2[data2['first_improvement_iteration'].notna()]['first_improvement_iteration']
    if len(improved1) > 5 and len(improved2) > 5:
        u_time_first, p_time_first = stats.mannwhitneyu(improved1, improved2, alternative='two-sided')
        eff_time_first = cohens_d(improved1, improved2)
    else:
        u_time_first, p_time_first, eff_time_first = np.nan, np.nan, np.nan

    gen1 = data1['final_iteration']
    gen2 = data2['final_iteration']
    if len(gen1) > 0 and len(gen2) > 0:
        u_gen, p_gen = stats.mannwhitneyu(gen1, gen2, alternative='two-sided')
        eff_gen = cohens_d(gen1, gen2)
        mean_gen1 = gen1.mean()
        mean_gen2 = gen2.mean()
        median_gen1 = gen1.median()
        median_gen2 = gen2.median()
    else:
        u_gen, p_gen, eff_gen = np.nan, np.nan, np.nan
        mean_gen1 = mean_gen2 = median_gen1 = median_gen2 = np.nan

    additional_stats.append({
        'comparison': label,
        'config1': config1,
        'config2': config2,
        'p_value_time_first_improvement': p_time_first,
        'effect_size_time_first_improvement': eff_time_first,
        'median_time_first_imp_sec_1': median_time1,
        'median_time_first_imp_sec_2': median_time2,
        'mean_time_first_imp_sec_1': mean_time1,
        'mean_time_first_imp_sec_2': mean_time2,
        'p_value_generations': p_gen,
        'effect_size_generations': eff_gen,
        'mean_generations_1': mean_gen1,
        'mean_generations_2': mean_gen2,
        'median_generations_1': median_gen1,
        'median_generations_2': median_gen2,
    })

additional_stats_df = pd.DataFrame(additional_stats).round(4)
additional_stats_df.to_csv(RAW_DATA_PATH / 'statistical_tests_additional.csv', index=False)
print('Saved: raw_data/statistical_tests_additional.csv')

Saved: raw_data/partial_fixes_statistical_tests.csv
Saved: raw_data/statistical_tests.csv
Saved: raw_data/statistical_tests_additional.csv
