# AURAC Tables for April 2025 Data

This notebook computes AURAC for the April 2025 experimental data and generates publication-ready LaTeX tables.

**Prerequisites:** 
- April 2025 CSV files with experimental results
- AURAC computation (this notebook will compute it)

**Output:** 
- Two properly formatted LaTeX tables (Full and Mini datasets)

In [7]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind_from_stats, norm

sys.path.append('../src/')
from plotting import get_coverage_threshold_and_size, get_auc, compute_aurac

# Base path setup
base_path = Path('Ensembling_Finetuned_LLMs')

def read_file(file_path, base_path=base_path) -> pd.DataFrame:
    """Read a file and return a DataFrame."""
    path = base_path / 'llm_experiments_data' / file_path
    if not path.exists():
        raise FileNotFoundError(f"File {path} does not exist.")
    return pd.read_csv(path)

def combine_and_clean_dataframes(df1, df2=None) -> pd.DataFrame:
    """Combine and clean the dataframes."""
    parts = [df1]
    if df2 is not None:
        parts.append(df2)
    combined_df = pd.concat(parts, ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset=['dataset', 'seed', 'method', 'ensemble_type'], keep='first')
    combined_df.reset_index(drop=True, inplace=True)
    return combined_df

def calc_ci_for_df(df) -> pd.DataFrame:
    """Calculate confidence intervals for DataFrame (AURAC-focused version)."""
    # Only use columns that exist in AURAC-enriched data (removed threshold, set_size, auc)
    available_columns = ['ensemble_size', 'ensemble_unique_size', 'nll_test', 'c1', 'c2', 'epi_scalar', 'aurac', 'aorac']
    
    df = df.groupby(['dataset', 'method', 'ensemble_type'])[available_columns].agg(['mean', 'std', 'count']).reset_index()
    
    for col in available_columns:
        df[(col, 'CI')] = 1.96 * (df[(col, 'std')] / np.sqrt(df[(col, 'count')].replace(0, np.nan)))
        df[(col, 'mean±CI')] = (df[(col, 'mean')].round(4).astype(str) + 
                              " ± " + df[(col, 'CI')].round(4).astype(str))
    return df

def flatten_subset_df(df, subset=['dataset', 'method', 'ensemble_type', 'aurac_mean±CI']) -> pd.DataFrame:
    """Flatten the dataframe (AURAC-focused version)."""
    df.columns = ['_'.join(col).strip('_') if col[1] else col[0] for col in df.columns.values]
    return df[subset]

def compare_to_best(df: pd.DataFrame, metric: str, ensemble_types: list = None, 
                   calibration_method: str = 'pure_logits', alpha: float = 0.05) -> pd.DataFrame:
    """Compare all ensemble types to the best one for each dataset."""
    z = norm.ppf(1 - alpha/2)
    
    if ensemble_types is None:
        ensemble_types = df['ensemble_type'].unique().tolist()
    
    results = []
    for ds in df['dataset'].unique():
        sub = df[(df['dataset'] == ds) & (df['method'] == calibration_method) & 
                (df['ensemble_type'].isin(ensemble_types))].copy()
        
        if sub.empty:
            continue
            
        sub[f'{metric}_ci_lo'] = sub[f'{metric}_mean'] - z * sub[f'{metric}_std'] / np.sqrt(sub[f'{metric}_count'])
        sub[f'{metric}_ci_hi'] = sub[f'{metric}_mean'] + z * sub[f'{metric}_std'] / np.sqrt(sub[f'{metric}_count'])
        
        if metric in ['auc', 'aurac']:
            best_idx = sub[f'{metric}_mean'].idxmax()
        else:
            best_idx = sub[f'{metric}_mean'].idxmin()
        best = sub.loc[best_idx]
        
        for _, row in sub.iterrows():
            m1, s1, n1 = row[f'{metric}_mean'], row[f'{metric}_std'], row[f'{metric}_count']
            m0, s0, n0 = best[f'{metric}_mean'], best[f'{metric}_std'], best[f'{metric}_count']
            
            t_stat, p_val = ttest_ind_from_stats(mean1=m1, std1=s1, nobs1=n1,
                                               mean2=m0, std2=s0, nobs2=n0, equal_var=False)
            
            results.append({
                'dataset': ds, 'ensemble_type': row['ensemble_type'],
                'best_ensemble_type': best['ensemble_type'], 'mean': m1,
                'mean_best': m0, 'significant': (p_val < alpha)
            })
    
    return pd.DataFrame(results)

def create_report_view_df(df: pd.DataFrame, value_vars: list, 
                         calibration_method: str = 'pure_logits', custom_rows: list = None) -> pd.DataFrame:
    """Create wide-format DataFrame for reporting."""
    df = df.loc[df['method'] == calibration_method]
    
    dfm = df.melt(id_vars=['dataset', 'ensemble_type', 'method'], 
                 value_vars=value_vars, var_name='metric', value_name='value')
    
    metric_name_map = {'aurac_mean±CI': 'AURAC'}
    dfm['metric'] = dfm['metric'].map(metric_name_map)
    dfm['dataset'] = dfm['dataset'].map({
        'SetFit/mnli': 'SetFit', 'ag_news': 'News', 'dbpedia_14': 'DBpedia',
        'imdb': 'IMDB', 'mteb/tweet_sentiment_extraction': 'Tweet', 'stanfordnlp/sst2': 'SST-2'
    })
    
    pivoted = dfm.pivot_table(index=['ensemble_type', 'method'], columns=['metric', 'dataset'],
                             values='value', aggfunc='first')
    
    metrics_order = [metric_name_map[v] for v in value_vars]
    datasets_order = ['DBpedia', 'News', 'SST-2', 'SetFit', 'Tweet', 'IMDB']
    new_cols = pd.MultiIndex.from_product([metrics_order, datasets_order], names=['metric', 'dataset'])
    
    wide = pivoted.reindex(columns=new_cols).reset_index()
    
    if custom_rows is None:
        custom_rows = sorted(df['ensemble_type'].unique())
    
    wide['ensemble_type'] = pd.Categorical(wide['ensemble_type'], categories=custom_rows, ordered=True)
    wide = wide.sort_values(['ensemble_type', 'method']).reset_index(drop=True)
    
    return wide

print("Functions loaded successfully")

Functions loaded successfully


In [8]:
# Compute ONLY AURAC for April 2025 data (other metrics already exist)

def compute_aurac_only(row, base_path=base_path) -> float:
    """Compute only AURAC for a single row."""
    npfile = base_path / row['path']
    data = np.load(npfile, allow_pickle=True)
    probs, labels = data['ensemble_probs'], data['labels']
    aurac = compute_aurac(labels, probs)
    return aurac

def add_aurac_to_existing_data(df, base_path=base_path, output_path=None) -> None:
    """Add AURAC column to existing DataFrame with other metrics."""
    df = df.copy()
    print(f"Computing AURAC for {len(df)} rows...")
    tqdm.pandas(desc="Computing AURAC only")
    df['aurac'] = df.progress_apply(compute_aurac_only, axis=1, base_path=base_path)
    df['aorac'] = 1 - df['aurac']  # Also compute AORAC since it's trivial
    df.to_csv(output_path, index=False)
    print(f"AURAC added and saved to {output_path}")

# Load original data files (these should already have other metrics)
mini_path_1_orig = 'llm_experimental_results_mini_neurips_2025-04-20.csv'
mini_path_2_orig = 'llm_experimental_results_mini_neurips_2025-04-24.csv'
ftc_path_orig = 'llm_experimental_results_ftc_neurips_2025-04-25.csv'

print("Adding AURAC to existing April 2025 data...")

# Generate output paths consistently (outside try/except block)
from datetime import datetime
date_suffix = datetime.now().strftime("%m_%d")

output_path_mini_1_aurac_99 = base_path / 'llm_experiments_data' / 'metrics' / f'mini_1_with_aurac_cov_0p99_{date_suffix}.csv'
output_path_mini_2_aurac_99 = base_path / 'llm_experiments_data' / 'metrics' / f'mini_2_with_aurac_cov_0p99_{date_suffix}.csv'
output_path_ftc_aurac_99 = base_path / 'llm_experiments_data' / 'metrics' / f'ftc_with_aurac_cov_0p99_{date_suffix}.csv'

try:
    df_mini_1_orig = read_file(mini_path_1_orig)
    df_mini_2_orig = read_file(mini_path_2_orig)
    df_ftc_orig = read_file(ftc_path_orig)

    print(f"Mini 1 shape: {df_mini_1_orig.shape}")
    print(f"Mini 2 shape: {df_mini_2_orig.shape}")
    print(f"FTC shape: {df_ftc_orig.shape}")
    
    # Check if AURAC files already exist
    if output_path_ftc_aurac_99.exists():
        print("AURAC files already exist. Skipping computation.")
    else:
        # Only compute AURAC (much faster!)
        add_aurac_to_existing_data(df_mini_1_orig, base_path=base_path, output_path=output_path_mini_1_aurac_99)
        add_aurac_to_existing_data(df_mini_2_orig, base_path=base_path, output_path=output_path_mini_2_aurac_99)
        add_aurac_to_existing_data(df_ftc_orig, base_path=base_path, output_path=output_path_ftc_aurac_99)

        print("AURAC computation completed (only AURAC computed, other metrics preserved)")
        
    print("Output files:")
    print(f"  - {output_path_mini_1_aurac_99}")
    print(f"  - {output_path_mini_2_aurac_99}")
    print(f"  - {output_path_ftc_aurac_99}")
    
except FileNotFoundError as e:
    print(f"Original April 2025 files not found: {e}")
    print("This is normal if you only need tables from the newer data.")

Adding AURAC to existing April 2025 data...
Mini 1 shape: (312, 11)
Mini 2 shape: (160, 11)
FTC shape: (480, 11)
AURAC files already exist. Skipping computation.
Output files:
  - Ensembling_Finetuned_LLMs/llm_experiments_data/metrics/mini_1_with_aurac_cov_0p99_09_24.csv
  - Ensembling_Finetuned_LLMs/llm_experiments_data/metrics/mini_2_with_aurac_cov_0p99_09_24.csv
  - Ensembling_Finetuned_LLMs/llm_experiments_data/metrics/ftc_with_aurac_cov_0p99_09_24.csv


In [9]:
# Process FTC data and compute significance
df_ftc_aurac = pd.read_csv(output_path_ftc_aurac_99)
print(f"FTC data loaded: {df_ftc_aurac.shape}")
print(f"Unique ensemble types: {sorted(df_ftc_aurac['ensemble_type'].unique())}")

# Clean and calculate CIs
df_ftc_aurac_clean = combine_and_clean_dataframes(df_ftc_aurac)
df_ftc_aurac_ci = calc_ci_for_df(df_ftc_aurac_clean)
df_ftc_aurac_subset = flatten_subset_df(df_ftc_aurac_ci, subset=['dataset', 'method', 'ensemble_type', 'aurac_mean±CI'])

print(f"Processed FTC data: {df_ftc_aurac_subset.shape}")

# Process Mini data (combine mini_1 and mini_2)
df_mini_1_aurac = pd.read_csv(output_path_mini_1_aurac_99)
df_mini_2_aurac = pd.read_csv(output_path_mini_2_aurac_99)

print(f"Mini 1 data loaded: {df_mini_1_aurac.shape}")
print(f"Mini 2 data loaded: {df_mini_2_aurac.shape}")

# Combine mini datasets
df_mini_aurac_combined = combine_and_clean_dataframes(df_mini_1_aurac, df_mini_2_aurac)
print(f"Combined Mini data: {df_mini_aurac_combined.shape}")

# Calculate CIs
df_mini_aurac_ci = calc_ci_for_df(df_mini_aurac_combined)
df_mini_aurac_subset = flatten_subset_df(df_mini_aurac_ci, subset=['dataset', 'method', 'ensemble_type', 'aurac_mean±CI'])

print(f"Processed Mini data: {df_mini_aurac_subset.shape}")

# Run significance tests
print("Running significance tests for AURAC metric...")

df_ftc_significance_aurac = compare_to_best(df_ftc_aurac_ci, metric='aurac', calibration_method='pure_logits', alpha=0.05)
df_mini_significance_aurac = compare_to_best(df_mini_aurac_ci, metric='aurac', calibration_method='pure_logits', alpha=0.05)

print(f"FTC AURAC significance results: {df_ftc_significance_aurac.shape}")
print(f"Mini AURAC significance results: {df_mini_significance_aurac.shape}")

print("Data processing completed")

FTC data loaded: (480, 13)
Unique ensemble types: ['greedy_50_baseline', 'greedy_50_calib_every_step', 'greedy_50_calib_once', 'greedy_50_post_calib', 'greedy_50_temp_baseline', 'greedy_unique_5_baseline', 'greedy_unique_5_post_calib', 'greedy_unique_5_temp_baseline']
Processed FTC data: (96, 4)
Mini 1 data loaded: (312, 13)
Mini 2 data loaded: (160, 13)
Combined Mini data: (472, 13)
Processed Mini data: (96, 4)
Running significance tests for AURAC metric...
FTC AURAC significance results: (48, 6)
Mini AURAC significance results: (48, 6)
Data processing completed


In [10]:
# Perform significance tests for AURAC metric

# For FTC dataset
print("=== FTC AURAC Significance Tests ===")
df_ftc_significance_aurac = compare_to_best(
    df_ftc_aurac_ci, 
    metric='aurac', 
    calibration_method='pure_logits', 
    alpha=0.05
)

print("FTC AURAC significance results:")
print(df_ftc_significance_aurac[['dataset', 'ensemble_type', 'best_ensemble_type', 'mean', 'mean_best', 'significant']])

# For Mini dataset  
print("\n=== Mini AURAC Significance Tests ===")
df_mini_significance_aurac = compare_to_best(
    df_mini_aurac_ci,
    metric='aurac',
    calibration_method='pure_logits', 
    alpha=0.05
)

print("Mini AURAC significance results:")
print(df_mini_significance_aurac[['dataset', 'ensemble_type', 'best_ensemble_type', 'mean', 'mean_best', 'significant']])

print("\nSignificance testing completed successfully!")

=== FTC AURAC Significance Tests ===
FTC AURAC significance results:
                            dataset                  ensemble_type  \
0                       SetFit/mnli             greedy_50_baseline   
1                       SetFit/mnli     greedy_50_calib_every_step   
2                       SetFit/mnli           greedy_50_calib_once   
3                       SetFit/mnli           greedy_50_post_calib   
4                       SetFit/mnli        greedy_50_temp_baseline   
5                       SetFit/mnli       greedy_unique_5_baseline   
6                       SetFit/mnli     greedy_unique_5_post_calib   
7                       SetFit/mnli  greedy_unique_5_temp_baseline   
8                           ag_news             greedy_50_baseline   
9                           ag_news     greedy_50_calib_every_step   
10                          ag_news           greedy_50_calib_once   
11                          ag_news           greedy_50_post_calib   
12                   

In [11]:
# Add LaTeX table formatting functions (copied from main tables notebook)

def parse_mean_ci(value_str: str) -> float:
    """
    Extract mean value from 'mean ± CI' string.
    
    Args:
        value_str: String in format "0.1234 ± 0.0056"
    
    Returns:
        float: The mean value
    """
    if pd.isna(value_str) or value_str == 'nan':
        return float('nan')
    return float(value_str.split(' ± ')[0])

def format_cell_value(value_str: str, is_best: bool, is_not_significant: bool) -> str:
    """
    Apply LaTeX formatting to a cell value.
    
    Args:
        value_str: Original value string "mean ± CI"
        is_best: True if this is the best value in the column
        is_not_significant: True if not significantly different from best
    
    Returns:
        str: LaTeX formatted string
    """
    if pd.isna(value_str) or value_str == 'nan':
        return ''
    
    # Start with the original value
    formatted = value_str
    
    # Apply bold if it's the best value
    if is_best:
        formatted = f"\\textbf{{{formatted}}}"
    
    # Apply shading if it's not significantly different from best OR if it is the best
    if is_not_significant or is_best:
        formatted = f"\\cellcolor{{gray!20}}{formatted}"
    
    return formatted

def get_significance_for_dataset(df_significance: pd.DataFrame, 
                                dataset: str, 
                                ensemble_type: str) -> tuple[bool, bool]:
    """
    Get significance information for a specific dataset-ensemble combination.
    
    Args:
        df_significance: DataFrame from compare_to_best function
        dataset: Dataset name (e.g., 'DBpedia', 'News')
        ensemble_type: Ensemble type name (ORIGINAL name, not display name)
    
    Returns:
        tuple: (is_best, is_not_significant)
    """
    # Map short dataset names back to full names for lookup
    dataset_map_reverse = {
        'DBpedia': 'dbpedia_14',
        'News': 'ag_news', 
        'SST-2': 'stanfordnlp/sst2',
        'SetFit': 'SetFit/mnli',
        'Tweet': 'mteb/tweet_sentiment_extraction',
        'IMDB': 'imdb'
    }
    
    full_dataset_name = dataset_map_reverse.get(dataset, dataset)
    
    # Find the row for this dataset and ensemble type
    mask = (df_significance['dataset'] == full_dataset_name) & \
           (df_significance['ensemble_type'] == ensemble_type)
    
    if not mask.any():
        return False, False
    
    row = df_significance[mask].iloc[0]
    
    # Check if this ensemble type is the best
    is_best = (row['ensemble_type'] == row['best_ensemble_type'])
    
    # Check if it's not significantly different from best
    is_not_significant = not row['significant']
    
    return is_best, is_not_significant

def create_latex_table_with_significance(
    df_wide: pd.DataFrame,           # Wide-format data from create_report_view_df
    df_significance: pd.DataFrame,   # Significance test results
    ensemble_type_mapping: dict,     # Original -> Display name mapping
    metric_name: str,                # 'NLL', 'AUC', 'AURAC', 'Set Size'
    dataset_name: str,               # 'FTC-metadataset', 'Extended', 'Mini'
    caption: str,                    # Full caption text
    label: str,                      # e.g., 'tab:nll_pure_logits'
    maximize: bool = False           # True for AUC/AURAC, False for NLL/Set Size
) -> str:
    """
    Create a publication-ready LaTeX table with statistical significance formatting.
    
    Args:
        df_wide: Wide-format DataFrame with ensemble_type as rows, datasets as columns
        df_significance: Results from compare_to_best() function
        ensemble_type_mapping: Mapping from original names to display names
        metric_name: Name of the metric for the table
        dataset_name: Dataset name for caption
        caption: Full LaTeX caption
        label: LaTeX label for referencing
        maximize: True if higher values are better (AUC, AURAC), False otherwise
    
    Returns:
        str: Complete LaTeX table code
    """
    # Create a copy to avoid modifying the original
    df_formatted = df_wide.copy()
    
    # Handle metric name mapping - 'AUROC' should map to 'AUC' in the data
    metric_lookup = {'AUROC': 'AUC', 'Set_Size': 'Set Size'}
    data_metric_name = metric_lookup.get(metric_name, metric_name)
    
    # Get the metric column names (should be tuples like (metric, dataset))
    metric_cols = [col for col in df_formatted.columns if isinstance(col, tuple) and col[0] == data_metric_name]
    
    if not metric_cols:
        raise ValueError(f"No columns found for metric '{data_metric_name}' (original: '{metric_name}') in the DataFrame")
    
    # Create reverse mapping from display names to original names for significance lookup
    reverse_mapping = {v: k for k, v in ensemble_type_mapping.items()}
    
    # Process each dataset column
    for metric, dataset in metric_cols:
        col_name = (metric, dataset)
        
        # Apply formatting to each cell in this column
        for row_idx in range(len(df_formatted)):
            # Handle the ensemble_type column access properly with MultiIndex
            ensemble_display_name = df_formatted.iloc[row_idx]['ensemble_type']
            if isinstance(ensemble_display_name, pd.Series):
                ensemble_display_name = ensemble_display_name.iloc[0]
            
            ensemble_original_name = reverse_mapping.get(ensemble_display_name, ensemble_display_name)
            
            # Get the original value using iloc for MultiIndex
            original_value = df_formatted.iloc[row_idx][col_name]
            if isinstance(original_value, pd.Series):
                original_value = original_value.iloc[0]
            
            if pd.isna(original_value) or str(original_value) == 'nan':
                # Set empty string using loc for MultiIndex
                df_formatted.loc[row_idx, col_name] = ''
                continue
            
            # Get significance information  
            is_best, is_not_significant = get_significance_for_dataset(
                df_significance, dataset, ensemble_original_name
            )
            
            # Apply formatting
            formatted_value = format_cell_value(str(original_value), is_best, is_not_significant)
            # Set value using loc for MultiIndex
            df_formatted.loc[row_idx, col_name] = formatted_value
    
    # Build the LaTeX table structure
    datasets = [col[1] for col in metric_cols]  # Extract dataset names
    n_datasets = len(datasets)
    
    # Column specification: l for ensemble type + c for each dataset
    col_spec = 'l' + 'c' * n_datasets
    
    # Build header row
    header = ' & '.join(['Ensemble Type'] + datasets) + ' \\\\'
    
    # Build data rows
    data_rows = []
    for row_idx in range(len(df_formatted)):
        ensemble_name = df_formatted.iloc[row_idx]['ensemble_type']
        if isinstance(ensemble_name, pd.Series):
            ensemble_name = ensemble_name.iloc[0]
        
        values = []
        for col in metric_cols:
            value = df_formatted.iloc[row_idx][col]
            if isinstance(value, pd.Series):
                value = value.iloc[0]
            values.append(str(value))
        
        row_str = ' & '.join([str(ensemble_name)] + values) + ' \\\\'
        data_rows.append(row_str)
    
    # Create separators for different ensemble groups if needed
    formatted_rows = []
    for i, row_str in enumerate(data_rows):
        if i > 0 and ('G50' in row_str and 'G5' in data_rows[i-1] and 'G50' not in data_rows[i-1]):
            # Add midrule before first G50 row
            formatted_rows.append('\\midrule')
        formatted_rows.append(row_str)
    
    # Combine everything into the full LaTeX table
    latex_lines = [
        '\\begin{table}[h]',
        f'  \\caption{{{caption}}}',
        f'  \\label{{{label}}}',
        '  \\centering',
        '  \\resizebox{\\textwidth}{!}{%',
        f'  \\begin{{tabular}}{{{col_spec}}}',
        '    \\toprule',
        f'    {header}',
        '    \\midrule'
    ]
    
    # Add data rows
    for row in formatted_rows:
        latex_lines.append(f'    {row}')
    
    # Close the table
    latex_lines.extend([
        '    \\bottomrule',
        '  \\end{tabular}',
        '  }',
        '\\end{table}'
    ])
    
    # Return with proper newlines
    return '\n'.join(latex_lines)

## AURAC for Full Dataset


In [12]:
# Generate properly formatted FTC AURAC table with statistical significance

# Define ensemble type mapping for FTC data
etype_map_ftc = {
    'greedy_unique_5_baseline': 'G5',
    'greedy_unique_5_temp_baseline': 'G5 p.t.c.',
    'greedy_unique_5_post_calib': 'G5 JUCAL',
    'greedy_50_baseline': 'G50',
    'greedy_50_temp_baseline': 'G50 p.t.c.',
    'greedy_50_post_calib': 'G50 JUCAL',
    'greedy_50_calib_once': 'G50 r.c.o. JUCAL',
    'greedy_50_calib_every_step': 'G50 r.c. JUCAL'
}

# Define proper ensemble ordering for FTC data
ensemble_order_ftc = [
    'G5', 'G5 p.t.c.', 'G5 JUCAL',
    'G50', 'G50 p.t.c.', 'G50 JUCAL', 
    'G50 r.c.o. JUCAL', 'G50 r.c. JUCAL'
]

# Apply ensemble mapping to data
df_ftc_aurac_subset_mapped = df_ftc_aurac_subset.copy()
for original, mapped in etype_map_ftc.items():
    df_ftc_aurac_subset_mapped['ensemble_type'] = df_ftc_aurac_subset_mapped['ensemble_type'].replace(original, mapped)

# Create FTC AURAC wide-format table
df_ftc_report_aurac = create_report_view_df(
    df_ftc_aurac_subset_mapped,
    value_vars=['aurac_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=ensemble_order_ftc
)

# Generate caption
caption = ("Full dataset: Area Under the Rejection-Accuracy Curve (AURAC) over data splits; "
          "mean ± 95\\% confidence interval half-width) on the full dataset (100\\%). "
          "The best mean is shown in bold, and methods not significantly different from the best "
          "(paired test, \\( \\alpha = 0.05 \\)) are shaded.")

# Generate LaTeX table with significance formatting
latex_table_ftc = create_latex_table_with_significance(
    df_wide=df_ftc_report_aurac,
    df_significance=df_ftc_significance_aurac,
    ensemble_type_mapping=etype_map_ftc,
    metric_name='AURAC',
    dataset_name='Full',
    caption=caption,
    label='tab:aurac_ftc_pure_logits',
    maximize=True
)

# Save to file
latex_path = Path('LATEX/llm')
latex_path.mkdir(parents=True, exist_ok=True)
filename_ftc = 'ftc_aurac_formatted.tex'
filepath_ftc = latex_path / filename_ftc

with open(filepath_ftc, 'w') as f:
    f.write(latex_table_ftc)

print("=== FTC (Full Dataset) AURAC TABLE ===")
print(latex_table_ftc)
print(f"\n✅ FTC AURAC table saved to: {filepath_ftc}")
print(f"Table shape: {df_ftc_report_aurac.shape}")

=== FTC (Full Dataset) AURAC TABLE ===
\begin{table}[h]
  \caption{Full dataset: Area Under the Rejection-Accuracy Curve (AURAC) over data splits; mean ± 95\% confidence interval half-width) on the full dataset (100\%). The best mean is shown in bold, and methods not significantly different from the best (paired test, \( \alpha = 0.05 \)) are shaded.}
  \label{tab:aurac_ftc_pure_logits}
  \centering
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{lcccccc}
    \toprule
    Ensemble Type & DBpedia & News & SST-2 & SetFit & Tweet & IMDB \\
    \midrule
    G5 & 0.9895 ± 0.0 & 0.981 ± 0.0011 & 0.984 ± 0.0005 & 0.8915 ± 0.0008 & 0.9103 ± 0.0028 & \cellcolor{gray!20}\textbf{0.9859 ± 0.0002} \\
    G5 p.t.c. & 0.9895 ± 0.0 & 0.981 ± 0.0011 & 0.984 ± 0.0005 & 0.8915 ± 0.0008 & 0.9103 ± 0.0027 & \cellcolor{gray!20}0.9859 ± 0.0002 \\
    G5 JUCAL & \cellcolor{gray!20}0.9897 ± 0.0 & \cellcolor{gray!20}0.9829 ± 0.0005 & 0.9842 ± 0.0005 & 0.924 ± 0.0006 & 0.9211 ± 0.0006 & \cellcolor{gray!20}0.9858

## Aurac for Mini Dataset

In [13]:
# Generate properly formatted Mini AURAC table with statistical significance

# Apply ensemble mapping to Mini data (same mapping as FTC)
df_mini_aurac_subset_mapped = df_mini_aurac_subset.copy()
for original, mapped in etype_map_ftc.items():
    df_mini_aurac_subset_mapped['ensemble_type'] = df_mini_aurac_subset_mapped['ensemble_type'].replace(original, mapped)

# Create Mini AURAC wide-format table
df_mini_report_aurac = create_report_view_df(
    df_mini_aurac_subset_mapped,
    value_vars=['aurac_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=ensemble_order_ftc
)

# Generate caption for Mini dataset
caption_mini = ("Mini dataset: Area Under the Rejection-Accuracy Curve (AURAC) over data splits; "
               "mean ± 95\\% confidence interval half-width) on the full dataset (100\\%). "
               "The best mean is shown in bold, and methods not significantly different from the best "
               "(paired test, \\( \\alpha = 0.05 \\)) are shaded.")

# Generate LaTeX table with significance formatting
latex_table_mini = create_latex_table_with_significance(
    df_wide=df_mini_report_aurac,
    df_significance=df_mini_significance_aurac,
    ensemble_type_mapping=etype_map_ftc,
    metric_name='AURAC',
    dataset_name='Mini',
    caption=caption_mini,
    label='tab:aurac_mini_pure_logits',
    maximize=True
)

# Save to file
filename_mini = 'mini_aurac_formatted.tex'
filepath_mini = latex_path / filename_mini

with open(filepath_mini, 'w') as f:
    f.write(latex_table_mini)

print("=== MINI DATASET AURAC TABLE ===")
print(latex_table_mini)
print(f"\n✅ Mini AURAC table saved to: {filepath_mini}")
print(f"Table shape: {df_mini_report_aurac.shape}")

print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print("✅ Both AURAC tables generated with proper LaTeX formatting!")
print("✅ Statistical significance formatting applied (bold + shading)")
print("✅ Tables ready for your ICLR paper")
print(f"\nGenerated files:")
print(f"  - {filepath_ftc}")
print(f"  - {filepath_mini}")
print(f"\nFeatures:")
print("  • Bold formatting for best values")
print("  • Gray shading for non-significantly different values")
print("  • Proper LaTeX table structure with captions and labels")
print("  • Single backslashes (correct LaTeX formatting)")
print("  • 'Full dataset' and 'Mini dataset' in captions")

=== MINI DATASET AURAC TABLE ===
\begin{table}[h]
  \caption{Mini dataset: Area Under the Rejection-Accuracy Curve (AURAC) over data splits; mean ± 95\% confidence interval half-width) on the full dataset (100\%). The best mean is shown in bold, and methods not significantly different from the best (paired test, \( \alpha = 0.05 \)) are shaded.}
  \label{tab:aurac_mini_pure_logits}
  \centering
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{lcccccc}
    \toprule
    Ensemble Type & DBpedia & News & SST-2 & SetFit & Tweet & IMDB \\
    \midrule
    G5 & \cellcolor{gray!20}0.9895 ± 0.0001 & 0.9769 ± 0.0002 & 0.979 ± 0.0008 & 0.9406 ± 0.0005 & 0.8982 ± 0.0026 & 0.9809 ± 0.0006 \\
    G5 p.t.c. & \cellcolor{gray!20}0.9895 ± 0.0001 & 0.9769 ± 0.0003 & 0.979 ± 0.0008 & 0.9407 ± 0.0005 & 0.8981 ± 0.0026 & 0.9809 ± 0.0006 \\
    G5 JUCAL & 0.9895 ± 0.0001 & \cellcolor{gray!20}0.9779 ± 0.0003 & 0.9817 ± 0.0004 & 0.95 ± 0.0005 & 0.9025 ± 0.0018 & \cellcolor{gray!20}0.9819 ± 0.0005 \\
    \midru