# LLM Analysis Tables

This notebook generates LaTeX tables from pre-computed metrics for the LLM uncertainty analysis.

**Prerequisites:** Run `iclr_llm_analysis_plots_v2.ipynb` first to compute metrics.

**Contents:**
1. Load pre-computed metrics from plots_v2 notebook
2. Generate significance tests 
3. Create LaTeX tables for all metrics (NLL, AUC, AURAC, Set Size)
4. Special case: Compute AURAC for older April 2025 data (not in plots_v2)

In [16]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind_from_stats, norm

sys.path.append('../src/')
from plotting import get_coverage_threshold_and_size, get_auc, compute_aurac

## Helper Functions for Table Generation

In [17]:
base_path = Path('Ensembling_Finetuned_LLMs')

def read_file(file_path, base_path=base_path) -> pd.DataFrame:
    """
    Read a file and return a DataFrame.
    """
    path  = base_path / 'llm_experiments_data' / file_path
    if not path.exists():
        raise FileNotFoundError(f"File {path} does not exist.")
    return pd.read_csv(path)

def combine_and_clean_dataframes(df1, df2 = None) -> pd.DataFrame:
    """
    Combine and clean the dataframes.
    """
    parts = [df1]
    if df2 is not None:
        parts.append(df2)
    # Combine the dataframes
    combined_df = pd.concat(parts, ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset=['dataset', 'seed', 'method', 'ensemble_type'], keep='first')
    combined_df.reset_index(drop=True, inplace=True)
    return combined_df

def calc_ci_for_df(df) -> pd.DataFrame:
    """
    Code assumes the right columns are present in the dataframe
    """
    #group by dataset, method, ensemble_type and calculate mean, std, count
    df = df.groupby(['dataset', 'method', 'ensemble_type'])[
    ['ensemble_size', 'ensemble_unique_size', 'nll_test', 'c1', 'c2', 'epi_scalar', 'threshold', 'set_size', 'auc', 'aurac', 'aorac' ]
    ].agg(
        ['mean', 'std', 'count']
        ).reset_index()
    # note we will get a double index
    columns = ['ensemble_size', 'ensemble_unique_size', 'nll_test', 'c1', 'c2', 'epi_scalar', 'threshold', 'set_size', 'auc', 'aurac', 'aorac' ]
    for col in columns:
        df[(col, 'CI')] = 1.96 * (df[(col, 'std')] / np.sqrt(df[(col, 'count')].replace(0, np.nan)))
        df[(col,'mean±CI')] = (df[(col, 'mean')].round(4).astype(str) + 
                              " ± " + df[(col, 'CI')].round(4).astype(str))
    return df

def flatten_subset_df(df, subset = ['dataset', 'method', 'ensemble_type', 'nll_test_mean±CI',
                                     'threshold_mean±CI', 'set_size_mean±CI', 'auc_mean±CI', 'aurac_mean±CI', 'aorac_mean±CI']) -> pd.DataFrame:
    """
    Flatten the dataframe
    """
    # Flatten the multi-index columns
    df.columns = [
                '_'.join(col).strip('_') if col[1] else col[0] 
                  for col in df.columns.values]
    #return subset of columns
    return df[subset]

## Functions for determining statistical significance

In [18]:
def compare_to_best(
    df: pd.DataFrame,
    metric: str,
    ensemble_types: list = None,  # Auto-detect if None
    calibration_method: str = 'pure_logits',
    alpha: float = 0.05
) -> pd.DataFrame:
    """
    For each dataset, find the best ensemble_type on `metric`,
    then compare *all* ensemble_types to that best one.

    Args:
      df                 : DataFrame with columns
                           ['dataset','ensemble_type','method',
                            f'{metric}_mean', f'{metric}_std', f'{metric}_count', …]
      ensemble_types     : list of ensemble_type values to compare (auto-detect if None)
      calibration_method : the `method` value to filter on (e.g. 'pure_logits')
      metric             : one of ['nll_test','threshold','set_size','auc','aurac','aorac']
      alpha              : for 95% CI and p<alpha test

    Returns:
      DataFrame with one row per (dataset, ensemble_type) containing:
        • best_ensemble_type  
        • mean & CI for that row & for the best  
        • ci_no_overlap (bool)  
        • significant (bool, p<alpha)  
        • p_value, t_stat
    """
    z = norm.ppf(1 - alpha/2)  # ≈1.96 for alpha=0.05
    print(f"Using z={z:.2f} for alpha={alpha:.2f}")
    
    # Auto-detect ensemble types if not provided
    if ensemble_types is None:
        ensemble_types = df['ensemble_type'].unique().tolist()
        print(f"Auto-detected ensemble types: {ensemble_types}")
    
    results = []

    for ds in df['dataset'].unique():
        # restrict to this dataset, this calibration method, and only those ensemble_types
        sub = df[
            (df['dataset'] == ds) &
            (df['method']  == calibration_method) &
            (df['ensemble_type'].isin(ensemble_types))
        ].copy()

        if sub.empty:
            print(f"Warning: No data for dataset {ds} with method {calibration_method} and ensemble types {ensemble_types}")
            continue

        # compute 95% CI boundaries for every row
        sub[f'{metric}_ci_lo'] = sub[f'{metric}_mean'] - z * sub[f'{metric}_std'] / np.sqrt(sub[f'{metric}_count'])
        sub[f'{metric}_ci_hi'] = sub[f'{metric}_mean'] + z * sub[f'{metric}_std'] / np.sqrt(sub[f'{metric}_count'])

        # pick the best row - for AURAC we want max, for AORAC we want min
        if metric in ['auc', 'aurac']:
            best_idx = sub[f'{metric}_mean'].idxmax()
        else:
            best_idx = sub[f'{metric}_mean'].idxmin()
        best = sub.loc[best_idx]
        print(f"{best['ensemble_type']} is the best ensemble type for {ds} with {metric}={best[metric+'_mean']:.4f} ")
        
        # compare every row to best
        for _, row in sub.iterrows():
            m1, s1, n1 = row[f'{metric}_mean'], row[f'{metric}_std'], row[f'{metric}_count']
            lo1, hi1 = row[f'{metric}_ci_lo'], row[f'{metric}_ci_hi']

            m0, s0, n0 = best[f'{metric}_mean'], best[f'{metric}_std'], best[f'{metric}_count']
            lo0, hi0 = best[f'{metric}_ci_lo'], best[f'{metric}_ci_hi']

            # CI non-overlap?
            ci_no_overlap = (hi1 < lo0) or (hi0 < lo1)

            # Welch's t-test
            t_stat, p_val = ttest_ind_from_stats(
                mean1=m1, std1=s1, nobs1=n1,
                mean2=m0, std2=s0, nobs2=n0,
                equal_var=False
            )
            significant = (p_val < alpha)

            results.append({
                'calibration_method': calibration_method,
                'metric':              metric,
                'ci_no_overlap':       ci_no_overlap,
                'hi':                  hi1,
                'lo':                  lo1,
                'best_ensemble_type':  best['ensemble_type'],
                'dataset':             ds,
                'mean':                m1,
                'mean_best':           m0,
                'ensemble_type':       row['ensemble_type'],
                'significant':         significant
            })

    return pd.DataFrame(results)

def compare_to_two(
    df: pd.DataFrame,
    metric: str,
    ensemble_type_1: str,
    ensemble_type_2: str,
    calibration_method: str = 'pure_logits',
    alpha: float = 0.05
) -> pd.DataFrame:
    """
    Compare two ensemble types (ensemble_type_1 vs. ensemble_type_2)
    on a single metric, for every dataset.

    Returns columns:
      - dataset
      - metric
      - ensemble_type_1, ensemble_type_2
      - mean_1, ci_lo_1, ci_hi_1
      - mean_2, ci_lo_2, ci_hi_2
      - ci_no_overlap  (True if CIs are disjoint)
      - t_stat, p_value, significant
    """
    # z-value for two-sided (1-alpha) CI
    z = norm.ppf(1 - alpha/2)
    print(f"Using z = {z:.2f} for {100*(1-alpha):.0f}% CI")

    results = []
    for ds in df['dataset'].unique():
        # filter to this dataset, this method, these two ensemble types
        sub = df[
            (df['dataset'] == ds) &
            (df['method']  == calibration_method) &
            (df['ensemble_type'].isin([ensemble_type_1, ensemble_type_2]))
        ]
        if len(sub) < 2:
            # skip if we don't have both
            continue

        # pull them out
        row1 = sub[sub['ensemble_type'] == ensemble_type_1].iloc[0]
        row2 = sub[sub['ensemble_type'] == ensemble_type_2].iloc[0]

        # summary stats
        m1, s1, n1 = row1[f"{metric}_mean"], row1[f"{metric}_std"], row1[f"{metric}_count"]
        m2, s2, n2 = row2[f"{metric}_mean"], row2[f"{metric}_std"], row2[f"{metric}_count"]

        # 95% CIs
        lo1, hi1 = m1 - z*s1/np.sqrt(n1), m1 + z*s1/np.sqrt(n1)
        lo2, hi2 = m2 - z*s2/np.sqrt(n2), m2 + z*s2/np.sqrt(n2)

        # disjoint?
        ci_no_overlap = (hi1 < lo2) or (hi2 < lo1)

        # Welch's t-test
        t_stat, p_val = ttest_ind_from_stats(
            mean1=m1, std1=s1, nobs1=n1,
            mean2=m2, std2=s2, nobs2=n2,
            equal_var=False
        )
        significant = (p_val < alpha)
        # NOTE we again report no overlap => True stronger than significant
        results.append({
            'dataset'                : ds,
            'metric'                 : metric,
            f'mean_{ensemble_type_1}': m1,
            f'mean_{ensemble_type_2}': m2,
            'ci_no_overlap'          : ci_no_overlap,
            'significant'            : significant,
            f'ci_lo_{ensemble_type_1}': lo1,
            f'ci_hi_{ensemble_type_1}': hi1,
            f'ci_lo_{ensemble_type_2}': lo2,
            f'ci_hi_{ensemble_type_2}': hi2,
        })

    return pd.DataFrame(results)

### Create the table, view, for the report

In [19]:
def create_report_view_df(df: pd.DataFrame,
                          value_vars: list, 
                          calibration_method: str = 'pure_logits',
                          custom_rows: list = None) -> pd.DataFrame:
    """
    Melt & pivot a stats‐DataFrame so that 'metric'×'dataset' becomes
    a two‐level column index, but only for the metrics in value_vars.
    
    Args:
      df         : DataFrame with columns
                   ['dataset','ensemble_type','method', *value_vars*]
      value_vars : list of the exact column names in `df` to pivot,
                   e.g. ['nll_test_mean±CI', 'auc_mean±CI', 'aurac_mean±CI', 'aorac_mean±CI']
      custom_rows: list of ensemble_type values in desired order, or None to auto-detect
    Returns:
      Wide‐form DataFrame indexed by (ensemble_type, method), with
      columns MultiIndex (metric_short, dataset_short).
    """
    calibration_methods = df['method'].unique()
    if calibration_method not in calibration_methods:
        raise ValueError(f"calibration_method {calibration_method} not in {calibration_methods}")
    #filter based on calibration method
    df = df.loc[df['method'] == calibration_method]
    # 'nll_test_mean±CI', 'threshold_mean±CI', 'set_size_mean±CI', 'auc_mean±CI', 'aurac_mean±CI', 'aorac_mean±CI'
    dfm = df.melt(
        id_vars    = ['dataset','ensemble_type','method'],
        value_vars = value_vars,
        var_name   = 'metric',
        value_name = 'value'
    )

    metric_name_map = {
        'nll_test_mean±CI'   : 'NLL',
        'threshold_mean±CI'  : 'Threshold',
        'set_size_mean±CI'   : 'Set Size',
        'auc_mean±CI'        : 'AUC',
        'aurac_mean±CI'      : 'AURAC',
        'aorac_mean±CI'      : 'AORAC'
    }
    # 3) map both metric and dataset to their short names
    dfm['metric']  = dfm['metric'].map(metric_name_map)
    dfm['dataset'] = dfm['dataset'].map({
        'SetFit/mnli'                     : 'SetFit',
        'ag_news'                         : 'News',
        'dbpedia_14'                      : 'DBpedia',
        'imdb'                            : 'IMDB',
        'mteb/tweet_sentiment_extraction' : 'Tweet',
        'stanfordnlp/sst2'                : 'SST-2'
    })
    
    # 4) pivot to a two‐level column index
    pivoted = dfm.pivot_table(
        index   = ['ensemble_type','method'],
        columns = ['metric','dataset'],
        values  = 'value',
        aggfunc = 'first'
    )
    
    # 5) decide on the exact metric‐order you want (in the same order as value_vars)
    metrics_order = [ metric_name_map[v] for v in value_vars ]
    datasets_order = ['DBpedia', 'News', 'SST-2', 'SetFit', 'Tweet', 'IMDB']
    
    # 6) build the MultiIndex of all (metric, dataset) pairs you need
    new_cols = pd.MultiIndex.from_product(
        [metrics_order, datasets_order],
        names=['metric','dataset']
    )
    
    # 7) re‐index the pivoted DataFrame to force that exact column order
    wide = pivoted.reindex(columns=new_cols)
    
    # 8) bring ensemble_type & method back as columns, sort rows, drop any unwanted
    wide = wide.reset_index()
    
    # 9) sort rows by a categorical order of ensemble_type
    if custom_rows is None:
        # Auto-detect from data if not provided
        custom_rows = sorted(df['ensemble_type'].unique())
        print(f"Auto-detected ensemble types: {custom_rows}")
    
    wide['ensemble_type'] = pd.Categorical(
        wide['ensemble_type'],
        categories=custom_rows,
        ordered=True
    )
    wide = wide.sort_values(['ensemble_type','method']).reset_index(drop=True)
    
    return wide

### LaTeX Table Generation with Statistical Significance Formatting

Functions to create publication-ready LaTeX tables with:
- **Bold** formatting for best values
- **Shading** for values not significantly different from best
- Proper LaTeX structure with captions and labels

In [20]:
def parse_mean_ci(value_str: str) -> float:
    """
    Extract mean value from 'mean ± CI' string.
    
    Args:
        value_str: String in format "0.1234 ± 0.0056"
    
    Returns:
        float: The mean value
    """
    if pd.isna(value_str) or value_str == 'nan':
        return float('nan')
    return float(value_str.split(' ± ')[0])

def format_cell_value(value_str: str, is_best: bool, is_not_significant: bool) -> str:
    """
    Apply LaTeX formatting to a cell value.
    
    Args:
        value_str: Original value string "mean ± CI"
        is_best: True if this is the best value in the column
        is_not_significant: True if not significantly different from best
    
    Returns:
        str: LaTeX formatted string
    """
    if pd.isna(value_str) or value_str == 'nan':
        return ''
    
    # Start with the original value
    formatted = value_str
    
    # Apply bold if it's the best value
    if is_best:
        formatted = f"\\textbf{{{formatted}}}"
    
    # Apply shading if it's not significantly different from best OR if it is the best
    if is_not_significant or is_best:
        formatted = f"\\cellcolor{{gray!20}}{formatted}"
    
    return formatted

def get_significance_for_dataset(df_significance: pd.DataFrame, 
                                dataset: str, 
                                ensemble_type: str) -> tuple[bool, bool]:
    """
    Get significance information for a specific dataset-ensemble combination.
    
    Args:
        df_significance: DataFrame from compare_to_best function
        dataset: Dataset name (e.g., 'DBpedia', 'News')
        ensemble_type: Ensemble type name (ORIGINAL name, not display name)
    
    Returns:
        tuple: (is_best, is_not_significant)
    """
    # Map short dataset names back to full names for lookup
    dataset_map_reverse = {
        'DBpedia': 'dbpedia_14',
        'News': 'ag_news', 
        'SST-2': 'stanfordnlp/sst2',
        'SetFit': 'SetFit/mnli',
        'Tweet': 'mteb/tweet_sentiment_extraction',
        'IMDB': 'imdb'
    }
    
    full_dataset_name = dataset_map_reverse.get(dataset, dataset)
    
    # Find the row for this dataset and ensemble type
    mask = (df_significance['dataset'] == full_dataset_name) & \
           (df_significance['ensemble_type'] == ensemble_type)
    
    if not mask.any():
        return False, False
    
    row = df_significance[mask].iloc[0]
    
    # Check if this ensemble type is the best
    is_best = (row['ensemble_type'] == row['best_ensemble_type'])
    
    # Check if it's not significantly different from best
    is_not_significant = not row['significant']
    
    return is_best, is_not_significant

def create_latex_table_with_significance(
    df_wide: pd.DataFrame,           # Wide-format data from create_report_view_df
    df_significance: pd.DataFrame,   # Significance test results
    ensemble_type_mapping: dict,     # Original -> Display name mapping
    metric_name: str,                # 'NLL', 'AUC', 'AURAC', 'Set Size'
    dataset_name: str,               # 'FTC-metadataset', 'Extended', 'Mini'
    caption: str,                    # Full caption text
    label: str,                      # e.g., 'tab:nll_pure_logits'
    maximize: bool = False           # True for AUC/AURAC, False for NLL/Set Size
) -> str:
    """
    Create a publication-ready LaTeX table with statistical significance formatting.
    
    Args:
        df_wide: Wide-format DataFrame with ensemble_type as rows, datasets as columns
        df_significance: Results from compare_to_best() function
        ensemble_type_mapping: Mapping from original names to display names
        metric_name: Name of the metric for the table
        dataset_name: Dataset name for caption
        caption: Full LaTeX caption
        label: LaTeX label for referencing
        maximize: True if higher values are better (AUC, AURAC), False otherwise
    
    Returns:
        str: Complete LaTeX table code
    """
    # Create a copy to avoid modifying the original
    df_formatted = df_wide.copy()
    
    # Handle metric name mapping - 'AUROC' should map to 'AUC' in the data
    metric_lookup = {'AUROC': 'AUC', 'Set_Size': 'Set Size'}
    data_metric_name = metric_lookup.get(metric_name, metric_name)
    
    # Get the metric column names (should be tuples like (metric, dataset))
    metric_cols = [col for col in df_formatted.columns if isinstance(col, tuple) and col[0] == data_metric_name]
    
    if not metric_cols:
        raise ValueError(f"No columns found for metric '{data_metric_name}' (original: '{metric_name}') in the DataFrame")
    
    # Create reverse mapping from display names to original names for significance lookup
    reverse_mapping = {v: k for k, v in ensemble_type_mapping.items()}
    
    # Process each dataset column
    for metric, dataset in metric_cols:
        col_name = (metric, dataset)
        
        # Apply formatting to each cell in this column
        for row_idx in range(len(df_formatted)):
            # Handle the ensemble_type column access properly with MultiIndex (FIXED)
            # Use iloc to access by integer position and then get scalar value
            ensemble_display_name = df_formatted.iloc[row_idx]['ensemble_type']
            if isinstance(ensemble_display_name, pd.Series):
                ensemble_display_name = ensemble_display_name.iloc[0]
            
            ensemble_original_name = reverse_mapping.get(ensemble_display_name, ensemble_display_name)
            
            # Get the original value using iloc for MultiIndex (FIXED)
            original_value = df_formatted.iloc[row_idx][col_name]
            if isinstance(original_value, pd.Series):
                original_value = original_value.iloc[0]
            
            if pd.isna(original_value) or str(original_value) == 'nan':
                # Set empty string using loc for MultiIndex
                df_formatted.loc[row_idx, col_name] = ''
                continue
            
            # Get significance information  
            is_best, is_not_significant = get_significance_for_dataset(
                df_significance, dataset, ensemble_original_name
            )
            
            # Apply formatting
            formatted_value = format_cell_value(str(original_value), is_best, is_not_significant)
            # Set value using loc for MultiIndex
            df_formatted.loc[row_idx, col_name] = formatted_value
    
    # Build the LaTeX table structure (also fixing data access here)
    datasets = [col[1] for col in metric_cols]  # Extract dataset names
    n_datasets = len(datasets)
    
    # Column specification: l for ensemble type + c for each dataset
    col_spec = 'l' + 'c' * n_datasets
    
    # Build header row - FIXED: single backslashes
    header = ' & '.join(['Ensemble Type'] + datasets) + ' \\\\'
    
    # Build data rows (FIXED for MultiIndex)
    data_rows = []
    for row_idx in range(len(df_formatted)):
        ensemble_name = df_formatted.iloc[row_idx]['ensemble_type']
        if isinstance(ensemble_name, pd.Series):
            ensemble_name = ensemble_name.iloc[0]
        
        values = []
        for col in metric_cols:
            value = df_formatted.iloc[row_idx][col]
            if isinstance(value, pd.Series):
                value = value.iloc[0]
            values.append(str(value))
        
        row_str = ' & '.join([str(ensemble_name)] + values) + ' \\\\'
        data_rows.append(row_str)
    
    # Create separators for different ensemble groups if needed
    formatted_rows = []
    for i, row_str in enumerate(data_rows):
        if i > 0 and 'Greedy-50' in row_str:
            # Add midrule before first Greedy-50 row
            prev_row = data_rows[i-1]
            if 'Greedy-5' in prev_row and 'Greedy-50' not in prev_row:
                formatted_rows.append('\\midrule')
        formatted_rows.append(row_str)
    
    # Combine everything into the full LaTeX table - FIXED: single backslashes
    latex_lines = [
        '\\begin{table}[h]',
        f'  \\caption{{{caption}}}',
        f'  \\label{{{label}}}',
        '  \\centering',
        '  \\resizebox{\\textwidth}{!}{%',
        f'  \\begin{{tabular}}{{{col_spec}}}',
        '    \\toprule',
        f'    {header}',
        '    \\midrule'
    ]
    
    # Add data rows
    for row in formatted_rows:
        latex_lines.append(f'    {row}')
    
    # Close the table
    latex_lines.extend([
        '    \\bottomrule',
        '  \\end{tabular}',
        '  }',
        '\\end{table}'
    ])
    
    # Return with proper newlines
    return '\n'.join(latex_lines)

## Load Pre-computed Metrics

Load metrics that have already been computed in iclr_llm_analysis_plots_v2.ipynb

In [21]:
# Load pre-computed metrics from iclr_llm_analysis_plots_v2.ipynb
# These files contain all computed metrics (NLL, AUC, AURAC, set_size, etc.)

# Load metrics files computed with different dates
try:
    # Try latest files first (from plots_v2 notebook)
    df_extended_99 = read_file('metrics/extended_with_metrics_cov_0p99_2025-09-21.csv')
    df_extended_999 = read_file('metrics/extended_with_metrics_cov_0p999_2025-09-21.csv')
    df_mini_99 = read_file('metrics/mini_with_metrics_cov_0p99_2025-09-21.csv')
    df_mini_999 = read_file('metrics/mini_with_metrics_cov_0p999_2025-09-21.csv')
    
    print("✓ Loaded latest metrics from plots_v2 (2025-09-21)")
    print(f"Extended 99%: {df_extended_99.shape}")
    print(f"Mini 99%: {df_mini_99.shape}")
    print(f"Columns available: {df_extended_99.columns.tolist()}")
    
except FileNotFoundError:
    print("Latest metrics files not found, trying fallback files...")
    # Try older files as fallback  
    try:
        df_extended_99 = read_file('metrics/ftc_with_metrics_cov_0p99_09_10.csv')
        df_extended_999 = read_file('metrics/ftc_with_metrics_cov_0p999_09_10.csv')
        df_mini_1_99 = read_file('metrics/mini_1_with_metrics_cov_0p99_09_10.csv')
        df_mini_2_99 = read_file('metrics/mini_2_with_metrics_cov_0p99_09_10.csv')
        df_mini_1_999 = read_file('metrics/mini_1_with_metrics_cov_0p999_09_10.csv')
        df_mini_2_999 = read_file('metrics/mini_2_with_metrics_cov_0p999_09_10.csv')
        
        # Combine mini datasets
        df_mini_99 = combine_and_clean_dataframes(df_mini_1_99, df_mini_2_99)
        df_mini_999 = combine_and_clean_dataframes(df_mini_1_999, df_mini_2_999)
        
        print("✓ Loaded fallback metrics (09_10)")
        print(f"Extended 99%: {df_extended_99.shape}")
        print(f"Mini 99%: {df_mini_99.shape}")
        
    except FileNotFoundError:
        print(" No pre-computed metrics found!")
        print("Please run iclr_llm_analysis_plots_v2.ipynb first to compute metrics.")
        raise

✓ Loaded latest metrics from plots_v2 (2025-09-21)
Extended 99%: (240, 18)
Mini 99%: (240, 18)
Columns available: ['dataset', 'seed', 'method', 'ensemble_type', 'ensemble_size', 'ensemble_unique_size', 'nll_test', 'c1', 'c2', 'epi_scalar', 'ensemble_time', 'calibration_time', 'path', 'threshold', 'set_size', 'auc', 'aurac', 'aorac']


## Generate LaTeX Tables from Pre-computed Metrics

Create tables for all available metrics using the loaded data.

In [22]:
# Process loaded metrics for table generation
method_of_interest = 'pure_logits'

# Filter for pure_logits method only
df_extended_99_filtered = df_extended_99.loc[df_extended_99['method'] == method_of_interest]
df_extended_999_filtered = df_extended_999.loc[df_extended_999['method'] == method_of_interest]
df_mini_99_filtered = df_mini_99.loc[df_mini_99['method'] == method_of_interest]
df_mini_999_filtered = df_mini_999.loc[df_mini_999['method'] == method_of_interest]

print("Filtered data for pure_logits method:")
print(f"Extended 99%: {df_extended_99_filtered.shape}")
print(f"Mini 99%: {df_mini_99_filtered.shape}")
print(f"Available ensemble types: {sorted(df_extended_99_filtered['ensemble_type'].unique())}")
print("Available metrics columns:")
metric_cols = [col for col in df_extended_99_filtered.columns if col in ['nll_test', 'auc', 'aurac', 'set_size', 'threshold']]
print(metric_cols)

# Calculate confidence intervals for all datasets
df_extended_99_ci = calc_ci_for_df(df_extended_99_filtered.copy())
df_extended_999_ci = calc_ci_for_df(df_extended_999_filtered.copy())
df_mini_99_ci = calc_ci_for_df(df_mini_99_filtered.copy())
df_mini_999_ci = calc_ci_for_df(df_mini_999_filtered.copy())

# Create subset dataframes with the metrics we need for tables
subset_cols = ['dataset', 'method', 'ensemble_type', 'nll_test_mean±CI', 'auc_mean±CI', 'set_size_mean±CI']
if 'aurac' in df_extended_99_filtered.columns:
    subset_cols.append('aurac_mean±CI')

print(f"Table columns to include: {subset_cols}")

df_extended_99_subset = flatten_subset_df(df_extended_99_ci, subset=subset_cols)
df_extended_999_subset = flatten_subset_df(df_extended_999_ci, subset=subset_cols)
df_mini_99_subset = flatten_subset_df(df_mini_99_ci, subset=subset_cols)
df_mini_999_subset = flatten_subset_df(df_mini_999_ci, subset=subset_cols)

print("✓ Processed metrics for table generation")

Filtered data for pure_logits method:
Extended 99%: (240, 18)
Mini 99%: (240, 18)
Available ensemble types: ['greedy_50_baseline', 'greedy_50_post_calib', 'greedy_50_temp_calibrate_then_pool', 'greedy_50_temp_pool_then_calibrate', 'greedy_5_baseline', 'greedy_5_post_calib', 'greedy_5_temp_calibrate_then_pool', 'greedy_5_temp_pool_then_calibrate']
Available metrics columns:
['nll_test', 'threshold', 'set_size', 'auc', 'aurac']
Table columns to include: ['dataset', 'method', 'ensemble_type', 'nll_test_mean±CI', 'auc_mean±CI', 'set_size_mean±CI', 'aurac_mean±CI']
✓ Processed metrics for table generation


## Discover Available Ensemble Types

Check what ensemble types are actually in the loaded data to create proper mappings.

In [23]:
# Discover what ensemble types are in the loaded data
print("=== ENSEMBLE TYPES IN LOADED DATA ===")
print("Use this information to update the mapping dictionaries below!\n")

print(" Extended dataset ensemble types:")
for i, et in enumerate(sorted(df_extended_99_filtered['ensemble_type'].unique()), 1):
    print(f"  {i}. '{et}'")

print("\nMini dataset ensemble types:")
for i, et in enumerate(sorted(df_mini_99_filtered['ensemble_type'].unique()), 1):
    print(f"  {i}. '{et}'")

print("\n" + "="*60)
print("COPY THE TYPES ABOVE TO UPDATE THE MAPPINGS BELOW")
print("="*60)

=== ENSEMBLE TYPES IN LOADED DATA ===
Use this information to update the mapping dictionaries below!

 Extended dataset ensemble types:
  1. 'greedy_50_baseline'
  2. 'greedy_50_post_calib'
  3. 'greedy_50_temp_calibrate_then_pool'
  4. 'greedy_50_temp_pool_then_calibrate'
  5. 'greedy_5_baseline'
  6. 'greedy_5_post_calib'
  7. 'greedy_5_temp_calibrate_then_pool'
  8. 'greedy_5_temp_pool_then_calibrate'

Mini dataset ensemble types:
  1. 'greedy_50_baseline'
  2. 'greedy_50_post_calib'
  3. 'greedy_50_temp_calibrate_then_pool'
  4. 'greedy_50_temp_pool_then_calibrate'
  5. 'greedy_5_baseline'
  6. 'greedy_5_post_calib'
  7. 'greedy_5_temp_calibrate_then_pool'
  8. 'greedy_5_temp_pool_then_calibrate'

COPY THE TYPES ABOVE TO UPDATE THE MAPPINGS BELOW


## Define Ensemble Type Mappings

Update these mappings based on the output above to get proper table labels.

In [24]:
# Mapping for Extended/Mini datasets (from plots_v2)
etype_map_extended = {
    'greedy_5_baseline': 'Greedy-5',
    'greedy_5_temp_pool_then_calibrate': 'Greedy-5 p.t.c.',
    'greedy_5_temp_calibrate_then_pool': 'Greedy-5 c.t.p.',
    'greedy_5_post_calib': 'Greedy-5 JUCAL',
    'greedy_50_baseline': 'Greedy-50',
    'greedy_50_temp_pool_then_calibrate': 'Greedy-50 p.t.c.',
    'greedy_50_temp_calibrate_then_pool': 'Greedy-50 c.t.p.',
    'greedy_50_post_calib': 'Greedy-50 JUCAL'
}

# Mapping for FTC dataset (April 2025 data)
etype_map_ftc = {
    'greedy_unique_5_baseline': 'G5',
    'greedy_unique_5_temp_baseline': 'G5 p.t.c.',
    'greedy_unique_5_post_calib': 'G5 JUCAL',
    'greedy_50_baseline': 'G50',
    'greedy_50_temp_baseline': 'G50 p.t.c.',
    'greedy_50_post_calib': 'G50 JUCAL',
    'greedy_50_calib_once': 'G50 r.c.o. JUCAL',
    'greedy_50_calib_every_step': 'G50 r.c. JUCAL'
}

# Define the correct ordering patterns for each dataset type
ensemble_order_extended = [
    'Greedy-5',
    'Greedy-5 p.t.c.',
    'Greedy-5 c.t.p.',
    'Greedy-5 JUCAL',
    'Greedy-50',
    'Greedy-50 p.t.c.',
    'Greedy-50 c.t.p.',
    'Greedy-50 JUCAL'
]

ensemble_order_ftc = [
    'G5',
    'G5 p.t.c.',
    'G5 JUCAL',
    'G50',
    'G50 p.t.c.',
    'G50 JUCAL',
    'G50 r.c.o. JUCAL',
    'G50 r.c. JUCAL'
]

def get_ensemble_order_for_dataset(available_types, dataset_type='extended'):
    """
    Get ensemble types in the correct logical order based on dataset type.
    
    Args:
        available_types: List of ensemble types available in the data
        dataset_type: 'extended', 'mini', or 'ftc'
    
    Returns:
        List of ensemble types in correct order
    """
    if dataset_type in ['extended', 'mini']:
        preferred_order = ensemble_order_extended
    elif dataset_type == 'ftc':
        preferred_order = ensemble_order_ftc
    else:
        # Fallback to alphabetical if unknown type
        return sorted(available_types)
    
    # Filter to only include available types in the correct order
    ordered_types = [t for t in preferred_order if t in available_types]
    
    # Add any remaining types that don't match patterns (fallback)
    for t in available_types:
        if t not in ordered_types:
            ordered_types.append(t)
            
    return ordered_types

print("✓ Mapping dictionaries and ordering patterns defined")
print(f"✓ Extended mapping has {len(etype_map_extended)} entries")
print(f"✓ FTC mapping has {len(etype_map_ftc)} entries")
print(f"✓ Extended order: {ensemble_order_extended}")
print(f"✓ FTC order: {ensemble_order_ftc}")

✓ Mapping dictionaries and ordering patterns defined
✓ Extended mapping has 8 entries
✓ FTC mapping has 8 entries
✓ Extended order: ['Greedy-5', 'Greedy-5 p.t.c.', 'Greedy-5 c.t.p.', 'Greedy-5 JUCAL', 'Greedy-50', 'Greedy-50 p.t.c.', 'Greedy-50 c.t.p.', 'Greedy-50 JUCAL']
✓ FTC order: ['G5', 'G5 p.t.c.', 'G5 JUCAL', 'G50', 'G50 p.t.c.', 'G50 JUCAL', 'G50 r.c.o. JUCAL', 'G50 r.c. JUCAL']


### Running significance tests

In [25]:
# Generate significance tests for all metrics
print("Running significance tests...")

# Extended dataset significance tests
print("=== Extended Dataset Significance Tests ===")
if 'aurac' in df_extended_99_filtered.columns:
    df_extended_sig_aurac_99 = compare_to_best(df_extended_99_ci, metric='aurac', calibration_method='pure_logits', alpha=0.05)
    print(f"✓ Extended AURAC significance: {df_extended_sig_aurac_99.shape}")

df_extended_sig_nll_99 = compare_to_best(df_extended_99_ci, metric='nll_test', calibration_method='pure_logits', alpha=0.05)
df_extended_sig_auc_99 = compare_to_best(df_extended_99_ci, metric='auc', calibration_method='pure_logits', alpha=0.05)
df_extended_sig_setsize_99 = compare_to_best(df_extended_99_ci, metric='set_size', calibration_method='pure_logits', alpha=0.05)

print(f"✓ Extended NLL significance: {df_extended_sig_nll_99.shape}")
print(f"✓ Extended AUC significance: {df_extended_sig_auc_99.shape}")
print(f"✓ Extended Set Size significance: {df_extended_sig_setsize_99.shape}")

# Mini dataset significance tests
print("\n=== Mini Dataset Significance Tests ===")
if 'aurac' in df_mini_99_filtered.columns:
    df_mini_sig_aurac_99 = compare_to_best(df_mini_99_ci, metric='aurac', calibration_method='pure_logits', alpha=0.05)
    print(f"✓ Mini AURAC significance: {df_mini_sig_aurac_99.shape}")

df_mini_sig_nll_99 = compare_to_best(df_mini_99_ci, metric='nll_test', calibration_method='pure_logits', alpha=0.05)
df_mini_sig_auc_99 = compare_to_best(df_mini_99_ci, metric='auc', calibration_method='pure_logits', alpha=0.05)
df_mini_sig_setsize_99 = compare_to_best(df_mini_99_ci, metric='set_size', calibration_method='pure_logits', alpha=0.05)

print(f"✓ Mini NLL significance: {df_mini_sig_nll_99.shape}")
print(f"✓ Mini AUC significance: {df_mini_sig_auc_99.shape}")
print(f"✓ Mini Set Size significance: {df_mini_sig_setsize_99.shape}")

print("\n✓ All significance tests completed")

Running significance tests...
=== Extended Dataset Significance Tests ===
Using z=1.96 for alpha=0.05
Auto-detected ensemble types: ['greedy_50_baseline', 'greedy_50_post_calib', 'greedy_50_temp_calibrate_then_pool', 'greedy_50_temp_pool_then_calibrate', 'greedy_5_baseline', 'greedy_5_post_calib', 'greedy_5_temp_calibrate_then_pool', 'greedy_5_temp_pool_then_calibrate']
greedy_5_post_calib is the best ensemble type for SetFit/mnli with aurac=0.9240 
greedy_50_post_calib is the best ensemble type for ag_news with aurac=0.9835 
greedy_50_post_calib is the best ensemble type for dbpedia_14 with aurac=0.9897 
greedy_5_baseline is the best ensemble type for imdb with aurac=0.9859 
greedy_50_temp_calibrate_then_pool is the best ensemble type for mteb/tweet_sentiment_extraction with aurac=0.9246 
greedy_50_temp_calibrate_then_pool is the best ensemble type for stanfordnlp/sst2 with aurac=0.9849 
✓ Extended AURAC significance: (48, 11)
Using z=1.96 for alpha=0.05
Auto-detected ensemble types: 

greedy_50_post_calib is the best ensemble type for ag_news with nll_test=0.1423 
greedy_50_post_calib is the best ensemble type for dbpedia_14 with nll_test=0.0288 
greedy_50_post_calib is the best ensemble type for imdb with nll_test=0.0983 
greedy_50_post_calib is the best ensemble type for mteb/tweet_sentiment_extraction with nll_test=0.4680 
greedy_50_post_calib is the best ensemble type for stanfordnlp/sst2 with nll_test=0.1090 
Using z=1.96 for alpha=0.05
Auto-detected ensemble types: ['greedy_50_baseline', 'greedy_50_post_calib', 'greedy_50_temp_calibrate_then_pool', 'greedy_50_temp_pool_then_calibrate', 'greedy_5_baseline', 'greedy_5_post_calib', 'greedy_5_temp_calibrate_then_pool', 'greedy_5_temp_pool_then_calibrate']
greedy_5_post_calib is the best ensemble type for SetFit/mnli with auc=0.9377 
greedy_50_post_calib is the best ensemble type for ag_news with auc=0.9948 
greedy_50_post_calib is the best ensemble type for dbpedia_14 with auc=0.9999 
greedy_5_temp_calibrate_then_

### Generate LaTeX tables for all metrics (NLL, AUC, AURAC, Set Size)

In [26]:
# Generate LaTeX tables for all metrics with proper ensemble mappings
print("Generating LaTeX tables...")

# Ensure latex directory exists
latex_path = Path('LATEX/llm')
latex_path.mkdir(parents=True, exist_ok=True)

# Apply ensemble mappings to Extended/Mini data BEFORE creating tables
df_extended_99_subset_mapped = df_extended_99_subset.copy()
df_mini_99_subset_mapped = df_mini_99_subset.copy()

# Apply Extended mapping
for original, mapped in etype_map_extended.items():
    df_extended_99_subset_mapped['ensemble_type'] = df_extended_99_subset_mapped['ensemble_type'].replace(original, mapped)
    df_mini_99_subset_mapped['ensemble_type'] = df_mini_99_subset_mapped['ensemble_type'].replace(original, mapped)

# Get ensemble types with proper ordering using the predefined patterns
available_extended = list(df_extended_99_subset_mapped['ensemble_type'].unique())
available_mini = list(df_mini_99_subset_mapped['ensemble_type'].unique())

extended_ensemble_order = get_ensemble_order_for_dataset(available_extended, 'extended')
mini_ensemble_order = get_ensemble_order_for_dataset(available_mini, 'mini')

print(f"Extended ensemble types (proper order): {extended_ensemble_order}")
print(f"Mini ensemble types (proper order): {mini_ensemble_order}")

# Extended dataset tables
print("\n=== Extended Dataset Tables ===")
if 'aurac_mean±CI' in df_extended_99_subset_mapped.columns:
    df_extended_report_aurac = create_report_view_df(
        df_extended_99_subset_mapped, 
        value_vars=['aurac_mean±CI'], 
        calibration_method='pure_logits',
        custom_rows=extended_ensemble_order
    )
    print("\n EXTENDED AURAC TABLE:")
    print(df_extended_report_aurac.to_latex(None, index=False, escape=False))
    print(f"✓ Extended AURAC table: {df_extended_report_aurac.shape}")

df_extended_report_nll = create_report_view_df(
    df_extended_99_subset_mapped, 
    value_vars=['nll_test_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=extended_ensemble_order
)
df_extended_report_auc = create_report_view_df(
    df_extended_99_subset_mapped, 
    value_vars=['auc_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=extended_ensemble_order
)
df_extended_report_setsize = create_report_view_df(
    df_extended_99_subset_mapped, 
    value_vars=['set_size_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=extended_ensemble_order
)

print("\nEXTENDED NLL TABLE:")
print(df_extended_report_nll.to_latex(None, index=False, escape=False))
print("\nEXTENDED AUC TABLE:")
print(df_extended_report_auc.to_latex(None, index=False, escape=False))
print("\nEXTENDED SET SIZE TABLE:")
print(df_extended_report_setsize.to_latex(None, index=False, escape=False))

print(f"✓ Extended NLL table: {df_extended_report_nll.shape}")
print(f"✓ Extended AUC table: {df_extended_report_auc.shape}")
print(f"✓ Extended Set Size table: {df_extended_report_setsize.shape}")

# Mini dataset tables
print("\n=== Mini Dataset Tables ===")
if 'aurac_mean±CI' in df_mini_99_subset_mapped.columns:
    df_mini_report_aurac = create_report_view_df(
        df_mini_99_subset_mapped, 
        value_vars=['aurac_mean±CI'], 
        calibration_method='pure_logits',
        custom_rows=mini_ensemble_order
    )
    print("\nMINI AURAC TABLE:")
    print(df_mini_report_aurac.to_latex(None, index=False, escape=False))
    print(f"✓ Mini AURAC table: {df_mini_report_aurac.shape}")

df_mini_report_nll = create_report_view_df(
    df_mini_99_subset_mapped, 
    value_vars=['nll_test_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=mini_ensemble_order
)
df_mini_report_auc = create_report_view_df(
    df_mini_99_subset_mapped, 
    value_vars=['auc_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=mini_ensemble_order
)
df_mini_report_setsize = create_report_view_df(
    df_mini_99_subset_mapped, 
    value_vars=['set_size_mean±CI'], 
    calibration_method='pure_logits',
    custom_rows=mini_ensemble_order
)

print("\nMINI NLL TABLE:")
print(df_mini_report_nll.to_latex(None, index=False, escape=False))
print("\nMINI AUC TABLE:")
print(df_mini_report_auc.to_latex(None, index=False, escape=False))
print("\nMINI SET SIZE TABLE:")
print(df_mini_report_setsize.to_latex(None, index=False, escape=False))

print(f"✓ Mini NLL table: {df_mini_report_nll.shape}")
print(f"✓ Mini AUC table: {df_mini_report_auc.shape}")
print(f"✓ Mini Set Size table: {df_mini_report_setsize.shape}")

print("\nNext steps:")
print("1. Use the new table generator functions for statistical significance formatting")
print("2. For FTC data, use etype_map_ftc and dataset_type='ftc'")

Generating LaTeX tables...
Extended ensemble types (proper order): ['Greedy-5', 'Greedy-5 p.t.c.', 'Greedy-5 c.t.p.', 'Greedy-5 JUCAL', 'Greedy-50', 'Greedy-50 p.t.c.', 'Greedy-50 c.t.p.', 'Greedy-50 JUCAL']
Mini ensemble types (proper order): ['Greedy-5', 'Greedy-5 p.t.c.', 'Greedy-5 c.t.p.', 'Greedy-5 JUCAL', 'Greedy-50', 'Greedy-50 p.t.c.', 'Greedy-50 c.t.p.', 'Greedy-50 JUCAL']

=== Extended Dataset Tables ===

 EXTENDED AURAC TABLE:
\begin{tabular}{llllllll}
\toprule
ensemble_type & method & \multicolumn{6}{r}{AURAC} \\
 &  & DBpedia & News & SST-2 & SetFit & Tweet & IMDB \\
\midrule
Greedy-5 & pure_logits & 0.9895 ± 0.0 & 0.981 ± 0.0011 & 0.984 ± 0.0005 & 0.8915 ± 0.0008 & 0.9103 ± 0.0028 & 0.9859 ± 0.0002 \\
Greedy-5 p.t.c. & pure_logits & 0.9895 ± 0.0 & 0.981 ± 0.0011 & 0.984 ± 0.0005 & 0.8915 ± 0.0008 & 0.9103 ± 0.0027 & 0.9859 ± 0.0002 \\
Greedy-5 c.t.p. & pure_logits & 0.9896 ± 0.0 & 0.9819 ± 0.0008 & 0.9842 ± 0.0005 & 0.9186 ± 0.0007 & 0.9187 ± 0.0011 & 0.9859 ± 0.0003 \\
G

# 📊 Generate ALL Publication-Ready LaTeX Tables with Statistical Significance

**This section consolidates generation of ALL formatted tables for your ICLR paper.**

- **8 tables total**: 4 Extended + 4 Mini (each with NLL, AUROC, AURAC, Set Size)
- **All with statistical significance formatting**: Bold for best, shading for non-significant
- **Proper LaTeX structure**: Captions, labels, booktabs formatting
- **Consistent file naming**: `{dataset}_{metric}_formatted.tex`
- **Correct naming**: AUC → AUROC (Area under the ROC)

In [27]:
# === GLOBAL CONFIGURATION ===
print(" GENERATING ALL PUBLICATION-READY LATEX TABLES")
print("=" * 60)

# Define paths and settings once
latex_path = Path('LATEX/llm/iclr')
latex_path.mkdir(parents=True, exist_ok=True)
print(f" All LaTeX tables will be saved to: {latex_path}")

# Define table configurations
# Format: metric_name: (value_column, significance_df, maximize, description)
table_configs = {
    'extended': {
        'NLL': ('nll_test_mean±CI', df_extended_sig_nll_99, False, "Negative log-likelihood"),
        'AUROC': ('auc_mean±CI', df_extended_sig_auc_99, True, "Area under the ROC"),
        'AURAC': ('aurac_mean±CI', df_extended_sig_aurac_99, True, "Area Under the Rejection-Accuracy Curve"),
        'Set_Size': ('set_size_mean±CI', df_extended_sig_setsize_99, False, "Predictive Set Size")
    },
    'mini': {
        'NLL': ('nll_test_mean±CI', df_mini_sig_nll_99, False, "Negative log-likelihood"),
        'AUROC': ('auc_mean±CI', df_mini_sig_auc_99, True, "Area under the ROC"),
        'AURAC': ('aurac_mean±CI', df_mini_sig_aurac_99, True, "Area Under the Rejection-Accuracy Curve"),
        'Set_Size': ('set_size_mean±CI', df_mini_sig_setsize_99, False, "Predictive Set Size")
    }
}

# Show summary of what will be generated
total_tables = sum(len(configs) for configs in table_configs.values())
print(f" Will generate {total_tables} publication-ready tables:")
for dataset_name, configs in table_configs.items():
    print(f"  • {dataset_name.title()}: {list(configs.keys())}")
print()

 GENERATING ALL PUBLICATION-READY LATEX TABLES
 All LaTeX tables will be saved to: LATEX/llm/iclr
 Will generate 8 publication-ready tables:
  • Extended: ['NLL', 'AUROC', 'AURAC', 'Set_Size']
  • Mini: ['NLL', 'AUROC', 'AURAC', 'Set_Size']



In [28]:
# === DATA PREPARATION ===
print("PREPARING DATA FOR ALL TABLES")
print("-" * 40)

# Apply ensemble mappings to Extended/Mini data
df_extended_mapped = df_extended_99_subset.copy()
df_mini_mapped = df_mini_99_subset.copy()

for original, mapped in etype_map_extended.items():
    df_extended_mapped['ensemble_type'] = df_extended_mapped['ensemble_type'].replace(original, mapped)
    df_mini_mapped['ensemble_type'] = df_mini_mapped['ensemble_type'].replace(original, mapped)

# Get proper ordering using predefined patterns
available_extended = list(df_extended_mapped['ensemble_type'].unique())
available_mini = list(df_mini_mapped['ensemble_type'].unique())

extended_order = get_ensemble_order_for_dataset(available_extended, 'extended')
mini_order = get_ensemble_order_for_dataset(available_mini, 'mini')

print(f"✓ Extended ensemble order: {extended_order}")
print(f"✓ Mini ensemble order: {mini_order}")

# Prepare data dictionary for easy access
datasets_prepared = {
    'extended': {
        'data': df_extended_mapped,
        'order': extended_order,
        'mapping': etype_map_extended
    },
    'mini': {
        'data': df_mini_mapped,
        'order': mini_order,
        'mapping': etype_map_extended
    }
}

print("✓ Data preparation completed")
print()

PREPARING DATA FOR ALL TABLES
----------------------------------------
✓ Extended ensemble order: ['Greedy-5', 'Greedy-5 p.t.c.', 'Greedy-5 c.t.p.', 'Greedy-5 JUCAL', 'Greedy-50', 'Greedy-50 p.t.c.', 'Greedy-50 c.t.p.', 'Greedy-50 JUCAL']
✓ Mini ensemble order: ['Greedy-5', 'Greedy-5 p.t.c.', 'Greedy-5 c.t.p.', 'Greedy-5 JUCAL', 'Greedy-50', 'Greedy-50 p.t.c.', 'Greedy-50 c.t.p.', 'Greedy-50 JUCAL']
✓ Data preparation completed



In [None]:
# === GENERATE ALL PUBLICATION-READY TABLES ===
# IMPORTANT: Restart kernel before running this cell to load updated functions
print("GENERATING ALL FORMATTED LATEX TABLES")  
print("=" * 60)
print("NOTE: Make sure you've restarted the kernel to load updated functions!")
print()

generated_files = []
failed_tables = []

# Process each dataset and each metric
for dataset_name, dataset_info in datasets_prepared.items():
    print(f"\nGenerating {dataset_name.upper()} dataset tables...")
    
    for metric_name, (value_col, sig_df, maximize, description) in table_configs[dataset_name].items():
        try:
            print(f"Processing {metric_name}...")
            
            # Skip if column doesn't exist (e.g., AURAC might not be in older data)
            if value_col not in dataset_info['data'].columns:
                print(f"  Skipping {metric_name}: column '{value_col}' not found")
                failed_tables.append(f"{dataset_name}_{metric_name}")
                continue
            
            # Create wide-format table
            df_wide = create_report_view_df(
                dataset_info['data'],
                value_vars=[value_col],
                calibration_method='pure_logits',
                custom_rows=dataset_info['order']
            )
            
            # Generate caption with correct metric names
            if metric_name == 'NLL':
                metric_formula = "\\\\( \\\\text{NLL}_{\\\\text{mean}} \\\\)"
            elif metric_name == 'AUROC':
                metric_formula = "AUROC"
            elif metric_name == 'AURAC':
                metric_formula = "AURAC"
            else:
                metric_formula = metric_name.replace('_', ' ')
            
            # Fix dataset name for caption: Extended -> Full (FIXED)
            caption_dataset_name = "Full" if dataset_name == "extended" else dataset_name.title()
            
            caption = (f"{caption_dataset_name} dataset: {description} ({metric_formula}) over data splits; "
                      "mean ± 95\\\\% confidence interval half-width) on the full dataset (100\\\\%). "
                      "The best mean is shown in bold, and methods not significantly different from the best "
                      "(paired test, \\\\( \\\\alpha = 0.05 \\\\)) are shaded.")
            
            # Generate LaTeX table with significance formatting
            latex_table = create_latex_table_with_significance(
                df_wide=df_wide,
                df_significance=sig_df,
                ensemble_type_mapping=dataset_info['mapping'],
                metric_name=metric_name,  # This will be 'AUROC' now instead of 'AUC'
                dataset_name=dataset_name.title(),
                caption=caption,
                label=f'tab:{metric_name.lower()}_{dataset_name}_pure_logits',
                maximize=maximize
            )
            
            # Save to file with AUROC in filename
            filename = f'{dataset_name}_{metric_name.lower()}_formatted.tex'
            filepath = latex_path / filename
            
            with open(filepath, 'w') as f:
                f.write(latex_table)
            
            generated_files.append(str(filepath))
            print(f"  Saved: {filename}")
            
            # DEMO: Print first table properly formatted (FIXED)
            if len(generated_files) == 1:
                print("\n" + "="*60)
                print("SAMPLE TABLE (properly formatted):")
                print("="*60)
                print(latex_table)  # This will now display with proper newlines
                print("="*60)
            
        except Exception as e:
            import traceback
            print(f"  ERROR: Failed to generate {dataset_name} {metric_name}: {str(e)}")
            traceback.print_exc()
            failed_tables.append(f"{dataset_name}_{metric_name}")

print(f"\n" + "=" * 60)
print("GENERATION SUMMARY")
print("=" * 60)
print(f"Successfully generated: {len(generated_files)} tables")
print(f"Failed to generate: {len(failed_tables)} tables")

if generated_files:
    print(f"\nGenerated files:")
    for file in generated_files:
        print(f"  - {file}")

if failed_tables:
    print(f"\nFailed tables:")
    for table in failed_tables:
        print(f"  - {table}")

print(f"\nAll publication-ready LaTeX tables saved to: {latex_path}")
print("Tables include statistical significance formatting (bold + shading)")
print("Ready for your ICLR paper!")
print("\nNote: AUC tables are now correctly named as AUROC tables")

GENERATING ALL FORMATTED LATEX TABLES
NOTE: Make sure you've restarted the kernel to load updated functions!


Generating EXTENDED dataset tables...
Processing NLL...
  Saved: extended_nll_formatted.tex

SAMPLE TABLE (properly formatted):
\begin{table}[h]
  \caption{Full dataset: Negative log-likelihood (\\( \\text{NLL}_{\\text{mean}} \\)) over data splits; mean ± 95\\% confidence interval half-width) on the full dataset (100\\%). The best mean is shown in bold, and methods not significantly different from the best (paired test, \\( \\alpha = 0.05 \\)) are shaded.}
  \label{tab:nll_extended_pure_logits}
  \centering
  \resizebox{\textwidth}{!}{%
  \begin{tabular}{lcccccc}
    \toprule
    Ensemble Type & DBpedia & News & SST-2 & SetFit & Tweet & IMDB \\
    \midrule
    Greedy-5 & 0.0376 ± 0.0005 & 0.1682 ± 0.0048 & 0.1359 ± 0.0051 & 0.5465 ± 0.0033 & 0.5095 ± 0.0089 & 0.1171 ± 0.0028 \\
    Greedy-5 p.t.c. & 0.0348 ± 0.0007 & 0.1618 ± 0.0052 & 0.1208 ± 0.004 & 0.5431 ± 0.0019 & 0.5012