# Alt-Test: How to Justify Replacing Humans by LLMs

**This notebook has been edited from the original Alt-Test version to apply it to our data -- ProxAnn authors**

To run the alt-test, you need two dictionaries: one with human annotations and another with LLM predictions. Then, you should call the `alt_test` function: <br>
```python
winning_rate, advantage_prob = alt_test(humans_annotations, llm_annotations, scoring_function, epsilon)
```

The `winning_rate` represents the proportion of humans the LLM "wins", and if `winning_rate >= 0.5` the LLM passes the test. <br>
The `advantage_prob` estimates the probability that the LLM annotations are as good as or better than a randomly selected human annotator. It should be used to compare LLMs (higher is better). <br>


### Parameters

- **`humans_annotations`**:
  A dictionary of dictionaries where:
  - Outer keys represent annotators (annotator ids).
  - Inner dictionaries with keys representing instances (instance ids) and values representing annotations.
  **Example:**
  ```python
  {
      'annotator1': {'instance1': 'A', 'instance2': 'B'},
      'annotator2': {'instance1': 'A', 'instance2': 'C', 'instance3': 'A'}
  }
  ```

- **`llm_annotations`**:
  A dictionary where the keys represent instances (instance ids) and the values represent LLM predictions.
  **Example:**
  ```python
  {'instance1': 'A', 'instance2': 'B', 'instance3': 'A'}
  ```

- **`scoring_function`**:
  Specifies how predictions are evaluated. Can be:
  - A string: `'accuracy'` or `'neg_rmse'`.
  - A custom function: Takes a prediction and a list of annotations as inputs, returning a score.

- **`epsilon`**:
  A float representing the cost-benefit penalty for the null hypothesis. Suggested values:
  - **0.2**: if annotators are experts.
  - **0.15**: if annotators are skilled.
  - **0.1**: if annotators are crowd-workers.

In [None]:
import json
import os
import numpy as np
from collections import defaultdict
from pathlib import Path
from scipy.stats import ttest_1samp, wilcoxon, kendalltau
from typing import List, Dict, Any, Callable, Union

import pandas as pd
from tqdm.auto import tqdm

from sklearn.metrics import root_mean_squared_error
from proxann.llm_annotations.utils import process_responses, collect_fit_rank_data

In [2]:
def open_json(file_path: str) -> Dict:
    with open(file_path, 'r') as f:
        return json.load(f)


def by_procedure(p_values: List[float], q: float) -> List[int]:
    p_values = np.array(p_values, dtype=float)
    m = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_pvals = p_values[sorted_indices]
    # Compute the harmonic sum H_m = 1 + 1/2 + ... + 1/m
    H_m = np.sum(1.0 / np.arange(1, m + 1))
    # Compute the BY thresholds for each rank i
    by_thresholds = (np.arange(1, m + 1) / m) * (q / H_m)
    max_i = -1
    for i in range(m):
        if sorted_pvals[i] <= by_thresholds[i]:
            max_i = i
    if max_i == -1:
        return []
    rejected_sorted_indices = sorted_indices[:max_i + 1]
    return list(rejected_sorted_indices)


def accuracy(pred: Any, annotations: List[Any]) -> float:
    return float(np.mean([pred == ann for ann in annotations]))


def neg_rmse(pred: Union[int, float], annotations: List[Union[int, float]]) -> float:
    return -1 * float(np.sqrt(np.mean([(pred - ann) ** 2 for ann in annotations])))


def neg_rmse_topic(pred: list, annotations: List[List[Union[int, float]]]) -> float:
    return -1 * float(np.mean([root_mean_squared_error(pred, ann) for ann in annotations]))


def tau(pred: list, annotations: List[List[Union[int, float]]]) -> float:
    return np.mean([kendalltau(pred, ann)[0] for ann in annotations])


def sim(pred: str, annotations: List[str], similarity_func: Callable) -> float:
    return float(np.mean([similarity_func(pred, ann) for ann in annotations]))


def ttest(indicators, epsilon: float) -> float:
    indicators = np.array(indicators, dtype=float)
    # avoids warning with standard deviation of zero
    if np.all(indicators == 1):
        return 1.0
    if np.all(indicators == -1):
        return 0.0
    
    return ttest_1samp(indicators, epsilon, alternative='less').pvalue


def wilcoxon_test(indicators, epsilon: float) -> float:
    # use epsilon
    shifted_differences = [d - epsilon for d in indicators]
    return wilcoxon(shifted_differences, alternative='less', mode="exact").pvalue


def alt_test(
    llm_annotations: Dict[Union[int, str], Any],
    humans_annotations: Dict[Union[int, str], Dict[Union[int, str], Any]],
    scoring_function: Union[str, Callable] = 'accuracy',
    epsilon: float = 0.2,
    q_fdr: float = 0.05,
    min_humans_per_instance: int = 2,
    min_instances_per_human: int = 30,
    test: str = 'ttest',
    verbose: bool = True,
) -> float:
    # prepare alignment scoring function
    if isinstance(scoring_function, str):
        if scoring_function == 'accuracy':
            scoring_function = accuracy
        elif scoring_function == 'neg_rmse':
            scoring_function = neg_rmse
        elif scoring_function == 'neg_rmse_topic':
            scoring_function = neg_rmse_topic
        elif scoring_function == 'tau':
            scoring_function = lambda pred, human: np.mean([kendalltau(pred, h)[0] for h in human])
        else:
            raise ValueError("Unknown scoring function")
    else:
        scoring_function = scoring_function

    # prepare sets - i_set has humans as keys, h_set has instances as keys
    i_set, h_set = {}, {}
    for h, anns in humans_annotations.items():
        i_set[h] = list(anns.keys())
        for i, ann in anns.items():
            if i not in h_set:
                h_set[i] = []
            h_set[i].append(h)

    # remove instances with less than min_humans_per_instance
    instances_to_keep = {i for i in h_set if len(h_set[i]) >= min_humans_per_instance and i in llm_annotations}

    if len(instances_to_keep) < len(h_set) and verbose:
        print(f"Dropped {len(h_set) - len(instances_to_keep)} instances with less than {min_humans_per_instance} annotators.")
    i_set = {h: [i for i in i_set[h] if i in instances_to_keep] for h in i_set}
    h_set = {i: h_set[i] for i in h_set if i in instances_to_keep}

    p_values, advantage_probs, humans = [], [], []
    for excluded_h in humans_annotations:
        llm_indicators = []
        excluded_indicators = []
        score_differences = []
        instances = [i for i in i_set[excluded_h] if i in llm_annotations]
        if len(instances) < min_instances_per_human:
            if verbose:
                print(f"Skipping annotator {excluded_h} with only {len(instances)} instances < {min_instances_per_human}.")
            continue

        avg_rem_ans = 0
        for i in instances:
            human_ann = humans_annotations[excluded_h][i]
            llm_ann = llm_annotations[i]
            remaining_anns = [humans_annotations[h][i] for h in h_set[i] if h != excluded_h]
            human_score = scoring_function(human_ann, remaining_anns)
            llm_score = scoring_function(llm_ann, remaining_anns)
            llm_indicators.append(1 if llm_score >= human_score else 0)
            excluded_indicators.append(1 if human_score >= llm_score else 0)
            score_differences.append(human_score - llm_score)
            avg_rem_ans += len(remaining_anns)
        avg_rem_ans /= len(instances)
        diff_indicators = [exc_ind - llm_ind for exc_ind, llm_ind in zip(excluded_indicators, llm_indicators)]
        #print(f"Mean diff: {np.mean(llm_indicators):0.2f} | Diff n: {len(diff_indicators)} |  Remaining anns n: {avg_rem_ans:0.2f}")
        if test == 'ttest':
            global current_data
            current_data = diff_indicators
            p = ttest(diff_indicators, epsilon)
        elif test == 'wilcoxon':
            # TODO: change to score_differences?
            p = wilcoxon_test(diff_indicators, epsilon)
        elif test == 'permutation':
            raise NotImplementedError("Permutation test not implemented.")
        p_values.append(p)
        advantage_probs.append(float(np.mean(llm_indicators)))
        humans.append(excluded_h)

    rejected_indices = by_procedure(p_values, q_fdr)
    advantage_prob = float(np.mean(advantage_probs))
    winning_rate = len(rejected_indices) / len(humans)
    return winning_rate, advantage_prob

### TM Evaluation

In [3]:
data_jsons = [
    "../data/json_out/config_wiki_part1.json",
    "../data/json_out/config_wiki_part2.json",
    "../data/json_out/config_bills_part1.json",
    "../data/json_out/config_bills_part2.json",
]
response_csvs = [
    "../data/human_annotations/Cluster+Evaluation+-+Sort+and+Rank+-+Bills_December+14,+2024_13.20.csv",
    "../data/human_annotations/Cluster+Evaluation+-+Sort+and+Rank_December+12,+2024_05.19.csv",
]
start_date = "2024-12-06 09:00:00"

responses = {}
for csv in response_csvs:
    for topic_id, topic_responses in process_responses(csv, data_jsons, start_date=start_date, path_save=None, removal_condition="loose").items():
        if topic_responses:
            responses[topic_id] = topic_responses

_, _, _, corr_data = collect_fit_rank_data(responses)
corr_data = sorted(corr_data, key=lambda x: x["id"])
corr_ids = [x["id"] for x in corr_data]

#%% Load the model output data
base_path = "../data/camera_ready_llm_out/mean"
llm_data_patterns = {
    "gpt-4o": {
        "wiki": list(Path(base_path, "wiki/gpt-4o-2024-08-06/").glob("*")),
        "bills": list(Path(base_path, "bills/gpt-4o-2024-08-06/").glob("*")),
    },
    "llama-8b": {
        "wiki": list(Path(base_path, "wiki/Meta-Llama-3.1-8B-Instruct").glob("*")),
        "bills": list(Path(base_path, "bills/Meta-Llama-3.1-8B-Instruct").glob("*")),
    },
    "llama-70b": {
        "wiki": list(Path(base_path, "wiki/llama-3.3-70b-instruct-awq/").glob("*")),
        "bills": list(Path(base_path, "bills/llama-3.3-70b-instruct-awq/").glob("*")),
    },
    "qwen-2.5-72b": {
        "wiki": list(Path(base_path, "wiki/Qwen2.5-72B-Instruct-AWQ/").glob("*")),
        "bills": list(Path(base_path, "bills/Qwen2.5-72B-Instruct-AWQ/").glob("*")),
    },
    "qwen-3-8b": {
        "wiki": list(Path(base_path, "wiki/Qwen3-8B/").glob("*")),
        "bills": list(Path(base_path, "bills/Qwen3-8B/").glob("*")),
    },
    "qwen-3-32b": {
        "wiki": list(Path(base_path, "wiki/Qwen3-32B/").glob("*")),
        "bills": list(Path(base_path, "bills/Qwen3-32B/").glob("*")),
    },
    "qwen-3-30b-moe": {
        "wiki": list(Path(base_path, "wiki/Qwen3-30B-A3B/").glob("*")),
        "bills": list(Path(base_path, "bills/Qwen3-30B-A3B/").glob("*")),
    },
}
llm_fits, llm_ranks = {}, {}


#%% Load the model output data
for llm, paths_by_ds in llm_data_patterns.items():
    llm_fits[llm] = defaultdict(list)
    llm_ranks[llm] = defaultdict(list)

    for dataset, paths in paths_by_ds.items():
        fits_, ranks_, wins_ = [], [], []
        # iterate over all seeds
        for seed, path in enumerate(paths):
            fits_seed = open_json(f"{path}/llm_results_q2.json")
            ranks_seed = open_json(f"{path}/llm_results_q3.json")

            # point is to move from "seed by topic"
            # [[topic_0_seed_0, topic_1_seed_0, ...], [topic_0_seed_1, topic_1_seed_1, ...]]
            # to "topic by seed"
            # [[topic_0_seed_0, topic_0_seed_1, ...], [topic_1_seed_0, topic_1_seed_1, ...]]
            for i, (fit_item, rank_item) in enumerate(zip(fits_seed, ranks_seed)):
                assert(fit_item["id"] == rank_item["id"])
                if seed == 0:
                    fits_.append([fit_item])
                    ranks_.append([rank_item])
                else:
                    fits_[i].append(fit_item)
                    ranks_[i].append(rank_item)

        # then we can average over all seeds
        for fit_item, rank_item in zip(fits_, ranks_):
            id = fit_item[0]["id"]
            llm_fits[llm][dataset].append({
                "id": id,
                "annotators": [llm],
                "fit_data": [np.mean([x["fit_data"][0] for x in fit_item], axis=0).tolist()],
            })
            llm_ranks[llm][dataset].append({
                "id": id,
                "annotators": [llm],
                "rank_data": [np.mean([x["rank_data"][0] for x in rank_item], axis=0).tolist()],
            })

Total responses: 121
Total responses: 121
Removed: 20
Total responses: 142
Total responses: 142
Removed: 25


In [4]:
def construct_annotation_data(corr_data, combine_method=None, document_level=True, seed=42):
    """
    Constructs the annotation data for the given correlation data and LLM fits/ranks.
    """
    # TODO: determine if more legitimate to combine over dataset or over model
    fit_annotation_data = {"wiki": {}, "bills": {}}
    rank_annotation_data = {"wiki": {}, "bills": {}}

    rng = np.random.default_rng(seed)

    for ds in ["wiki", "bills"]:
        for topic_data in corr_data:
            topic_id = topic_data["id"]
            if ds in topic_id:
                n_annotators = topic_data["n_annotators"]
                annotator_idxs = rng.permutation(n_annotators)
                for id_, i in enumerate(annotator_idxs):
                    # combine annotators (this is approved by the paper authors)
                    model_id = "/".join(topic_id.split("/")[:-1])

                    # how to combine multiple annotators 
                    if combine_method == "model":
                        annotator_key = f"{model_id}/ann_{id_}" # combine annotators per model
                    elif combine_method == "dataset":
                        annotator_key = f"{ds}/ann_{id_}" # combine annotators per dataset
                    elif combine_method is None:
                        annotator_key = f"{topic_id}/ann_{i}" # original setup
                    
                    if annotator_key not in fit_annotation_data[ds]:
                        fit_annotation_data[ds][annotator_key] = {}
                        rank_annotation_data[ds][annotator_key] = {}
                    n_docs = topic_data["fit_data"].shape[1]
                    if 1 in topic_data["rank_data"][i] and 0 in topic_data["rank_data"][i]:
                        raise ValueError("Rank data should not have both 0 and 1.")
                    if document_level:
                        for doc in range(n_docs):
                            fit_annotation_data[ds][annotator_key][f"{topic_id}/doc_{doc}"] = topic_data["fit_data"][i, doc]# >= fit_threshold
                            # clip the rank to be between 1 and n_docs
                            rank_annotation_data[ds][annotator_key][f"{topic_id}/doc_{doc}"] = max(1, topic_data["rank_data"][i, doc])
                    else:
                        fit_annotation_data[ds][annotator_key][topic_id] = [topic_data["fit_data"][i, doc] for doc in range(n_docs)]
                        rank_annotation_data[ds][annotator_key][topic_id] = [max(1, topic_data["rank_data"][i, doc]) for doc in range(n_docs)]

    return fit_annotation_data, rank_annotation_data


def construct_annotation_data_random(corr_data, topics_to_combine=5, document_level=True, seed=42):
    """
    Constructs the annotation data for the given correlation data and LLM fits/ranks.
    """
    # alternative construction
    fit_annotation_data = {"wiki": {}, "bills": {}}
    rank_annotation_data = {"wiki": {}, "bills": {}}

    rng = np.random.default_rng(seed)
    # should be around 5, so 35 items per annotator
    k = topics_to_combine

    for ds in ["wiki", "bills"]:
        corr_data_by_annotator = defaultdict(list)
        for topic_data in corr_data:
            topic_id = topic_data["id"]
            if ds in topic_id:
                n_annotators = topic_data["n_annotators"]
                for i in range(n_annotators):
                    corr_data_by_annotator[topic_id].append({
                        "annotator_id": i,
                        "fit_data": topic_data["fit_data"][i],
                        "rank_data": topic_data["rank_data"][i],
                    })

        groups = 0
        while len(corr_data_by_annotator) > k:
            # Sample k topics randomly
            topics = list(corr_data_by_annotator.keys())
            sampled_topics = rng.choice(topics, size=k, replace=False)

            # initialize pseudo-annotator data
            annotator_key = f"{ds}/pseudo_{groups}"
            fit_annotation_data[ds][annotator_key] = {}
            rank_annotation_data[ds][annotator_key] = {}
            
            # Pick random index from each group
            for topic_id in sampled_topics:
                idx = rng.choice(range(len(corr_data_by_annotator[topic_id])))
                topic_data = corr_data_by_annotator[topic_id].pop(idx)
                n_docs = len(topic_data["fit_data"])
            
                if 1 in topic_data["rank_data"] and 0 in topic_data["rank_data"]:
                    raise ValueError("Rank data should not have both 0 and 1.")
                if document_level:
                    for doc in range(n_docs):
                        fit_annotation_data[ds][annotator_key][f"{topic_id}/doc_{doc}"] = topic_data["fit_data"][doc]
                        # clip the rank to be between 1 and n_docs
                        rank_annotation_data[ds][annotator_key][f"{topic_id}/doc_{doc}"] = max(1, topic_data["rank_data"][doc])
                else:
                    fit_annotation_data[ds][annotator_key][topic_id] = [topic_data["fit_data"][doc] for doc in range(n_docs)]
                    rank_annotation_data[ds][annotator_key][topic_id] = [max(1, topic_data["rank_data"][doc]) for doc in range(n_docs)]
                
                # remove topic from pool
                if len(corr_data_by_annotator[topic_id]) == 0:
                    del corr_data_by_annotator[topic_id]

            groups += 1
    return fit_annotation_data, rank_annotation_data


def construct_llm_data(llm_fits, llm_ranks, llm, document_level=True):
    """
    Constructs the LLM data for the given LLM fits and ranks.
    """
    fit_llm_data = {"wiki": {}, "bills": {}}
    rank_llm_data = {"wiki": {}, "bills": {}}

    for ds in ["wiki", "bills"]:
        for topic_fits, topic_ranks in zip(llm_fits[llm][ds], llm_ranks[llm][ds]):
            assert topic_fits["id"] == topic_ranks["id"]
            topic_id = topic_fits["id"]
            if ds in topic_id:
                n_docs = len(topic_fits["fit_data"][0])
                if document_level:
                    for doc in range(n_docs):
                        fit_llm_data[ds][f"{topic_id}/doc_{doc}"] = topic_fits["fit_data"][0][doc]
                        rank_llm_data[ds][f"{topic_id}/doc_{doc}"] = topic_ranks["rank_data"][0][doc]
                else:
                    fit_llm_data[ds][topic_id] = [topic_fits["fit_data"][0][doc] for doc in range(n_docs)]
                    rank_llm_data[ds][topic_id] = [topic_ranks["rank_data"][0][doc] for doc in range(n_docs)]
    return fit_llm_data, rank_llm_data

# Make the table

In [5]:
results = []
epsilon = 0.1
n_models = 3
n_topics_per_model = 8
n_docs = 7
n_resamples = 10
topics_to_combine = 8  # for random construction

rng = np.random.default_rng(42)

for i in tqdm(range(n_resamples), desc="Running tests"):
    for doc_level in [True, False]:
        if doc_level:
            metric = "neg_rmse"
            n_items = n_docs
        else:
            metric = "neg_rmse_topic"
            n_items = 1
        
        for combine_method in ["model", "dataset", "random"]:
            if combine_method == "model":
                min_instances_per_human = n_topics_per_model * n_items
            elif combine_method == "dataset":
                min_instances_per_human = n_models * n_topics_per_model * n_items
            elif combine_method == "random":
                min_instances_per_human =  topics_to_combine * n_items

            for llm in llm_data_patterns:
                if combine_method != "random":
                    fit_annotation_data, rank_annotation_data = construct_annotation_data(
                        corr_data, combine_method=combine_method, document_level=doc_level, seed=rng,
                    )
                else:
                    fit_annotation_data, rank_annotation_data = construct_annotation_data_random(
                        corr_data, topics_to_combine=topics_to_combine, document_level=doc_level, seed=rng,
                    )
                fit_llm_data, rank_llm_data = construct_llm_data(
                    llm_fits, llm_ranks, llm, document_level=doc_level
                )

                for ds in ["wiki", "bills"]:
                    for test in ["ttest", "wilcoxon"]:
                        task_data = [
                            ("fit", fit_llm_data[ds], fit_annotation_data[ds]),
                            ("rank", rank_llm_data[ds], rank_annotation_data[ds]),
                        ]
                        for (task_name, task_llm_data, task_annotation_data) in task_data:
                            wr, ap = alt_test(
                                task_llm_data,
                                task_annotation_data,
                                metric,
                                epsilon=epsilon,
                                test=test,
                                min_instances_per_human=min_instances_per_human,
                                verbose=False,
                            )
                            results.append({
                                "llm": llm,
                                "dataset": ds,
                                "task": task_name,
                                "combine_method": combine_method,
                                "doc_level": doc_level,
                                "test": test,
                                "iter": i,
                                "winning_rate": wr,
                                "advantage_probability": ap,
                            })

Running tests:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
results = pd.DataFrame(results)

In [7]:
def create_latex_summary_table(results, combine_method="dataset", format_decimals=3, as_decimal=True, show_std=True, multirow=True):
    """
    Create a LaTeX summary table with dataset as outer grouping and both doc/topic levels.
    
    Parameters:
    - results: DataFrame with the analysis results
    - combine_method: str, either "model" or "dataset"
    - format_decimals: int, number of decimal places for formatting
    - dataset_alignment: str, alignment for dataset column ('c', 'l', 'r')
    
    Required LaTeX packages: booktabs, multirow
    """
    
    # Filter results to retain only specific models and rename them
    summary = results.copy()
    
    models_to_retain = {
        "llama-8b": r"\llamaThreeOneEightB{}",
        "qwen-3-8b": r"\qwenThreeEightB{}",
        "qwen-3-32b": r"\qwenThreeThirtyTwoB{}",
        "llama-70b": r"\llamaThreeThreeSeventyB{}",
        "qwen-2.5-72b": r"\qwenTwoFiveSeventyTwoB{}",
        "gpt-4o": r"\gptFourO{}",
    }
    
    summary = summary[
        summary["llm"].isin(models_to_retain)
        & (summary["combine_method"] == combine_method)
    ]
    summary["llm"] = summary["llm"].map(models_to_retain)
    
    # Group and calculate statistics
    summary_grouped = []
    
    for (llm, dataset, doc_level), group in summary.groupby(['llm', 'dataset', 'doc_level']):
        row_data = {
            'llm': llm,
            'dataset': dataset,
            'doc_level': doc_level
        }
        
        for task in ['fit', 'rank']:
            # Get advantage probability (same for both tests, so we use ttest)
            adv_prob = group[group['task'] == task]['advantage_probability']
            row_data[f'{task}_adv_mean'] = adv_prob.mean()
            row_data[f'{task}_adv_std'] = adv_prob.std()
            
            # Get winning rates for both tests
            ttest_data = group[(group['task'] == task) & (group['test'] == 'ttest')]['winning_rate']
            wilcoxon_data = group[(group['task'] == task) & (group['test'] == 'wilcoxon')]['winning_rate']
            
            row_data[f'{task}_ttest_mean'] = ttest_data.mean() if len(ttest_data) > 0 else 0
            row_data[f'{task}_wilcoxon_mean'] = wilcoxon_data.mean() if len(wilcoxon_data) > 0 else 0
        
        summary_grouped.append(row_data)
    
    summary_final = pd.DataFrame(summary_grouped)
    
    # Define model order
    model_order = ["Llama-3.1-8B", "Qwen-3-8B", "Qwen-3-32B", "Llama-3.3-70B", "Qwen-2.5-72B", "GPT-4o"]
    summary_final['model_order'] = summary_final['llm'].map({m: i for i, m in enumerate(model_order)})
    # Define dataset order
    dataset_order = ["wiki", "bills"]
    summary_final['dataset_order'] = summary_final['dataset'].map({d: i for i, d in enumerate(dataset_order)})
    
    # Sort by dataset and model order
    summary_final = summary_final.sort_values(['dataset_order', 'model_order'])
    
    # Create LaTeX table
    latex_lines = []
    latex_lines.append(rf"\begin{{tabular}}{{lllll}}")
    latex_lines.append(r"\toprule")
    latex_lines.append(r"  & \multicolumn{2}{c}{Document-Level $\rho$} & \multicolumn{2}{c}{Topic-Level $\rho$} \\")
    latex_lines.append(r"\cmidrule(lr){2-3} \cmidrule(lr){4-5}")
    latex_lines.append(r"   & Fit & Rank & Fit & Rank \\")
    latex_lines.append(r"\midrule")
    
    datasets = summary_final['dataset'].unique()
    for dataset_idx, dataset in enumerate(datasets):
        dataset_data = summary_final[summary_final['dataset'] == dataset]
        
        # Add dataset header
        latex_lines.append(rf"& \multicolumn{{4}}{{c}}{{\texttt{{{dataset}}}}}\\")
        latex_lines.append(r"\cmidrule(lr){2-5}")
        # Get unique models for this dataset
        models_in_dataset = dataset_data['llm'].unique()
        n_models = len(models_in_dataset)
        

        for model_idx, model in enumerate(models_in_dataset):
            line_parts = []
            
            # Model column
            line_parts.append(model)
            
            # Process document-level (True) and topic-level (False) in order
            for doc_level in [True, False]:
                model_doc_data = dataset_data[(dataset_data['llm'] == model) & (dataset_data['doc_level'] == doc_level)]
                
                for task in ['fit', 'rank']:
                    if len(model_doc_data) > 0:
                        row = model_doc_data.iloc[0]
                        adv_mean = row[f'{task}_adv_mean']
                        adv_std = row[f'{task}_adv_std']
                        ttest_mean = row[f'{task}_ttest_mean']
                        wilcoxon_mean = row[f'{task}_wilcoxon_mean']
                        
                        if as_decimal:
                            adv_str = f"${adv_mean:.{format_decimals}f}"
                            if show_std:
                                adv_str += rf"\ ({adv_std:.{format_decimals}f})"
                        else:
                            adv_str = rf"${adv_mean*100:0.0f}"
                            if show_std:
                                adv_str += rf"\ ({adv_std*100:0.0f})"
                        markers = ""
                        if ttest_mean > 0.5 or wilcoxon_mean > 0.5:
                            markers = "^{"
                            if ttest_mean > 0.5:
                                markers += "{*}"
                            if wilcoxon_mean > 0.5:
                                markers += r"\dagger"
                            markers += "}"
                        markers += "$"
                        line_parts.append(f"{adv_str}{markers}")
                    else:
                        # Handle missing data
                        line_parts.append("--")
            
            line = " & ".join(line_parts) + r" \\"
            latex_lines.append(line)
        
        if dataset_idx < len(datasets) - 1:
            latex_lines.append(r"\midrule")
    
    latex_lines.append(r"\bottomrule")
    latex_lines.append(r"\end{tabular}")
    
    return "\n".join(latex_lines)

In [8]:
# pseudo-annotators by dataset
print(create_latex_summary_table(
    results,
    combine_method="dataset",
    format_decimals=2,
    as_decimal=True,
    show_std=False,
))

\begin{tabular}{lllll}
\toprule
  & \multicolumn{2}{c}{Document-Level $\rho$} & \multicolumn{2}{c}{Topic-Level $\rho$} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
   & Fit & Rank & Fit & Rank \\
\midrule
& \multicolumn{4}{c}{\texttt{wiki}}\\
\cmidrule(lr){2-5}
\gptFourO{} & $0.56^{{*}\dagger}$ & $0.68^{{*}\dagger}$ & $0.66^{\dagger}$ & $0.55^{\dagger}$ \\
\llamaThreeOneEightB{} & $0.22$ & $0.36$ & $0.05$ & $0.11$ \\
\llamaThreeThreeSeventyB{} & $0.57^{{*}\dagger}$ & $0.67^{{*}\dagger}$ & $0.58^{\dagger}$ & $0.50^{\dagger}$ \\
\qwenThreeEightB{} & $0.56^{{*}\dagger}$ & $0.58^{\dagger}$ & $0.46$ & $0.39$ \\
\qwenThreeThirtyTwoB{} & $0.55^{{*}\dagger}$ & $0.63^{\dagger}$ & $0.47$ & $0.42$ \\
\qwenTwoFiveSeventyTwoB{} & $0.52^{\dagger}$ & $0.68^{{*}\dagger}$ & $0.66^{\dagger}$ & $0.46$ \\
\midrule
& \multicolumn{4}{c}{\texttt{bills}}\\
\cmidrule(lr){2-5}
\gptFourO{} & $0.65^{{*}\dagger}$ & $0.71^{{*}\dagger}$ & $0.77^{{*}\dagger}$ & $0.75^{{*}\dagger}$ \\
\llamaThreeOneEightB{} & $0.30$ & $0.

In [9]:
# pseudo-annotators by model
print(create_latex_summary_table(
    results,
    combine_method="model",
    format_decimals=2,
    as_decimal=True,
    show_std=False,
))

\begin{tabular}{lllll}
\toprule
  & \multicolumn{2}{c}{Document-Level $\rho$} & \multicolumn{2}{c}{Topic-Level $\rho$} \\
\cmidrule(lr){2-3} \cmidrule(lr){4-5}
   & Fit & Rank & Fit & Rank \\
\midrule
& \multicolumn{4}{c}{\texttt{wiki}}\\
\cmidrule(lr){2-5}
\gptFourO{} & $0.58^{\dagger}$ & $0.68^{\dagger}$ & $0.67$ & $0.55$ \\
\llamaThreeOneEightB{} & $0.22$ & $0.37$ & $0.04$ & $0.12$ \\
\llamaThreeThreeSeventyB{} & $0.58^{\dagger}$ & $0.67^{\dagger}$ & $0.58$ & $0.48$ \\
\qwenThreeEightB{} & $0.58^{\dagger}$ & $0.58$ & $0.46$ & $0.38$ \\
\qwenThreeThirtyTwoB{} & $0.57^{\dagger}$ & $0.63$ & $0.51$ & $0.42$ \\
\qwenTwoFiveSeventyTwoB{} & $0.53^{\dagger}$ & $0.68^{\dagger}$ & $0.67$ & $0.46$ \\
\midrule
& \multicolumn{4}{c}{\texttt{bills}}\\
\cmidrule(lr){2-5}
\gptFourO{} & $0.65^{{*}\dagger}$ & $0.71^{{*}\dagger}$ & $0.77$ & $0.76$ \\
\llamaThreeOneEightB{} & $0.30$ & $0.53^{\dagger}$ & $0.14$ & $0.44$ \\
\llamaThreeThreeSeventyB{} & $0.66^{{*}\dagger}$ & $0.67^{\dagger}$ & $0.69$ & $0.