# Compare the model likelihood of the same sentence under two models.

> :warning: **Environment**: ipython has a lot of dependencies and is not in the main training environment. Hence this needs to run under a seperate one.


In [None]:
import sys
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from omegaconf import OmegaConf
import numpy as np


sys.path.append(os.path.dirname(os.getcwd()))

from notebooks.utils import styles

SAVE_BASE_PATH = "/path/to/your/figures/benchmarks"


## Load data

Load data from pickle files and store in DF.

In [None]:
# Fill in the 8 digit ID of your saved experiment data
results_likelihoods_names = ["1234-abcd"]
relative_path_benchmark = "pubmedqa/meta-llama_Meta-Llama-3-8B/results-categories-medical_qa-0-shot.pkl"
BENCHMARK_NAME = "PubMedQA" # start with "_"
BENCHMARK_NAME_PRETTY = "PubMedQA"

##### LOAD BY RUN NAMES
run_names = results_likelihoods_names
paths_likelihoods = [
    f"/path/to/your/artifacts/model_likelihood/{run_name}/model_likelihood.pkl" for run_name in run_names
]

paths_benchmarks = [os.path.join("/path/to/your/artifacts/Domain-Certification/benchmarks/", relative_path_benchmark) for _ in run_names]

print(paths_likelihoods)
print(f"Loaded {len(paths_likelihoods)} likelihood outputs.")

# ID_NAME = "task_data_int_sort"
DOMAIN = "MedicalQA"
ID_NAMES = ["pubmedqa", "pubmedqa_generated"]

##### LOAD RESULTS FROM LIKELIHOODS
# load table with $k$s for certificate. !!!! This table is generaated by the notebook `results_likelihoods.ipynb`
ood_bound_table_path = "/path/to/your/code/src/notebooks/k_given_eps_table_<DATASET>_DC_quantile_1.00.csv"


##### PROCESS RESULTS
ood_bound_table = pd.read_csv(ood_bound_table_path, sep="\t")
ood_bound_table["k"] = ood_bound_table["k_mid"]
print(f"Loaded bound tabl for log10 epsilons from min={ood_bound_table['log10_eps'].min()} to max={ood_bound_table['log10_eps'].max()}")

benchmark_chance_performance = {"MMLU": 0.25, "PubMedQA": 0.33}[BENCHMARK_NAME]

assert len(paths_likelihoods) == len(paths_benchmarks)

SAVE_BASE_PATH = os.path.join(SAVE_BASE_PATH, DOMAIN, BENCHMARK_NAME)
os.makedirs(SAVE_BASE_PATH, exist_ok=True)

In [None]:
experiments = []

for path_ll, path_benchmark in zip(paths_likelihoods, paths_benchmarks):
    try:
        with open(path_ll, 'rb') as f:
            ll_results =pickle.load(f)
        with open(path_benchmark, 'rb') as f:
            benchmark_results = pickle.load(f)
            experiments.append((ll_results, benchmark_results))
    except:
        print(f"Could not load: {path_ll} or {path_benchmark}")


# compute the log likelihoods
records = []
for i, (experiment, benchmark_scores) in enumerate(experiments):
    if "x_text" not in experiment["data"]:
        experiment["data"]["x_text"] = experiment["data"]["x_text_generator"]
    if "y_text" not in experiment["data"]:
        experiment["data"]["y_text"] = experiment["data"]["y_text_generator"]
    if "n_token_prompt" not in experiment["data"]:
        experiment["data"]["n_token_prompt"] = experiment["data"]["n_token_prompt_generator"]
    if "sequence_length" not in experiment["data"]:
        experiment["data"]["sequence_length"] = experiment["data"]["sequence_length_generator"]
    # check lengths
    num_samples_ll = len(experiment["data"]["x_text"])
    num_samples_benchmark = len(benchmark_scores["results"])
    if num_samples_benchmark > num_samples_ll:
        benchmark_scores["results"] = benchmark_scores["results"][:num_samples_ll]
        num_samples_benchmark = len(benchmark_scores["results"])
    assert num_samples_ll == num_samples_benchmark

    data = experiment['data']
    N_samples = data["n_token_prompt"].shape[0]
    config = OmegaConf.to_container(experiment['config'])
    ll_model = data["log_likelihoods_model"].sum(-1)
    ll_generator = data["log_likelihoods_generator"].sum(-1)
    ll2_model = ll_model / np.log(2)
    ll2_generator = ll_generator / np.log(2)
    ll10_model = ll_model / np.log(10)
    ll10_generator = ll_generator / np.log(10)
    n_token_response =  data["sequence_length"] - data["n_token_prompt"].squeeze()

    log2_ratio = ll2_model - ll2_generator
    norm_log2_ratio = log2_ratio / n_token_response

    benchmark_correct = [int(x["correct"]) for x in benchmark_scores["results"]]
    subject = [x["subject"] for x in benchmark_scores["results"]]

    # get response string length
    n_char_correct_response_benchmark = [len(row["choices"][row["correct_answer_id"]]) for row in benchmark_scores["results"]]
    n_char_judged_response_benchmark = [len(x) for x in data["y_text"]]

    config_columns = pd.json_normalize(config)
    dist_F = config_columns['model.target_distribution'][0]
    dist_G = config_columns['generator.target_distribution'][0]

    entropy_model = data["entropy_model"].sum(-1)
    entropy_generator = data["entropy_generator"].sum(-1)

    config_columns["distributions"] = f"F({dist_F})||G({dist_G})"

    if config["inference"]["prompt_length"] == "dataset":
        prompt_length = data["n_token_prompt"].squeeze()
    else:
        prompt_length = np.full((N_samples,), int(config["inference"]["prompt_length"]))


    data_columns = pd.DataFrame({
        "ll2_model": ll2_model,
        "ll2_generator": ll2_generator,
        "ll10_model": ll10_model,
        "ll10_generator": ll10_generator,
        "ll2_model_norm": ll2_model / n_token_response,
        "ll2_generator_norm": ll2_generator / n_token_response,
        "ll10_model_norm": ll10_model / n_token_response,
        "ll10_generator_norm": ll10_generator / n_token_response,
        "log2_ratio": log2_ratio,
        "log2_ratio_norm": log2_ratio / n_token_response,
        "entropy_model": entropy_model,
        "entropy_generator": entropy_generator,
        "x": data["x_text"],
        "y": data["y_text"],
        "sequence_length": data["sequence_length"],
        "n_token_prompt": data["n_token_prompt"].squeeze(),
        "prompt_length": prompt_length,
        "n_token_response": n_token_response,
        "benchmark_correct": benchmark_correct,
        "n_char_correct_response_benchmark": n_char_correct_response_benchmark,
        "n_char_judged_response_benchmark": n_char_judged_response_benchmark,
        "category": subject,
    })

    assert data_columns["n_token_response"].min() > 0

    # combine and repeat config_columns to match data_columns
    config_columns = pd.concat([config_columns] * len(data_columns), ignore_index=True)
    combined = pd.concat([config_columns, data_columns], axis=1)

    records.append(combined)

df = pd.concat(records).copy()

# OOD is when data_config_name is not task_data_int_sort
df["OOD"] = (~df["data_config_name"].isin(ID_NAMES)).astype(int)
df["Dataset"] = df["OOD"].map({0: r"Target Domain $\mathcal{D}_{T}$", 1: r"Other $\mathcal{D}_{F}$"})

# clean performance
CLEAN_PERFORMANCE = df["benchmark_correct"].mean().item()
print(f"{BENCHMARK_NAME} clean performance: {CLEAN_PERFORMANCE:.2%}")

# print model
print(f"Model:       {df['model.name_or_path'].unique()[0]}")
print(f"Generator:   {df['generator.name_or_path'].unique()[0]}")

# Save df

In [None]:
# save data
all_run_names = "_".join(run_names)
save_path = os.path.join(SAVE_BASE_PATH, f"df_{all_run_names}.csv")
df.to_csv(save_path, sep="\t", index=False)
print(f"Saved to {save_path}")

# General Functions

In [None]:
def get_log_upper_bound_m(log_g: float, N: int, T: int, k: float) -> float:
    return log_g + np.log2(T) + k * N


# FRR vs Epsilon-Certificate

In [None]:
# for old files with outdated naming
if "log10_cr_10" not in ood_bound_table.columns:
    ood_bound_table["log10_cr_10"] = ood_bound_table["cr_10"]
    ood_bound_table["log10_cr_median"] = ood_bound_table["cr_median"]
    ood_bound_table["log10_cr_90"] = ood_bound_table["cr_90"]

# convert ood_bound_table to dict: log10_eps -> k
ood_bound_dict = dict(zip(ood_bound_table["log10_eps"], ood_bound_table["k"]))

ood_k_cr_dict = {row["k"].item(): {
    "log10_cr_10": row["log10_cr_10"].item(),
    "log10_cr_median": row["log10_cr_median"].item(),
    "log10_cr_90": row["log10_cr_90"].item()
    } for _, row in ood_bound_table.iterrows()}


metrics = []

df_ood = df.loc[df["OOD"] == 1]

for e_log, k in ood_bound_dict.items():
    e_log2 = e_log / np.log10(2)
    preds = df['log2_ratio_norm'] > k
    sample_accepted = ~preds
    benchmark_score = df["benchmark_correct"].astype(bool)
    correct_at_k = (benchmark_score & sample_accepted)

    correct_at_k_per_category = {}
    for category in df["category"].unique():
        correct_at_k_per_category[category] = correct_at_k[df["category"] == category].mean()

    cr_values = ood_k_cr_dict[k]

    frr = preds.mean() # in the benchmark test, we only have ID samples
    accuracy_at_k = correct_at_k.mean()

    metrics.append({
        "e_log": e_log,
        "FRR": frr,
        BENCHMARK_NAME: accuracy_at_k,
        "k": k,
        "log10_cr_median": cr_values["log10_cr_median"],
        "log10_cr_10": cr_values["log10_cr_10"],
        "log10_cr_90": cr_values["log10_cr_90"],
        **{f"{BENCHMARK_NAME}_{category}": c for category, c in correct_at_k_per_category.items()}
    })

metrics = pd.DataFrame(metrics)

metrics_long = pd.melt(metrics, id_vars=["e_log", "log10_cr_median"], value_vars=["FRR", BENCHMARK_NAME], var_name="Metric", value_name="Value")

In [None]:
subset = metrics_long.loc[(metrics_long["e_log"] >= -20) & (metrics_long["e_log"] <= 2) & (metrics_long["Metric"] == BENCHMARK_NAME)] # subset for plotting the most interestin range
metrics_long

In [None]:
plt.clf()
sns.set_theme(style="whitegrid")
plt.rcParams.update(styles.third)

# plt.figure(figsize=(8, 3.25))
x_axis = "e_log"
g = sns.lineplot(data=metrics, x=x_axis, y="FRR", label="FRR")
g = sns.lineplot(data=metrics, x=x_axis, y=BENCHMARK_NAME, label=rf"{BENCHMARK_NAME_PRETTY}@$\epsilon$")
g.set(ylabel="")
g.set(yticks=[0, 0.25, 0.5, 0.75, 1])
g.set(xlabel=r"$\text{log}_{10}$ $\epsilon$-DC")

# PubMedQA
g.legend(loc="upper center", bbox_to_anchor=(0.5, 1.25), ncol=2, frameon=False, columnspacing=0.5)
g.set(xlim=(-20, 2))
g.set(xticks=[-20, -15, -10, -5, 0])

save_path = os.path.join(SAVE_BASE_PATH, f"benchmark_epsilon_dc_{DOMAIN}_{BENCHMARK_NAME}.pdf")
plt.savefig(save_path, bbox_inches="tight")
print(f"Saved figure to {save_path}")

In [None]:
plt.clf()
sns.set_theme(style="whitegrid")
plt.rcParams.update(styles.third)

x_axis = "log10_cr_median"
g = sns.lineplot(data=metrics, x=x_axis, y="FRR", label="FRR")
g = sns.lineplot(data=metrics, x=x_axis, y=BENCHMARK_NAME, label=rf"{BENCHMARK_NAME_PRETTY}@$\epsilon$")
g.set(ylabel="")
g.set(xlabel=r"$\log_{10} CR_k$ (Median)")
g.set(yticks=[0, 0.25, 0.5, 0.75, 1])

# g.legend(loc='lower left', bbox_to_anchor=(0.0, 0.2))
g.legend(loc="upper center", bbox_to_anchor=(0.5, 1.25), ncol=2, frameon=False, columnspacing=0.5)
g.set(xlim=(-1, 42))

save_path = os.path.join(SAVE_BASE_PATH, f"benchmark_cr_{DOMAIN}_{BENCHMARK_NAME}.pdf")
# plt.savefig(save_path, bbox_inches="tight")
print(f"Saved figure to {save_path}")