In [None]:
import pandas as pd

results = pd.read_csv(f"results/eval_mmlu-7b-sub_200k-ut.csv")
model_map = {
    "Llama2-7b-all_200k_c-ut-new-hqg9f60c": r"$\mathrm{UT}$",
    "Llama-2-7b-sub_all_200k-it-xcmpque2": r"$\mathrm{IT}$",
}
results["train_mode"] = results.query_model.apply(lambda m: model_map[m])
results = results[results.split == "test"]
# results

In [None]:
dname_map = {d: idx + 1 for idx, d in enumerate(results.dataset.unique())}

plt_results = results[results.split == "test"]
plt_results["task_idx"] = results.dataset.apply(lambda d: dname_map[d])
plt_results["qa_unc_ece_100"] = results["qa_unc_ece"] * 100.
plt_results["unc_acc_100"] = results["unc_acc"] * 100.
plt_results

## Relative Improvement

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# import numpy as np

# sns.set(font_scale=2., style='whitegrid')

# metric = "qa_unc_ece_100"
# flip = -1 if metric in ["ece", "qa_unc_ece", "qa_unc_ece_100"] else 1

# ut_metric = plt_results[plt_results.train_mode == "$\mathrm{UT}$"][[metric]].values.flatten()
# it_metric = plt_results[plt_results.train_mode == "$\mathrm{IT}$"][[metric]].values.flatten()

# print(sum(flip * ut_metric <= flip * it_metric))

# fig, ax = plt.subplots(figsize=(5,5))

# ax.set_yticks(np.arange(0, 15, 5))

# ax.hist((((ut_metric - it_metric) * flip / it_metric)) * 100.)

# fig.show()

## Task Breakdown

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(font_scale=6., style='whitegrid')

fig, ax = plt.subplots(figsize=(200, 40))

sns.barplot(ax=ax, data=plt_results,
            x="task_idx", y="unc_acc_100", hue="train_mode",
            width=0.68,
            palette=sns.color_palette("tab20")[8:10])

ax.set_ylabel('UQ Acc. (%)', fontsize=250, labelpad=100)
ax.set_ylim(30, 80)
ax.set_yticks(np.arange(30, 80 + 1e-3, 10))
ax.set_yticklabels(ax.get_yticklabels(), fontsize=150)

ax.set_xlabel('MMLU Task ID', fontsize=250, labelpad=100)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=150)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=labels, loc='best',
          title='', title_fontsize=200,
          prop=dict(size=180), bbox_to_anchor=(.91, .7, .1, .1))

fig.tight_layout()
fig.show()
# fig.savefig("mmlu_correctness_comparison.pdf", bbox_inches="tight")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(font_scale=6., style='whitegrid')

fig, ax = plt.subplots(figsize=(200, 40))

sns.barplot(ax=ax, data=plt_results,
            x="task_idx", y="qa_unc_ece_100", hue="train_mode",
            width=0.68,
            palette=sns.color_palette("tab20")[8:10])

ax.set_ylabel(r'$\mathrm{ECE} (\%)$', fontsize=250, labelpad=100)
ax.set_yticks(np.arange(0, 30 + 1e-3, 10))
ax.set_yticklabels(ax.get_yticklabels(), fontsize=150)

ax.set_xlabel('MMLU Task ID', fontsize=250, labelpad=100)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=150)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=labels, loc='upper left',
          title='', title_fontsize=200,
          prop=dict(size=180))

fig.tight_layout()
fig.show()
fig.savefig("mmlu_ece_comparison.pdf", bbox_inches="tight")