## Make nice table and plots

This notebook just serves to collect results from the different run folders and log files and arrange the values in a tabulare format that can be directly pasted into LaTeX.

In [3]:
import os
os.chdir("../")

In [4]:
import json

## Load Final Eval files (only Auto-GDA lines in the table)

In [None]:
Obtaining the results for this experiment first required hyperparameter-tuning via ```src\scripts\hyper_opt4.py``` and performing the actual evaluation runs via ```run_multiple_seeds.sh```

In [58]:
import json
import numpy as np
import os
## Use line according to which teacher model is used.
datasets = [("ragtruth", "-Summary", "tasksource"), ("ragtruth", "-QA", "tasksource"), ("lfqa", "_alignscore", "vectara_v2"), ("summall", "_alignscore", "bart-large-tasksource")] #default (best performing model as teacher, main paper)
#datasets = [("rtsumm", "_gptq", "vectara_v2-tasksource"), ("rtqa", "_gptq", "bart-large-tasksource"), ("lfqa", "_gpt", "tasksource"), ("summall", "_gptq", "tasksource-tasksource")] # learning from gpt (table in Appendix) 
datasets = [("rtsumm", "_tasksource_self2", "tasksource-tasksource"), ("rtqa", "_tasksource_self2", "tasksource-tasksource"), ("lfqa", "_tasksource_self2", "tasksource-tasksource"), ("summall", "_tasksource", "tasksource-tasksource")] # self-supervised setup (table in Appendix)
models = ["flan-t5-base", "bart-large", "tasksource"]
metric = "roc"
res_mat = np.ones((len(datasets), len(models), 5))*float("nan")
for seed in range(1,6):
    for idx, (dset, dgroup, pinit) in enumerate(datasets):
        log_path = f"eval_run-{dset}{dgroup}/seed_{seed}-{pinit}/eval_out_unweighted_test_list.json"
        if os.path.exists(log_path):
            initial_scores = json.load(open(log_path))
            for midx, mkey in enumerate(models):
                if dset == "lfqa" or dset =="summall":
                    if mkey in initial_scores["1"]:
                        res_mat[idx, midx, seed-1] = initial_scores["1"][mkey][metric]
                else:
                    if "1" in initial_scores and mkey in initial_scores["1"]:
                        res_mat[idx, midx, seed-1] = initial_scores["1"][mkey][metric]
                    if "2" in initial_scores and mkey in initial_scores["2"]:
                        res_mat[idx, midx, seed-1] = initial_scores["2"][mkey][metric]

In [59]:
res_mat

array([[[0.70045348, 0.68086312, 0.6963646 , 0.69294838, 0.69539717],
        [0.82492631, 0.80384703, 0.82612803, 0.81558461, 0.80399063],
        [0.81735319, 0.83697377, 0.82175195, 0.83367848, 0.84098708]],

       [[0.74045868, 0.7189382 , 0.73875175, 0.73876926, 0.73095238],
        [0.79333859, 0.79120273, 0.7858806 , 0.76870623, 0.79292279],
        [0.80276611, 0.80879289, 0.81872812, 0.81408438, 0.78932511]],

       [[0.79450758, 0.82575758, 0.73579545, 0.74621212, 0.78219697],
        [0.9157197 , 0.94981061, 0.91098485, 0.94128788, 0.92424242],
        [0.92424242, 0.94128788, 0.92518939, 0.90435606, 0.92045455]],

       [[0.80777277, 0.81823726, 0.80337692, 0.82633442, 0.82151508],
        [0.84676367, 0.85450087, 0.88014331, 0.85082496, 0.86053563],
        [0.89174699, 0.88885877, 0.90040316, 0.89098894, 0.87976217]]])

In [60]:
means = np.nanmean(res_mat, axis=2)
stds = np.nanstd(res_mat, axis=2)
print_names = {"flan-t5-base": "Flan-T5 (Ours)", "bart-large": "BART (Ours)", "tasksource": "DeBERTaV2 (Ours)"}
for midx, mkey in enumerate(models):
    str_print = print_names[mkey]
    row_list=[]
    for idx, (dset, dgroup, _) in enumerate(datasets):
        str_print += r" & " + f"{means[idx, midx]:.3f}" + r" & \fstd{" + f"{stds[idx, midx]:.3f}" +"}"
        row_list.append(means[idx, midx])
    row_list = np.array(row_list)
    str_print += f" & {row_list.mean():.3f}  "
    print(str_print + r"\\")

Flan-T5 (Ours) & 0.693 & \fstd{0.007} & 0.734 & \fstd{0.008} & 0.777 & \fstd{0.033} & 0.815 & \fstd{0.009} & 0.755  \\
BART (Ours) & 0.815 & \fstd{0.010} & 0.786 & \fstd{0.009} & 0.928 & \fstd{0.015} & 0.859 & \fstd{0.012} & 0.847  \\
DeBERTaV2 (Ours) & 0.830 & \fstd{0.009} & 0.807 & \fstd{0.010} & 0.923 & \fstd{0.012} & 0.890 & \fstd{0.007} & 0.863  \\


## Baselines (remaining lines of the table)

In [61]:
import json
import matplotlib.pyplot as plt
import numpy as np

dsetlist = [("ragtruth", "Summary"), ("ragtruth", "QA"), ("lfqa-veri", "all"), ("summedits", "all")]

baselines = {"flan-t5-base": "FLAN-T5", "tasksource": "DeVERTaV2",  "bart-large": "BART-large", "minicheck-t5": "MiniCheck-T5", "alignscore": "AlignScore", "vectara_v2": "Vectara-2.1",
                "gpt-4o": "GPT-4o", "gpt-4o-mini": "GPT-4o-mini", "gpt-3.5-turbo": "GPT-3.5"}

## Load logfiles
res_dict = {}
for dset, group in dsetlist:
    if dset not in res_dict:
        res_dict[dset] = json.load(open(f"results/eval_baselines_{dset}_nofinetune.json"))
        
metric ="roc"

for baseline, pname in baselines.items():
    row_str = pname +" & "
    row_list = []
    for dset, group in dsetlist:
        if baseline in res_dict[dset][group]:
            row_str += f"{res_dict[dset][group][baseline][metric]:.3f} & & "
            row_list.append(res_dict[dset][group][baseline][metric])
        else:
            row_str += " - & & "
    row_list = np.array(row_list)
    row_str += f"{row_list.mean():.3f} "
    print(row_str+ r"\\")

FLAN-T5 & 0.734 & & 0.708 & & 0.655 & & 0.700 & & 0.699 \\
DeVERTaV2 & 0.782 & & 0.530 & & 0.645 & & 0.876 & & 0.708 \\
BART-large & 0.696 & & 0.670 & & 0.821 & & 0.769 & & 0.739 \\
MiniCheck-T5 & 0.754 & & 0.640 & & 0.741 & & 0.791 & & 0.732 \\
AlignScore & 0.729 & & 0.822 & & 0.904 & & 0.894 & & 0.837 \\
Vectara-2.1 & 0.805 & & 0.854 & & 0.648 & & 0.590 & & 0.725 \\
GPT-4o & 0.892 & & 0.865 & & 0.896 & & 0.880 & & 0.883 \\
GPT-4o-mini & 0.884 & & 0.833 & & 0.812 & & 0.878 & & 0.852 \\
GPT-3.5 & 0.706 & & 0.648 & & 0.749 & & 0.814 & & 0.729 \\


## Ablation study

Collect the results needed for the ablation study table in the paper.

In [64]:
import json
import numpy as np
import os
datasets = [("ragtruth", "-Summary", "tasksource"), ("ragtruth", "-QA", "tasksource"), ("lfqa", "_alignscore", "vectara_v2"), ("summall", "_alignscore", "bart-large-tasksource")]
datasets_names = [("ragtruth", "Summary"), ("ragtruth", "QA"),  ("lfqa-veri", "all"), ("summedits", "all")] 

# Unfortunately, we introduced new naming for the runs before performing the random selection experiment.
datasets2 = [("rtsumm", "_tasksource_final2", "tasksource-tasksource"), ("rtqa", "_tasksource_final2", "tasksource-tasksource"), ("lfqa", "_alignscore", "vectara_v2-tasksource"), ("summall", "_alignscore", "bart-large-tasksource")]
metric = "roc"
mkey = "tasksource"
res_mat_initial_sync = np.ones((len(datasets), 5))*float("nan")
res_mat_augmented_sync = np.ones((len(datasets), 5))*float("nan")
res_mat_randsel = np.ones((len(datasets), 5))*float("nan")
res_mat_nofinetune = np.ones((len(datasets)))*float("nan")
res_mat_finetune = np.ones((len(datasets)))*float("nan")
## Load logfiles for baselines

res_dict_no_ft = {}
res_dict_ft = {}
for dset, dgroup in datasets_names:
    res = json.load(open(f"results/eval_baselines_{dset}_nofinetune.json"))
    res_dict_no_ft[f"{dset}-{dgroup}"] = res[dgroup][mkey][metric]
    res = json.load(open(f"results/eval_baselines_{dset}_finetune.json"))
    res_dict_ft[f"{dset}-{dgroup}"] = res[dgroup][mkey][metric]


for seed in range(1,6):
    for idx, (dset, dgroup, pinit) in enumerate(datasets):
        log_path = f"eval_run-{dset}{dgroup}/seed_{seed}-{pinit}/eval_out_unweighted_test_list.json"
        if os.path.exists(log_path):
            initial_scores = json.load(open(log_path))
            if "0" in initial_scores and  mkey in initial_scores["0"]:
                    res_mat_initial_sync[idx, seed-1] = initial_scores["0"][mkey][metric]
            if dset == "lfqa" or dset =="summall":
                if mkey in initial_scores["1"]:
                    res_mat_augmented_sync[idx, seed-1] = initial_scores["1"][mkey][metric]
            else:
                if mkey in initial_scores["2"]:
                    res_mat_augmented_sync[idx, seed-1] = initial_scores["2"][mkey][metric]
    for idx, (dset, dgroup, pinit) in enumerate(datasets2):
        log_path_randsel = f"eval_run_randomsel-{dset}{dgroup}/seed_{seed}-{pinit}/eval_out_final.json"
        if os.path.exists(log_path_randsel):
            log_randsel = json.load(open(log_path_randsel))
            #print(log_randsel.keys())
            res_mat_randsel[idx, seed-1]  = log_randsel[mkey]["roc"]

In [65]:
dict_initial = {}
dict_augmented = {}
dict_randomsel = {}
for idx, (dset, dgroup) in enumerate(datasets_names):
    dict_initial[f"{dset}-{dgroup}"] = res_mat_initial_sync[idx, ~np.isnan(res_mat_initial_sync[idx])].mean()
    dict_augmented[f"{dset}-{dgroup}"] = res_mat_augmented_sync[idx, ~np.isnan(res_mat_initial_sync[idx])].mean()
    dict_randomsel[f"{dset}-{dgroup}"] = res_mat_randsel[idx, ~np.isnan(res_mat_randsel[idx])].mean()

In [66]:
def get_row_str(name, values_dict, std_dict, v_min=0.708, v_max=0.8848):
    rowstr = name
    values = []
    for dset, dgroup in datasets_names:
        key = f'{dset}-{dgroup}'
        rowstr += f" & {values_dict[key]:.3f}"
        values.append(values_dict[key])
    mean = np.array(values).mean()
    rowstr += f" & {mean:.3f} ({int(100*(mean-v_min)/(v_max-v_min))}" + r"\%)"
    rowstr += r" \\"
    return rowstr
    
lines = [("Non-Fintuned", res_dict_no_ft), ("Few-Shot Prompt", dict_initial), 
         ("Augmented-RandomSel", dict_randomsel), ("Augmented", dict_augmented), ("Finetuned-Labeled", res_dict_ft)]
for name, dicct1 in lines:
    print(get_row_str(name, dicct1, None))

Non-Fintuned & 0.782 & 0.530 & 0.645 & 0.876 & 0.708 (0\%) \\
Few-Shot Prompt & 0.799 & 0.826 & 0.934 & 0.872 & 0.858 (84\%) \\
Augmented-RandomSel & 0.777 & 0.783 & 0.919 & 0.862 & 0.835 (71\%) \\
Augmented & 0.837 & 0.867 & 0.925 & 0.883 & 0.878 (96\%) \\
Finetuned-Labeled & 0.842 & 0.890 & 0.909 & 0.898 & 0.885 (100\%) \\


## Ablation study on mutation strategy

Collect results obtained using only one mutation strategy.

TODO: Run more seeds...

In [82]:
import json
import pandas as pd
models = ["tasksource", "bart-large", "flan-t5-base"]
mutation_list = ["LLMFillInTheGapsMutation", "RephraseMutation", "DropSentenceMutation", "All"]
dict_list = []
n_runs = 5
results_matrix = np.ones((5, n_runs, 3))*float("nan")
for seed in range(1, 6):
    res = json.load(open(f"eval_run-ragtruth-QA/seed_{seed}-tasksource/eval_out_unweighted_test_list.json"))
    results_matrix[0, seed-1] = np.array([(res["0"][m]["roc"] if m in res["0"] else float("nan")) for m in models])  
    for idx, mutation in enumerate(mutation_list[:3]):
        res = json.load(open(f"abl_mutation-ragtruth-QA/{mutation}_seed{seed}/eval_out_unweighted_test_list.json"))
        results_matrix[idx+1, seed-1] = np.array([res["2"][m]["roc"] for m in models])
    res = json.load(open(f"eval_run-ragtruth-QA/seed_{seed}-tasksource/eval_out_unweighted_test_list.json"))
    results_matrix[4, seed-1] = np.array([res["2"][m]["roc"] for m in models])       

In [86]:
mlist_ext =["Few-Shot Only"] + mutation_list
for r, vals in enumerate(results_matrix):
    rowstr = mlist_ext[r]
    #print(rowstr)
    for idx, model in enumerate(models + ["mean"]):
        if model =="mean":
            rowstr += f" & {vals[0, :].mean():.3f} "
        else:
            rowstr += f" & {vals[0, idx].mean():.3f} "
    print(rowstr  +r"\\")

Few-Shot Only & 0.836  & 0.845  & 0.772  & 0.818 \\
LLMFillInTheGapsMutation & 0.869  & 0.890  & 0.767  & 0.842 \\
RephraseMutation & 0.845  & 0.863  & 0.711  & 0.806 \\
DropSentenceMutation & 0.868  & 0.872  & 0.758  & 0.833 \\
All & 0.872  & 0.886  & 0.806  & 0.855 \\


## Baseline: Label train dataset

In [87]:
model="tasksource"
#model="bart-large"
metric ="roc"
dsetlist = [("ragtruth", "Summary"), ("ragtruth", "QA"), ("lfqa-veri", "all"), ("summedits", "all")]
baseline_labeler = {"alignscore": "AlignScore", "vectara_v2": "Vectara-2.1", "gpt-4o": "GPT-4o"}


baseline_vals = {}
for baseline, pname in baseline_labeler.items():
    baseline_vals[baseline] = []
    row_str = pname + " & "
    for dset, group in dsetlist:
        filename = f"results/eval_baselines_{dset}_finetune_label_{baseline}.json"
        if os.path.exists(filename):
            res_dict = json.load(open(filename))
            if group in res_dict and model in res_dict[group]:
                row_str += f"{res_dict[group][model][metric]:.3f} & "
                #baseline_vals[baseline].append(res_dict[dset][group][baseline][metric])
            else:
                row_str += " - & "
        else:
            row_str += " - & "
    print(row_str[:-2] + r"\\")


AlignScore & 0.737 & 0.836 & 0.870 & 0.874 \\
Vectara-2.1 & 0.814 & 0.879 & 0.879 & 0.805 \\
GPT-4o & 0.828 & 0.866 & 0.876 & 0.878 \\


In [None]:
DebertaV2 (Ours) 0.837 ± 0.007 0.867 ± 0.007 0.925 ± 0.009 0.890 ± nan

## Runtimes

In [88]:
perfs = {"flan-t5-base": 0.699, "tasksource": 0.739, "bart-large": 0.739, "minicheck-t5": 0.732, "alignscore": 0.837, "vectara_v2": 0.725, "gpt-4o": 0.883} # Add performances hardcoded from main table...

In [89]:
import numpy as np
dsetlist = [("ragtruth", "Summary"), ("ragtruth", "QA"), ("lfqa-veri", "all"), ("summedits", "all")]
baseline_labeler = {"vectara_v2": "Vectara", "flan-t5-base": "FLAN-T5", "tasksource": "DeBERTaV2", "minicheck-t5": "MiniCheck-T5", "bart-large": "BART-large",
                    "alignscore": "AlignScore", "gpt-4o": "GPT-4o", }
all_results = {}
for dset, group in dsetlist:
    filename = f"results/timing_log_50_{dset}_{group}.json"
    if os.path.exists(filename):
        all_results[(dset, group)] = json.load(open(filename))

for baseline, name in baseline_labeler.items():
    str_print = name
    mean_list_all = []
    for dset, group in all_results.keys():
        runtimes = np.array(all_results[(dset, group)][baseline])
        rt_mean = runtimes.mean()
        rt_std = runtimes.std()/2
        str_print += r" & \wstd{" + f"{rt_mean:.2f}" + r"}{" + f"{rt_std:.2f}" +"}"
        #str_print += "& "
        mean_list_all.append(rt_mean)
    str_print += (f" & {np.array(mean_list_all).mean():.2f} ({int(np.array(mean_list_all).mean()/0.02117)}\\%) & {perfs[baseline]*100:.1f}" + r"\\")
    print(str_print)

Vectara & \wstd{1.57}{0.02} & \wstd{1.13}{0.03} & \wstd{1.35}{0.03} & \wstd{1.03}{0.01} & 1.27 (59\%) & 72.5\\
FLAN-T5 & \wstd{1.71}{0.07} & \wstd{1.71}{0.07} & \wstd{1.72}{0.07} & \wstd{1.71}{0.07} & 1.71 (80\%) & 69.9\\
DeBERTaV2 & \wstd{2.56}{0.03} & \wstd{1.88}{0.04} & \wstd{2.15}{0.06} & \wstd{1.88}{0.09} & 2.12 (100\%) & 73.9\\
MiniCheck-T5 & \wstd{4.50}{0.20} & \wstd{3.16}{0.06} & \wstd{3.90}{0.14} & \wstd{3.22}{0.10} & 3.69 (174\%) & 73.2\\
BART-large & \wstd{4.33}{0.01} & \wstd{3.62}{0.06} & \wstd{3.95}{0.09} & \wstd{3.76}{0.20} & 3.92 (184\%) & 73.9\\
AlignScore & \wstd{5.88}{0.12} & \wstd{7.55}{0.28} & \wstd{7.55}{0.35} & \wstd{1.81}{0.06} & 5.70 (269\%) & 83.7\\
GPT-4o & \wstd{19.80}{0.51} & \wstd{19.11}{0.44} & \wstd{21.09}{2.97} & \wstd{21.89}{1.26} & 20.47 (967\%) & 88.3\\
