In [3]:
import json
import os

import numpy as np

res_dir = "/home/nee7ne/EfficientCoT/results"
methods = ['coconut', 'codi', 'icot_si', 'pause', 'softcot', 'effi_cot']
datasets = ["coin_flip", "commonsense_qa", "gsm8k", "multiarith", "svamp"]
models = ["small", "mistral"]
res = {}

for mo in models:
    res[mo] = {}
    for me in methods:
        res[mo][me] = {}
        for d in datasets:
            res[mo][me][d] = {}
            file = f"{res_dir}/{me}/{mo}/{d}/evaluation_results.jsonl"
            if me == "effi_cot":
                file = f"{res_dir}/{me}/vanilla/{mo}/{d}/evaluation_results.jsonl"
            if not os.path.exists(file):
                continue
            with open(file) as f:
                lines = f.readlines()
                idx = [i for i, l in enumerate(lines) if l == "\n"]
                if len(idx) == 0:
                    idx = 0
                elif idx[-1] == len(lines) - 1:
                    idx = 0
                else:
                    idx = idx[-1] + 1
                lines = [json.loads(l) for l in lines[idx:] if l != "\n"]
            summary = {}
            for l in lines:
                if l['eval_temp'] not in summary:
                    summary[l['eval_temp']] = {'acc': [], 'time': []}
                summary[l['eval_temp']]['acc'].append(l['numerical_accuracy'] * 100)
                summary[l['eval_temp']]['time'].append(l['ave_sample_time'])
            best_temp, max_acc = 0, -1
            for temp in summary:
                if np.mean(summary[temp]['acc']) > max_acc:
                    best_temp = temp
                    max_acc = np.mean(summary[temp]['acc'])
            res[mo][me][d] = {'acc': summary[best_temp]['acc'], 'time': summary[best_temp]['time']}


In [4]:
for mo in models:
    for d in sorted(datasets):
        acc_line = ""
        time_line = ""
        m = ""
        for me in methods:
            m += me + "\t"
            cur_acc = "N/A\t"
            cur_time = "N/A\t"
            if len(res[mo][me][d]) > 0:
                cur_acc = f"{np.mean(res[mo][me][d]['acc']):.2f} ± {np.std(res[mo][me][d]['acc']):.2f}\t"
                cur_time = f"{np.mean(res[mo][me][d]['time']):.2f} ± {np.std(res[mo][me][d]['time']):.2f}\t"
            acc_line += cur_acc
            time_line += cur_time
        print(acc_line)
        print(time_line)

30.00 ± 0.00	66.33 ± 11.05	25.33 ± 0.85	50.50 ± 0.00	73.17 ± 1.25	79.00 ± 16.13	
2.21 ± 0.00	4.08 ± 0.12	2.22 ± 0.01	3.14 ± 0.00	1.31 ± 0.00	1.32 ± 0.00	
13.00 ± 0.41	74.50 ± 12.29	20.00 ± 1.41	50.00 ± 0.00	63.50 ± 0.41	66.14 ± 7.69	
1.62 ± 0.01	3.89 ± 0.01	2.24 ± 0.00	2.67 ± 0.01	1.27 ± 0.07	1.35 ± 0.14	
6.50 ± 1.22	8.17 ± 5.07	9.83 ± 0.24	4.67 ± 0.62	2.17 ± 0.47	10.00 ± 6.01	
1.58 ± 0.01	2.82 ± 1.61	1.93 ± 0.23	3.40 ± 0.03	1.35 ± 0.01	1.45 ± 0.15	
2.23 ± 0.45	8.17 ± 7.34	2.80 ± 0.90	0.57 ± 0.45	4.43 ± 1.20	12.43 ± 1.04	
1.57 ± 0.01	4.15 ± 0.37	2.26 ± 0.00	3.56 ± 0.11	1.43 ± 0.06	1.29 ± 0.01	
8.50 ± 1.22	11.33 ± 4.52	19.67 ± 5.57	16.17 ± 4.73	6.17 ± 0.94	37.50 ± 10.61	
1.58 ± 0.01	3.19 ± 0.53	0.43 ± 0.04	3.32 ± 0.04	1.31 ± 0.00	1.31 ± 0.00	
2.17 ± 1.25	87.17 ± 18.15	33.83 ± 2.87	50.00 ± 0.00	41.00 ± 2.55	87.83 ± 6.54	
1.68 ± 0.00	4.32 ± 0.04	2.30 ± 0.00	3.74 ± 0.02	1.39 ± 0.00	1.37 ± 0.01	
14.50 ± 0.00	80.17 ± 16.41	21.17 ± 2.05	63.50 ± 0.00	62.67 ± 1.18	N/A	
1.67 ± 0.00	4.14 ± 0.00	2