In [4]:
import json
import os

import numpy as np

res_dir = "/home/nee7ne/EfficientCoT/results"
methods = ['coconut', 'codi', 'icot_si', 'pause', 'softcot', 'effi_cot']
datasets = ["coin_flip", "commonsense_qa", "gsm8k", "multiarith", "svamp"]
models = ["small", "mistral"]
res = {}

for mo in models:
    res[mo] = {}
    for me in methods:
        res[mo][me] = {}
        for d in datasets:
            res[mo][me][d] = {}
            file = f"{res_dir}/{me}/{mo}/{d}/evaluation_results.jsonl"
            if me == "effi_cot":
                file = f"{res_dir}/{me}/vanilla/{mo}/{d}/evaluation_results.jsonl"
            if not os.path.exists(file):
                continue
            with open(file) as f:
                lines = f.readlines()
                # idx = [i for i, l in enumerate(lines) if l == "\n"]
                # if len(idx) == 0:
                #     idx = 0
                # elif idx[-1] == len(lines) - 1:
                #     idx = 0
                # else:
                #     idx = idx[-1] + 1
                lines = [json.loads(l) for l in lines[:15] if l != "\n"]
            summary = {}
            for l in lines:
                if l['eval_temp'] not in summary:
                    summary[l['eval_temp']] = {'acc': [], 'time': []}
                summary[l['eval_temp']]['acc'].append(l['numerical_accuracy'] * 100)
                summary[l['eval_temp']]['time'].append(l['ave_sample_time'])
            best_temp, max_acc = 0, -1
            for temp in summary:
                if np.mean(summary[temp]['acc']) > max_acc:
                    best_temp = temp
                    max_acc = np.mean(summary[temp]['acc'])
            res[mo][me][d] = {'acc': summary[best_temp]['acc'], 'time': summary[best_temp]['time']}


In [11]:
for mo in models:
    for d in sorted(datasets):
        acc_line = ""
        time_line = ""
        m = ""
        for me in methods:
            m += me + "\t"
            cur_acc = "N/A\t"
            cur_time = "N/A\t"
            if len(res[mo][me][d]) > 0:
                cur_acc = f"{np.mean(res[mo][me][d]['acc']):.2f} \scriptsize"+"{± "+f"{np.std(res[mo][me][d]['acc']):.2f}"+"}\t"
                cur_time = f"{np.mean(res[mo][me][d]['time']):.2f} \scriptsize"+"{± "+f"{np.std(res[mo][me][d]['time']):.2f}"+"}\t"
            acc_line += cur_acc
            time_line += cur_time
        print(acc_line)
        print(time_line)

32.67 \scriptsize{± 1.93}	66.33 \scriptsize{± 11.05}	25.33 \scriptsize{± 0.85}	50.50 \scriptsize{± 0.00}	73.17 \scriptsize{± 1.25}	90.00 \scriptsize{± 9.91}	
1.99 \scriptsize{± 0.12}	4.08 \scriptsize{± 0.12}	2.22 \scriptsize{± 0.01}	3.14 \scriptsize{± 0.00}	1.31 \scriptsize{± 0.00}	1.32 \scriptsize{± 0.00}	
13.00 \scriptsize{± 0.41}	74.50 \scriptsize{± 12.29}	20.00 \scriptsize{± 1.41}	50.00 \scriptsize{± 0.00}	63.50 \scriptsize{± 0.41}	76.83 \scriptsize{± 10.02}	
1.62 \scriptsize{± 0.01}	3.89 \scriptsize{± 0.01}	2.24 \scriptsize{± 0.00}	2.67 \scriptsize{± 0.01}	1.27 \scriptsize{± 0.07}	1.26 \scriptsize{± 0.02}	
6.50 \scriptsize{± 1.22}	8.17 \scriptsize{± 5.07}	9.83 \scriptsize{± 0.24}	4.67 \scriptsize{± 0.62}	2.17 \scriptsize{± 0.47}	11.00 \scriptsize{± 6.72}	
1.58 \scriptsize{± 0.01}	2.82 \scriptsize{± 1.61}	1.93 \scriptsize{± 0.23}	3.40 \scriptsize{± 0.03}	1.35 \scriptsize{± 0.01}	1.31 \scriptsize{± 0.00}	
2.23 \scriptsize{± 0.45}	8.17 \scriptsize{± 7.34}	2.80 \scriptsize{± 0.90}	0.5

In [5]:
lines = """
{"numerical_accuracy": 0.15, "close_match_rate": 0.16, "mean_relative_error": 2.825956314567759, "median_relative_error": 0.5, "exp_num": 0, "dataset": "svamp", "eval_temp": 0.1, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.3920869755744934}
{"numerical_accuracy": 0.155, "close_match_rate": 0.16, "mean_relative_error": 2.914523613480803, "median_relative_error": 0.4850980392156863, "exp_num": 0, "dataset": "svamp", "eval_temp": 0.3, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.3932299447059631}
{"numerical_accuracy": 0.12, "close_match_rate": 0.13, "mean_relative_error": 2.5612209492440154, "median_relative_error": 0.5, "exp_num": 0, "dataset": "svamp", "eval_temp": 0.5, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.39934741020202635}
{"numerical_accuracy": 0.115, "close_match_rate": 0.12, "mean_relative_error": 3.3848128990146513, "median_relative_error": 0.5, "exp_num": 0, "dataset": "svamp", "eval_temp": 0.7, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.4530379295349121}
{"numerical_accuracy": 0.105, "close_match_rate": 0.11, "mean_relative_error": 3.978413661754349, "median_relative_error": 0.5652173913043478, "exp_num": 0, "dataset": "svamp", "eval_temp": 0.9, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.5246122753620148}
{"numerical_accuracy": 0.165, "close_match_rate": 0.17, "mean_relative_error": 1.926373532544781, "median_relative_error": 0.4, "exp_num": 1, "dataset": "svamp", "eval_temp": 0.1, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.4148860025405884}
{"numerical_accuracy": 0.145, "close_match_rate": 0.15, "mean_relative_error": 2.3702632672969766, "median_relative_error": 0.4469914040114613, "exp_num": 1, "dataset": "svamp", "eval_temp": 0.3, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.43266808152198794}
{"numerical_accuracy": 0.155, "close_match_rate": 0.16, "mean_relative_error": 1.9922007326443447, "median_relative_error": 0.47409523809523807, "exp_num": 1, "dataset": "svamp", "eval_temp": 0.5, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.45586092948913576}
{"numerical_accuracy": 0.135, "close_match_rate": 0.14, "mean_relative_error": 2.765192993955823, "median_relative_error": 0.6125, "exp_num": 1, "dataset": "svamp", "eval_temp": 0.7, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.496463885307312}
{"numerical_accuracy": 0.1, "close_match_rate": 0.11, "mean_relative_error": 3.472068154724791, "median_relative_error": 0.6666666666666666, "exp_num": 1, "dataset": "svamp", "eval_temp": 0.9, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.5225280523300171}
{"numerical_accuracy": 0.275, "close_match_rate": 0.28, "mean_relative_error": 2.1516417579580875, "median_relative_error": 0.37484993997599036, "exp_num": 2, "dataset": "svamp", "eval_temp": 0.1, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.490375292301178}
{"numerical_accuracy": 0.275, "close_match_rate": 0.28, "mean_relative_error": 1.9310918786099907, "median_relative_error": 0.37037037037037035, "exp_num": 2, "dataset": "svamp", "eval_temp": 0.3, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.4728516352176666}
{"numerical_accuracy": 0.275, "close_match_rate": 0.285, "mean_relative_error": 2.116831502069541, "median_relative_error": 0.37717171717171716, "exp_num": 2, "dataset": "svamp", "eval_temp": 0.5, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.48019825100898744}
{"numerical_accuracy": 0.255, "close_match_rate": 0.265, "mean_relative_error": 4.181612202777049, "median_relative_error": 0.3763888888888889, "exp_num": 2, "dataset": "svamp", "eval_temp": 0.7, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.5109249222278595}
{"numerical_accuracy": 0.165, "close_match_rate": 0.18, "mean_relative_error": 2.8756297224063387, "median_relative_error": 0.5, "exp_num": 2, "dataset": "svamp", "eval_temp": 0.9, "eval_max_contemp_tokens": 1, "ave_sample_time": 0.501106048822403}
"""
lines = lines.split("\n")[1:-1]
idx = [i for i, l in enumerate(lines) if l == "\n"]
if len(idx) == 0:
    idx = 0
elif idx[-1] == len(lines) - 1:
    idx = 0
else:
    idx = idx[-1] + 1
lines = [json.loads(l) for l in lines[idx:] if l != "\n"]
summary = {}
for l in lines:
    if l['eval_temp'] not in summary:
        summary[l['eval_temp']] = {'acc': [], 'time': []}
    summary[l['eval_temp']]['acc'].append(l['numerical_accuracy'] * 100)
    summary[l['eval_temp']]['time'].append(l['ave_sample_time'])
best_temp, max_acc = 0, -1
for temp in summary:
    if np.mean(summary[temp]['acc']) > max_acc:
        best_temp = temp
        max_acc = np.mean(summary[temp]['acc'])
        
print(f"{np.mean(summary[best_temp]['acc']):.2f} ± {np.std(summary[best_temp]['acc']):.2f}\t")
print(f"{np.mean(summary[best_temp]['time']):.2f} ± {np.std(summary[best_temp]['time']):.2f}\t")

19.67 ± 5.57	
0.43 ± 0.04	


In [7]:
import json
res_path = "/home/nee7ne/EfficientCoT/results/effi_cot/vanilla/mistral/multiarith/evaluation_results.jsonl"
cur_res = {}
with open(res_path, "r") as f:
    lines = f.readlines()
    idx = 0
    while idx < len(lines):
        if lines[idx].strip() == "":
            idx +=1
            continue
        elif json.loads(lines[idx])["exp_num"] != 0:
            idx += 1
            continue
        acc = []
        for temp in [0.1, 0.3, 0.5, 0.7, 0.9]:
            cur_line = json.loads(lines[idx])
            if cur_line['eval_temp'] != temp:
                break
            acc.append(cur_line['numerical_accuracy'])
            idx += 1
        cur_params = (cur_line['st_linear_lr'], cur_line['st_linear_wd'], cur_line['st_linear_epochs'], cur_line['st_llm_lr'], cur_line['st_llm_wd'], cur_line['st_llm_epochs'], cur_line['cg_linear_lr'], cur_line['cg_linear_wd'], cur_line['cg_linear_epochs'], cur_line['cg_llm_lr'], cur_line['cg_llm_wd'], cur_line['cg_llm_epochs'])
        cur_res[cur_params] = max(acc)

In [8]:
sorted([(k,v) for k, v in cur_res.items() if v >= 0.185], key=lambda x: x[1])

[((0.0001, 0.001, 3, 1e-05, 0.001, 2, 0.0001, 0.01, 3, 1e-05, 0.001, 1),
  0.189),
 ((0.0001, 0.01, 1, 1e-07, 0.001, 2, 0.01, 0.0001, 5, 1e-05, 0.001, 1), 0.189),
 ((0.0001, 0.01, 1, 1e-07, 0.001, 2, 0.01, 0.01, 5, 1e-07, 0.001, 1), 0.189),
 ((0.0001, 0.01, 1, 1e-07, 0.001, 2, 0.01, 0.0001, 5, 1e-07, 0.001, 2), 0.189),
 ((0.01, 0.001, 1, 1e-05, 1e-05, 1, 0.0001, 0.001, 1, 1e-07, 0.001, 2), 0.189),
 ((0.0001, 0.01, 1, 1e-07, 0.001, 2, 0.01, 0.01, 3, 1e-07, 1e-05, 1), 0.2),
 ((0.001, 0.0001, 3, 1e-07, 1e-05, 1, 0.01, 0.001, 5, 1e-05, 0.001, 1), 0.2),
 ((0.0001, 0.01, 1, 1e-07, 0.001, 2, 0.001, 0.0001, 1, 1e-05, 0.001, 2),
  0.217)]

In [7]:
cur_line ={"numerical_accuracy": 0.211, "close_match_rate": 0.211, "mean_relative_error": 0.4133522429329931, "median_relative_error": 0.2, "exp_num": 0, "dataset": "multiarith", "eval_temp": 0.9, "ave_sample_time": 1.3804727421866523, "student": "optimum/mistral-1.1b-testing", "teacher": "mistralai/Mistral-7B-Instruct-v0.2", "st_linear_lr": 0.0001, "st_linear_wd": 0.01, "st_linear_epochs": 1, "st_llm_lr": 1e-07, "st_llm_wd": 0.001, "st_llm_epochs": 2, "cg_linear_lr": 0.001, "cg_linear_wd": 0.0001, "cg_linear_epochs": 5, "cg_llm_lr": 1e-07, "cg_llm_wd": 0.001, "cg_llm_epochs": 1, "train_max_contemp_tokens": 5, "eval_max_contemp_tokens": 1}

(cur_line['st_linear_lr'], cur_line['st_linear_wd'], cur_line['st_linear_epochs'], cur_line['st_llm_lr'], cur_line['st_llm_wd'], cur_line['st_llm_epochs'], cur_line['cg_linear_lr'], cur_line['cg_linear_wd'], cur_line['cg_linear_epochs'], cur_line['cg_llm_lr'], cur_line['cg_llm_wd'], cur_line['cg_llm_epochs'])

(0.0001, 0.01, 1, 1e-07, 0.001, 2, 0.001, 0.0001, 5, 1e-07, 0.001, 1)

In [8]:
print(f"python main.py --config {'small' if 'llama' in cur_line['teacher'] else 'mistral'} --mode effi_cot --dataset {cur_line['dataset']} --device 3 --variation vanilla -stllr {cur_line['st_linear_lr']} -stlwd {cur_line['st_linear_wd']} -stle {cur_line['st_linear_epochs']} -stllmlr {cur_line['st_llm_lr']} -stllmwd {cur_line['st_llm_wd']} -stllme {cur_line['st_llm_epochs']} -cgllr {cur_line['cg_linear_lr']} -cglwd {cur_line['cg_linear_wd']} -cgle {cur_line['cg_linear_epochs']} -cgllmlr {cur_line['cg_llm_lr']} -cgllmwd {cur_line['cg_llm_wd']} -cgllme {cur_line['cg_llm_epochs']}")

python main.py --config mistral --mode effi_cot --dataset multiarith --device 3 --variation vanilla -stllr 0.0001 -stlwd 0.01 -stle 1 -stllmlr 1e-07 -stllmwd 0.001 -stllme 2 -cgllr 0.001 -cglwd 0.0001 -cgle 5 -cgllmlr 1e-07 -cgllmwd 0.001 -cgllme 1


In [None]:
python main.py --config mistral --mode effi_cot --dataset multiarith --device 1 --variation vanilla -stllr 0.0001 -stlwd 0.01 -stle 1 -stllmlr 1e-07 -stllmwd 0.001 -stllme 2 -cgllr 0.001 -cglwd 0.0001 -cgle 5 -cgllmlr 1e-07 -cgllmwd 0.001 -cgllme 1


SyntaxError: invalid syntax (1663213852.py, line 1)

In [None]:
python main.py --config mistral --mode effi_cot --dataset multiarith --device 2 --variation vanilla -stllr 0.0001 -stlwd 0.001 -stle 3 -stllmlr 1e-05 -stllmwd 0.001 -stllme 2 -cgllr 0.0001 -cglwd 0.01 -cgle 3 -cgllmlr 1e-05 -cgllmwd 0.001 -cgllme 1 && python main.py --config mistral --mode effi_cot --dataset multiarith --device 2 --variation vanilla -stllr 0.0001 -stlwd 0.01 -stle 1 -stllmlr 1e-07 -stllmwd 0.001 -stllme 2 -cgllr 0.01 -cglwd 0.0001 -cgle 5 -cgllmlr 1e-05 -cgllmwd 0.001 -cgllme 1 && python main.py --config mistral --mode effi_cot --dataset multiarith --device 2 --variation vanilla -stllr 0.0001 -stlwd 0.01 -stle 1 -stllmlr 1e-07 -stllmwd 0.001 -stllme 2 -cgllr 0.01 -cglwd 0.0001 -cgle 5 -cgllmlr 1e-07 -cgllmwd 0.001 -cgllme 2 && python main.py --config mistral --mode effi_cot --dataset multiarith --device 2 --variation vanilla -stllr 0.0001 -stlwd 0.01 -stle 1 -stllmlr 1e-07 -stllmwd 0.001 -stllme 2 -cgllr 0.01 -cglwd 0.01 -cgle 5 -cgllmlr 1e-07 -cgllmwd 0.001 -cgllme 1

In [None]:
                                                     &                                                        &          & {\large Coconut}                                  & {\large CODI}                                              & {\large ICoT-SI}                                           & {\large Pause}            & {\large SoftCoT}                                          & {\large \textbf{SemCoT} (ours)}            \\ \hline
                                                     &                                                        & Acc (\%) & 32.67 \scriptsize{± 1.93}                         & 66.33 \scriptsize{± 11.05}                                 & 25.33 \scriptsize{± 0.85}                                  & 50.50 \scriptsize{± 0.00} & \cellcolor[HTML]{D1E6F6}73.17 \scriptsize{± 1.25}         & \cellcolor[HTML]{E3EFDA}\textbf{90.00 \scriptsize{± 9.91}}  \\
                                                     & \multirow{-2}{*}{{\large CoinFlip}}   & Time (s) & 1.99 \scriptsize{± 0.12}                          & 4.08 \scriptsize{± 0.12}                                   & 2.22 \scriptsize{± 0.01}                                   & 3.14 \scriptsize{± 0.00}  & \cellcolor[HTML]{E3EFDA}\textbf{1.31 \scriptsize{± 0.00}} & \cellcolor[HTML]{D1E6F6}1.32 \scriptsize{± 0.00}            \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & 13.00 \scriptsize{± 0.41}                         & \cellcolor[HTML]{D1E6F6}74.50 \scriptsize{± 12.29}         & 20.00 \scriptsize{± 1.41}                                  & 50.00 \scriptsize{± 0.00} & 63.50 \scriptsize{± 0.41}                                 & \cellcolor[HTML]{E3EFDA}\textbf{76.83 \scriptsize{± 10.02}} \\
                                                     & \multirow{-2}{*}{{\large Common}}     & Time (s) & 1.62 \scriptsize{± 0.01}                          & 3.89 \scriptsize{± 0.01}                                   & 2.24 \scriptsize{± 0.00}                                   & 2.67 \scriptsize{± 0.01}  & \cellcolor[HTML]{D1E6F6}1.27 \scriptsize{± 0.07}          & \cellcolor[HTML]{E3EFDA}\textbf{1.26 \scriptsize{± 0.02}}   \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & 6.50 \scriptsize{± 1.22}                          & 8.17 \scriptsize{± 5.07}                                   & \cellcolor[HTML]{D1E6F6}9.83 \scriptsize{± 0.24}           & 4.67 \scriptsize{± 0.62}  & 2.17 \scriptsize{± 0.47}                                  & \cellcolor[HTML]{E3EFDA}\textbf{11.00 \scriptsize{± 6.72}}  \\
                                                     & \multirow{-2}{*}{{\large GSM8K}}      & Time (s) & 1.58 \scriptsize{± 0.01}                          & 2.82 \scriptsize{± 1.61}                                   & 1.93 \scriptsize{± 0.23}                                   & 3.40 \scriptsize{± 0.03}  & \cellcolor[HTML]{D1E6F6}1.35 \scriptsize{± 0.01}          & \cellcolor[HTML]{E3EFDA}\textbf{1.31 \scriptsize{± 0.00}}   \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & 2.23 \scriptsize{± 0.45}                          & \cellcolor[HTML]{D1E6F6}8.17 \scriptsize{± 7.34}           & 2.80 \scriptsize{± 0.90}                                   & 0.57 \scriptsize{± 0.45}  & 4.43 \scriptsize{± 1.20}                                  & \cellcolor[HTML]{E3EFDA}\textbf{10.73 \scriptsize{± 0.52}}  \\
                                                     & \multirow{-2}{*}{{\large MultiArith}} & Time (s) & \cellcolor[HTML]{D1E6F6}1.57 \scriptsize{± 0.01}  & 4.15 \scriptsize{± 0.37}                                   & 2.26 \scriptsize{± 0.00}                                   & 3.56 \scriptsize{± 0.11}  & 1.43 \scriptsize{± 0.06}                                  & \cellcolor[HTML]{E3EFDA}\textbf{1.28 \scriptsize{± 0.03}}   \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & 8.50 \scriptsize{± 1.22}                          & 11.33 \scriptsize{± 4.52}                                  & \cellcolor[HTML]{D1E6F6}19.67 \scriptsize{± 5.57}          & 16.17 \scriptsize{± 4.73} & 6.17 \scriptsize{± 0.94}                                  & \cellcolor[HTML]{E3EFDA}\textbf{43.67 \scriptsize{± 2.09}}  \\
\multirow{-10}{*}{{\large Llama}}   & \multirow{-2}{*}{{\large SVAMP}}      & Time (s) & 1.58 \scriptsize{± 0.01}                          & 3.19 \scriptsize{± 0.53}                                   & \cellcolor[HTML]{E3EFDA}\textbf{0.43 \scriptsize{± 0.04}}  & 3.32 \scriptsize{± 0.04}  & 1.31 \scriptsize{± 0.00}                                  & \cellcolor[HTML]{D1E6F6}1.30 \scriptsize{± 0.02}            \\ \hline
                                                     &                                                        & Acc (\%) & 2.17 \scriptsize{± 1.25}                          & \cellcolor[HTML]{D1E6F6}87.17 \scriptsize{± 18.15}         & 33.83 \scriptsize{± 2.87}                                  & 50.00 \scriptsize{± 0.00} & 41.00 \scriptsize{± 2.55}                                 & \cellcolor[HTML]{E3EFDA}\textbf{87.83 \scriptsize{± 6.54}}  \\
                                                     & \multirow{-2}{*}{{\large CoinFlip}}   & Time (s) & 1.68 \scriptsize{± 0.00}                          & 4.32 \scriptsize{± 0.04}                                   & 2.30 \scriptsize{± 0.00}                                   & 3.74 \scriptsize{± 0.02}  & \cellcolor[HTML]{D1E6F6}1.39 \scriptsize{± 0.00}          & \cellcolor[HTML]{E3EFDA}\textbf{1.37 \scriptsize{± 0.01}}   \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & 14.50 \scriptsize{± 0.00}                         & \cellcolor[HTML]{D1E6F6}80.17 \scriptsize{± 16.41}         & 21.17 \scriptsize{± 2.05}                                  & 63.50 \scriptsize{± 0.00} & 62.67 \scriptsize{± 1.18}                                 & \cellcolor[HTML]{E3EFDA}\textbf{81.67 \scriptsize{± 2.78}}  \\
                                                     & \multirow{-2}{*}{{\large Common}}     & Time (s) & 1.67 \scriptsize{± 0.00}                          & 4.14 \scriptsize{± 0.00}                                   & 2.29 \scriptsize{± 0.00}                                   & 3.71 \scriptsize{± 0.03}  & \cellcolor[HTML]{D1E6F6}1.37 \scriptsize{± 0.00}          & \cellcolor[HTML]{E3EFDA}\textbf{1.35 \scriptsize{± 0.03}}   \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & 9.67 \scriptsize{± 0.24}                          & 4.33 \scriptsize{± 3.47}                                   & \cellcolor[HTML]{E3EFDA}\textbf{14.67 \scriptsize{± 2.09}} & 2.33 \scriptsize{± 1.65}  & 2.50 \scriptsize{± 0.41}                                  & \cellcolor[HTML]{D1E6F6}14.33 \scriptsize{± 0.62}           \\
                                                     & \multirow{-2}{*}{{\large GSM8K}}      & Time (s) & \cellcolor[HTML]{D1E6F6}1.68 \scriptsize{± 0.01}  & 4.96 \scriptsize{± 0.03}                                   & 1.72 \scriptsize{± 0.08}                                   & 3.46 \scriptsize{± 0.02}  & \cellcolor[HTML]{E3EFDA}\textbf{1.39 \scriptsize{± 0.00}} & \cellcolor[HTML]{E3EFDA}\textbf{1.39 \scriptsize{± 0.00}}   \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & 2.43 \scriptsize{± 0.52}                          & \cellcolor[HTML]{E3EFDA}\textbf{19.47 \scriptsize{± 9.32}} & 3.67 \scriptsize{± 0.52}                                   & 1.47 \scriptsize{± 1.04}  & 3.33 \scriptsize{± 0.45}                                  & \cellcolor[HTML]{D1E6F6}19.07 \scriptsize{± 2.23}           \\
                                                     & \multirow{-2}{*}{{\large MultiArith}} & Time (s) & 1.68 \scriptsize{± 0.01}                          & 4.45 \scriptsize{± 0.62}                                   & 2.32 \scriptsize{± 0.01}                                   & 3.64 \scriptsize{± 0.13}  & \cellcolor[HTML]{E3EFDA}\textbf{1.37 \scriptsize{± 0.00}} & \cellcolor[HTML]{D1E6F6}1.50 \scriptsize{± 0.16}            \\ \cline{2-9} 
                                                     &                                                        & Acc (\%) & \cellcolor[HTML]{D1E6F6}34.00 \scriptsize{± 0.82} & 14.83 \scriptsize{± 9.10}                                  & 32.50 \scriptsize{± 3.54}                                  & 7.50 \scriptsize{± 5.31}  & 10.67 \scriptsize{± 1.03}                                 & \cellcolor[HTML]{E3EFDA}\textbf{46.17 \scriptsize{± 1.25}}  \\
\multirow{-10}{*}{{\large Mistral}} & \multirow{-2}{*}{{\large SVAMP}}      & Time (s) & 1.69 \scriptsize{± 0.00}                          & 4.08 \scriptsize{± 0.03}                                   & \cellcolor[HTML]{E3EFDA}\textbf{0.90 \scriptsize{± 0.19}}  & 3.44 \scriptsize{± 0.01}  & 1.38 \scriptsize{± 0.00}                                  & \cellcolor[HTML]{D1E6F6}1.37 \scriptsize{± 0.01}            \\