## Language

In [3]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np
from preprocessing import loadDataset, createPrompts
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = {
    'dataset': 'GERestaurant',
    'data_path': '../data',
    'lang': 'en',
    'split': 0
}

args = SimpleNamespace(**args)

RESULTS_PATH = '../results/ft_llm/'
N_SAMPLES = 1000

col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

def fetchFolders(args):

    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = results_all[np.logical_and.reduce([results_all['dataset'] == args.dataset, 
                                                     results_all['task'] == args.task,
                                                     results_all['split'] == str(args.split),
                                                     results_all['lr_setting'] == lr_setting,
                                                     results_all['prompt'] == args.prompt_style])].sort_values(by = ['f1-micro'], ascending = False)
    
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    print(len(results_per_epoch))
    
    display(results_per_epoch.sort_values(by = ['f1-micro'], ascending = False).head(50))
    
    return results_per_epoch.apply(lambda x: x['path'], axis = 1)

def read_predictions(model_config):
    with open(f'../results/ft_llm/{model_config}/predictions.txt', 'r') as file:
        lines = file.readlines()
    
    # Funktion zur Dekodierung der escaped UTF-8 Zeichen
    def decode_escaped_utf8(text):
        # Entferne doppelte Backslashes und dekodiere den Text als UTF-8
        text = text.encode('latin1').decode('unicode_escape').encode('latin1').decode('utf-8')
        return text
    
    # Dekodieren der Zeilen
    decoded_lines = [decode_escaped_utf8(line.strip()) for line in lines]
    return decoded_lines
    
def bootstrap_sampling(gold, pred, n_samples=N_SAMPLES):
    bootstrap_samples = []
    for i in range(n_samples):
        # Ziehe eine Bootstrap-Stichprobe mit Zurücklegen aus dem Test-Set
        indices = np.random.choice(len(gold), len(gold), replace=True)
        bootstrap_samples.append([[gold[index] for index in indices], [pred[index] for index in indices]])
    return bootstrap_samples

def compute_f1_scores(ground_truth, predictions, args, n_samples=N_SAMPLES):

    predictions = [extractAspects(pred, args.task, args.prompt_style == 'cot', True) for pred in predictions]
    pred_labels, false_predictions = convertLabels(predictions, args.task, args.LABEL_SPACE)
    ground_truth = [extractAspects(gt, args.task, args.prompt_style == 'cot', False) for gt in ground_truth]
    gold_labels, _ = convertLabels(ground_truth, args.task, args.LABEL_SPACE)

    # combined = list(zip(gold_labels, pred_labels))
    bootstrap_samples = bootstrap_sampling(gold_labels, pred_labels, n_samples)
    f1_scores = []
    
    results_asp, results_asp_pol, results_pairs, results_pol, results_phrases = createResults(pred_labels, gold_labels, args.LABEL_SPACE, args.task)
    
    if args.task == 'acd':
        print(results_asp['Micro-AVG']['f1'])
    elif args.task == 'acsa':
        print(results_asp_pol['Micro-AVG']['f1'])
    elif args.task == 'e2e' or args.task == 'e2e-e':
        print(results_pol['Micro-AVG']['f1'])
    else:
        print(results_phrases['Micro-AVG']['f1'])
    
    for sample in bootstrap_samples:
        # gt_sample, pred_sample = zip(*sample)
        results_asp, results_asp_pol, results_pairs, results_pol, results_phrases = createResults(sample[1], sample[0], args.LABEL_SPACE, args.task)
        if args.task == 'acd':
            f1_scores.append(results_asp['Micro-AVG']['f1'])
        elif args.task == 'acsa':
            f1_scores.append(results_asp_pol['Micro-AVG']['f1'])
        elif args.task == 'e2e' or args.task == 'e2e-e':
            f1_scores.append(results_pol['Micro-AVG']['f1'])
        else:
            f1_scores.append(results_phrases['Micro-AVG']['f1'])
    
    return f1_scores

def computeStatistics(args):
    model_folders = fetchFolders(args)

    df_train, df_test, args.LABEL_SPACE = loadDataset(args.data_path, args.dataset, args.lr_setting, args.task, args.split)
    prompts_train, prompts_test, ground_truth = createPrompts(df_train, df_test, args)
    
    f1_scores = {}
    df_f1_scores = []
    
    for config in model_folders:
    
        predictions = read_predictions(config)
        
        scores = compute_f1_scores(ground_truth, predictions, args)
        f1_scores[config] = scores
        # print(pg.normality(scores))
        df_f1_scores.append(pd.DataFrame({'f1':scores, 'config': '_'.join(config.split('_')[3:6])}))

    df_f1_scores = pd.concat(df_f1_scores)
    
    # Kruskal-Wallis-Test durchführen
    results_kruskal = kruskal(data=df_f1_scores, dv='f1', between='config')
    print(results_kruskal)
    
    f1_scores = pd.DataFrame(f1_scores)
    
    # Wenn der Kruskal-Wallis-Test signifikant ist, führen wir die paarweisen Vergleiche durch
    if results_kruskal['p-unc'].iloc[0] < 0.05:

        comb = combinations(model_folders, 2)
        # Paarweise Vergleiche
        pairwise_comparisons = []
        
        for config_i, config_j in comb:
            # Bootstrapping-Vergleich
            ci_lower_1 = np.percentile(f1_scores[config_i], 2.5)
            ci_upper_1 = np.percentile(f1_scores[config_i], 97.5)
            ci_lower_2 = np.percentile(f1_scores[config_j], 2.5)
            ci_upper_2 = np.percentile(f1_scores[config_j], 97.5)

            ci_overlap = str(not (ci_lower_1 > ci_upper_2))
            
            # Mann-Whitney-U-Test
            mwu_gr = pg.mwu(f1_scores[config_i], f1_scores[config_j], alternative='two-sided')
            
            # Ergebnisse speichern
            pairwise_comparisons.append({
                'Model 1': '_'.join(config_i.split('_')[3:6]),
                'Model 2': '_'.join(config_j.split('_')[3:6]),
                "Mean Model 1": np.mean(f1_scores[config_i]),
                "Mean Model 2": np.mean(f1_scores[config_j]),
                'Model 1 CI Lower': ci_lower_1,
                'Model 1 CI Upper': ci_upper_1,
                'Model 2 CI Lower': ci_lower_2,
                'Model 2 CI Upper': ci_upper_2,
                'CI Overlap': ci_overlap,
                'U Statistic (Model1 > Model2)': mwu_gr['U-val']['MWU'],
                'P-Value (Model1 > Model2)': mwu_gr['p-val']['MWU'].round(6)
            })
    
        # Ergebnisse in einem DataFrame anzeigen
        pairwise_comparisons_df = pd.DataFrame(pairwise_comparisons)
        
        # Holm-Korrektur für p-Werte
        p_values = pairwise_comparisons_df['P-Value (Model1 > Model2)']
    
        corrected_p_values = pg.multicomp(p_values, method='holm')
    
        # Korrigierte p-Werte in den DataFrame einfügen
        pairwise_comparisons_df['Corrected P-Value (Model1 > Model2)'] = corrected_p_values[1]
        pairwise_comparisons_df['Significant (Model1 > Model2)'] = corrected_p_values[0]
    
        # Ergebnisse anzeigen
        print("Paarweise Vergleiche mit Holm-Korrektur:")
        display(pairwise_comparisons_df)
        pairwise_comparisons_df.to_csv(f'statistics/{args.task}_{args.dataset}_{args.prompt_style}_{args.lr_setting}.tsv', sep = '\t')
        
    else:
        print("Kruskal-Wallis-Test war nicht signifikant, keine weiteren Tests durchgeführt.")



# GERestaurant

In [13]:
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all

## ACD

In [68]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'acd',
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


('1000', 'basic')
89.05, 0.97
('1000', 'context')
89.31, 1.09
('500', 'basic')
88.34, 1.05
('500', 'context')
88.37, 1.24
('full', 'basic')
88.49, 1.82
('full', 'context')
87.80, 0.55
('orig', 'basic')
88.43, 0.00
('orig', 'context')
87.67, 0.00


### Full Dataset
### basic Prompt

In [234]:
args.lr_setting = 0
args.task = 'acd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
1213,en,GERestaurant,acd,short,3e-05,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,10,en_GERestaurant__short_acd_3e-05_32_64_0.05_4_...,en_GERestaurant__short_acd_3e-05_32_64_0.05_4_...,0.9009,0.8931,0.8196
493,en,GERestaurant,acd,short,3e-05,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,5,en_GERestaurant__short_acd_3e-05_32_32_0.05_4_...,en_GERestaurant__short_acd_3e-05_32_32_0.05_4_...,0.8983,0.8934,0.8154
2679,en,GERestaurant,acd,short,3e-05,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,5,en_GERestaurant__short_acd_3e-05_8_16_0.05_4_f...,en_GERestaurant__short_acd_3e-05_8_16_0.05_4_0...,0.8976,0.8917,0.8142
1380,en,GERestaurant,acd,short,3e-05,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,5,en_GERestaurant__short_acd_3e-05_8_8_0.05_4_fu...,en_GERestaurant__short_acd_3e-05_8_8_0.05_4_0_...,0.8942,0.8949,0.8087
4019,en,GERestaurant,acd,short,0.0003,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__short_acd_0.0003_32_32_0.05_4...,en_GERestaurant__short_acd_0.0003_32_32_0.05_4...,0.8913,0.8743,0.8039
2963,en,GERestaurant,acd,short,0.0003,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,10,en_GERestaurant__short_acd_0.0003_8_8_0.05_4_f...,en_GERestaurant__short_acd_0.0003_8_8_0.05_4_0...,0.8861,0.8785,0.7954
1098,en,GERestaurant,acd,short,0.0003,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,10,en_GERestaurant__short_acd_0.0003_8_16_0.05_4_...,en_GERestaurant__short_acd_0.0003_8_16_0.05_4_...,0.8728,0.8643,0.7743
1020,en,GERestaurant,acd,short,0.0003,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,6,en_GERestaurant__short_acd_0.0003_32_64_0.05_4...,en_GERestaurant__short_acd_0.0003_32_64_0.05_4...,0.8416,0.8396,0.7265


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.8913
0.8416
0.8728
0.8861
0.8983
0.9009
0.8976
0.8942
         Source  ddof1            H  p-unc
Kruskal  config      7  3551.862874    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.891052,0.842051,0.8661,0.913907,0.803188,0.875408,True,989217.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.891052,0.873573,0.8661,0.913907,0.84058,0.9041,True,794970.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.891052,0.886092,0.8661,0.913907,0.845897,0.9179,True,572629.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.891052,0.898206,0.8661,0.913907,0.858585,0.928902,True,357032.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.891052,0.901061,0.8661,0.913907,0.878698,0.9231,True,278513.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.891052,0.897904,0.8661,0.913907,0.861588,0.927902,True,363287.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.891052,0.894947,0.8661,0.913907,0.8574,0.925,True,416598.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.842051,0.873573,0.803188,0.875408,0.84058,0.9041,True,101971.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.842051,0.886092,0.803188,0.875408,0.845897,0.9179,True,47949.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.842051,0.898206,0.803188,0.875408,0.858585,0.928902,True,16794.0,0.0,0.0,True


### Full Dataset
### context Prompt

In [70]:
args.lr_setting = 0
args.task = 'acd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
1512,GERestaurant,acd,context,0.0003,8,16,0.05,0,full,10,acd_GERestaurant_context_0.0003_8_16_0.05_full,acd_GERestaurant_context_0.0003_8_16_0.05_0_fu...,0.8828,0.8698,0.7901
1938,GERestaurant,acd,context,0.0003,32,32,0.05,0,full,10,acd_GERestaurant_context_0.0003_32_32_0.05_full,acd_GERestaurant_context_0.0003_32_32_0.05_0_f...,0.882,0.872,0.7889
5812,GERestaurant,acd,context,3e-05,32,32,0.05,0,full,10,acd_GERestaurant_context_3e-05_32_32_0.05_full,acd_GERestaurant_context_3e-05_32_32_0.05_0_fu...,0.8817,0.8798,0.7885
1190,GERestaurant,acd,context,3e-05,8,8,0.05,0,full,5,acd_GERestaurant_context_3e-05_8_8_0.05_full,acd_GERestaurant_context_3e-05_8_8_0.05_0_full_5,0.8805,0.8769,0.7865
2374,GERestaurant,acd,context,0.0003,8,8,0.05,0,full,8,acd_GERestaurant_context_0.0003_8_8_0.05_full,acd_GERestaurant_context_0.0003_8_8_0.05_0_full_8,0.8789,0.8695,0.784
1606,GERestaurant,acd,context,3e-05,8,16,0.05,0,full,7,acd_GERestaurant_context_3e-05_8_16_0.05_full,acd_GERestaurant_context_3e-05_8_16_0.05_0_full_7,0.8781,0.8682,0.7827
6107,GERestaurant,acd,context,3e-05,32,64,0.05,0,full,4,acd_GERestaurant_context_3e-05_32_64_0.05_full,acd_GERestaurant_context_3e-05_32_64_0.05_0_fu...,0.8746,0.8727,0.7772
585,GERestaurant,acd,context,0.0003,32,64,0.05,0,full,8,acd_GERestaurant_context_0.0003_32_64_0.05_full,acd_GERestaurant_context_0.0003_32_64_0.05_0_f...,0.865,0.8603,0.7621


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.882
0.865
0.8828
0.8789
0.8817
0.8746
0.8781
0.8805
         Source  ddof1           H          p-unc
Kruskal  config      7  831.368751  3.151491e-175
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.881711,0.864654,0.855897,0.905203,0.836097,0.891402,True,816031.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.881711,0.882642,0.855897,0.905203,0.843045,0.9141,True,465823.5,0.00813,0.0813,False
2,0.0003_32_32,0.0003_8_8,0.881711,0.879669,0.855897,0.905203,0.841985,0.9108,True,522482.0,0.081687,0.412636,False
3,0.0003_32_32,3e-05_32_32,0.881711,0.881063,0.855897,0.905203,0.840595,0.910403,True,492587.0,0.565948,1.0,False
4,0.0003_32_32,3e-05_32_64,0.881711,0.873463,0.855897,0.905203,0.834288,0.906903,True,630262.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.881711,0.877992,0.855897,0.905203,0.839298,0.909403,True,557901.5,7e-06,9.8e-05,True
6,0.0003_32_32,3e-05_8_8,0.881711,0.880826,0.855897,0.905203,0.844785,0.914212,True,507667.5,0.552688,1.0,False
7,0.0003_32_64,0.0003_8_16,0.864654,0.882642,0.836097,0.891402,0.843045,0.9141,True,211667.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.864654,0.879669,0.836097,0.891402,0.841985,0.9108,True,251544.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.864654,0.881063,0.836097,0.891402,0.840595,0.910403,True,230166.5,0.0,0.0,True


### 1000
### basic Prompt

In [75]:
args.lr_setting = 1000
args.task = 'acd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
4159,GERestaurant,acd,basic,3e-05,32,64,0.05,0,1000,9,acd_GERestaurant_basic_3e-05_32_64_0.05_1000,acd_GERestaurant_basic_3e-05_32_64_0.05_0_1000_9,0.9013,0.9096,0.8204
6236,GERestaurant,acd,basic,3e-05,32,32,0.05,0,1000,4,acd_GERestaurant_basic_3e-05_32_32_0.05_1000,acd_GERestaurant_basic_3e-05_32_32_0.05_0_1000_4,0.9,0.8972,0.8182
5469,GERestaurant,acd,basic,3e-05,8,16,0.05,0,1000,10,acd_GERestaurant_basic_3e-05_8_16_0.05_1000,acd_GERestaurant_basic_3e-05_8_16_0.05_0_1000_10,0.897,0.8819,0.8132
3235,GERestaurant,acd,basic,0.0003,8,8,0.05,0,1000,5,acd_GERestaurant_basic_0.0003_8_8_0.05_1000,acd_GERestaurant_basic_0.0003_8_8_0.05_0_1000_5,0.8936,0.8837,0.8077
5733,GERestaurant,acd,basic,0.0003,8,16,0.05,0,1000,6,acd_GERestaurant_basic_0.0003_8_16_0.05_1000,acd_GERestaurant_basic_0.0003_8_16_0.05_0_1000_6,0.8928,0.8723,0.8063
1082,GERestaurant,acd,basic,3e-05,8,8,0.05,0,1000,8,acd_GERestaurant_basic_3e-05_8_8_0.05_1000,acd_GERestaurant_basic_3e-05_8_8_0.05_0_1000_8,0.8889,0.892,0.8
102,GERestaurant,acd,basic,0.0003,32,32,0.05,0,1000,4,acd_GERestaurant_basic_0.0003_32_32_0.05_1000,acd_GERestaurant_basic_0.0003_32_32_0.05_0_1000_4,0.8759,0.8713,0.7792
5601,GERestaurant,acd,basic,0.0003,32,64,0.05,0,1000,7,acd_GERestaurant_basic_0.0003_32_64_0.05_1000,acd_GERestaurant_basic_0.0003_32_64_0.05_0_1000_7,0.8742,0.8539,0.7765


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8759
0.8742
0.8928
0.8936
0.9
0.9013
0.897
0.8889
         Source  ddof1            H  p-unc
Kruskal  config      7  1859.489748    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.876452,0.874038,0.840583,0.911605,0.835888,0.9107,True,534784.0,0.007067,0.014134,True
1,0.0003_32_32,0.0003_8_16,0.876452,0.892389,0.840583,0.911605,0.8584,0.925403,True,266769.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.876452,0.893642,0.840583,0.911605,0.859393,0.925303,True,246445.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.876452,0.90017,0.840583,0.911605,0.86979,0.928105,True,161213.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.876452,0.902342,0.840583,0.911605,0.864483,0.934312,True,153575.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.876452,0.89675,0.840583,0.911605,0.8638,0.930403,True,206312.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.876452,0.889727,0.840583,0.911605,0.856485,0.920203,True,296418.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.874038,0.892389,0.835888,0.9107,0.8584,0.925403,True,236631.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.874038,0.893642,0.835888,0.9107,0.859393,0.925303,True,217082.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.874038,0.90017,0.835888,0.9107,0.86979,0.928105,True,137212.5,0.0,0.0,True


### 1000
### context Prompt

In [76]:
args.lr_setting = 1000
args.task = 'acd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
6069,GERestaurant,acd,context,0.0003,8,8,0.05,0,1000,4,acd_GERestaurant_context_0.0003_8_8_0.05_1000,acd_GERestaurant_context_0.0003_8_8_0.05_0_1000_4,0.9095,0.9012,0.834
2025,GERestaurant,acd,context,3e-05,32,64,0.05,0,1000,8,acd_GERestaurant_context_3e-05_32_64_0.05_1000,acd_GERestaurant_context_3e-05_32_64_0.05_0_10...,0.9004,0.8936,0.8189
1137,GERestaurant,acd,context,3e-05,8,8,0.05,0,1000,5,acd_GERestaurant_context_3e-05_8_8_0.05_1000,acd_GERestaurant_context_3e-05_8_8_0.05_0_1000_5,0.8967,0.8909,0.8127
1662,GERestaurant,acd,context,3e-05,32,32,0.05,0,1000,9,acd_GERestaurant_context_3e-05_32_32_0.05_1000,acd_GERestaurant_context_3e-05_32_32_0.05_0_10...,0.8957,0.8881,0.811
2559,GERestaurant,acd,context,3e-05,8,16,0.05,0,1000,10,acd_GERestaurant_context_3e-05_8_16_0.05_1000,acd_GERestaurant_context_3e-05_8_16_0.05_0_100...,0.8952,0.8755,0.8103
3915,GERestaurant,acd,context,0.0003,32,32,0.05,0,1000,7,acd_GERestaurant_context_0.0003_32_32_0.05_1000,acd_GERestaurant_context_0.0003_32_32_0.05_0_1...,0.8936,0.8684,0.8077
6822,GERestaurant,acd,context,0.0003,8,16,0.05,0,1000,4,acd_GERestaurant_context_0.0003_8_16_0.05_1000,acd_GERestaurant_context_0.0003_8_16_0.05_0_10...,0.8834,0.8747,0.7912
3543,GERestaurant,acd,context,0.0003,32,64,0.05,0,1000,7,acd_GERestaurant_context_0.0003_32_64_0.05_1000,acd_GERestaurant_context_0.0003_32_64_0.05_0_1...,0.8706,0.8575,0.7708


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8936
0.8705
0.8834
0.9095
0.8957
0.9004
0.8952
0.8967
         Source  ddof1            H  p-unc
Kruskal  config      7  2231.868632    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.893608,0.870613,0.858398,0.928105,0.833292,0.905415,True,817260.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.893608,0.884237,0.858398,0.928105,0.849293,0.917405,True,652337.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.893608,0.910563,0.858398,0.928105,0.881295,0.93751,True,229278.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.893608,0.895537,0.858398,0.928105,0.8596,0.9268,True,462542.0,0.003723,0.014892,True
4,0.0003_32_32,3e-05_32_64,0.893608,0.900488,0.858398,0.928105,0.867797,0.932207,True,385055.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.893608,0.895193,0.858398,0.928105,0.86269,0.927203,True,472284.5,0.031851,0.063702,False
6,0.0003_32_32,3e-05_8_8,0.893608,0.897888,0.858398,0.928105,0.867988,0.926805,True,428226.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.870613,0.884237,0.833292,0.905415,0.849293,0.917405,True,297228.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.870613,0.910563,0.833292,0.905415,0.881295,0.93751,True,47164.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.870613,0.895537,0.833292,0.905415,0.8596,0.9268,True,165833.5,0.0,0.0,True


### 500
### basic Prompt

In [73]:
args.lr_setting = 500
args.task = 'acd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
217,GERestaurant,acd,basic,3e-05,32,32,0.05,0,500,7,acd_GERestaurant_basic_3e-05_32_32_0.05_500,acd_GERestaurant_basic_3e-05_32_32_0.05_0_500_7,0.8991,0.903,0.8168
2627,GERestaurant,acd,basic,0.0003,8,8,0.05,0,500,10,acd_GERestaurant_basic_0.0003_8_8_0.05_500,acd_GERestaurant_basic_0.0003_8_8_0.05_0_500_10,0.8907,0.8801,0.803
1393,GERestaurant,acd,basic,3e-05,32,64,0.05,0,500,9,acd_GERestaurant_basic_3e-05_32_64_0.05_500,acd_GERestaurant_basic_3e-05_32_64_0.05_0_500_9,0.8898,0.8827,0.8015
6254,GERestaurant,acd,basic,3e-05,8,16,0.05,0,500,8,acd_GERestaurant_basic_3e-05_8_16_0.05_500,acd_GERestaurant_basic_3e-05_8_16_0.05_0_500_8,0.8851,0.8774,0.7939
652,GERestaurant,acd,basic,0.0003,8,16,0.05,0,500,9,acd_GERestaurant_basic_0.0003_8_16_0.05_500,acd_GERestaurant_basic_0.0003_8_16_0.05_0_500_9,0.8813,0.8678,0.7879
1295,GERestaurant,acd,basic,3e-05,8,8,0.05,0,500,8,acd_GERestaurant_basic_3e-05_8_8_0.05_500,acd_GERestaurant_basic_3e-05_8_8_0.05_0_500_8,0.8813,0.8582,0.7879
547,GERestaurant,acd,basic,0.0003,32,64,0.05,0,500,9,acd_GERestaurant_basic_0.0003_32_64_0.05_500,acd_GERestaurant_basic_0.0003_32_64_0.05_0_500_9,0.8787,0.8446,0.7836
6217,GERestaurant,acd,basic,0.0003,32,32,0.05,0,500,8,acd_GERestaurant_basic_0.0003_32_32_0.05_500,acd_GERestaurant_basic_0.0003_32_32_0.05_0_500_8,0.861,0.8357,0.7559


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.861
0.8787
0.8814
0.8908
0.8992
0.8898
0.8851
0.8814
         Source  ddof1            H          p-unc
Kruskal  config      7  1095.813481  2.367549e-232
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.861719,0.880746,0.811993,0.9091,0.835188,0.923702,True,293090.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.861719,0.880827,0.811993,0.9091,0.831997,0.927025,True,297391.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.861719,0.890347,0.811993,0.9091,0.842588,0.9378,True,210870.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.861719,0.899353,0.811993,0.9091,0.848988,0.945518,True,150252.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.861719,0.891875,0.811993,0.9091,0.841588,0.938113,True,200471.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.861719,0.885518,0.811993,0.9091,0.831882,0.9372,True,261649.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.861719,0.882858,0.811993,0.9091,0.829295,0.931515,True,275869.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.880746,0.880827,0.835188,0.923702,0.831997,0.927025,True,496307.5,0.774945,0.774945,False
8,0.0003_32_64,0.0003_8_8,0.880746,0.890347,0.835188,0.923702,0.842588,0.9378,True,386501.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.880746,0.899353,0.835188,0.923702,0.848988,0.945518,True,292224.5,0.0,0.0,True


### 500
### context Prompt

In [77]:
args.lr_setting = 500
args.task = 'acd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
67,GERestaurant,acd,context,3e-05,32,32,0.05,0,500,2,acd_GERestaurant_context_3e-05_32_32_0.05_500,acd_GERestaurant_context_3e-05_32_32_0.05_0_500_2,0.9,0.8983,0.8182
3980,GERestaurant,acd,context,3e-05,32,64,0.05,0,500,6,acd_GERestaurant_context_3e-05_32_64_0.05_500,acd_GERestaurant_context_3e-05_32_64_0.05_0_500_6,0.8974,0.899,0.814
1060,GERestaurant,acd,context,3e-05,8,16,0.05,0,500,4,acd_GERestaurant_context_3e-05_8_16_0.05_500,acd_GERestaurant_context_3e-05_8_16_0.05_0_500_4,0.8889,0.8759,0.8
6189,GERestaurant,acd,context,3e-05,8,8,0.05,0,500,7,acd_GERestaurant_context_3e-05_8_8_0.05_500,acd_GERestaurant_context_3e-05_8_8_0.05_0_500_7,0.8841,0.8751,0.7923
4065,GERestaurant,acd,context,0.0003,8,16,0.05,0,500,6,acd_GERestaurant_context_0.0003_8_16_0.05_500,acd_GERestaurant_context_0.0003_8_16_0.05_0_500_6,0.8839,0.8776,0.792
2185,GERestaurant,acd,context,0.0003,8,8,0.05,0,500,5,acd_GERestaurant_context_0.0003_8_8_0.05_500,acd_GERestaurant_context_0.0003_8_8_0.05_0_500_5,0.8839,0.8862,0.792
35,GERestaurant,acd,context,0.0003,32,32,0.05,0,500,7,acd_GERestaurant_context_0.0003_32_32_0.05_500,acd_GERestaurant_context_0.0003_32_32_0.05_0_5...,0.8728,0.8745,0.7744
724,GERestaurant,acd,context,0.0003,32,64,0.05,0,500,3,acd_GERestaurant_context_0.0003_32_64_0.05_500,acd_GERestaurant_context_0.0003_32_64_0.05_0_5...,0.8585,0.8428,0.752


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.8729
0.8584
0.8839
0.8839
0.9
0.8974
0.8889
0.8841
         Source  ddof1            H          p-unc
Kruskal  config      7  1260.915486  4.731160e-268
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.872548,0.857439,0.815078,0.925607,0.776763,0.917907,True,618793.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.872548,0.882795,0.815078,0.925607,0.83189,0.932715,True,398717.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.872548,0.884431,0.815078,0.925607,0.82729,0.938107,True,381606.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.872548,0.900112,0.815078,0.925607,0.855855,0.941213,True,221119.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.872548,0.897363,0.815078,0.925607,0.852187,0.9406,True,245634.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.872548,0.888994,0.815078,0.925607,0.8326,0.9367,True,333425.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.872548,0.88592,0.815078,0.925607,0.831882,0.937207,True,366861.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.857439,0.882795,0.776763,0.917907,0.83189,0.932715,True,294813.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.857439,0.884431,0.776763,0.917907,0.82729,0.938107,True,283314.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.857439,0.900112,0.776763,0.917907,0.855855,0.941213,True,155533.5,0.0,0.0,True


## ACSA

In [78]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'acsa',
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


('1000', 'basic')
85.98, 3.59
('1000', 'context')
87.43, 1.55
('1000', 'cot')
86.26, 1.97
('500', 'basic')
87.90, 0.83
('500', 'context')
87.16, 1.59
('500', 'cot')
85.73, 1.99
('full', 'basic')
85.42, 2.23
('full', 'context')
85.64, 1.71
('full', 'cot')
83.85, 1.89
('orig', 'basic')
84.43, 0.00
('orig', 'context')
84.70, 0.00
('orig', 'cot')
76.66, 0.00


### Full Dataset
### basic Prompt

In [241]:
args.lr_setting = 0
args.task = 'acsa'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
1289,en,GERestaurant,acsa,short,3e-05,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,8,en_GERestaurant__short_acsa_3e-05_8_16_0.05_4_...,en_GERestaurant__short_acsa_3e-05_8_16_0.05_4_...,0.8942,0.8778,0.8087
1673,en,GERestaurant,acsa,short,0.0003,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,8,en_GERestaurant__short_acsa_0.0003_8_8_0.05_4_...,en_GERestaurant__short_acsa_0.0003_8_8_0.05_4_...,0.8814,0.8627,0.7879
1933,en,GERestaurant,acsa,short,3e-05,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,8,en_GERestaurant__short_acsa_3e-05_32_64_0.05_4...,en_GERestaurant__short_acsa_3e-05_32_64_0.05_4...,0.879,0.8764,0.7841
650,en,GERestaurant,acsa,short,3e-05,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__short_acsa_3e-05_32_32_0.05_4...,en_GERestaurant__short_acsa_3e-05_32_32_0.05_4...,0.87,0.8575,0.7699
3049,en,GERestaurant,acsa,short,0.0003,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,4,en_GERestaurant__short_acsa_0.0003_8_16_0.05_4...,en_GERestaurant__short_acsa_0.0003_8_16_0.05_4...,0.8552,0.8522,0.747
892,en,GERestaurant,acsa,short,3e-05,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__short_acsa_3e-05_8_8_0.05_4_f...,en_GERestaurant__short_acsa_3e-05_8_8_0.05_4_0...,0.8412,0.8307,0.726
1498,en,GERestaurant,acsa,short,0.0003,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,5,en_GERestaurant__short_acsa_0.0003_32_32_0.05_...,en_GERestaurant__short_acsa_0.0003_32_32_0.05_...,0.834,0.8339,0.7152
1920,en,GERestaurant,acsa,short,0.0003,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,8,en_GERestaurant__short_acsa_0.0003_32_64_0.05_...,en_GERestaurant__short_acsa_0.0003_32_64_0.05_...,0.8123,0.79,0.684


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.834
0.8123
0.8552
0.8814
0.87
0.879
0.8942
0.8412
         Source  ddof1            H  p-unc
Kruskal  config      7  5212.568678    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.834031,0.81279,0.786687,0.8744,0.783095,0.841703,True,781197.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.834031,0.856544,0.786687,0.8744,0.810397,0.895602,True,239740.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.834031,0.881411,0.786687,0.8744,0.856877,0.906605,True,25354.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.834031,0.870319,0.786687,0.8744,0.842288,0.893903,True,80139.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.834031,0.878851,0.786687,0.8744,0.853797,0.903702,True,34395.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.834031,0.893666,0.786687,0.8744,0.867695,0.918303,True,5892.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.834031,0.843198,0.786687,0.8744,0.780377,0.8969,True,403629.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.81279,0.856544,0.783095,0.841703,0.810397,0.895602,True,56637.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.81279,0.881411,0.783095,0.841703,0.856877,0.906605,True,236.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.81279,0.870319,0.783095,0.841703,0.842288,0.893903,True,2230.0,0.0,0.0,True


### Full Dataset
### context Prompt

In [80]:
args.lr_setting = 0
args.task = 'acsa'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5131,GERestaurant,acsa,context,3e-05,32,32,0.05,0,full,9,acsa_GERestaurant_context_3e-05_32_32_0.05_full,acsa_GERestaurant_context_3e-05_32_32_0.05_0_f...,0.873,0.8623,0.7746
3926,GERestaurant,acsa,context,3e-05,32,64,0.05,0,full,10,acsa_GERestaurant_context_3e-05_32_64_0.05_full,acsa_GERestaurant_context_3e-05_32_64_0.05_0_f...,0.8729,0.8716,0.7745
3810,GERestaurant,acsa,context,0.0003,8,16,0.05,0,full,7,acsa_GERestaurant_context_0.0003_8_16_0.05_full,acsa_GERestaurant_context_0.0003_8_16_0.05_0_f...,0.8626,0.8408,0.7583
4823,GERestaurant,acsa,context,0.0003,32,32,0.05,0,full,5,acsa_GERestaurant_context_0.0003_32_32_0.05_full,acsa_GERestaurant_context_0.0003_32_32_0.05_0_...,0.8615,0.8558,0.7567
408,GERestaurant,acsa,context,0.0003,8,8,0.05,0,full,10,acsa_GERestaurant_context_0.0003_8_8_0.05_full,acsa_GERestaurant_context_0.0003_8_8_0.05_0_fu...,0.8599,0.8592,0.7542
5386,GERestaurant,acsa,context,3e-05,8,16,0.05,0,full,5,acsa_GERestaurant_context_3e-05_8_16_0.05_full,acsa_GERestaurant_context_3e-05_8_16_0.05_0_fu...,0.8552,0.8602,0.7471
1947,GERestaurant,acsa,context,3e-05,8,8,0.05,0,full,10,acsa_GERestaurant_context_3e-05_8_8_0.05_full,acsa_GERestaurant_context_3e-05_8_8_0.05_0_ful...,0.8504,0.8547,0.7398
2853,GERestaurant,acsa,context,0.0003,32,64,0.05,0,full,2,acsa_GERestaurant_context_0.0003_32_64_0.05_full,acsa_GERestaurant_context_0.0003_32_64_0.05_0_...,0.8153,0.797,0.6883


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.8615
0.8154
0.8625
0.8599
0.873
0.8729
0.8552
0.8504
         Source  ddof1            H  p-unc
Kruskal  config      7  3049.223387    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.86196,0.815228,0.824392,0.8943,0.781395,0.848117,True,968114.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.86196,0.863935,0.824392,0.8943,0.834288,0.8927,True,475597.0,0.058792,0.117584,False
2,0.0003_32_32,0.0003_8_8,0.86196,0.860468,0.824392,0.8943,0.815392,0.9007,True,513847.5,0.283577,0.283577,False
3,0.0003_32_32,3e-05_32_32,0.86196,0.873793,0.824392,0.8943,0.8484,0.8989,True,305229.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.86196,0.872217,0.824392,0.8943,0.844097,0.899505,True,337013.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.86196,0.856786,0.824392,0.8943,0.813065,0.895107,True,577098.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.86196,0.851131,0.824392,0.8943,0.806285,0.887415,True,648832.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.815228,0.863935,0.781395,0.848117,0.834288,0.8927,True,13965.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.815228,0.860468,0.781395,0.848117,0.815392,0.9007,True,62346.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.815228,0.873793,0.781395,0.848117,0.8484,0.8989,True,2468.0,0.0,0.0,True


### Full Dataset
### CoT Prompt

In [82]:
args.lr_setting = 0
args.task = 'acsa'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2893,GERestaurant,acsa,cot,0.0003,8,16,0.05,0,full,7,acsa_GERestaurant_cot_0.0003_8_16_0.05_full,acsa_GERestaurant_cot_0.0003_8_16_0.05_0_full_7,0.8569,0.8439,0.7496
5879,GERestaurant,acsa,cot,0.0003,8,8,0.05,0,full,4,acsa_GERestaurant_cot_0.0003_8_8_0.05_full,acsa_GERestaurant_cot_0.0003_8_8_0.05_0_full_4,0.8543,0.8464,0.7456
3234,GERestaurant,acsa,cot,0.0003,32,32,0.05,0,full,5,acsa_GERestaurant_cot_0.0003_32_32_0.05_full,acsa_GERestaurant_cot_0.0003_32_32_0.05_0_full_5,0.8497,0.8505,0.7387
1481,GERestaurant,acsa,cot,3e-05,8,8,0.05,0,full,5,acsa_GERestaurant_cot_3e-05_8_8_0.05_full,acsa_GERestaurant_cot_3e-05_8_8_0.05_0_full_5,0.8427,0.8419,0.7282
3156,GERestaurant,acsa,cot,3e-05,32,64,0.05,0,full,7,acsa_GERestaurant_cot_3e-05_32_64_0.05_full,acsa_GERestaurant_cot_3e-05_32_64_0.05_0_full_7,0.8419,0.8485,0.727
1172,GERestaurant,acsa,cot,3e-05,8,16,0.05,0,full,7,acsa_GERestaurant_cot_3e-05_8_16_0.05_full,acsa_GERestaurant_cot_3e-05_8_16_0.05_0_full_7,0.8385,0.838,0.7219
3409,GERestaurant,acsa,cot,3e-05,32,32,0.05,0,full,5,acsa_GERestaurant_cot_3e-05_32_32_0.05_full,acsa_GERestaurant_cot_3e-05_32_32_0.05_0_full_5,0.8314,0.834,0.7113
1633,GERestaurant,acsa,cot,0.0003,32,64,0.05,0,full,6,acsa_GERestaurant_cot_0.0003_32_64_0.05_full,acsa_GERestaurant_cot_0.0003_32_64_0.05_0_full_6,0.7929,0.7943,0.6568


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.8497
0.7929
0.8569
0.8543
0.8313
0.8419
0.8385
0.8427
         Source  ddof1           H  p-unc
Kruskal  config      7  3073.39269    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.849566,0.793822,0.803588,0.891105,0.751997,0.829727,True,968810.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.849566,0.857367,0.803588,0.891105,0.819195,0.888912,True,391112.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.849566,0.855032,0.803588,0.891105,0.8175,0.8907,True,426381.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.849566,0.831867,0.803588,0.891105,0.789892,0.869005,True,724068.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.849566,0.842608,0.803588,0.891105,0.80259,0.8774,True,597662.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.849566,0.837648,0.803588,0.891105,0.797675,0.874205,True,660883.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.849566,0.842956,0.803588,0.891105,0.800682,0.878803,True,593183.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.793822,0.857367,0.751997,0.829727,0.819195,0.888912,True,9805.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.793822,0.855032,0.751997,0.829727,0.8175,0.8907,True,14140.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.793822,0.831867,0.751997,0.829727,0.789892,0.869005,True,96828.0,0.0,0.0,True


### 1000
### basic Prompt

In [88]:
args.lr_setting = 1000
args.task = 'acsa'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5670,GERestaurant,acsa,basic,0.0003,8,8,0.05,0,1000,7,acsa_GERestaurant_basic_0.0003_8_8_0.05_1000,acsa_GERestaurant_basic_0.0003_8_8_0.05_0_1000_7,0.8955,0.8728,0.8107
56,GERestaurant,acsa,basic,0.0003,8,16,0.05,0,1000,4,acsa_GERestaurant_basic_0.0003_8_16_0.05_1000,acsa_GERestaurant_basic_0.0003_8_16_0.05_0_1000_4,0.8952,0.8739,0.8103
283,GERestaurant,acsa,basic,3e-05,32,64,0.05,0,1000,9,acsa_GERestaurant_basic_3e-05_32_64_0.05_1000,acsa_GERestaurant_basic_3e-05_32_64_0.05_0_1000_9,0.8903,0.8828,0.8024
6561,GERestaurant,acsa,basic,3e-05,32,32,0.05,0,1000,9,acsa_GERestaurant_basic_3e-05_32_32_0.05_1000,acsa_GERestaurant_basic_3e-05_32_32_0.05_0_1000_9,0.8834,0.8762,0.7912
5440,GERestaurant,acsa,basic,3e-05,8,16,0.05,0,1000,9,acsa_GERestaurant_basic_3e-05_8_16_0.05_1000,acsa_GERestaurant_basic_3e-05_8_16_0.05_0_1000_9,0.8674,0.8614,0.7659
4433,GERestaurant,acsa,basic,0.0003,32,32,0.05,0,1000,1,acsa_GERestaurant_basic_0.0003_32_32_0.05_1000,acsa_GERestaurant_basic_0.0003_32_32_0.05_0_10...,0.8277,0.8142,0.7061
3068,GERestaurant,acsa,basic,3e-05,8,8,0.05,0,1000,10,acsa_GERestaurant_basic_3e-05_8_8_0.05_1000,acsa_GERestaurant_basic_3e-05_8_8_0.05_0_1000_10,0.8182,0.8173,0.6923
2631,GERestaurant,acsa,basic,0.0003,32,64,0.05,0,1000,3,acsa_GERestaurant_basic_0.0003_32_64_0.05_1000,acsa_GERestaurant_basic_0.0003_32_64_0.05_0_10...,0.8009,0.767,0.668


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8278
0.8009
0.8952
0.8955
0.8834
0.8904
0.8674
0.8182
         Source  ddof1            H  p-unc
Kruskal  config      7  5236.170637    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.828291,0.801254,0.782195,0.873807,0.75478,0.849615,True,780142.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.828291,0.896239,0.782195,0.873807,0.860185,0.931802,True,10115.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.828291,0.895946,0.782195,0.873807,0.8578,0.930312,True,12818.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.828291,0.883535,0.782195,0.873807,0.845073,0.917,True,32649.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.828291,0.890827,0.782195,0.873807,0.858855,0.922018,True,14447.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.828291,0.867663,0.782195,0.873807,0.827895,0.905003,True,104326.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.828291,0.822144,0.782195,0.873807,0.708583,0.897608,True,513613.5,0.291793,0.583586,False
7,0.0003_32_64,0.0003_8_16,0.801254,0.896239,0.75478,0.849615,0.860185,0.931802,True,580.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.801254,0.895946,0.75478,0.849615,0.8578,0.930312,True,1025.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.801254,0.883535,0.75478,0.849615,0.845073,0.917,True,3907.5,0.0,0.0,True


### 1000
### context Prompt

In [84]:
args.lr_setting = 1000
args.task = 'acsa'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
1978,GERestaurant,acsa,context,0.0003,8,8,0.05,0,1000,6,acsa_GERestaurant_context_0.0003_8_8_0.05_1000,acsa_GERestaurant_context_0.0003_8_8_0.05_0_10...,0.8985,0.8731,0.8157
3061,GERestaurant,acsa,context,0.0003,8,16,0.05,0,1000,5,acsa_GERestaurant_context_0.0003_8_16_0.05_1000,acsa_GERestaurant_context_0.0003_8_16_0.05_0_1...,0.8889,0.8656,0.8
4085,GERestaurant,acsa,context,3e-05,32,64,0.05,0,1000,10,acsa_GERestaurant_context_3e-05_32_64_0.05_1000,acsa_GERestaurant_context_3e-05_32_64_0.05_0_1...,0.8821,0.8725,0.7891
6276,GERestaurant,acsa,context,3e-05,8,16,0.05,0,1000,10,acsa_GERestaurant_context_3e-05_8_16_0.05_1000,acsa_GERestaurant_context_3e-05_8_16_0.05_0_10...,0.8776,0.8555,0.782
576,GERestaurant,acsa,context,3e-05,32,32,0.05,0,1000,9,acsa_GERestaurant_context_3e-05_32_32_0.05_1000,acsa_GERestaurant_context_3e-05_32_32_0.05_0_1...,0.8755,0.8672,0.7787
1166,GERestaurant,acsa,context,0.0003,32,32,0.05,0,1000,6,acsa_GERestaurant_context_0.0003_32_32_0.05_1000,acsa_GERestaurant_context_0.0003_32_32_0.05_0_...,0.8667,0.8475,0.7647
2504,GERestaurant,acsa,context,0.0003,32,64,0.05,0,1000,4,acsa_GERestaurant_context_0.0003_32_64_0.05_1000,acsa_GERestaurant_context_0.0003_32_64_0.05_0_...,0.8578,0.8418,0.751
394,GERestaurant,acsa,context,3e-05,8,8,0.05,0,1000,9,acsa_GERestaurant_context_3e-05_8_8_0.05_1000,acsa_GERestaurant_context_3e-05_8_8_0.05_0_1000_9,0.8474,0.8391,0.7352


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8667
0.8578
0.8889
0.8985
0.8756
0.8821
0.8776
0.8474
         Source  ddof1            H  p-unc
Kruskal  config      7  3313.371655    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.866326,0.856827,0.823,0.907403,0.816098,0.895913,True,627739.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.866326,0.888773,0.823,0.907403,0.855897,0.921602,True,208122.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.866326,0.899763,0.823,0.907403,0.8671,0.931303,True,106561.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.866326,0.876122,0.823,0.907403,0.8405,0.9095,True,363110.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.866326,0.882101,0.823,0.907403,0.8462,0.9163,True,290660.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.866326,0.877926,0.823,0.907403,0.838298,0.9136,True,345690.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.866326,0.845828,0.823,0.907403,0.803582,0.888425,True,752767.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.856827,0.888773,0.816098,0.895913,0.855897,0.921602,True,115218.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.856827,0.899763,0.816098,0.895913,0.8671,0.931303,True,49617.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.856827,0.876122,0.816098,0.895913,0.8405,0.9095,True,236945.0,0.0,0.0,True


### 1000
### CoT Prompt

In [85]:
args.lr_setting = 1000
args.task = 'acsa'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
403,GERestaurant,acsa,cot,3e-05,32,32,0.05,0,1000,9,acsa_GERestaurant_cot_3e-05_32_32_0.05_1000,acsa_GERestaurant_cot_3e-05_32_32_0.05_0_1000_9,0.8898,0.891,0.8016
44,GERestaurant,acsa,cot,3e-05,8,8,0.05,0,1000,8,acsa_GERestaurant_cot_3e-05_8_8_0.05_1000,acsa_GERestaurant_cot_3e-05_8_8_0.05_0_1000_8,0.8702,0.8587,0.7702
6726,GERestaurant,acsa,cot,3e-05,32,64,0.05,0,1000,7,acsa_GERestaurant_cot_3e-05_32_64_0.05_1000,acsa_GERestaurant_cot_3e-05_32_64_0.05_0_1000_7,0.8694,0.8463,0.7689
3947,GERestaurant,acsa,cot,3e-05,8,16,0.05,0,1000,6,acsa_GERestaurant_cot_3e-05_8_16_0.05_1000,acsa_GERestaurant_cot_3e-05_8_16_0.05_0_1000_6,0.8686,0.8663,0.7677
5413,GERestaurant,acsa,cot,0.0003,8,16,0.05,0,1000,5,acsa_GERestaurant_cot_0.0003_8_16_0.05_1000,acsa_GERestaurant_cot_0.0003_8_16_0.05_0_1000_5,0.8676,0.8524,0.7661
5609,GERestaurant,acsa,cot,0.0003,8,8,0.05,0,1000,9,acsa_GERestaurant_cot_0.0003_8_8_0.05_1000,acsa_GERestaurant_cot_0.0003_8_8_0.05_0_1000_9,0.8665,0.8646,0.7645
2693,GERestaurant,acsa,cot,0.0003,32,32,0.05,0,1000,6,acsa_GERestaurant_cot_0.0003_32_32_0.05_1000,acsa_GERestaurant_cot_0.0003_32_32_0.05_0_1000_6,0.8518,0.8328,0.7419
2112,GERestaurant,acsa,cot,0.0003,32,64,0.05,0,1000,7,acsa_GERestaurant_cot_0.0003_32_64_0.05_1000,acsa_GERestaurant_cot_0.0003_32_64_0.05_0_1000_7,0.8171,0.823,0.6908


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8519
0.8172
0.8676
0.8665
0.8899
0.8694
0.8686
0.8702
         Source  ddof1            H  p-unc
Kruskal  config      7  3025.239111    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.851284,0.819089,0.80889,0.8904,0.7659,0.8714,True,833197.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.851284,0.867559,0.80889,0.8904,0.821677,0.913502,True,302915.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.851284,0.866164,0.80889,0.8904,0.826595,0.907702,True,307285.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.851284,0.890771,0.80889,0.8904,0.852297,0.924403,True,77794.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.851284,0.86903,0.80889,0.8904,0.817995,0.9112,True,284385.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.851284,0.86945,0.80889,0.8904,0.814185,0.91121,True,277675.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.851284,0.871458,0.80889,0.8904,0.834392,0.906512,True,237935.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.819089,0.867559,0.7659,0.8714,0.821677,0.913502,True,85360.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.819089,0.866164,0.7659,0.8714,0.826595,0.907702,True,75414.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.819089,0.890771,0.7659,0.8714,0.852297,0.924403,True,12274.5,0.0,0.0,True


### 500
### basic Prompt

In [89]:
args.lr_setting = 500
args.task = 'acsa'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
3776,GERestaurant,acsa,basic,3e-05,8,8,0.05,0,500,8,acsa_GERestaurant_basic_3e-05_8_8_0.05_500,acsa_GERestaurant_basic_3e-05_8_8_0.05_0_500_8,0.8927,0.8741,0.8062
153,GERestaurant,acsa,basic,3e-05,8,16,0.05,0,500,8,acsa_GERestaurant_basic_3e-05_8_16_0.05_500,acsa_GERestaurant_basic_3e-05_8_16_0.05_0_500_8,0.887,0.8695,0.7969
5855,GERestaurant,acsa,basic,3e-05,32,64,0.05,0,500,5,acsa_GERestaurant_basic_3e-05_32_64_0.05_500,acsa_GERestaurant_basic_3e-05_32_64_0.05_0_500_5,0.8841,0.8476,0.7923
5802,GERestaurant,acsa,basic,3e-05,32,32,0.05,0,500,8,acsa_GERestaurant_basic_3e-05_32_32_0.05_500,acsa_GERestaurant_basic_3e-05_32_32_0.05_0_500_8,0.8793,0.865,0.7846
6818,GERestaurant,acsa,basic,0.0003,8,8,0.05,0,500,9,acsa_GERestaurant_basic_0.0003_8_8_0.05_500,acsa_GERestaurant_basic_0.0003_8_8_0.05_0_500_9,0.8772,0.8735,0.7812
5235,GERestaurant,acsa,basic,0.0003,8,16,0.05,0,500,7,acsa_GERestaurant_basic_0.0003_8_16_0.05_500,acsa_GERestaurant_basic_0.0003_8_16_0.05_0_500_7,0.8766,0.8726,0.7803
1783,GERestaurant,acsa,basic,0.0003,32,32,0.05,0,500,7,acsa_GERestaurant_basic_0.0003_32_32_0.05_500,acsa_GERestaurant_basic_0.0003_32_32_0.05_0_500_7,0.8696,0.8418,0.7692
2641,GERestaurant,acsa,basic,0.0003,32,64,0.05,0,500,6,acsa_GERestaurant_basic_0.0003_32_64_0.05_500,acsa_GERestaurant_basic_0.0003_32_64_0.05_0_500_6,0.8655,0.8327,0.763


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.8696
0.8655
0.8766
0.8772
0.8793
0.8841
0.887
0.8927
         Source  ddof1           H          p-unc
Kruskal  config      7  687.552694  3.327392e-144
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.869684,0.865498,0.814492,0.918905,0.806477,0.916745,True,540680.5,0.001631,0.008994,True
1,0.0003_32_32,0.0003_8_16,0.869684,0.877008,0.814492,0.918905,0.8333,0.920503,True,421953.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.869684,0.877586,0.814492,0.918905,0.823987,0.926805,True,414947.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.869684,0.87925,0.814492,0.918905,0.824538,0.932715,True,402083.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.869684,0.885967,0.814492,0.918905,0.820695,0.943407,True,342845.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.869684,0.888505,0.814492,0.918905,0.838395,0.938612,True,308533.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.869684,0.892301,0.814492,0.918905,0.839495,0.944707,True,278887.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.865498,0.877008,0.806477,0.916745,0.8333,0.920503,True,382803.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.865498,0.877586,0.806477,0.916745,0.823987,0.926805,True,378953.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.865498,0.87925,0.806477,0.916745,0.824538,0.932715,True,366786.5,0.0,0.0,True


### 500
### context Prompt

In [90]:
args.lr_setting = 500
args.task = 'acsa'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2315,GERestaurant,acsa,context,0.0003,8,8,0.05,0,500,5,acsa_GERestaurant_context_0.0003_8_8_0.05_500,acsa_GERestaurant_context_0.0003_8_8_0.05_0_500_5,0.8938,0.8712,0.808
6774,GERestaurant,acsa,context,0.0003,8,16,0.05,0,500,9,acsa_GERestaurant_context_0.0003_8_16_0.05_500,acsa_GERestaurant_context_0.0003_8_16_0.05_0_5...,0.8851,0.8809,0.7939
5502,GERestaurant,acsa,context,3e-05,32,64,0.05,0,500,5,acsa_GERestaurant_context_3e-05_32_64_0.05_500,acsa_GERestaurant_context_3e-05_32_64_0.05_0_5...,0.8851,0.8793,0.7939
4421,GERestaurant,acsa,context,3e-05,8,16,0.05,0,500,10,acsa_GERestaurant_context_3e-05_8_16_0.05_500,acsa_GERestaurant_context_3e-05_8_16_0.05_0_50...,0.8755,0.8581,0.7786
5316,GERestaurant,acsa,context,3e-05,8,8,0.05,0,500,7,acsa_GERestaurant_context_3e-05_8_8_0.05_500,acsa_GERestaurant_context_3e-05_8_8_0.05_0_500_7,0.8755,0.865,0.7786
6928,GERestaurant,acsa,context,3e-05,32,32,0.05,0,500,10,acsa_GERestaurant_context_3e-05_32_32_0.05_500,acsa_GERestaurant_context_3e-05_32_32_0.05_0_5...,0.8559,0.8619,0.7481
1510,GERestaurant,acsa,context,0.0003,32,32,0.05,0,500,8,acsa_GERestaurant_context_0.0003_32_32_0.05_500,acsa_GERestaurant_context_0.0003_32_32_0.05_0_...,0.8547,0.8433,0.7464
5504,GERestaurant,acsa,context,0.0003,32,64,0.05,0,500,6,acsa_GERestaurant_context_0.0003_32_64_0.05_500,acsa_GERestaurant_context_0.0003_32_64_0.05_0_...,0.8468,0.8418,0.7344


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.8548
0.8468
0.8851
0.8938
0.8559
0.8851
0.8755
0.8755
         Source  ddof1            H  p-unc
Kruskal  config      7  2227.332413    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.855774,0.846966,0.804662,0.904605,0.79339,0.8972,True,594600.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.855774,0.886883,0.804662,0.904605,0.838395,0.9321,True,193049.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.855774,0.894,0.804662,0.904605,0.850882,0.938442,True,134273.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.855774,0.856431,0.804662,0.904605,0.7932,0.913805,True,488298.5,0.364863,1.0,False
4,0.0003_32_32,3e-05_32_64,0.855774,0.886955,0.804662,0.904605,0.838087,0.935203,True,197170.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.855774,0.87603,0.804662,0.904605,0.821377,0.92921,True,298389.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.855774,0.875718,0.804662,0.904605,0.824877,0.9231,True,289207.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.846966,0.886883,0.79339,0.8972,0.838395,0.9321,True,133885.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.846966,0.894,0.79339,0.8972,0.850882,0.938442,True,86366.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.846966,0.856431,0.79339,0.8972,0.7932,0.913805,True,403783.5,0.0,0.0,True


### 500
### CoT Prompt

In [91]:
args.lr_setting = 500
args.task = 'acsa'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
568,GERestaurant,acsa,cot,0.0003,8,8,0.05,0,500,10,acsa_GERestaurant_cot_0.0003_8_8_0.05_500,acsa_GERestaurant_cot_0.0003_8_8_0.05_0_500_10,0.8929,0.871,0.8065
3288,GERestaurant,acsa,cot,0.0003,8,16,0.05,0,500,10,acsa_GERestaurant_cot_0.0003_8_16_0.05_500,acsa_GERestaurant_cot_0.0003_8_16_0.05_0_500_10,0.8688,0.8484,0.7681
4144,GERestaurant,acsa,cot,3e-05,32,64,0.05,0,500,7,acsa_GERestaurant_cot_3e-05_32_64_0.05_500,acsa_GERestaurant_cot_3e-05_32_64_0.05_0_500_7,0.8651,0.8628,0.7622
5798,GERestaurant,acsa,cot,0.0003,32,32,0.05,0,500,10,acsa_GERestaurant_cot_0.0003_32_32_0.05_500,acsa_GERestaurant_cot_0.0003_32_32_0.05_0_500_10,0.861,0.8298,0.7559
1454,GERestaurant,acsa,cot,3e-05,32,32,0.05,0,500,10,acsa_GERestaurant_cot_3e-05_32_32_0.05_500,acsa_GERestaurant_cot_3e-05_32_32_0.05_0_500_10,0.8572,0.8299,0.75
2687,GERestaurant,acsa,cot,3e-05,8,16,0.05,0,500,9,acsa_GERestaurant_cot_3e-05_8_16_0.05_500,acsa_GERestaurant_cot_3e-05_8_16_0.05_0_500_9,0.8572,0.8173,0.75
5638,GERestaurant,acsa,cot,3e-05,8,8,0.05,0,500,4,acsa_GERestaurant_cot_3e-05_8_8_0.05_500,acsa_GERestaurant_cot_3e-05_8_8_0.05_0_500_4,0.8297,0.7955,0.709
5133,GERestaurant,acsa,cot,0.0003,32,64,0.05,0,500,4,acsa_GERestaurant_cot_0.0003_32_64_0.05_500,acsa_GERestaurant_cot_0.0003_32_64_0.05_0_500_4,0.8267,0.7575,0.7045


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.861
0.8267
0.8689
0.8929
0.8571
0.8651
0.8571
0.8297
         Source  ddof1            H  p-unc
Kruskal  config      7  2770.278935    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.862376,0.826915,0.8089,0.9132,0.7654,0.887803,True,805660.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.862376,0.8683,0.8089,0.9132,0.815992,0.915725,True,439759.5,3e-06,1.8e-05,True
2,0.0003_32_32,0.0003_8_8,0.862376,0.893506,0.8089,0.9132,0.8475,0.9366,True,191855.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.862376,0.856881,0.8089,0.9132,0.805397,0.9076,True,561973.5,2e-06,1.4e-05,True
4,0.0003_32_32,3e-05_32_64,0.862376,0.866832,0.8089,0.9132,0.8231,0.9114,True,455447.5,0.00056,0.0028,True
5,0.0003_32_32,3e-05_8_16,0.862376,0.858722,0.8089,0.9132,0.810065,0.90502,True,542904.0,0.000892,0.003568,True
6,0.0003_32_32,3e-05_8_8,0.862376,0.830257,0.8089,0.9132,0.767182,0.891022,True,777577.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.826915,0.8683,0.7654,0.887803,0.815992,0.915725,True,147350.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.826915,0.893506,0.7654,0.887803,0.8475,0.9366,True,41739.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.826915,0.856881,0.7654,0.887803,0.805397,0.9076,True,229800.0,0.0,0.0,True


## E2E

In [92]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'e2e',
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


('1000', 'basic')
78.77, 1.50
('1000', 'context')
78.44, 2.70
('1000', 'cot')
73.69, 3.82
('500', 'basic')
77.53, 2.67
('500', 'context')
77.45, 3.42
('500', 'cot')
73.23, 2.43
('full', 'basic')
80.31, 1.74
('full', 'context')
80.23, 1.80
('full', 'cot')
75.56, 2.68
('orig', 'basic')
75.44, 0.00
('orig', 'context')
77.44, 0.00
('orig', 'cot')
67.48, 0.00


In [97]:
args.lr_setting = 0
args.task = 'e2e'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
1500,GERestaurant,e2e,basic,3e-05,32,64,0.05,0,full,10,e2e_GERestaurant_basic_3e-05_32_64_0.05_full,e2e_GERestaurant_basic_3e-05_32_64_0.05_0_full_10,0.8187,0.8257,0.6931
3450,GERestaurant,e2e,basic,3e-05,32,32,0.05,0,full,8,e2e_GERestaurant_basic_3e-05_32_32_0.05_full,e2e_GERestaurant_basic_3e-05_32_32_0.05_0_full_8,0.8179,0.7964,0.6919
387,GERestaurant,e2e,basic,0.0003,8,8,0.05,0,full,4,e2e_GERestaurant_basic_0.0003_8_8_0.05_full,e2e_GERestaurant_basic_0.0003_8_8_0.05_0_full_4,0.8167,0.7932,0.6902
6709,GERestaurant,e2e,basic,3e-05,8,8,0.05,0,full,10,e2e_GERestaurant_basic_3e-05_8_8_0.05_full,e2e_GERestaurant_basic_3e-05_8_8_0.05_0_full_10,0.8109,0.7932,0.682
5416,GERestaurant,e2e,basic,0.0003,32,32,0.05,0,full,9,e2e_GERestaurant_basic_0.0003_32_32_0.05_full,e2e_GERestaurant_basic_0.0003_32_32_0.05_0_full_9,0.8008,0.6904,0.6678
1743,GERestaurant,e2e,basic,3e-05,8,16,0.05,0,full,4,e2e_GERestaurant_basic_3e-05_8_16_0.05_full,e2e_GERestaurant_basic_3e-05_8_16_0.05_0_full_4,0.8008,0.745,0.6677
5786,GERestaurant,e2e,basic,0.0003,8,16,0.05,0,full,2,e2e_GERestaurant_basic_0.0003_8_16_0.05_full,e2e_GERestaurant_basic_0.0003_8_16_0.05_0_full_2,0.7972,0.7649,0.6628
6620,GERestaurant,e2e,basic,0.0003,32,64,0.05,0,full,4,e2e_GERestaurant_basic_0.0003_32_64_0.05_full,e2e_GERestaurant_basic_0.0003_32_64_0.05_0_full_4,0.7621,0.7244,0.6156


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.8008
0.7621
0.7972
0.8167
0.8179
0.8187
0.8008
0.8109
         Source  ddof1            H  p-unc
Kruskal  config      7  3136.899301    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.800934,0.762649,0.763385,0.834905,0.721995,0.802003,True,908897.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.800934,0.797757,0.763385,0.834905,0.760593,0.83374,True,548116.5,0.000194,0.001164,True
2,0.0003_32_32,0.0003_8_8,0.800934,0.81728,0.763385,0.834905,0.781295,0.8522,True,270126.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.800934,0.817692,0.763385,0.834905,0.778998,0.851603,True,261097.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.800934,0.818873,0.763385,0.834905,0.785692,0.84931,True,243240.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.800934,0.800065,0.763385,0.834905,0.7634,0.835805,True,513787.5,0.285668,0.571336,False
6,0.0003_32_32,3e-05_8_8,0.800934,0.810862,0.763385,0.834905,0.774397,0.844903,True,351276.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.762649,0.797757,0.721995,0.802003,0.760593,0.83374,True,107383.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.762649,0.81728,0.721995,0.802003,0.781295,0.8522,True,23236.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.762649,0.817692,0.721995,0.802003,0.778998,0.851603,True,23338.0,0.0,0.0,True


In [94]:
args.lr_setting = 0
args.task = 'e2e'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2976,GERestaurant,e2e,context,0.0003,8,8,0.05,0,full,5,e2e_GERestaurant_context_0.0003_8_8_0.05_full,e2e_GERestaurant_context_0.0003_8_8_0.05_0_full_5,0.8216,0.81,0.6972
3214,GERestaurant,e2e,context,0.0003,8,16,0.05,0,full,4,e2e_GERestaurant_context_0.0003_8_16_0.05_full,e2e_GERestaurant_context_0.0003_8_16_0.05_0_fu...,0.816,0.8191,0.6891
712,GERestaurant,e2e,context,3e-05,8,8,0.05,0,full,9,e2e_GERestaurant_context_3e-05_8_8_0.05_full,e2e_GERestaurant_context_3e-05_8_8_0.05_0_full_9,0.8139,0.7921,0.6862
1876,GERestaurant,e2e,context,3e-05,32,64,0.05,0,full,2,e2e_GERestaurant_context_3e-05_32_64_0.05_full,e2e_GERestaurant_context_3e-05_32_64_0.05_0_fu...,0.8047,0.7503,0.6733
3266,GERestaurant,e2e,context,3e-05,32,32,0.05,0,full,4,e2e_GERestaurant_context_3e-05_32_32_0.05_full,e2e_GERestaurant_context_3e-05_32_32_0.05_0_fu...,0.803,0.7816,0.6709
2421,GERestaurant,e2e,context,0.0003,32,32,0.05,0,full,5,e2e_GERestaurant_context_0.0003_32_32_0.05_full,e2e_GERestaurant_context_0.0003_32_32_0.05_0_f...,0.8019,0.7862,0.6693
2282,GERestaurant,e2e,context,3e-05,8,16,0.05,0,full,8,e2e_GERestaurant_context_3e-05_8_16_0.05_full,e2e_GERestaurant_context_3e-05_8_16_0.05_0_full_8,0.7984,0.7585,0.6645
2811,GERestaurant,e2e,context,0.0003,32,64,0.05,0,full,5,e2e_GERestaurant_context_0.0003_32_64_0.05_full,e2e_GERestaurant_context_0.0003_32_64_0.05_0_f...,0.759,0.7574,0.6116


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.8019
0.759
0.816
0.8216
0.803
0.8047
0.7984
0.8139
         Source  ddof1            H  p-unc
Kruskal  config      7  3199.830005    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.802276,0.758133,0.765492,0.838807,0.716498,0.795612,True,948359.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.802276,0.816176,0.765492,0.838807,0.779798,0.8532,True,295488.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.802276,0.822169,0.765492,0.838807,0.787898,0.85261,True,211542.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.802276,0.802386,0.765492,0.838807,0.76438,0.836105,True,491981.5,0.534653,0.534653,False
4,0.0003_32_32,3e-05_32_64,0.802276,0.805488,0.765492,0.838807,0.7719,0.837307,True,447721.5,5.2e-05,0.000208,True
5,0.0003_32_32,3e-05_8_16,0.802276,0.798289,0.765492,0.838807,0.757497,0.835005,True,554015.0,2.9e-05,0.000145,True
6,0.0003_32_32,3e-05_8_8,0.802276,0.814173,0.765492,0.838807,0.778085,0.849313,True,323279.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.758133,0.816176,0.716498,0.795612,0.779798,0.8532,True,17239.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.758133,0.822169,0.716498,0.795612,0.787898,0.85261,True,6865.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.758133,0.802386,0.716498,0.795612,0.76438,0.836105,True,50654.0,0.0,0.0,True


In [95]:
args.lr_setting = 0
args.task = 'e2e'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
6124,GERestaurant,e2e,cot,0.0003,8,16,0.05,0,full,7,e2e_GERestaurant_cot_0.0003_8_16_0.05_full,e2e_GERestaurant_cot_0.0003_8_16_0.05_0_full_7,0.7849,0.7768,0.646
6187,GERestaurant,e2e,cot,0.0003,8,8,0.05,0,full,7,e2e_GERestaurant_cot_0.0003_8_8_0.05_full,e2e_GERestaurant_cot_0.0003_8_8_0.05_0_full_7,0.7737,0.7485,0.6309
1119,GERestaurant,e2e,cot,3e-05,32,32,0.05,0,full,8,e2e_GERestaurant_cot_3e-05_32_32_0.05_full,e2e_GERestaurant_cot_3e-05_32_32_0.05_0_full_8,0.7726,0.7757,0.6294
2724,GERestaurant,e2e,cot,3e-05,32,64,0.05,0,full,9,e2e_GERestaurant_cot_3e-05_32_64_0.05_full,e2e_GERestaurant_cot_3e-05_32_64_0.05_0_full_9,0.7707,0.7568,0.627
1614,GERestaurant,e2e,cot,3e-05,8,16,0.05,0,full,8,e2e_GERestaurant_cot_3e-05_8_16_0.05_full,e2e_GERestaurant_cot_3e-05_8_16_0.05_0_full_8,0.7686,0.7529,0.6242
3518,GERestaurant,e2e,cot,0.0003,32,32,0.05,0,full,5,e2e_GERestaurant_cot_0.0003_32_32_0.05_full,e2e_GERestaurant_cot_0.0003_32_32_0.05_0_full_5,0.7505,0.7595,0.6006
5754,GERestaurant,e2e,cot,3e-05,8,8,0.05,0,full,9,e2e_GERestaurant_cot_3e-05_8_8_0.05_full,e2e_GERestaurant_cot_3e-05_8_8_0.05_0_full_9,0.7154,0.738,0.5569
4868,GERestaurant,e2e,cot,0.0003,32,64,0.05,0,full,7,e2e_GERestaurant_cot_0.0003_32_64_0.05_full,e2e_GERestaurant_cot_0.0003_32_64_0.05_0_full_7,0.7084,0.6902,0.5485


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.7505
0.7084
0.7849
0.7737
0.7726
0.7707
0.7686
0.7154
         Source  ddof1            H  p-unc
Kruskal  config      7  4514.139105    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.751367,0.708752,0.70719,0.79353,0.663188,0.75302,True,903591.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.751367,0.786077,0.70719,0.79353,0.74228,0.825908,True,134108.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.751367,0.774009,0.70719,0.79353,0.729697,0.815107,True,235704.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.751367,0.773588,0.70719,0.79353,0.732592,0.815805,True,241822.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.751367,0.770172,0.70719,0.79353,0.7281,0.811415,True,278718.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.751367,0.769091,0.70719,0.79353,0.726197,0.805832,True,284908.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.751367,0.715815,0.70719,0.79353,0.672695,0.755107,True,870986.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.708752,0.786077,0.663188,0.75302,0.74228,0.825908,True,7560.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.708752,0.774009,0.663188,0.75302,0.729697,0.815107,True,19871.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.708752,0.773588,0.663188,0.75302,0.732592,0.815805,True,19864.0,0.0,0.0,True


In [98]:
args.lr_setting = 1000
args.task = 'e2e'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5421,GERestaurant,e2e,basic,0.0003,8,8,0.05,0,1000,9,e2e_GERestaurant_basic_0.0003_8_8_0.05_1000,e2e_GERestaurant_basic_0.0003_8_8_0.05_0_1000_9,0.8053,0.7277,0.6741
5769,GERestaurant,e2e,basic,0.0003,8,16,0.05,0,1000,7,e2e_GERestaurant_basic_0.0003_8_16_0.05_1000,e2e_GERestaurant_basic_0.0003_8_16_0.05_0_1000_7,0.8009,0.746,0.6679
2336,GERestaurant,e2e,basic,3e-05,32,64,0.05,0,1000,4,e2e_GERestaurant_basic_3e-05_32_64_0.05_1000,e2e_GERestaurant_basic_3e-05_32_64_0.05_0_1000_4,0.8,0.7021,0.6667
4765,GERestaurant,e2e,basic,0.0003,32,32,0.05,0,1000,9,e2e_GERestaurant_basic_0.0003_32_32_0.05_1000,e2e_GERestaurant_basic_0.0003_32_32_0.05_0_1000_9,0.7937,0.6814,0.658
5155,GERestaurant,e2e,basic,3e-05,32,32,0.05,0,1000,4,e2e_GERestaurant_basic_3e-05_32_32_0.05_1000,e2e_GERestaurant_basic_3e-05_32_32_0.05_0_1000_4,0.7903,0.7047,0.6533
6064,GERestaurant,e2e,basic,3e-05,8,16,0.05,0,1000,6,e2e_GERestaurant_basic_3e-05_8_16_0.05_1000,e2e_GERestaurant_basic_3e-05_8_16_0.05_0_1000_6,0.7778,0.6844,0.6364
660,GERestaurant,e2e,basic,3e-05,8,8,0.05,0,1000,7,e2e_GERestaurant_basic_3e-05_8_8_0.05_1000,e2e_GERestaurant_basic_3e-05_8_8_0.05_0_1000_7,0.7747,0.6922,0.6323
1309,GERestaurant,e2e,basic,0.0003,32,64,0.05,0,1000,5,e2e_GERestaurant_basic_0.0003_32_64_0.05_1000,e2e_GERestaurant_basic_0.0003_32_64_0.05_0_1000_5,0.7586,0.6766,0.6111


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7937
0.7586
0.8009
0.8053
0.7903
0.8
0.7778
0.7747
         Source  ddof1            H  p-unc
Kruskal  config      7  1713.807453    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.794851,0.759243,0.744573,0.846008,0.697195,0.81941,True,809589.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.794851,0.80132,0.744573,0.846008,0.753495,0.8486,True,427341.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.794851,0.806689,0.744573,0.846008,0.754495,0.859003,True,376417.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.794851,0.790303,0.744573,0.846008,0.735,0.844705,True,547195.5,0.000257,0.000956,True
4,0.0003_32_32,3e-05_32_64,0.794851,0.799191,0.744573,0.846008,0.7495,0.846707,True,452561.0,0.000239,0.000956,True
5,0.0003_32_32,3e-05_8_16,0.794851,0.778591,0.744573,0.846008,0.71259,0.833008,True,655061.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.794851,0.774864,0.744573,0.846008,0.7175,0.826213,True,693595.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.759243,0.80132,0.697195,0.81941,0.753495,0.8486,True,144378.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.759243,0.806689,0.697195,0.81941,0.754495,0.859003,True,126495.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.759243,0.790303,0.697195,0.81941,0.735,0.844705,True,226941.5,0.0,0.0,True


In [99]:
args.lr_setting = 1000
args.task = 'e2e'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5525,GERestaurant,e2e,context,0.0003,8,8,0.05,0,1000,9,e2e_GERestaurant_context_0.0003_8_8_0.05_1000,e2e_GERestaurant_context_0.0003_8_8_0.05_0_1000_9,0.814,0.7341,0.6863
3083,GERestaurant,e2e,context,3e-05,32,64,0.05,0,1000,8,e2e_GERestaurant_context_3e-05_32_64_0.05_1000,e2e_GERestaurant_context_3e-05_32_64_0.05_0_10...,0.8107,0.7271,0.6816
2071,GERestaurant,e2e,context,0.0003,32,32,0.05,0,1000,5,e2e_GERestaurant_context_0.0003_32_32_0.05_1000,e2e_GERestaurant_context_0.0003_32_32_0.05_0_1...,0.7964,0.7419,0.6617
556,GERestaurant,e2e,context,0.0003,8,16,0.05,0,1000,6,e2e_GERestaurant_context_0.0003_8_16_0.05_1000,e2e_GERestaurant_context_0.0003_8_16_0.05_0_10...,0.7903,0.6926,0.6533
2642,GERestaurant,e2e,context,3e-05,8,16,0.05,0,1000,9,e2e_GERestaurant_context_3e-05_8_16_0.05_1000,e2e_GERestaurant_context_3e-05_8_16_0.05_0_1000_9,0.7902,0.6852,0.6531
5118,GERestaurant,e2e,context,3e-05,32,32,0.05,0,1000,8,e2e_GERestaurant_context_3e-05_32_32_0.05_1000,e2e_GERestaurant_context_3e-05_32_32_0.05_0_10...,0.7841,0.6908,0.6449
5665,GERestaurant,e2e,context,3e-05,8,8,0.05,0,1000,6,e2e_GERestaurant_context_3e-05_8_8_0.05_1000,e2e_GERestaurant_context_3e-05_8_8_0.05_0_1000_6,0.7659,0.6682,0.6206
4027,GERestaurant,e2e,context,0.0003,32,64,0.05,0,1000,3,e2e_GERestaurant_context_0.0003_32_64_0.05_1000,e2e_GERestaurant_context_0.0003_32_64_0.05_0_1...,0.7234,0.6669,0.5667


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7964
0.7234
0.7903
0.814
0.7841
0.8107
0.7902
0.7659
         Source  ddof1            H  p-unc
Kruskal  config      7  3307.508498    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.796027,0.724594,0.7366,0.846917,0.664998,0.782015,True,950292.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.796027,0.789546,0.7366,0.846917,0.739888,0.842612,True,575699.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.796027,0.813866,0.7366,0.846917,0.761297,0.8641,True,331306.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.796027,0.78392,0.7366,0.846917,0.730598,0.83621,True,625194.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.796027,0.809669,0.7366,0.846917,0.760777,0.855915,True,368435.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.796027,0.789543,0.7366,0.846917,0.73539,0.839705,True,571413.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.796027,0.767187,0.7366,0.846917,0.71248,0.821207,True,759417.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.724594,0.789546,0.664998,0.782015,0.739888,0.842612,True,53065.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.724594,0.813866,0.664998,0.782015,0.761297,0.8641,True,14541.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.724594,0.78392,0.664998,0.782015,0.730598,0.83621,True,75564.0,0.0,0.0,True


In [100]:
args.lr_setting = 1000
args.task = 'e2e'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2172,GERestaurant,e2e,cot,0.0003,8,16,0.05,0,1000,9,e2e_GERestaurant_cot_0.0003_8_16_0.05_1000,e2e_GERestaurant_cot_0.0003_8_16_0.05_0_1000_9,0.7832,0.7116,0.6437
3124,GERestaurant,e2e,cot,0.0003,32,32,0.05,0,1000,9,e2e_GERestaurant_cot_0.0003_32_32_0.05_1000,e2e_GERestaurant_cot_0.0003_32_32_0.05_0_1000_9,0.7783,0.7099,0.6371
3160,GERestaurant,e2e,cot,0.0003,8,8,0.05,0,1000,10,e2e_GERestaurant_cot_0.0003_8_8_0.05_1000,e2e_GERestaurant_cot_0.0003_8_8_0.05_0_1000_10,0.7758,0.6963,0.6337
3318,GERestaurant,e2e,cot,3e-05,32,32,0.05,0,1000,10,e2e_GERestaurant_cot_3e-05_32_32_0.05_1000,e2e_GERestaurant_cot_3e-05_32_32_0.05_0_1000_10,0.758,0.6571,0.6103
1319,GERestaurant,e2e,cot,3e-05,8,16,0.05,0,1000,6,e2e_GERestaurant_cot_3e-05_8_16_0.05_1000,e2e_GERestaurant_cot_3e-05_8_16_0.05_0_1000_6,0.7115,0.6135,0.5522
5776,GERestaurant,e2e,cot,0.0003,32,64,0.05,0,1000,10,e2e_GERestaurant_cot_0.0003_32_64_0.05_1000,e2e_GERestaurant_cot_0.0003_32_64_0.05_0_1000_10,0.7075,0.6585,0.5474
4332,GERestaurant,e2e,cot,3e-05,8,8,0.05,0,1000,9,e2e_GERestaurant_cot_3e-05_8_8_0.05_1000,e2e_GERestaurant_cot_3e-05_8_8_0.05_0_1000_9,0.6911,0.6116,0.528
4194,GERestaurant,e2e,cot,3e-05,32,64,0.05,0,1000,4,e2e_GERestaurant_cot_3e-05_32_64_0.05_1000,e2e_GERestaurant_cot_3e-05_32_64_0.05_0_1000_4,0.6897,0.6134,0.5263


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7783
0.7075
0.7832
0.7758
0.758
0.6897
0.7115
0.6911
         Source  ddof1            H  p-unc
Kruskal  config      7  5149.340723    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.77803,0.707924,0.721197,0.83171,0.63929,0.76851,True,945654.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.77803,0.783614,0.721197,0.83171,0.731875,0.83451,True,442341.0,8e-06,3.2e-05,True
2,0.0003_32_32,0.0003_8_8,0.77803,0.776639,0.721197,0.83171,0.719097,0.83131,True,511603.5,0.368896,0.737792,False
3,0.0003_32_32,3e-05_32_32,0.77803,0.757916,0.721197,0.83171,0.698797,0.813227,True,687732.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.77803,0.691238,0.721197,0.83171,0.632317,0.748315,True,982957.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.77803,0.711738,0.721197,0.83171,0.653095,0.771015,True,944046.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.77803,0.692203,0.721197,0.83171,0.632788,0.754207,True,979027.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.707924,0.783614,0.63929,0.76851,0.731875,0.83451,True,38791.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.707924,0.776639,0.63929,0.76851,0.719097,0.83131,True,59010.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.707924,0.757916,0.63929,0.76851,0.698797,0.813227,True,131385.0,0.0,0.0,True


In [101]:
args.lr_setting = 500
args.task = 'e2e'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
3623,GERestaurant,e2e,basic,0.0003,32,64,0.05,0,500,4,e2e_GERestaurant_basic_0.0003_32_64_0.05_500,e2e_GERestaurant_basic_0.0003_32_64_0.05_0_500_4,0.8089,0.7905,0.6791
6897,GERestaurant,e2e,basic,0.0003,8,16,0.05,0,500,3,e2e_GERestaurant_basic_0.0003_8_16_0.05_500,e2e_GERestaurant_basic_0.0003_8_16_0.05_0_500_3,0.807,0.7312,0.6765
4979,GERestaurant,e2e,basic,0.0003,32,32,0.05,0,500,4,e2e_GERestaurant_basic_0.0003_32_32_0.05_500,e2e_GERestaurant_basic_0.0003_32_32_0.05_0_500_4,0.7892,0.7113,0.6519
6315,GERestaurant,e2e,basic,0.0003,8,8,0.05,0,500,7,e2e_GERestaurant_basic_0.0003_8_8_0.05_500,e2e_GERestaurant_basic_0.0003_8_8_0.05_0_500_7,0.7815,0.7423,0.6414
4899,GERestaurant,e2e,basic,3e-05,32,64,0.05,0,500,6,e2e_GERestaurant_basic_3e-05_32_64_0.05_500,e2e_GERestaurant_basic_3e-05_32_64_0.05_0_500_6,0.7719,0.7334,0.6286
4162,GERestaurant,e2e,basic,3e-05,8,16,0.05,0,500,8,e2e_GERestaurant_basic_3e-05_8_16_0.05_500,e2e_GERestaurant_basic_3e-05_8_16_0.05_0_500_8,0.7632,0.7104,0.617
5117,GERestaurant,e2e,basic,3e-05,32,32,0.05,0,500,8,e2e_GERestaurant_basic_3e-05_32_32_0.05_500,e2e_GERestaurant_basic_3e-05_32_32_0.05_0_500_8,0.7598,0.6219,0.6127
4817,GERestaurant,e2e,basic,3e-05,8,8,0.05,0,500,8,e2e_GERestaurant_basic_3e-05_8_8_0.05_500,e2e_GERestaurant_basic_3e-05_8_8_0.05_0_500_8,0.721,0.676,0.5638


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7892
0.8089
0.807
0.7815
0.7598
0.7719
0.7632
0.721
         Source  ddof1            H  p-unc
Kruskal  config      7  2779.327866    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.791381,0.809658,0.7273,0.851513,0.739067,0.87323,True,349053.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.791381,0.805905,0.7273,0.851513,0.746262,0.864112,True,375913.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.791381,0.782642,0.7273,0.851513,0.714885,0.852337,True,575270.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.791381,0.757935,0.7273,0.851513,0.6911,0.821802,True,755749.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.791381,0.773111,0.7273,0.851513,0.704175,0.84061,True,650267.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.791381,0.764979,0.7273,0.851513,0.687662,0.839825,True,695879.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.791381,0.720264,0.7273,0.851513,0.641177,0.805303,True,905736.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.809658,0.805905,0.739067,0.87323,0.746262,0.864112,True,536550.0,0.004649,0.004649,True
8,0.0003_32_64,0.0003_8_8,0.809658,0.782642,0.739067,0.87323,0.714885,0.852337,True,715976.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.809658,0.757935,0.739067,0.87323,0.6911,0.821802,True,858274.5,0.0,0.0,True


In [102]:
args.lr_setting = 500
args.task = 'e2e'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2233,GERestaurant,e2e,context,0.0003,32,64,0.05,0,500,7,e2e_GERestaurant_context_0.0003_32_64_0.05_500,e2e_GERestaurant_context_0.0003_32_64_0.05_0_5...,0.8194,0.7724,0.694
2625,GERestaurant,e2e,context,0.0003,32,32,0.05,0,500,4,e2e_GERestaurant_context_0.0003_32_32_0.05_500,e2e_GERestaurant_context_0.0003_32_32_0.05_0_5...,0.8091,0.7602,0.6794
1102,GERestaurant,e2e,context,0.0003,8,16,0.05,0,500,8,e2e_GERestaurant_context_0.0003_8_16_0.05_500,e2e_GERestaurant_context_0.0003_8_16_0.05_0_500_8,0.8034,0.7605,0.6714
6893,GERestaurant,e2e,context,0.0003,8,8,0.05,0,500,4,e2e_GERestaurant_context_0.0003_8_8_0.05_500,e2e_GERestaurant_context_0.0003_8_8_0.05_0_500_4,0.7879,0.7443,0.65
6554,GERestaurant,e2e,context,3e-05,32,32,0.05,0,500,8,e2e_GERestaurant_context_3e-05_32_32_0.05_500,e2e_GERestaurant_context_3e-05_32_32_0.05_0_500_8,0.766,0.6894,0.6207
4226,GERestaurant,e2e,context,3e-05,32,64,0.05,0,500,9,e2e_GERestaurant_context_3e-05_32_64_0.05_500,e2e_GERestaurant_context_3e-05_32_64_0.05_0_500_9,0.7586,0.6582,0.6111
4077,GERestaurant,e2e,context,3e-05,8,16,0.05,0,500,5,e2e_GERestaurant_context_3e-05_8_16_0.05_500,e2e_GERestaurant_context_3e-05_8_16_0.05_0_500_5,0.7333,0.5762,0.5789
5212,GERestaurant,e2e,context,3e-05,8,8,0.05,0,500,8,e2e_GERestaurant_context_3e-05_8_8_0.05_500,e2e_GERestaurant_context_3e-05_8_8_0.05_0_500_8,0.7186,0.6056,0.5608


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.8091
0.8194
0.8034
0.7879
0.766
0.7586
0.7333
0.7186
         Source  ddof1            H  p-unc
Kruskal  config      7  3667.704257    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.80779,0.818981,0.745077,0.870007,0.7424,0.889803,True,403532.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.80779,0.802648,0.745077,0.870007,0.730388,0.870417,True,542633.0,0.000962,0.001902,True
2,0.0003_32_32,0.0003_8_8,0.80779,0.789832,0.745077,0.870007,0.722177,0.8571,True,648882.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.80779,0.763946,0.745077,0.870007,0.683978,0.8381,True,801677.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.80779,0.758361,0.745077,0.870007,0.689985,0.82794,True,845610.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.80779,0.733906,0.745077,0.870007,0.660652,0.80871,True,927468.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.80779,0.719866,0.745077,0.870007,0.637482,0.8,True,952597.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.818981,0.802648,0.7424,0.889803,0.730388,0.870417,True,629621.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.818981,0.789832,0.7424,0.889803,0.722177,0.8571,True,720458.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.818981,0.763946,0.7424,0.889803,0.683978,0.8381,True,841155.5,0.0,0.0,True


In [103]:
args.lr_setting = 500
args.task = 'e2e'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
4329,GERestaurant,e2e,cot,0.0003,8,8,0.05,0,500,6,e2e_GERestaurant_cot_0.0003_8_8_0.05_500,e2e_GERestaurant_cot_0.0003_8_8_0.05_0_500_6,0.7671,0.7341,0.6222
3419,GERestaurant,e2e,cot,0.0003,32,32,0.05,0,500,7,e2e_GERestaurant_cot_0.0003_32_32_0.05_500,e2e_GERestaurant_cot_0.0003_32_32_0.05_0_500_7,0.765,0.7749,0.6194
5240,GERestaurant,e2e,cot,0.0003,8,16,0.05,0,500,3,e2e_GERestaurant_cot_0.0003_8_16_0.05_500,e2e_GERestaurant_cot_0.0003_8_16_0.05_0_500_3,0.7448,0.6897,0.5933
488,GERestaurant,e2e,cot,3e-05,32,32,0.05,0,500,10,e2e_GERestaurant_cot_3e-05_32_32_0.05_500,e2e_GERestaurant_cot_3e-05_32_32_0.05_0_500_10,0.7319,0.6425,0.5772
6435,GERestaurant,e2e,cot,3e-05,8,16,0.05,0,500,10,e2e_GERestaurant_cot_3e-05_8_16_0.05_500,e2e_GERestaurant_cot_3e-05_8_16_0.05_0_500_10,0.7217,0.6341,0.5646
4024,GERestaurant,e2e,cot,0.0003,32,64,0.05,0,500,5,e2e_GERestaurant_cot_0.0003_32_64_0.05_500,e2e_GERestaurant_cot_0.0003_32_64_0.05_0_500_5,0.7193,0.6476,0.5616
5968,GERestaurant,e2e,cot,3e-05,32,64,0.05,0,500,8,e2e_GERestaurant_cot_3e-05_32_64_0.05_500,e2e_GERestaurant_cot_3e-05_32_64_0.05_0_500_8,0.7186,0.6951,0.5608
5766,GERestaurant,e2e,cot,3e-05,8,8,0.05,0,500,8,e2e_GERestaurant_cot_3e-05_8_8_0.05_500,e2e_GERestaurant_cot_3e-05_8_8_0.05_0_500_8,0.6897,0.6126,0.5263


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.765
0.7193
0.7448
0.7671
0.7319
0.7186
0.7217
0.6897
         Source  ddof1            H  p-unc
Kruskal  config      7  2108.771027    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.767995,0.719491,0.689383,0.847607,0.644895,0.796305,True,805850.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.767995,0.745852,0.689383,0.847607,0.6797,0.81133,True,660041.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.767995,0.76744,0.689383,0.847607,0.69229,0.83412,True,501226.5,0.92436,0.988396,False
3,0.0003_32_32,3e-05_32_32,0.767995,0.732743,0.689383,0.847607,0.6607,0.8155,True,738689.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.767995,0.720808,0.689383,0.847607,0.6422,0.801605,True,795612.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.767995,0.723442,0.689383,0.847607,0.648075,0.8,True,787558.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.767995,0.691735,0.689383,0.847607,0.601792,0.7798,True,898197.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.719491,0.745852,0.644895,0.796305,0.6797,0.81133,True,309166.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.719491,0.76744,0.644895,0.796305,0.69229,0.83412,True,191706.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.719491,0.732743,0.644895,0.796305,0.6607,0.8155,True,408105.0,0.0,0.0,True


## ACSD

In [15]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'acsd',
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


### Full Dataset
### basic Prompt

In [17]:
args.lr_setting = 0
args.task = 'tasd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2136,GERestaurant,tasd,basic,0.0003,8,8,0.05,0,full,4,tasd_GERestaurant_basic_0.0003_8_8_0.05_full,tasd_GERestaurant_basic_0.0003_8_8_0.05_0_full_4,0.8,0.7553,0.6667
386,GERestaurant,tasd,basic,3e-05,32,32,0.05,0,full,9,tasd_GERestaurant_basic_3e-05_32_32_0.05_full,tasd_GERestaurant_basic_3e-05_32_32_0.05_0_full_9,0.7939,0.7536,0.6582
3287,GERestaurant,tasd,basic,3e-05,32,64,0.05,0,full,5,tasd_GERestaurant_basic_3e-05_32_64_0.05_full,tasd_GERestaurant_basic_3e-05_32_64_0.05_0_full_5,0.7927,0.7488,0.6565
432,GERestaurant,tasd,basic,0.0003,32,32,0.05,0,full,10,tasd_GERestaurant_basic_0.0003_32_32_0.05_full,tasd_GERestaurant_basic_0.0003_32_32_0.05_0_fu...,0.7926,0.747,0.6565
4504,GERestaurant,tasd,basic,3e-05,8,16,0.05,0,full,9,tasd_GERestaurant_basic_3e-05_8_16_0.05_full,tasd_GERestaurant_basic_3e-05_8_16_0.05_0_full_9,0.7827,0.7387,0.6431
6166,GERestaurant,tasd,basic,3e-05,8,8,0.05,0,full,8,tasd_GERestaurant_basic_3e-05_8_8_0.05_full,tasd_GERestaurant_basic_3e-05_8_8_0.05_0_full_8,0.7826,0.7373,0.6429
28,GERestaurant,tasd,basic,0.0003,8,16,0.05,0,full,2,tasd_GERestaurant_basic_0.0003_8_16_0.05_full,tasd_GERestaurant_basic_0.0003_8_16_0.05_0_full_2,0.7816,0.757,0.6415
1167,GERestaurant,tasd,basic,0.0003,32,64,0.05,0,full,3,tasd_GERestaurant_basic_0.0003_32_64_0.05_full,tasd_GERestaurant_basic_0.0003_32_64_0.05_0_fu...,0.7345,0.6932,0.5803


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.7919
0.7337
0.78
0.7992
0.7916
0.7904
0.7813
0.7803
         Source  ddof1            H  p-unc
Kruskal  config      7  2966.969491    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.792392,0.732612,0.750992,0.829102,0.689385,0.7718,True,980759.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.792392,0.780302,0.750992,0.829102,0.7426,0.817007,True,672857.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.792392,0.798567,0.750992,0.829102,0.76449,0.830303,True,411640.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.792392,0.792058,0.750992,0.829102,0.756898,0.825615,True,508677.0,0.501639,1.0,False
4,0.0003_32_32,3e-05_32_64,0.792392,0.790389,0.750992,0.829102,0.753897,0.823817,True,531825.0,0.01372,0.08232,False
5,0.0003_32_32,3e-05_8_16,0.792392,0.781258,0.750992,0.829102,0.744097,0.8161,True,661632.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.792392,0.781471,0.750992,0.829102,0.74368,0.81931,True,654272.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.732612,0.780302,0.689385,0.7718,0.7426,0.817007,True,43336.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.732612,0.798567,0.689385,0.7718,0.76449,0.830303,True,6920.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.732612,0.792058,0.689385,0.7718,0.756898,0.825615,True,14092.0,0.0,0.0,True


### Full Dataset
### context Prompt

In [252]:
args.lr_setting = 0
args.task = 'tasd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
1467,en,GERestaurant,acsd,long,3e-05,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__long_acsd_3e-05_32_32_0.05_4_...,en_GERestaurant__long_acsd_3e-05_32_32_0.05_4_...,0.8019,0.7675,0.6693
1000,en,GERestaurant,acsd,long,3e-05,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,5,en_GERestaurant__long_acsd_3e-05_8_16_0.05_4_f...,en_GERestaurant__long_acsd_3e-05_8_16_0.05_4_0...,0.7954,0.7558,0.6603
1089,en,GERestaurant,acsd,long,0.0003,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,4,en_GERestaurant__long_acsd_0.0003_8_16_0.05_4_...,en_GERestaurant__long_acsd_0.0003_8_16_0.05_4_...,0.794,0.7676,0.6583
883,en,GERestaurant,acsd,long,3e-05,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__long_acsd_3e-05_32_64_0.05_4_...,en_GERestaurant__long_acsd_3e-05_32_64_0.05_4_...,0.789,0.746,0.6516
470,en,GERestaurant,acsd,long,3e-05,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,6,en_GERestaurant__long_acsd_3e-05_8_8_0.05_4_fu...,en_GERestaurant__long_acsd_3e-05_8_8_0.05_4_0_...,0.789,0.7524,0.6516
1074,en,GERestaurant,acsd,long,0.0003,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,3,en_GERestaurant__long_acsd_0.0003_8_8_0.05_4_f...,en_GERestaurant__long_acsd_0.0003_8_8_0.05_4_0...,0.7873,0.7383,0.6493
625,en,GERestaurant,acsd,long,0.0003,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,6,en_GERestaurant__long_acsd_0.0003_32_32_0.05_4...,en_GERestaurant__long_acsd_0.0003_32_32_0.05_4...,0.78,0.7349,0.6393
752,en,GERestaurant,acsd,long,0.0003,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,1,en_GERestaurant__long_acsd_0.0003_32_64_0.05_4...,en_GERestaurant__long_acsd_0.0003_32_64_0.05_4...,0.7046,0.6753,0.5439


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.78
0.7046
0.794
0.7873
0.8019
0.789
0.7954
0.789
         Source  ddof1            H  p-unc
Kruskal  config      7  3093.273543    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.780544,0.704262,0.741982,0.822902,0.660397,0.746007,True,995777.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.780544,0.79386,0.741982,0.822902,0.7577,0.829533,True,310327.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.780544,0.787366,0.741982,0.822902,0.749285,0.821905,True,397011.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.780544,0.80091,0.741982,0.822902,0.764695,0.836503,True,224648.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.780544,0.789419,0.741982,0.822902,0.751988,0.825205,True,369895.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.780544,0.795543,0.741982,0.822902,0.758297,0.830705,True,283478.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.780544,0.7889,0.741982,0.822902,0.750877,0.8229,True,373868.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.704262,0.79386,0.660397,0.746007,0.7577,0.829533,True,654.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.704262,0.787366,0.660397,0.746007,0.749285,0.821905,True,2131.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.704262,0.80091,0.660397,0.746007,0.764695,0.836503,True,302.0,0.0,0.0,True


### Full Dataset
### CoT Prompt

In [253]:
args.lr_setting = 0
args.task = 'tasd'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
1228,en,GERestaurant,acsd,cot,0.0003,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,7,en_GERestaurant__cot_acsd_0.0003_8_8_0.05_4_fu...,en_GERestaurant__cot_acsd_0.0003_8_8_0.05_4_0_...,0.7794,0.7629,0.6384
1314,en,GERestaurant,acsd,cot,3e-05,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,6,en_GERestaurant__cot_acsd_3e-05_32_64_0.05_4_f...,en_GERestaurant__cot_acsd_3e-05_32_64_0.05_4_0...,0.764,0.7445,0.6181
1272,en,GERestaurant,acsd,cot,0.0003,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,7,en_GERestaurant__cot_acsd_0.0003_8_16_0.05_4_f...,en_GERestaurant__cot_acsd_0.0003_8_16_0.05_4_0...,0.755,0.724,0.6065
112,en,GERestaurant,acsd,cot,0.0003,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__cot_acsd_0.0003_32_32_0.05_4_...,en_GERestaurant__cot_acsd_0.0003_32_32_0.05_4_...,0.7485,0.7054,0.5981
155,en,GERestaurant,acsd,cot,3e-05,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,5,en_GERestaurant__cot_acsd_3e-05_32_32_0.05_4_f...,en_GERestaurant__cot_acsd_3e-05_32_32_0.05_4_0...,0.745,0.7217,0.5937
60,en,GERestaurant,acsd,cot,3e-05,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,8,en_GERestaurant__cot_acsd_3e-05_8_16_0.05_4_fu...,en_GERestaurant__cot_acsd_3e-05_8_16_0.05_4_0_...,0.7416,0.715,0.5893
492,en,GERestaurant,acsd,cot,3e-05,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__cot_acsd_3e-05_8_8_0.05_4_ful...,en_GERestaurant__cot_acsd_3e-05_8_8_0.05_4_0_f...,0.721,0.6938,0.5637
633,en,GERestaurant,acsd,cot,0.0003,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__cot_acsd_0.0003_32_64_0.05_4_...,en_GERestaurant__cot_acsd_0.0003_32_64_0.05_4_...,0.6737,0.6551,0.5079


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1795
Eval Length:  359
Split: 0
0.7485
0.6737
0.755
0.7794
0.745
0.764
0.7416
0.721
         Source  ddof1            H  p-unc
Kruskal  config      7  4349.463604    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.748505,0.674619,0.699298,0.788502,0.624992,0.72051,True,986227.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.748505,0.755509,0.699298,0.788502,0.71119,0.7952,True,415533.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.748505,0.778798,0.699298,0.788502,0.73037,0.820712,True,174136.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.748505,0.743724,0.699298,0.788502,0.691867,0.787403,True,561229.0,2e-06,4e-06,True
4,0.0003_32_32,3e-05_32_64,0.748505,0.763472,0.699298,0.788502,0.718095,0.8025,True,320372.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.748505,0.741547,0.699298,0.788502,0.692595,0.7833,True,585090.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.748505,0.720515,0.699298,0.788502,0.6761,0.766803,True,806587.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.674619,0.755509,0.624992,0.72051,0.71119,0.7952,True,5793.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.674619,0.778798,0.624992,0.72051,0.73037,0.820712,True,1139.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.674619,0.743724,0.624992,0.72051,0.691867,0.787403,True,20820.0,0.0,0.0,True


### 1000
### basic Prompt

In [21]:
args.lr_setting = 1000
args.task = 'tasd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
820,GERestaurant,tasd,basic,0.0003,8,8,0.05,0,1000,5,tasd_GERestaurant_basic_0.0003_8_8_0.05_1000,tasd_GERestaurant_basic_0.0003_8_8_0.05_0_1000_5,0.808,0.7663,0.6779
118,GERestaurant,tasd,basic,0.0003,8,16,0.05,0,1000,6,tasd_GERestaurant_basic_0.0003_8_16_0.05_1000,tasd_GERestaurant_basic_0.0003_8_16_0.05_0_1000_6,0.8044,0.7727,0.6729
5629,GERestaurant,tasd,basic,3e-05,32,64,0.05,0,1000,6,tasd_GERestaurant_basic_3e-05_32_64_0.05_1000,tasd_GERestaurant_basic_3e-05_32_64_0.05_0_1000_6,0.8017,0.7585,0.6691
254,GERestaurant,tasd,basic,3e-05,32,32,0.05,0,1000,10,tasd_GERestaurant_basic_3e-05_32_32_0.05_1000,tasd_GERestaurant_basic_3e-05_32_32_0.05_0_100...,0.7875,0.7532,0.6494
6135,GERestaurant,tasd,basic,0.0003,32,32,0.05,0,1000,8,tasd_GERestaurant_basic_0.0003_32_32_0.05_1000,tasd_GERestaurant_basic_0.0003_32_32_0.05_0_10...,0.7793,0.7578,0.6385
1926,GERestaurant,tasd,basic,3e-05,8,16,0.05,0,1000,5,tasd_GERestaurant_basic_3e-05_8_16_0.05_1000,tasd_GERestaurant_basic_3e-05_8_16_0.05_0_1000_5,0.7749,0.7326,0.6325
679,GERestaurant,tasd,basic,3e-05,8,8,0.05,0,1000,5,tasd_GERestaurant_basic_3e-05_8_8_0.05_1000,tasd_GERestaurant_basic_3e-05_8_8_0.05_0_1000_5,0.7682,0.7476,0.6237
2462,GERestaurant,tasd,basic,0.0003,32,64,0.05,0,1000,8,tasd_GERestaurant_basic_0.0003_32_64_0.05_1000,tasd_GERestaurant_basic_0.0003_32_64_0.05_0_10...,0.7111,0.6926,0.5517


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7739
0.7111
0.8044
0.8062
0.784
0.8
0.7716
0.765
         Source  ddof1            H  p-unc
Kruskal  config      7  3594.846549    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.773224,0.711416,0.719777,0.826605,0.645992,0.772907,True,930403.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.773224,0.803489,0.719777,0.826605,0.753395,0.8509,True,210430.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.773224,0.806643,0.719777,0.826605,0.756758,0.8558,True,187604.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.773224,0.782895,0.719777,0.826605,0.730185,0.835203,True,398387.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.773224,0.799094,0.719777,0.826605,0.7473,0.846507,True,250328.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.773224,0.772455,0.719777,0.826605,0.714892,0.82701,True,504218.0,0.743966,0.743966,False
6,0.0003_32_32,3e-05_8_8,0.773224,0.766289,0.719777,0.826605,0.703092,0.820705,True,560218.0,3e-06,1.5e-05,True
7,0.0003_32_64,0.0003_8_16,0.711416,0.803489,0.645992,0.772907,0.753395,0.8509,True,10945.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.711416,0.806643,0.645992,0.772907,0.756758,0.8558,True,8591.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.711416,0.782895,0.645992,0.772907,0.730185,0.835203,True,38793.0,0.0,0.0,True


### 1000
### context Prompt

In [22]:
args.lr_setting = 1000
args.task = 'tasd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
3255,GERestaurant,tasd,context,3e-05,32,64,0.05,0,1000,8,tasd_GERestaurant_context_3e-05_32_64_0.05_1000,tasd_GERestaurant_context_3e-05_32_64_0.05_0_1...,0.7941,0.7283,0.6585
4049,GERestaurant,tasd,context,0.0003,8,8,0.05,0,1000,8,tasd_GERestaurant_context_0.0003_8_8_0.05_1000,tasd_GERestaurant_context_0.0003_8_8_0.05_0_10...,0.7835,0.7526,0.6441
3073,GERestaurant,tasd,context,0.0003,8,16,0.05,0,1000,8,tasd_GERestaurant_context_0.0003_8_16_0.05_1000,tasd_GERestaurant_context_0.0003_8_16_0.05_0_1...,0.7832,0.7469,0.6436
5827,GERestaurant,tasd,context,3e-05,32,32,0.05,0,1000,5,tasd_GERestaurant_context_3e-05_32_32_0.05_1000,tasd_GERestaurant_context_3e-05_32_32_0.05_0_1...,0.7792,0.7176,0.6383
2285,GERestaurant,tasd,context,0.0003,32,32,0.05,0,1000,5,tasd_GERestaurant_context_0.0003_32_32_0.05_1000,tasd_GERestaurant_context_0.0003_32_32_0.05_0_...,0.7773,0.7395,0.6357
796,GERestaurant,tasd,context,3e-05,8,8,0.05,0,1000,8,tasd_GERestaurant_context_3e-05_8_8_0.05_1000,tasd_GERestaurant_context_3e-05_8_8_0.05_0_1000_8,0.7722,0.7188,0.6289
3348,GERestaurant,tasd,context,3e-05,8,16,0.05,0,1000,6,tasd_GERestaurant_context_3e-05_8_16_0.05_1000,tasd_GERestaurant_context_3e-05_8_16_0.05_0_10...,0.7672,0.6884,0.6224
4945,GERestaurant,tasd,context,0.0003,32,64,0.05,0,1000,8,tasd_GERestaurant_context_0.0003_32_64_0.05_1000,tasd_GERestaurant_context_0.0003_32_64_0.05_0_...,0.7253,0.706,0.5689


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7738
0.7252
0.7832
0.7819
0.7759
0.7924
0.7656
0.7705
         Source  ddof1            H  p-unc
Kruskal  config      7  2169.382775    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.77413,0.725569,0.711785,0.833017,0.6606,0.7864,True,857032.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.77413,0.783633,0.711785,0.833017,0.731795,0.833008,True,413587.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.77413,0.781629,0.711785,0.833017,0.72689,0.8337,True,431798.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.77413,0.777549,0.711785,0.833017,0.72969,0.822337,True,471901.5,0.029561,0.059122,False
4,0.0003_32_32,3e-05_32_64,0.77413,0.794244,0.711785,0.833017,0.74839,0.841002,True,310047.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.77413,0.765431,0.711785,0.833017,0.711583,0.818205,True,587623.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.77413,0.77114,0.711785,0.833017,0.718797,0.819512,True,535011.5,0.006703,0.020109,True
7,0.0003_32_64,0.0003_8_16,0.725569,0.783633,0.6606,0.7864,0.731795,0.833008,True,86492.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.725569,0.781629,0.6606,0.7864,0.72689,0.8337,True,93371.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.725569,0.777549,0.6606,0.7864,0.72969,0.822337,True,99185.0,0.0,0.0,True


### 1000
### CoT Prompt

In [256]:
args.lr_setting = 1000
args.task = 'tasd'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
1150,en,GERestaurant,acsd,cot,3e-05,32,64,0.05,0,1000,meta-llama-Meta-Llama-3-8B,7,en_GERestaurant__cot_acsd_3e-05_32_64_0.05_4_1...,en_GERestaurant__cot_acsd_3e-05_32_64_0.05_4_0...,0.7645,0.7223,0.6187
1297,en,GERestaurant,acsd,cot,3e-05,32,32,0.05,0,1000,meta-llama-Meta-Llama-3-8B,10,en_GERestaurant__cot_acsd_3e-05_32_32_0.05_4_1...,en_GERestaurant__cot_acsd_3e-05_32_32_0.05_4_0...,0.7512,0.7119,0.6015
632,en,GERestaurant,acsd,cot,0.0003,32,32,0.05,0,1000,meta-llama-Meta-Llama-3-8B,7,en_GERestaurant__cot_acsd_0.0003_32_32_0.05_4_...,en_GERestaurant__cot_acsd_0.0003_32_32_0.05_4_...,0.7494,0.69,0.5993
1413,en,GERestaurant,acsd,cot,0.0003,8,8,0.05,0,1000,meta-llama-Meta-Llama-3-8B,7,en_GERestaurant__cot_acsd_0.0003_8_8_0.05_4_10...,en_GERestaurant__cot_acsd_0.0003_8_8_0.05_4_0_...,0.7427,0.6752,0.5907
21,en,GERestaurant,acsd,cot,0.0003,8,16,0.05,0,1000,meta-llama-Meta-Llama-3-8B,10,en_GERestaurant__cot_acsd_0.0003_8_16_0.05_4_1...,en_GERestaurant__cot_acsd_0.0003_8_16_0.05_4_0...,0.7366,0.6731,0.583
972,en,GERestaurant,acsd,cot,3e-05,8,16,0.05,0,1000,meta-llama-Meta-Llama-3-8B,7,en_GERestaurant__cot_acsd_3e-05_8_16_0.05_4_10...,en_GERestaurant__cot_acsd_3e-05_8_16_0.05_4_0_...,0.7364,0.6887,0.5828
479,en,GERestaurant,acsd,cot,3e-05,8,8,0.05,0,1000,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__cot_acsd_3e-05_8_8_0.05_4_100...,en_GERestaurant__cot_acsd_3e-05_8_8_0.05_4_0_1...,0.7114,0.6609,0.5521
5,en,GERestaurant,acsd,cot,0.0003,32,64,0.05,0,1000,meta-llama-Meta-Llama-3-8B,10,en_GERestaurant__cot_acsd_0.0003_32_64_0.05_4_...,en_GERestaurant__cot_acsd_0.0003_32_64_0.05_4_...,0.707,0.6751,0.5468


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7494
0.707
0.7366
0.7427
0.7512
0.7645
0.7364
0.7114
         Source  ddof1            H  p-unc
Kruskal  config      7  2093.640967    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.749143,0.707361,0.690362,0.807708,0.646295,0.7633,True,833427.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.749143,0.736241,0.690362,0.807708,0.675697,0.79422,True,614670.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.749143,0.743764,0.690362,0.807708,0.679988,0.803725,True,546485.0,0.000318,0.001272,True
3,0.0003_32_32,3e-05_32_32,0.749143,0.751882,0.690362,0.807708,0.696685,0.81081,True,474047.0,0.044455,0.08891,False
4,0.0003_32_32,3e-05_32_64,0.749143,0.763323,0.690362,0.807708,0.7091,0.817403,True,366017.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.749143,0.736361,0.690362,0.807708,0.6727,0.797345,True,612867.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.749143,0.710386,0.690362,0.807708,0.645597,0.767803,True,811264.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.707361,0.736241,0.646295,0.7633,0.675697,0.79422,True,256563.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.707361,0.743764,0.646295,0.7633,0.679988,0.803725,True,206414.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.707361,0.751882,0.646295,0.7633,0.696685,0.81081,True,143385.5,0.0,0.0,True


### 500
### basic Prompt

In [24]:
args.lr_setting = 500
args.task = 'tasd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
1338,GERestaurant,tasd,basic,0.0003,32,32,0.05,0,500,3,tasd_GERestaurant_basic_0.0003_32_32_0.05_500,tasd_GERestaurant_basic_0.0003_32_32_0.05_0_500_3,0.8117,0.7568,0.6831
2474,GERestaurant,tasd,basic,0.0003,8,16,0.05,0,500,3,tasd_GERestaurant_basic_0.0003_8_16_0.05_500,tasd_GERestaurant_basic_0.0003_8_16_0.05_0_500_3,0.7917,0.7518,0.6552
6906,GERestaurant,tasd,basic,3e-05,32,64,0.05,0,500,10,tasd_GERestaurant_basic_3e-05_32_64_0.05_500,tasd_GERestaurant_basic_3e-05_32_64_0.05_0_500_10,0.7842,0.7362,0.6449
4832,GERestaurant,tasd,basic,0.0003,8,8,0.05,0,500,8,tasd_GERestaurant_basic_0.0003_8_8_0.05_500,tasd_GERestaurant_basic_0.0003_8_8_0.05_0_500_8,0.775,0.7496,0.6327
308,GERestaurant,tasd,basic,3e-05,8,16,0.05,0,500,9,tasd_GERestaurant_basic_3e-05_8_16_0.05_500,tasd_GERestaurant_basic_3e-05_8_16_0.05_0_500_9,0.761,0.7024,0.6143
5125,GERestaurant,tasd,basic,3e-05,32,32,0.05,0,500,6,tasd_GERestaurant_basic_3e-05_32_32_0.05_500,tasd_GERestaurant_basic_3e-05_32_32_0.05_0_500_6,0.7533,0.6809,0.6042
6263,GERestaurant,tasd,basic,3e-05,8,8,0.05,0,500,9,tasd_GERestaurant_basic_3e-05_8_8_0.05_500,tasd_GERestaurant_basic_3e-05_8_8_0.05_0_500_9,0.7423,0.6902,0.5903
315,GERestaurant,tasd,basic,0.0003,32,64,0.05,0,500,7,tasd_GERestaurant_basic_0.0003_32_64_0.05_500,tasd_GERestaurant_basic_0.0003_32_64_0.05_0_500_7,0.7203,0.6651,0.5629


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.8083
0.7173
0.7917
0.775
0.75
0.7841
0.7611
0.7424
         Source  ddof1            H  p-unc
Kruskal  config      7  3005.031629    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.808694,0.717399,0.744375,0.865815,0.644085,0.788805,True,969376.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.808694,0.793158,0.744375,0.865815,0.722888,0.85843,True,634135.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.808694,0.773913,0.744375,0.865815,0.698095,0.848515,True,765169.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.808694,0.750265,0.744375,0.865815,0.676852,0.828622,True,879855.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.808694,0.783141,0.744375,0.865815,0.715975,0.84934,True,710313.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.808694,0.76107,0.744375,0.865815,0.689335,0.836512,True,836589.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.808694,0.741484,0.744375,0.865815,0.663795,0.817007,True,911647.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.717399,0.793158,0.644085,0.788805,0.722888,0.85843,True,65992.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.717399,0.773913,0.644085,0.788805,0.698095,0.848515,True,138499.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.717399,0.750265,0.644085,0.788805,0.676852,0.828622,True,269028.5,0.0,0.0,True


### 500
### context Prompt

In [27]:
args.lr_setting = 500
args.task = 'tasd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
173,GERestaurant,tasd,context,0.0003,8,8,0.05,0,500,8,tasd_GERestaurant_context_0.0003_8_8_0.05_500,tasd_GERestaurant_context_0.0003_8_8_0.05_0_500_8,0.8,0.7678,0.6667
6511,GERestaurant,tasd,context,3e-05,32,64,0.05,0,500,10,tasd_GERestaurant_context_3e-05_32_64_0.05_500,tasd_GERestaurant_context_3e-05_32_64_0.05_0_5...,0.7983,0.7657,0.6643
5449,GERestaurant,tasd,context,0.0003,32,32,0.05,0,500,4,tasd_GERestaurant_context_0.0003_32_32_0.05_500,tasd_GERestaurant_context_0.0003_32_32_0.05_0_...,0.7822,0.712,0.6423
6778,GERestaurant,tasd,context,0.0003,8,16,0.05,0,500,9,tasd_GERestaurant_context_0.0003_8_16_0.05_500,tasd_GERestaurant_context_0.0003_8_16_0.05_0_5...,0.7818,0.7253,0.6418
4040,GERestaurant,tasd,context,0.0003,32,64,0.05,0,500,5,tasd_GERestaurant_context_0.0003_32_64_0.05_500,tasd_GERestaurant_context_0.0003_32_64_0.05_0_...,0.7544,0.7132,0.6056
2100,GERestaurant,tasd,context,3e-05,32,32,0.05,0,500,4,tasd_GERestaurant_context_3e-05_32_32_0.05_500,tasd_GERestaurant_context_3e-05_32_32_0.05_0_5...,0.7533,0.6776,0.6042
912,GERestaurant,tasd,context,3e-05,8,16,0.05,0,500,4,tasd_GERestaurant_context_3e-05_8_16_0.05_500,tasd_GERestaurant_context_3e-05_8_16_0.05_0_500_4,0.7257,0.6287,0.5695
4804,GERestaurant,tasd,context,3e-05,8,8,0.05,0,500,8,tasd_GERestaurant_context_3e-05_8_8_0.05_500,tasd_GERestaurant_context_3e-05_8_8_0.05_0_500_8,0.7178,0.631,0.5597


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7753
0.7511
0.7818
0.8
0.7532
0.7983
0.7257
0.7177
         Source  ddof1            H  p-unc
Kruskal  config      7  3168.416095    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.776559,0.750598,0.69954,0.848807,0.672098,0.830455,True,685264.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.776559,0.782154,0.69954,0.848807,0.710495,0.852222,True,461054.5,0.002562,0.007686,True
2,0.0003_32_32,0.0003_8_8,0.776559,0.799567,0.69954,0.848807,0.733,0.861925,True,330813.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.776559,0.752738,0.69954,0.848807,0.678092,0.819547,True,672114.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.776559,0.799077,0.69954,0.848807,0.721495,0.8707,True,341505.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.776559,0.725468,0.69954,0.848807,0.6529,0.796637,True,830731.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.776559,0.716973,0.69954,0.848807,0.63568,0.790135,True,859549.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.750598,0.782154,0.672098,0.830455,0.710495,0.852222,True,272464.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.750598,0.799567,0.672098,0.830455,0.733,0.861925,True,167656.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.750598,0.752738,0.672098,0.830455,0.678092,0.819547,True,478573.0,0.097059,0.194118,False


### 500
### CoT Prompt

In [259]:
args.lr_setting = 500
args.task = 'tasd'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
1424,en,GERestaurant,acsd,cot,0.0003,32,32,0.05,0,500,meta-llama-Meta-Llama-3-8B,7,en_GERestaurant__cot_acsd_0.0003_32_32_0.05_4_...,en_GERestaurant__cot_acsd_0.0003_32_32_0.05_4_...,0.7489,0.6614,0.5986
280,en,GERestaurant,acsd,cot,0.0003,8,16,0.05,0,500,meta-llama-Meta-Llama-3-8B,8,en_GERestaurant__cot_acsd_0.0003_8_16_0.05_4_5...,en_GERestaurant__cot_acsd_0.0003_8_16_0.05_4_0...,0.7327,0.6577,0.5782
588,en,GERestaurant,acsd,cot,0.0003,8,8,0.05,0,500,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__cot_acsd_0.0003_8_8_0.05_4_50...,en_GERestaurant__cot_acsd_0.0003_8_8_0.05_4_0_...,0.7234,0.6606,0.5667
467,en,GERestaurant,acsd,cot,3e-05,8,16,0.05,0,500,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__cot_acsd_3e-05_8_16_0.05_4_50...,en_GERestaurant__cot_acsd_3e-05_8_16_0.05_4_0_...,0.7168,0.6643,0.5586
1142,en,GERestaurant,acsd,cot,0.0003,32,64,0.05,0,500,meta-llama-Meta-Llama-3-8B,6,en_GERestaurant__cot_acsd_0.0003_32_64_0.05_4_...,en_GERestaurant__cot_acsd_0.0003_32_64_0.05_4_...,0.7064,0.6408,0.5461
576,en,GERestaurant,acsd,cot,3e-05,32,64,0.05,0,500,meta-llama-Meta-Llama-3-8B,9,en_GERestaurant__cot_acsd_3e-05_32_64_0.05_4_5...,en_GERestaurant__cot_acsd_3e-05_32_64_0.05_4_0...,0.7059,0.6129,0.5455
71,en,GERestaurant,acsd,cot,3e-05,32,32,0.05,0,500,meta-llama-Meta-Llama-3-8B,8,en_GERestaurant__cot_acsd_3e-05_32_32_0.05_4_5...,en_GERestaurant__cot_acsd_3e-05_32_32_0.05_4_0...,0.6845,0.5966,0.5203
202,en,GERestaurant,acsd,cot,3e-05,8,8,0.05,0,500,meta-llama-Meta-Llama-3-8B,6,en_GERestaurant__cot_acsd_3e-05_8_8_0.05_4_500...,en_GERestaurant__cot_acsd_3e-05_8_8_0.05_4_0_5...,0.6441,0.5603,0.475


Loading dataset ...
Dataset name: GERestaurant
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7489
0.7064
0.7327
0.7234
0.6845
0.7059
0.7168
0.6441
         Source  ddof1            H  p-unc
Kruskal  config      7  2667.454931    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.751131,0.704768,0.672397,0.827315,0.616685,0.792342,True,780354.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.751131,0.732332,0.672397,0.827315,0.6531,0.810307,True,632573.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.751131,0.725339,0.672397,0.827315,0.645693,0.80851,True,675630.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.751131,0.684764,0.672397,0.827315,0.600812,0.765508,True,867382.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.751131,0.707668,0.672397,0.827315,0.634583,0.786052,True,784689.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.751131,0.716548,0.672397,0.827315,0.639973,0.7893,True,731069.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.751131,0.64334,0.672397,0.827315,0.56268,0.721925,True,970463.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.704768,0.732332,0.616685,0.792342,0.6531,0.810307,True,323626.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.704768,0.725339,0.616685,0.792342,0.645693,0.80851,True,373322.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.704768,0.684764,0.616685,0.792342,0.600812,0.765508,True,623779.0,0.0,0.0,True


## Evaluate Averages per Hyperparameter Combination

In [4]:
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all
# Means
results_acd = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'acd', 
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

# Means
results_acsa = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'acsa',
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

# Means
results_e2e = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'e2e', 
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

# Means
results_tasd = results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                     results_all['task'] == 'tasd', 
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

results = pd.concat([results_acd, results_acsa, results_e2e, results_tasd])

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]

results = results.reset_index()

idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

In [5]:
for comb, group in results_per_epoch.groupby(['learning_rate', 'lora_r', 'lora_alpha']):
    print(comb)
    print(len(group))
    print(f"{np.mean(group['f1-micro'])*100:.2f}")

('0.0003', '32', '32')
33
81.84
('0.0003', '32', '64')
33
78.27
('0.0003', '8', '16')
33
82.55
('0.0003', '8', '8')
33
83.05
('3e-05', '32', '32')
33
81.86
('3e-05', '32', '64')
33
82.33
('3e-05', '8', '16')
33
81.20
('3e-05', '8', '8')
33
79.83
