In [1]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np
from preprocessing import loadDataset, createPrompts
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = {
    'dataset': 'rest-16',
    'data_path': '../data',
    'lang': 'en',
    'split': 0
}

args = SimpleNamespace(**args)

RESULTS_PATH = '../results/ft_llm/'
N_SAMPLES = 1000

col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

def fetchFolders(args):

    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = results_all[np.logical_and.reduce([results_all['dataset'] == args.dataset, 
                                                     results_all['task'] == args.task,
                                                     results_all['split'] == str(args.split),
                                                     results_all['lr_setting'] == lr_setting,
                                                     results_all['prompt'] == args.prompt_style])].sort_values(by = ['f1-micro'], ascending = False)
    
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    print(len(results_per_epoch))
    
    display(results_per_epoch.sort_values(by = ['f1-micro'], ascending = False).head(50))
    
    return results_per_epoch.apply(lambda x: x['path'], axis = 1)

def read_predictions(model_config):
    with open(f'../results/ft_llm/{model_config}/predictions.txt', 'r') as file:
        lines = file.readlines()
    
    # Funktion zur Dekodierung der escaped UTF-8 Zeichen
    def decode_escaped_utf8(text):
        # Entferne doppelte Backslashes und dekodiere den Text als UTF-8
        text = text.encode('latin1').decode('unicode_escape').encode('latin1').decode('utf-8')
        return text
    
    # Dekodieren der Zeilen
    decoded_lines = [decode_escaped_utf8(line.strip()) for line in lines]
    return decoded_lines
    
def bootstrap_sampling(gold, pred, n_samples=N_SAMPLES):
    bootstrap_samples = []
    for i in range(n_samples):
        # Ziehe eine Bootstrap-Stichprobe mit Zurücklegen aus dem Test-Set
        indices = np.random.choice(len(gold), len(gold), replace=True)
        bootstrap_samples.append([[gold[index] for index in indices], [pred[index] for index in indices]])
    return bootstrap_samples

def compute_f1_scores(ground_truth, predictions, args, n_samples=N_SAMPLES):

    predictions = [extractAspects(pred, args.task, args.prompt_style == 'cot', True) for pred in predictions]
    pred_labels, false_predictions = convertLabels(predictions, args.task, args.LABEL_SPACE)
    ground_truth = [extractAspects(gt, args.task, args.prompt_style == 'cot', False) for gt in ground_truth]
    gold_labels, _ = convertLabels(ground_truth, args.task, args.LABEL_SPACE)

    # combined = list(zip(gold_labels, pred_labels))
    bootstrap_samples = bootstrap_sampling(gold_labels, pred_labels, n_samples)
    f1_scores = []
    
    results_asp, results_asp_pol, results_pairs, results_pol, results_phrases = createResults(pred_labels, gold_labels, args.LABEL_SPACE, args.task)
    
    if args.task == 'acd':
        print(results_asp['Micro-AVG']['f1'])
    elif args.task == 'acsa':
        print(results_asp_pol['Micro-AVG']['f1'])
    elif args.task == 'e2e' or args.task == 'e2e-e':
        print(results_pol['Micro-AVG']['f1'])
    else:
        print(results_phrases['Micro-AVG']['f1'])
    
    for sample in bootstrap_samples:
        # gt_sample, pred_sample = zip(*sample)
        results_asp, results_asp_pol, results_pairs, results_pol, results_phrases = createResults(sample[1], sample[0], args.LABEL_SPACE, args.task)
        if args.task == 'acd':
            f1_scores.append(results_asp['Micro-AVG']['f1'])
        elif args.task == 'acsa':
            f1_scores.append(results_asp_pol['Micro-AVG']['f1'])
        elif args.task == 'e2e' or args.task == 'e2e-e':
            f1_scores.append(results_pol['Micro-AVG']['f1'])
        else:
            f1_scores.append(results_phrases['Micro-AVG']['f1'])
    
    return f1_scores

def computeStatistics(args):
    model_folders = fetchFolders(args)

    df_train, df_test, args.LABEL_SPACE = loadDataset(args.data_path, args.dataset, args.lr_setting, args.task, args.split)
    prompts_train, prompts_test, ground_truth = createPrompts(df_train, df_test, args)
    
    f1_scores = {}
    df_f1_scores = []
    
    for config in model_folders:
    
        predictions = read_predictions(config)
        
        scores = compute_f1_scores(ground_truth, predictions, args)
        f1_scores[config] = scores
        # print(pg.normality(scores))
        df_f1_scores.append(pd.DataFrame({'f1':scores, 'config': '_'.join(config.split('_')[3:6])}))

    df_f1_scores = pd.concat(df_f1_scores)
    
    # Kruskal-Wallis-Test durchführen
    results_kruskal = kruskal(data=df_f1_scores, dv='f1', between='config')
    print(results_kruskal)
    
    f1_scores = pd.DataFrame(f1_scores)
    
    # Wenn der Kruskal-Wallis-Test signifikant ist, führen wir die paarweisen Vergleiche durch
    if results_kruskal['p-unc'].iloc[0] < 0.05:

        comb = combinations(model_folders, 2)
        # Paarweise Vergleiche
        pairwise_comparisons = []
        
        for config_i, config_j in comb:
            # Bootstrapping-Vergleich
            ci_lower_1 = np.percentile(f1_scores[config_i], 2.5)
            ci_upper_1 = np.percentile(f1_scores[config_i], 97.5)
            ci_lower_2 = np.percentile(f1_scores[config_j], 2.5)
            ci_upper_2 = np.percentile(f1_scores[config_j], 97.5)

            ci_overlap = str(not (ci_lower_1 > ci_upper_2))
            
            # Mann-Whitney-U-Test
            mwu_gr = pg.mwu(f1_scores[config_i], f1_scores[config_j], alternative='two-sided')
            
            # Ergebnisse speichern
            pairwise_comparisons.append({
                'Model 1': '_'.join(config_i.split('_')[3:6]),
                'Model 2': '_'.join(config_j.split('_')[3:6]),
                "Mean Model 1": np.mean(f1_scores[config_i]),
                "Mean Model 2": np.mean(f1_scores[config_j]),
                'Model 1 CI Lower': ci_lower_1,
                'Model 1 CI Upper': ci_upper_1,
                'Model 2 CI Lower': ci_lower_2,
                'Model 2 CI Upper': ci_upper_2,
                'CI Overlap': ci_overlap,
                'U Statistic (Model1 > Model2)': mwu_gr['U-val']['MWU'],
                'P-Value (Model1 > Model2)': mwu_gr['p-val']['MWU'].round(6)
            })
    
        # Ergebnisse in einem DataFrame anzeigen
        pairwise_comparisons_df = pd.DataFrame(pairwise_comparisons)
        
        # Holm-Korrektur für p-Werte
        p_values = pairwise_comparisons_df['P-Value (Model1 > Model2)']
    
        corrected_p_values = pg.multicomp(p_values, method='holm')
    
        # Korrigierte p-Werte in den DataFrame einfügen
        pairwise_comparisons_df['Corrected P-Value (Model1 > Model2)'] = corrected_p_values[1]
        pairwise_comparisons_df['Significant (Model1 > Model2)'] = corrected_p_values[0]
    
        # Ergebnisse anzeigen
        print("Paarweise Vergleiche mit Holm-Korrektur:")
        display(pairwise_comparisons_df)
        pairwise_comparisons_df.to_csv(f'statistics/{args.task}_{args.dataset}_{args.prompt_style}_{args.lr_setting}.tsv', sep = '\t')
        
    else:
        print("Kruskal-Wallis-Test war nicht signifikant, keine weiteren Tests durchgeführt.")



In [2]:
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all

In [6]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'acd',
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


('1000', 'basic')
83.72, 1.72
('1000', 'context')
84.34, 1.22
('500', 'basic')
81.24, 1.80
('500', 'context')
81.89, 1.74
('full', 'basic')
85.03, 1.91
('full', 'context')
85.98, 1.48
('orig', 'basic')
83.33, 0.00
('orig', 'context')
81.09, 0.00


## ACD

### Full Dataset
### Short Prompt

In [7]:
args.lr_setting = 0
args.task = 'acd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
38,rest-16,acd,basic,0.0003,8,16,0.05,0,full,8,acd_rest-16_basic_0.0003_8_16_0.05_full,acd_rest-16_basic_0.0003_8_16_0.05_0_full_8,0.8785,0.8174,0.7833
4628,rest-16,acd,basic,0.0003,32,32,0.05,0,full,10,acd_rest-16_basic_0.0003_32_32_0.05_full,acd_rest-16_basic_0.0003_32_32_0.05_0_full_10,0.8611,0.8085,0.7559
5824,rest-16,acd,basic,0.0003,8,8,0.05,0,full,7,acd_rest-16_basic_0.0003_8_8_0.05_full,acd_rest-16_basic_0.0003_8_8_0.05_0_full_7,0.8589,0.7869,0.7526
786,rest-16,acd,basic,3e-05,8,16,0.05,0,full,9,acd_rest-16_basic_3e-05_8_16_0.05_full,acd_rest-16_basic_3e-05_8_16_0.05_0_full_9,0.8535,0.8032,0.7444
6381,rest-16,acd,basic,3e-05,32,32,0.05,0,full,10,acd_rest-16_basic_3e-05_32_32_0.05_full,acd_rest-16_basic_3e-05_32_32_0.05_0_full_10,0.8497,0.8048,0.7387
3262,rest-16,acd,basic,3e-05,32,64,0.05,0,full,6,acd_rest-16_basic_3e-05_32_64_0.05_full,acd_rest-16_basic_3e-05_32_64_0.05_0_full_6,0.8471,0.8047,0.7347
1785,rest-16,acd,basic,3e-05,8,8,0.05,0,full,6,acd_rest-16_basic_3e-05_8_8_0.05_full,acd_rest-16_basic_3e-05_8_8_0.05_0_full_6,0.8465,0.793,0.7339
2003,rest-16,acd,basic,0.0003,32,64,0.05,0,full,7,acd_rest-16_basic_0.0003_32_64_0.05_full,acd_rest-16_basic_0.0003_32_64_0.05_0_full_7,0.8068,0.6832,0.6762


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.861
0.8068
0.8785
0.8589
0.8497
0.8471
0.8535
0.8465
         Source  ddof1            H  p-unc
Kruskal  config      7  3866.035164    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.860809,0.80747,0.83309,0.889205,0.770497,0.841615,True,989734.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.860809,0.877737,0.83309,0.889205,0.846195,0.905552,True,210579.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.860809,0.858907,0.83309,0.889205,0.822975,0.892302,True,526010.5,0.043985,0.08797,False
3,0.0003_32_32,3e-05_32_32,0.860809,0.849465,0.83309,0.889205,0.81738,0.879703,True,692905.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.860809,0.846854,0.83309,0.889205,0.816295,0.877803,True,738689.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.860809,0.853765,0.83309,0.889205,0.824997,0.882818,True,629295.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.860809,0.846147,0.83309,0.889205,0.813392,0.875707,True,747009.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.80747,0.877737,0.770497,0.841615,0.846195,0.905552,True,2112.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.80747,0.858907,0.770497,0.841615,0.822975,0.892302,True,21580.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.80747,0.849465,0.770497,0.841615,0.81738,0.879703,True,43155.5,0.0,0.0,True


### Full Dataset
### Long Prompt

In [8]:
args.lr_setting = 0
args.task = 'acd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
6788,rest-16,acd,context,0.0003,8,8,0.05,0,full,6,acd_rest-16_context_0.0003_8_8_0.05_full,acd_rest-16_context_0.0003_8_8_0.05_0_full_6,0.8721,0.8406,0.7732
5993,rest-16,acd,context,0.0003,8,16,0.05,0,full,9,acd_rest-16_context_0.0003_8_16_0.05_full,acd_rest-16_context_0.0003_8_16_0.05_0_full_9,0.872,0.8324,0.773
3671,rest-16,acd,context,3e-05,8,8,0.05,0,full,9,acd_rest-16_context_3e-05_8_8_0.05_full,acd_rest-16_context_3e-05_8_8_0.05_0_full_9,0.8667,0.8299,0.7647
436,rest-16,acd,context,3e-05,32,32,0.05,0,full,6,acd_rest-16_context_3e-05_32_32_0.05_full,acd_rest-16_context_3e-05_32_32_0.05_0_full_6,0.8662,0.8201,0.7639
6772,rest-16,acd,context,0.0003,32,32,0.05,0,full,6,acd_rest-16_context_0.0003_32_32_0.05_full,acd_rest-16_context_0.0003_32_32_0.05_0_full_6,0.8645,0.8174,0.7613
264,rest-16,acd,context,3e-05,32,64,0.05,0,full,7,acd_rest-16_context_3e-05_32_64_0.05_full,acd_rest-16_context_3e-05_32_64_0.05_0_full_7,0.8603,0.8196,0.7547
3227,rest-16,acd,context,3e-05,8,16,0.05,0,full,7,acd_rest-16_context_3e-05_8_16_0.05_full,acd_rest-16_context_3e-05_8_16_0.05_0_full_7,0.8531,0.8026,0.7438
921,rest-16,acd,context,0.0003,32,64,0.05,0,full,8,acd_rest-16_context_0.0003_32_64_0.05_full,acd_rest-16_context_0.0003_32_64_0.05_0_full_8,0.8237,0.773,0.7002


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.8645
0.8237
0.872
0.8721
0.8662
0.8602
0.853
0.8667
         Source  ddof1            H  p-unc
Kruskal  config      7  2997.110708    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.864288,0.823925,0.833295,0.8923,0.788073,0.858903,True,958552.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.864288,0.871968,0.833295,0.8923,0.843498,0.898718,True,359755.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.864288,0.872677,0.833295,0.8923,0.842695,0.9005,True,346348.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.864288,0.865627,0.833295,0.8923,0.837797,0.89271,True,474548.5,0.04873,0.14619,False
4,0.0003_32_32,3e-05_32_64,0.864288,0.86069,0.833295,0.8923,0.830795,0.890402,True,565242.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.864288,0.853413,0.833295,0.8923,0.8223,0.883705,True,690199.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.864288,0.866569,0.833295,0.8923,0.839298,0.893502,True,459832.5,0.001867,0.007468,True
7,0.0003_32_64,0.0003_8_16,0.823925,0.871968,0.788073,0.858903,0.843498,0.898718,True,17056.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.823925,0.872677,0.788073,0.858903,0.842695,0.9005,True,17784.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.823925,0.865627,0.788073,0.858903,0.837797,0.89271,True,32238.0,0.0,0.0,True


### 1000
### Short Prompt

In [9]:
args.lr_setting = 1000
args.task = 'acd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
3993,rest-16,acd,basic,0.0003,32,32,0.05,0,1000,8,acd_rest-16_basic_0.0003_32_32_0.05_1000,acd_rest-16_basic_0.0003_32_32_0.05_0_1000_8,0.8612,0.7768,0.7563
667,rest-16,acd,basic,3e-05,32,64,0.05,0,1000,10,acd_rest-16_basic_3e-05_32_64_0.05_1000,acd_rest-16_basic_3e-05_32_64_0.05_0_1000_10,0.8572,0.8079,0.75
4786,rest-16,acd,basic,3e-05,32,32,0.05,0,1000,4,acd_rest-16_basic_3e-05_32_32_0.05_1000,acd_rest-16_basic_3e-05_32_32_0.05_0_1000_4,0.8474,0.8096,0.7352
223,rest-16,acd,basic,3e-05,8,16,0.05,0,1000,4,acd_rest-16_basic_3e-05_8_16_0.05_1000,acd_rest-16_basic_3e-05_8_16_0.05_0_1000_4,0.8404,0.7822,0.7247
4623,rest-16,acd,basic,0.0003,8,8,0.05,0,1000,5,acd_rest-16_basic_0.0003_8_8_0.05_1000,acd_rest-16_basic_0.0003_8_8_0.05_0_1000_5,0.8337,0.7847,0.7148
1143,rest-16,acd,basic,0.0003,8,16,0.05,0,1000,9,acd_rest-16_basic_0.0003_8_16_0.05_1000,acd_rest-16_basic_0.0003_8_16_0.05_0_1000_9,0.8289,0.755,0.7077
2226,rest-16,acd,basic,3e-05,8,8,0.05,0,1000,8,acd_rest-16_basic_3e-05_8_8_0.05_1000,acd_rest-16_basic_3e-05_8_8_0.05_0_1000_8,0.823,0.7717,0.6993
2453,rest-16,acd,basic,0.0003,32,64,0.05,0,1000,4,acd_rest-16_basic_0.0003_32_64_0.05_1000,acd_rest-16_basic_0.0003_32_64_0.05_0_1000_4,0.8058,0.7187,0.6747


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8612
0.8058
0.8289
0.8337
0.8474
0.8571
0.8404
0.8231
         Source  ddof1            H  p-unc
Kruskal  config      7  3252.931738    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.86165,0.803914,0.825992,0.895402,0.755795,0.847407,True,974962.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.86165,0.829097,0.825992,0.895402,0.786798,0.871905,True,874497.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.86165,0.834653,0.825992,0.895402,0.792585,0.877603,True,831476.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.86165,0.84755,0.825992,0.895402,0.805677,0.8846,True,703146.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.86165,0.857123,0.825992,0.895402,0.817995,0.893707,True,564978.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.86165,0.840692,0.825992,0.895402,0.797292,0.880305,True,781618.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.86165,0.823943,0.825992,0.895402,0.780895,0.864703,True,913810.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.803914,0.829097,0.755795,0.847407,0.786798,0.871905,True,220285.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.803914,0.834653,0.755795,0.847407,0.792585,0.877603,True,172066.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.803914,0.84755,0.755795,0.847407,0.805677,0.8846,True,79527.5,0.0,0.0,True


### 1000
### Long Prompt

In [10]:
args.lr_setting = 1000
args.task = 'acd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
1857,rest-16,acd,context,0.0003,32,32,0.05,0,1000,9,acd_rest-16_context_0.0003_32_32_0.05_1000,acd_rest-16_context_0.0003_32_32_0.05_0_1000_9,0.8583,0.7985,0.7518
3603,rest-16,acd,context,3e-05,32,32,0.05,0,1000,9,acd_rest-16_context_3e-05_32_32_0.05_1000,acd_rest-16_context_3e-05_32_32_0.05_0_1000_9,0.8542,0.8049,0.7455
1797,rest-16,acd,context,0.0003,8,16,0.05,0,1000,6,acd_rest-16_context_0.0003_8_16_0.05_1000,acd_rest-16_context_0.0003_8_16_0.05_0_1000_6,0.8502,0.8244,0.7394
4058,rest-16,acd,context,0.0003,8,8,0.05,0,1000,7,acd_rest-16_context_0.0003_8_8_0.05_1000,acd_rest-16_context_0.0003_8_8_0.05_0_1000_7,0.8495,0.7753,0.7384
6350,rest-16,acd,context,3e-05,32,64,0.05,0,1000,10,acd_rest-16_context_3e-05_32_64_0.05_1000,acd_rest-16_context_3e-05_32_64_0.05_0_1000_10,0.8429,0.7917,0.7285
4310,rest-16,acd,context,3e-05,8,8,0.05,0,1000,8,acd_rest-16_context_3e-05_8_8_0.05_1000,acd_rest-16_context_3e-05_8_8_0.05_0_1000_8,0.8384,0.7821,0.7218
6036,rest-16,acd,context,3e-05,8,16,0.05,0,1000,8,acd_rest-16_context_3e-05_8_16_0.05_1000,acd_rest-16_context_3e-05_8_16_0.05_0_1000_8,0.8367,0.7908,0.7193
5066,rest-16,acd,context,0.0003,32,64,0.05,0,1000,7,acd_rest-16_context_0.0003_32_64_0.05_1000,acd_rest-16_context_0.0003_32_64_0.05_0_1000_7,0.817,0.7438,0.6906


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8583
0.817
0.8502
0.8495
0.8542
0.8429
0.8367
0.8384
         Source  ddof1            H  p-unc
Kruskal  config      7  1960.979183    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.858165,0.817182,0.817573,0.895703,0.77446,0.860915,True,914114.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.858165,0.851686,0.817573,0.895703,0.81139,0.8907,True,589308.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.858165,0.849615,0.817573,0.895703,0.810597,0.888022,True,617542.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.858165,0.854755,0.817573,0.895703,0.81236,0.89332,True,543383.5,0.000781,0.001562,True
4,0.0003_32_32,3e-05_32_64,0.858165,0.843631,0.817573,0.895703,0.8032,0.878402,True,697211.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.858165,0.835618,0.817573,0.895703,0.795685,0.873402,True,783684.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.858165,0.839362,0.817573,0.895703,0.798382,0.884505,True,734180.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.817182,0.851686,0.77446,0.860915,0.81139,0.8907,True,122972.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.817182,0.849615,0.77446,0.860915,0.810597,0.888022,True,140021.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.817182,0.854755,0.77446,0.860915,0.81236,0.89332,True,107010.5,0.0,0.0,True


### 500
### Short Prompt

In [11]:
args.lr_setting = 500
args.task = 'acd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2873,rest-16,acd,basic,0.0003,8,16,0.05,0,500,10,acd_rest-16_basic_0.0003_8_16_0.05_500,acd_rest-16_basic_0.0003_8_16_0.05_0_500_10,0.8413,0.7889,0.726
6127,rest-16,acd,basic,0.0003,8,8,0.05,0,500,10,acd_rest-16_basic_0.0003_8_8_0.05_500,acd_rest-16_basic_0.0003_8_8_0.05_0_500_10,0.8259,0.7551,0.7034
1913,rest-16,acd,basic,0.0003,32,32,0.05,0,500,5,acd_rest-16_basic_0.0003_32_32_0.05_500,acd_rest-16_basic_0.0003_32_32_0.05_0_500_5,0.8249,0.7165,0.702
1667,rest-16,acd,basic,0.0003,32,64,0.05,0,500,3,acd_rest-16_basic_0.0003_32_64_0.05_500,acd_rest-16_basic_0.0003_32_64_0.05_0_500_3,0.8112,0.7438,0.6824
2665,rest-16,acd,basic,3e-05,8,16,0.05,0,500,9,acd_rest-16_basic_3e-05_8_16_0.05_500,acd_rest-16_basic_3e-05_8_16_0.05_0_500_9,0.811,0.7012,0.6821
1473,rest-16,acd,basic,3e-05,32,32,0.05,0,500,9,acd_rest-16_basic_3e-05_32_32_0.05_500,acd_rest-16_basic_3e-05_32_32_0.05_0_500_9,0.8108,0.6933,0.6818
5810,rest-16,acd,basic,3e-05,32,64,0.05,0,500,10,acd_rest-16_basic_3e-05_32_64_0.05_500,acd_rest-16_basic_3e-05_32_64_0.05_0_500_10,0.7939,0.6819,0.6582
6838,rest-16,acd,basic,3e-05,8,8,0.05,0,500,5,acd_rest-16_basic_3e-05_8_8_0.05_500,acd_rest-16_basic_3e-05_8_8_0.05_0_500_5,0.7799,0.7063,0.6392


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.8249
0.8112
0.8413
0.8259
0.8108
0.7939
0.811
0.7799
         Source  ddof1            H  p-unc
Kruskal  config      7  1757.701196    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.825071,0.809874,0.76389,0.882622,0.749975,0.871112,True,634842.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.825071,0.840005,0.76389,0.882622,0.7755,0.898022,True,368325.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.825071,0.824555,0.76389,0.882622,0.759792,0.883715,True,505047.5,0.695913,1.0,False
3,0.0003_32_32,3e-05_32_32,0.825071,0.811179,0.76389,0.882622,0.749977,0.868202,True,620866.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.825071,0.793658,0.76389,0.882622,0.728097,0.856,True,756727.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.825071,0.81183,0.76389,0.882622,0.746305,0.8696,True,611037.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.825071,0.780726,0.76389,0.882622,0.709195,0.843807,True,827917.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.809874,0.840005,0.749975,0.871112,0.7755,0.898022,True,246566.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.809874,0.824555,0.749975,0.871112,0.759792,0.883715,True,366888.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.809874,0.811179,0.749975,0.871112,0.749977,0.868202,True,486007.0,0.278546,0.835638,False


### 500
### Long Prompt

In [12]:
args.lr_setting = 500
args.task = 'acd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
6942,rest-16,acd,context,0.0003,8,16,0.05,0,500,6,acd_rest-16_context_0.0003_8_16_0.05_500,acd_rest-16_context_0.0003_8_16_0.05_0_500_6,0.8413,0.7926,0.726
6692,rest-16,acd,context,0.0003,8,8,0.05,0,500,9,acd_rest-16_context_0.0003_8_8_0.05_500,acd_rest-16_context_0.0003_8_8_0.05_0_500_9,0.8367,0.7513,0.7192
3813,rest-16,acd,context,3e-05,32,64,0.05,0,500,5,acd_rest-16_context_3e-05_32_64_0.05_500,acd_rest-16_context_3e-05_32_64_0.05_0_500_5,0.8287,0.7548,0.7075
5018,rest-16,acd,context,0.0003,32,32,0.05,0,500,7,acd_rest-16_context_0.0003_32_32_0.05_500,acd_rest-16_context_0.0003_32_32_0.05_0_500_7,0.8273,0.7371,0.7055
1763,rest-16,acd,context,3e-05,8,16,0.05,0,500,10,acd_rest-16_context_3e-05_8_16_0.05_500,acd_rest-16_context_3e-05_8_16_0.05_0_500_10,0.8142,0.7001,0.6867
297,rest-16,acd,context,0.0003,32,64,0.05,0,500,8,acd_rest-16_context_0.0003_32_64_0.05_500,acd_rest-16_context_0.0003_32_64_0.05_0_500_8,0.811,0.6443,0.6821
3724,rest-16,acd,context,3e-05,32,32,0.05,0,500,7,acd_rest-16_context_3e-05_32_32_0.05_500,acd_rest-16_context_3e-05_32_32_0.05_0_500_7,0.808,0.7045,0.6779
4363,rest-16,acd,context,3e-05,8,8,0.05,0,500,4,acd_rest-16_context_3e-05_8_8_0.05_500,acd_rest-16_context_3e-05_8_8_0.05_0_500_4,0.7837,0.6817,0.6443


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.8273
0.811
0.8413
0.8367
0.808
0.8287
0.8142
0.7837
         Source  ddof1            H  p-unc
Kruskal  config      7  1652.487969    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.827727,0.812634,0.76229,0.887003,0.744883,0.87301,True,626111.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.827727,0.841894,0.76229,0.887003,0.776998,0.897605,True,371803.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.827727,0.836783,0.76229,0.887003,0.77146,0.89881,True,421276.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.827727,0.806567,0.76229,0.887003,0.728097,0.868208,True,666218.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.827727,0.828968,0.76229,0.887003,0.763345,0.892203,True,488373.0,0.367925,0.443986,False
5,0.0003_32_32,3e-05_8_16,0.827727,0.81475,0.76229,0.887003,0.751783,0.878,True,615145.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.827727,0.7843,0.76229,0.887003,0.716695,0.848012,True,821988.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.812634,0.841894,0.744883,0.87301,0.776998,0.897605,True,262657.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.812634,0.836783,0.744883,0.87301,0.77146,0.89881,True,306751.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.812634,0.806567,0.744883,0.87301,0.728097,0.868208,True,546051.0,0.000362,0.00132,True


# ACSA

In [14]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'acsa', 
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


('1000', 'basic')
78.98, 2.22
('1000', 'context')
80.76, 1.94
('1000', 'cot')
79.08, 2.13
('500', 'basic')
77.03, 2.72
('500', 'context')
76.96, 1.95
('500', 'cot')
76.76, 3.32
('full', 'basic')
81.10, 2.75
('full', 'context')
82.02, 3.18
('full', 'cot')
79.05, 5.86
('orig', 'basic')
78.00, 0.00
('orig', 'context')
81.61, 0.00
('orig', 'cot')
82.55, 0.00


### Full Dataset
### Short Prompt

In [16]:
args.lr_setting = 0
args.task = 'acsa'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5270,rest-16,acsa,basic,3e-05,32,64,0.05,0,full,7,acsa_rest-16_basic_3e-05_32_64_0.05_full,acsa_rest-16_basic_3e-05_32_64_0.05_0_full_7,0.8363,0.7677,0.7187
4246,rest-16,acsa,basic,0.0003,8,8,0.05,0,full,5,acsa_rest-16_basic_0.0003_8_8_0.05_full,acsa_rest-16_basic_0.0003_8_8_0.05_0_full_5,0.8362,0.7645,0.7185
4092,rest-16,acsa,basic,3e-05,32,32,0.05,0,full,5,acsa_rest-16_basic_3e-05_32_32_0.05_full,acsa_rest-16_basic_3e-05_32_32_0.05_0_full_5,0.8307,0.7796,0.7105
5790,rest-16,acsa,basic,0.0003,8,16,0.05,0,full,9,acsa_rest-16_basic_0.0003_8_16_0.05_full,acsa_rest-16_basic_0.0003_8_16_0.05_0_full_9,0.8209,0.7323,0.6962
3681,rest-16,acsa,basic,3e-05,8,8,0.05,0,full,10,acsa_rest-16_basic_3e-05_8_8_0.05_full,acsa_rest-16_basic_3e-05_8_8_0.05_0_full_10,0.8175,0.728,0.6914
3125,rest-16,acsa,basic,3e-05,8,16,0.05,0,full,6,acsa_rest-16_basic_3e-05_8_16_0.05_full,acsa_rest-16_basic_3e-05_8_16_0.05_0_full_6,0.8138,0.7487,0.6861
230,rest-16,acsa,basic,0.0003,32,32,0.05,0,full,3,acsa_rest-16_basic_0.0003_32_32_0.05_full,acsa_rest-16_basic_0.0003_32_32_0.05_0_full_3,0.7778,0.6618,0.6364
4327,rest-16,acsa,basic,0.0003,32,64,0.05,0,full,3,acsa_rest-16_basic_0.0003_32_64_0.05_full,acsa_rest-16_basic_0.0003_32_64_0.05_0_full_3,0.7549,0.6301,0.6062


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.7778
0.7549
0.8209
0.8362
0.8307
0.8363
0.8138
0.8175
         Source  ddof1            H  p-unc
Kruskal  config      7  4919.451942    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.777688,0.754831,0.737677,0.814213,0.716788,0.793407,True,796784.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.777688,0.820884,0.737677,0.814213,0.786,0.85321,True,45609.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.777688,0.83543,0.737677,0.814213,0.80289,0.868905,True,11671.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.777688,0.830229,0.737677,0.814213,0.795992,0.861503,True,19593.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.777688,0.835906,0.737677,0.814213,0.800975,0.8669,True,11220.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.777688,0.813427,0.737677,0.814213,0.777977,0.84722,True,87042.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.777688,0.817557,0.737677,0.814213,0.7811,0.8507,True,64354.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.754831,0.820884,0.716788,0.793407,0.786,0.85321,True,5983.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.754831,0.83543,0.716788,0.793407,0.80289,0.868905,True,1110.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.754831,0.830229,0.716788,0.793407,0.795992,0.861503,True,1923.5,0.0,0.0,True


### Full Dataset
### Long Prompt

In [17]:
args.lr_setting = 0
args.task = 'acsa'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
6621,rest-16,acsa,context,3e-05,32,32,0.05,0,full,7,acsa_rest-16_context_3e-05_32_32_0.05_full,acsa_rest-16_context_3e-05_32_32_0.05_0_full_7,0.8513,0.8076,0.7411
6287,rest-16,acsa,context,3e-05,8,16,0.05,0,full,6,acsa_rest-16_context_3e-05_8_16_0.05_full,acsa_rest-16_context_3e-05_8_16_0.05_0_full_6,0.8402,0.7767,0.7244
4295,rest-16,acsa,context,3e-05,32,64,0.05,0,full,10,acsa_rest-16_context_3e-05_32_64_0.05_full,acsa_rest-16_context_3e-05_32_64_0.05_0_full_10,0.8373,0.7537,0.7201
3418,rest-16,acsa,context,0.0003,8,8,0.05,0,full,10,acsa_rest-16_context_0.0003_8_8_0.05_full,acsa_rest-16_context_0.0003_8_8_0.05_0_full_10,0.8323,0.7601,0.7128
1969,rest-16,acsa,context,0.0003,8,16,0.05,0,full,9,acsa_rest-16_context_0.0003_8_16_0.05_full,acsa_rest-16_context_0.0003_8_16_0.05_0_full_9,0.8255,0.7624,0.7029
1534,rest-16,acsa,context,3e-05,8,8,0.05,0,full,7,acsa_rest-16_context_3e-05_8_8_0.05_full,acsa_rest-16_context_3e-05_8_8_0.05_0_full_7,0.8245,0.7325,0.7015
4929,rest-16,acsa,context,0.0003,32,32,0.05,0,full,5,acsa_rest-16_context_0.0003_32_32_0.05_full,acsa_rest-16_context_0.0003_32_32_0.05_0_full_5,0.8086,0.7364,0.6787
6572,rest-16,acsa,context,0.0003,32,64,0.05,0,full,7,acsa_rest-16_context_0.0003_32_64_0.05_full,acsa_rest-16_context_0.0003_32_64_0.05_0_full_7,0.7421,0.6445,0.59


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.8086
0.7421
0.8256
0.8323
0.8513
0.8373
0.8402
0.8245
         Source  ddof1            H  p-unc
Kruskal  config      7  4549.991132    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.808467,0.742383,0.771195,0.8422,0.701797,0.781907,True,991229.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.808467,0.826731,0.771195,0.8422,0.788785,0.8635,True,235815.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.808467,0.831522,0.771195,0.8422,0.7956,0.866707,True,176808.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.808467,0.851968,0.771195,0.8422,0.821398,0.88061,True,32249.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.808467,0.838053,0.771195,0.8422,0.80639,0.869915,True,112005.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.808467,0.840522,0.771195,0.8422,0.810098,0.870503,True,83035.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.808467,0.824227,0.771195,0.8422,0.791687,0.854905,True,257298.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.742383,0.826731,0.701797,0.781907,0.788785,0.8635,True,1232.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.742383,0.831522,0.701797,0.781907,0.7956,0.866707,True,521.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.742383,0.851968,0.701797,0.781907,0.821398,0.88061,True,20.5,0.0,0.0,True


### Full Dataset
### CoT Prompt

In [18]:
args.lr_setting = 0
args.task = 'acsa'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
3999,rest-16,acsa,cot,3e-05,32,64,0.05,0,full,5,acsa_rest-16_cot_3e-05_32_64_0.05_full,acsa_rest-16_cot_3e-05_32_64_0.05_0_full_5,0.8391,0.8138,0.7228
630,rest-16,acsa,cot,0.0003,8,8,0.05,0,full,8,acsa_rest-16_cot_0.0003_8_8_0.05_full,acsa_rest-16_cot_0.0003_8_8_0.05_0_full_8,0.8259,0.784,0.7034
1422,rest-16,acsa,cot,3e-05,32,32,0.05,0,full,9,acsa_rest-16_cot_3e-05_32_32_0.05_full,acsa_rest-16_cot_3e-05_32_32_0.05_0_full_9,0.8233,0.7847,0.6996
1335,rest-16,acsa,cot,3e-05,8,16,0.05,0,full,9,acsa_rest-16_cot_3e-05_8_16_0.05_full,acsa_rest-16_cot_3e-05_8_16_0.05_0_full_9,0.8135,0.7483,0.6856
3633,rest-16,acsa,cot,0.0003,32,32,0.05,0,full,10,acsa_rest-16_cot_0.0003_32_32_0.05_full,acsa_rest-16_cot_0.0003_32_32_0.05_0_full_10,0.8117,0.7816,0.683
5356,rest-16,acsa,cot,3e-05,8,8,0.05,0,full,10,acsa_rest-16_cot_3e-05_8_8_0.05_full,acsa_rest-16_cot_3e-05_8_8_0.05_0_full_10,0.79,0.7204,0.6529
4111,rest-16,acsa,cot,0.0003,32,64,0.05,0,full,8,acsa_rest-16_cot_0.0003_32_64_0.05_full,acsa_rest-16_cot_0.0003_32_64_0.05_0_full_8,0.7771,0.7067,0.6354
6881,rest-16,acsa,cot,0.0003,8,16,0.05,0,full,6,acsa_rest-16_cot_0.0003_8_16_0.05_full,acsa_rest-16_cot_0.0003_8_16_0.05_0_full_6,0.6436,0.6245,0.4745


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.8117
0.7771
0.6436
0.8259
0.8233
0.8391
0.8135
0.79
         Source  ddof1            H  p-unc
Kruskal  config      7  5560.525235    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.812783,0.777262,0.76999,0.852005,0.738977,0.812605,True,894521.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.812783,0.643629,0.76999,0.852005,0.6037,0.67751,False,1000000.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.812783,0.825773,0.76999,0.852005,0.791797,0.858825,True,319913.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.812783,0.823474,0.76999,0.852005,0.790995,0.854407,True,345542.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.812783,0.838845,0.76999,0.852005,0.804988,0.870305,True,166250.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.812783,0.813088,0.76999,0.852005,0.77859,0.84661,True,498565.5,0.911577,0.911577,False
6,0.0003_32_32,3e-05_8_8,0.812783,0.79027,0.76999,0.852005,0.756297,0.824205,True,794409.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.777262,0.643629,0.738977,0.812605,0.6037,0.67751,False,1000000.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.777262,0.825773,0.738977,0.812605,0.791797,0.858825,True,26616.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.777262,0.823474,0.738977,0.812605,0.790995,0.854407,True,29685.5,0.0,0.0,True


### 1000
### Short Prompt

In [19]:
args.lr_setting = 1000
args.task = 'acsa'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
1707,rest-16,acsa,basic,3e-05,32,64,0.05,0,1000,9,acsa_rest-16_basic_3e-05_32_64_0.05_1000,acsa_rest-16_basic_3e-05_32_64_0.05_0_1000_9,0.8323,0.7697,0.7128
1318,rest-16,acsa,basic,0.0003,8,8,0.05,0,1000,10,acsa_rest-16_basic_0.0003_8_8_0.05_1000,acsa_rest-16_basic_0.0003_8_8_0.05_0_1000_10,0.8059,0.7438,0.6749
182,rest-16,acsa,basic,0.0003,32,32,0.05,0,1000,5,acsa_rest-16_basic_0.0003_32_32_0.05_1000,acsa_rest-16_basic_0.0003_32_32_0.05_0_1000_5,0.7974,0.77,0.6631
2101,rest-16,acsa,basic,3e-05,8,16,0.05,0,1000,5,acsa_rest-16_basic_3e-05_8_16_0.05_1000,acsa_rest-16_basic_3e-05_8_16_0.05_0_1000_5,0.7909,0.6888,0.6541
1257,rest-16,acsa,basic,3e-05,32,32,0.05,0,1000,4,acsa_rest-16_basic_3e-05_32_32_0.05_1000,acsa_rest-16_basic_3e-05_32_32_0.05_0_1000_4,0.7871,0.7202,0.649
1157,rest-16,acsa,basic,0.0003,8,16,0.05,0,1000,10,acsa_rest-16_basic_0.0003_8_16_0.05_1000,acsa_rest-16_basic_0.0003_8_16_0.05_0_1000_10,0.7852,0.7114,0.6465
4456,rest-16,acsa,basic,3e-05,8,8,0.05,0,1000,8,acsa_rest-16_basic_3e-05_8_8_0.05_1000,acsa_rest-16_basic_3e-05_8_8_0.05_0_1000_8,0.7604,0.6582,0.6134
5292,rest-16,acsa,basic,0.0003,32,64,0.05,0,1000,9,acsa_rest-16_basic_0.0003_32_64_0.05_1000,acsa_rest-16_basic_0.0003_32_64_0.05_0_1000_9,0.7594,0.5881,0.6121


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7974
0.7594
0.7853
0.8059
0.7871
0.8323
0.7909
0.7604
         Source  ddof1            H  p-unc
Kruskal  config      7  3436.391286    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.79764,0.759208,0.746195,0.84431,0.709398,0.810005,True,856133.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.79764,0.785517,0.746195,0.84431,0.730888,0.834407,True,626540.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.79764,0.80628,0.746195,0.84431,0.757992,0.850905,True,402688.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.79764,0.786993,0.746195,0.84431,0.745668,0.82911,True,626640.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.79764,0.832562,0.746195,0.84431,0.7883,0.875202,True,143967.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.79764,0.79097,0.746195,0.84431,0.745695,0.833718,True,581612.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.79764,0.761413,0.746195,0.84431,0.69508,0.81861,True,815433.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.759208,0.785517,0.709398,0.810005,0.730888,0.834407,True,236908.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.759208,0.80628,0.709398,0.810005,0.757992,0.850905,True,94624.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.759208,0.786993,0.709398,0.810005,0.745668,0.82911,True,206841.5,0.0,0.0,True


### 1000
### Long Prompt

In [20]:
args.lr_setting = 1000
args.task = 'acsa'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
3548,rest-16,acsa,context,0.0003,8,8,0.05,0,1000,8,acsa_rest-16_context_0.0003_8_8_0.05_1000,acsa_rest-16_context_0.0003_8_8_0.05_0_1000_8,0.8295,0.7672,0.7086
3224,rest-16,acsa,context,3e-05,32,64,0.05,0,1000,10,acsa_rest-16_context_3e-05_32_64_0.05_1000,acsa_rest-16_context_3e-05_32_64_0.05_0_1000_10,0.8238,0.754,0.7003
4342,rest-16,acsa,context,3e-05,8,16,0.05,0,1000,8,acsa_rest-16_context_3e-05_8_16_0.05_1000,acsa_rest-16_context_3e-05_8_16_0.05_0_1000_8,0.8231,0.7376,0.6993
3471,rest-16,acsa,context,3e-05,32,32,0.05,0,1000,9,acsa_rest-16_context_3e-05_32_32_0.05_1000,acsa_rest-16_context_3e-05_32_32_0.05_0_1000_9,0.8124,0.7022,0.684
4161,rest-16,acsa,context,0.0003,32,32,0.05,0,1000,8,acsa_rest-16_context_0.0003_32_32_0.05_1000,acsa_rest-16_context_0.0003_32_32_0.05_0_1000_8,0.8082,0.7721,0.6781
1882,rest-16,acsa,context,0.0003,8,16,0.05,0,1000,6,acsa_rest-16_context_0.0003_8_16_0.05_1000,acsa_rest-16_context_0.0003_8_16_0.05_0_1000_6,0.8059,0.7201,0.6749
2079,rest-16,acsa,context,3e-05,8,8,0.05,0,1000,9,acsa_rest-16_context_3e-05_8_8_0.05_1000,acsa_rest-16_context_3e-05_8_8_0.05_0_1000_9,0.7925,0.6892,0.6564
1409,rest-16,acsa,context,0.0003,32,64,0.05,0,1000,10,acsa_rest-16_context_0.0003_32_64_0.05_1000,acsa_rest-16_context_0.0003_32_64_0.05_0_1000_10,0.7653,0.6574,0.6199


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.8082
0.7653
0.8059
0.8295
0.8124
0.8238
0.823
0.7925
         Source  ddof1           H  p-unc
Kruskal  config      7  2957.53139    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.808179,0.766091,0.762685,0.852825,0.720268,0.814503,True,897447.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.808179,0.807113,0.762685,0.852825,0.7599,0.850415,True,509525.0,0.460767,0.560366,False
2,0.0003_32_32,0.0003_8_8,0.808179,0.827777,0.762685,0.852825,0.781665,0.870308,True,269712.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.808179,0.812101,0.762685,0.852825,0.761495,0.8552,True,444700.0,1.8e-05,9e-05,True
4,0.0003_32_32,3e-05_32_64,0.808179,0.825053,0.762685,0.852825,0.781497,0.866925,True,296356.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.808179,0.823609,0.762685,0.852825,0.774275,0.867903,True,315290.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.808179,0.793794,0.762685,0.852825,0.748097,0.835803,True,672434.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.766091,0.807113,0.720268,0.814503,0.7599,0.850415,True,110237.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.766091,0.827777,0.720268,0.814503,0.781665,0.870308,True,33756.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.766091,0.812101,0.720268,0.814503,0.761495,0.8552,True,91198.5,0.0,0.0,True


### 1000
### CoT Prompt

In [21]:
args.lr_setting = 1000
args.task = 'acsa'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
573,rest-16,acsa,cot,0.0003,8,8,0.05,0,1000,10,acsa_rest-16_cot_0.0003_8_8_0.05_1000,acsa_rest-16_cot_0.0003_8_8_0.05_0_1000_10,0.8197,0.7756,0.6944
3281,rest-16,acsa,cot,3e-05,32,64,0.05,0,1000,7,acsa_rest-16_cot_3e-05_32_64_0.05_1000,acsa_rest-16_cot_3e-05_32_64_0.05_0_1000_7,0.812,0.7596,0.6835
1537,rest-16,acsa,cot,0.0003,8,16,0.05,0,1000,9,acsa_rest-16_cot_0.0003_8_16_0.05_1000,acsa_rest-16_cot_0.0003_8_16_0.05_0_1000_9,0.8,0.7025,0.6667
3346,rest-16,acsa,cot,3e-05,32,32,0.05,0,1000,6,acsa_rest-16_cot_3e-05_32_32_0.05_1000,acsa_rest-16_cot_3e-05_32_32_0.05_0_1000_6,0.7948,0.7196,0.6596
565,rest-16,acsa,cot,0.0003,32,32,0.05,0,1000,10,acsa_rest-16_cot_0.0003_32_32_0.05_1000,acsa_rest-16_cot_0.0003_32_32_0.05_0_1000_10,0.7923,0.6828,0.656
557,rest-16,acsa,cot,3e-05,8,8,0.05,0,1000,9,acsa_rest-16_cot_3e-05_8_8_0.05_1000,acsa_rest-16_cot_3e-05_8_8_0.05_0_1000_9,0.7832,0.7135,0.6436
2989,rest-16,acsa,cot,3e-05,8,16,0.05,0,1000,9,acsa_rest-16_cot_3e-05_8_16_0.05_1000,acsa_rest-16_cot_3e-05_8_16_0.05_0_1000_9,0.7784,0.681,0.6373
3440,rest-16,acsa,cot,0.0003,32,64,0.05,0,1000,10,acsa_rest-16_cot_0.0003_32_64_0.05_1000,acsa_rest-16_cot_0.0003_32_64_0.05_0_1000_10,0.7456,0.6147,0.5944


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7923
0.7456
0.8
0.8197
0.7949
0.812
0.7785
0.7832
         Source  ddof1            H  p-unc
Kruskal  config      7  3253.778876    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.791182,0.746098,0.745395,0.839022,0.69699,0.79742,True,894663.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.791182,0.799886,0.745395,0.839022,0.754997,0.843005,True,395409.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.791182,0.82036,0.745395,0.839022,0.77729,0.861102,True,182291.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.791182,0.797088,0.745395,0.839022,0.749485,0.841,True,426059.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.791182,0.811608,0.745395,0.839022,0.766082,0.85593,True,267215.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.791182,0.780043,0.745395,0.839022,0.7284,0.8272,True,624599.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.791182,0.782106,0.745395,0.839022,0.731695,0.8253,True,600976.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.746098,0.799886,0.69699,0.79742,0.754997,0.843005,True,67159.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.746098,0.82036,0.69699,0.79742,0.77729,0.861102,True,15610.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.746098,0.797088,0.69699,0.79742,0.749485,0.841,True,79255.5,0.0,0.0,True


### 500
### Short Prompt

In [27]:
args.lr_setting = 500
args.task = 'acsa'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
168,rest-16,acsa,basic,3e-05,32,64,0.05,0,500,5,acsa_rest-16_basic_3e-05_32_64_0.05_500,acsa_rest-16_basic_3e-05_32_64_0.05_0_500_5,0.8016,0.7693,0.6688
512,rest-16,acsa,basic,0.0003,8,8,0.05,0,500,7,acsa_rest-16_basic_0.0003_8_8_0.05_500,acsa_rest-16_basic_0.0003_8_8_0.05_0_500_7,0.7984,0.766,0.6645
3436,rest-16,acsa,basic,0.0003,32,32,0.05,0,500,5,acsa_rest-16_basic_0.0003_32_32_0.05_500,acsa_rest-16_basic_0.0003_32_32_0.05_0_500_5,0.7967,0.7161,0.6621
363,rest-16,acsa,basic,0.0003,8,16,0.05,0,500,7,acsa_rest-16_basic_0.0003_8_16_0.05_500,acsa_rest-16_basic_0.0003_8_16_0.05_0_500_7,0.786,0.7132,0.6474
3948,rest-16,acsa,basic,3e-05,32,32,0.05,0,500,10,acsa_rest-16_basic_3e-05_32_32_0.05_500,acsa_rest-16_basic_3e-05_32_32_0.05_0_500_10,0.7656,0.6441,0.6203
2415,rest-16,acsa,basic,0.0003,32,64,0.05,0,500,6,acsa_rest-16_basic_0.0003_32_64_0.05_500,acsa_rest-16_basic_0.0003_32_64_0.05_0_500_6,0.7449,0.6994,0.5935
3661,rest-16,acsa,basic,3e-05,8,8,0.05,0,500,8,acsa_rest-16_basic_3e-05_8_8_0.05_500,acsa_rest-16_basic_3e-05_8_8_0.05_0_500_8,0.7352,0.6659,0.5813
3775,rest-16,acsa,basic,3e-05,8,16,0.05,0,500,10,acsa_rest-16_basic_3e-05_8_16_0.05_500,acsa_rest-16_basic_3e-05_8_16_0.05_0_500_10,0.7339,0.6238,0.5796


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7967
0.7449
0.786
0.7984
0.7656
0.8015
0.7339
0.7352
         Source  ddof1            H  p-unc
Kruskal  config      7  2983.673115    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.796371,0.746098,0.72568,0.8621,0.661597,0.820175,True,822307.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.796371,0.78565,0.72568,0.8621,0.716,0.84942,True,589451.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.796371,0.798041,0.72568,0.8621,0.723685,0.86275,True,487110.5,0.318215,0.954645,False
3,0.0003_32_32,3e-05_32_32,0.796371,0.76667,0.72568,0.8621,0.69569,0.834017,True,724643.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.796371,0.799388,0.72568,0.8621,0.7347,0.858327,True,474422.5,0.047625,0.1905,False
5,0.0003_32_32,3e-05_8_16,0.796371,0.73433,0.72568,0.8621,0.6587,0.8,True,893886.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.796371,0.733742,0.72568,0.8621,0.6639,0.804708,True,890061.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.746098,0.78565,0.661597,0.820175,0.716,0.84942,True,232206.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.746098,0.798041,0.661597,0.820175,0.723685,0.86275,True,168881.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.746098,0.76667,0.661597,0.820175,0.69569,0.834017,True,359199.0,0.0,0.0,True


### 500
### Long Prompt

In [23]:
args.lr_setting = 500
args.task = 'acsa'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2483,rest-16,acsa,context,0.0003,8,8,0.05,0,500,6,acsa_rest-16_context_0.0003_8_8_0.05_500,acsa_rest-16_context_0.0003_8_8_0.05_0_500_6,0.8074,0.7252,0.677
3668,rest-16,acsa,context,0.0003,8,16,0.05,0,500,4,acsa_rest-16_context_0.0003_8_16_0.05_500,acsa_rest-16_context_0.0003_8_16_0.05_0_500_4,0.7837,0.7124,0.6443
4895,rest-16,acsa,context,3e-05,32,64,0.05,0,500,7,acsa_rest-16_context_3e-05_32_64_0.05_500,acsa_rest-16_context_3e-05_32_64_0.05_0_500_7,0.7816,0.6827,0.6415
3205,rest-16,acsa,context,0.0003,32,32,0.05,0,500,7,acsa_rest-16_context_0.0003_32_32_0.05_500,acsa_rest-16_context_0.0003_32_32_0.05_0_500_7,0.7752,0.6487,0.6329
2915,rest-16,acsa,context,3e-05,32,32,0.05,0,500,3,acsa_rest-16_context_3e-05_32_32_0.05_500,acsa_rest-16_context_3e-05_32_32_0.05_0_500_3,0.7552,0.6459,0.6067
4378,rest-16,acsa,context,3e-05,8,16,0.05,0,500,7,acsa_rest-16_context_3e-05_8_16_0.05_500,acsa_rest-16_context_3e-05_8_16_0.05_0_500_7,0.755,0.6448,0.6065
1840,rest-16,acsa,context,0.0003,32,64,0.05,0,500,8,acsa_rest-16_context_0.0003_32_64_0.05_500,acsa_rest-16_context_0.0003_32_64_0.05_0_500_8,0.753,0.6475,0.6038
4078,rest-16,acsa,context,3e-05,8,8,0.05,0,500,9,acsa_rest-16_context_3e-05_8_8_0.05_500,acsa_rest-16_context_3e-05_8_8_0.05_0_500_9,0.746,0.6706,0.5949


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7752
0.7529
0.7837
0.8074
0.7552
0.7816
0.755
0.746
         Source  ddof1            H  p-unc
Kruskal  config      7  2028.164485    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.775464,0.754601,0.717595,0.834,0.6932,0.816312,True,677681.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.775464,0.782484,0.717595,0.834,0.71018,0.852337,True,441359.0,6e-06,3.6e-05,True
2,0.0003_32_32,0.0003_8_8,0.775464,0.809117,0.717595,0.834,0.7559,0.863007,True,213171.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.775464,0.754916,0.717595,0.834,0.68218,0.8205,True,666352.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.775464,0.781582,0.717595,0.834,0.722688,0.84213,True,446044.5,2.9e-05,0.000145,True
5,0.0003_32_32,3e-05_8_16,0.775464,0.757264,0.717595,0.834,0.67259,0.829512,True,630288.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.775464,0.744429,0.717595,0.834,0.672,0.8163,True,735111.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.754601,0.782484,0.6932,0.816312,0.71018,0.852337,True,282136.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.754601,0.809117,0.6932,0.816312,0.7559,0.863007,True,98975.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.754601,0.754916,0.6932,0.816312,0.68218,0.8205,True,493269.0,0.602217,0.904302,False


### 500
### CoT Prompt

In [24]:
args.lr_setting = 500
args.task = 'acsa'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
1117,rest-16,acsa,cot,0.0003,8,8,0.05,0,500,7,acsa_rest-16_cot_0.0003_8_8_0.05_500,acsa_rest-16_cot_0.0003_8_8_0.05_0_500_7,0.8279,0.7063,0.7063
3710,rest-16,acsa,cot,3e-05,32,64,0.05,0,500,9,acsa_rest-16_cot_3e-05_32_64_0.05_500,acsa_rest-16_cot_3e-05_32_64_0.05_0_500_9,0.7855,0.6902,0.6467
6675,rest-16,acsa,cot,0.0003,8,16,0.05,0,500,6,acsa_rest-16_cot_0.0003_8_16_0.05_500,acsa_rest-16_cot_0.0003_8_16_0.05_0_500_6,0.7846,0.6942,0.6456
842,rest-16,acsa,cot,3e-05,32,32,0.05,0,500,8,acsa_rest-16_cot_3e-05_32_32_0.05_500,acsa_rest-16_cot_3e-05_32_32_0.05_0_500_8,0.7739,0.7338,0.6312
758,rest-16,acsa,cot,0.0003,32,32,0.05,0,500,5,acsa_rest-16_cot_0.0003_32_32_0.05_500,acsa_rest-16_cot_0.0003_32_32_0.05_0_500_5,0.7712,0.6412,0.6276
6617,rest-16,acsa,cot,3e-05,8,16,0.05,0,500,10,acsa_rest-16_cot_3e-05_8_16_0.05_500,acsa_rest-16_cot_3e-05_8_16_0.05_0_500_10,0.7531,0.6845,0.6039
3769,rest-16,acsa,cot,3e-05,8,8,0.05,0,500,9,acsa_rest-16_cot_3e-05_8_8_0.05_500,acsa_rest-16_cot_3e-05_8_8_0.05_0_500_9,0.7313,0.5875,0.5765
3519,rest-16,acsa,cot,0.0003,32,64,0.05,0,500,6,acsa_rest-16_cot_0.0003_32_64_0.05_500,acsa_rest-16_cot_0.0003_32_64_0.05_0_500_6,0.7131,0.624,0.5541


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7712
0.713
0.7846
0.8279
0.7739
0.7854
0.753
0.7313
         Source  ddof1           H  p-unc
Kruskal  config      7  3610.50939    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.768877,0.714086,0.693298,0.84132,0.63559,0.7895,True,844998.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.768877,0.785031,0.693298,0.84132,0.713797,0.84942,True,373091.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.768877,0.828025,0.693298,0.84132,0.763773,0.888,True,113770.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.768877,0.772729,0.693298,0.84132,0.701192,0.8405,True,469447.0,0.017981,0.035962,True
4,0.0003_32_32,3e-05_32_64,0.768877,0.784208,0.693298,0.84132,0.718187,0.845208,True,375283.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.768877,0.754166,0.693298,0.84132,0.68499,0.821615,True,612682.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.768877,0.729732,0.693298,0.84132,0.651988,0.8014,True,769161.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.714086,0.785031,0.63559,0.7895,0.713797,0.84942,True,88938.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.714086,0.828025,0.63559,0.7895,0.763773,0.888,True,11947.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.714086,0.772729,0.63559,0.7895,0.701192,0.8405,True,135892.0,0.0,0.0,True


# E2E

In [19]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'e2e', 
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


('1000', 'cot')
71.38, 4.52
('1000', 'long')
78.39, 1.05
('1000', 'short')
78.90, 2.05
('500', 'cot')
67.20, 3.60
('500', 'long')
72.14, 1.77
('500', 'short')
71.85, 2.08
('full', 'cot')
77.81, 1.61
('full', 'long')
81.26, 2.36
('full', 'short')
81.71, 2.52


In [28]:
args.lr_setting = 0
args.task = 'e2e'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2378,rest-16,e2e,basic,3e-05,32,64,0.05,0,full,8,e2e_rest-16_basic_3e-05_32_64_0.05_full,e2e_rest-16_basic_3e-05_32_64_0.05_0_full_8,0.8406,0.7373,0.7251
3081,rest-16,e2e,basic,0.0003,8,8,0.05,0,full,7,e2e_rest-16_basic_0.0003_8_8_0.05_full,e2e_rest-16_basic_0.0003_8_8_0.05_0_full_7,0.8392,0.7339,0.7229
3460,rest-16,e2e,basic,3e-05,8,16,0.05,0,full,4,e2e_rest-16_basic_3e-05_8_16_0.05_full,e2e_rest-16_basic_3e-05_8_16_0.05_0_full_4,0.826,0.7053,0.7035
3669,rest-16,e2e,basic,3e-05,32,32,0.05,0,full,5,e2e_rest-16_basic_3e-05_32_32_0.05_full,e2e_rest-16_basic_3e-05_32_32_0.05_0_full_5,0.8237,0.7241,0.7002
5895,rest-16,e2e,basic,0.0003,32,32,0.05,0,full,9,e2e_rest-16_basic_0.0003_32_32_0.05_full,e2e_rest-16_basic_0.0003_32_32_0.05_0_full_9,0.822,0.7233,0.6977
6014,rest-16,e2e,basic,0.0003,8,16,0.05,0,full,7,e2e_rest-16_basic_0.0003_8_16_0.05_full,e2e_rest-16_basic_0.0003_8_16_0.05_0_full_7,0.8166,0.6888,0.6901
5478,rest-16,e2e,basic,3e-05,8,8,0.05,0,full,10,e2e_rest-16_basic_3e-05_8_8_0.05_full,e2e_rest-16_basic_3e-05_8_8_0.05_0_full_10,0.8139,0.6765,0.6862
5103,rest-16,e2e,basic,0.0003,32,64,0.05,0,full,3,e2e_rest-16_basic_0.0003_32_64_0.05_full,e2e_rest-16_basic_0.0003_32_64_0.05_0_full_3,0.7547,0.6223,0.6061


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.822
0.7547
0.8166
0.8392
0.8237
0.8406
0.826
0.8139
         Source  ddof1           H  p-unc
Kruskal  config      7  3681.27118    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.82244,0.755473,0.784282,0.8571,0.71248,0.796205,True,991088.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.82244,0.817424,0.784282,0.8571,0.781045,0.854612,True,575660.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.82244,0.839354,0.784282,0.8571,0.802792,0.873603,True,262383.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.82244,0.822831,0.784282,0.8571,0.783095,0.859305,True,493036.0,0.589709,0.589709,False
4,0.0003_32_32,3e-05_32_64,0.82244,0.840968,0.784282,0.8571,0.806088,0.8747,True,240071.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.82244,0.825707,0.784282,0.8571,0.792397,0.861102,True,454509.0,0.000427,0.001708,True
6,0.0003_32_32,3e-05_8_8,0.82244,0.813522,0.784282,0.8571,0.776998,0.848503,True,628774.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.755473,0.817424,0.71248,0.796205,0.781045,0.854612,True,14869.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.755473,0.839354,0.71248,0.796205,0.802792,0.873603,True,1597.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.755473,0.822831,0.71248,0.796205,0.783095,0.859305,True,10583.0,0.0,0.0,True


In [29]:
args.lr_setting = 0
args.task = 'e2e'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
3375,rest-16,e2e,context,3e-05,32,64,0.05,0,full,8,e2e_rest-16_context_3e-05_32_64_0.05_full,e2e_rest-16_context_3e-05_32_64_0.05_0_full_8,0.8287,0.6964,0.7076
2269,rest-16,e2e,context,3e-05,32,32,0.05,0,full,9,e2e_rest-16_context_3e-05_32_32_0.05_full,e2e_rest-16_context_3e-05_32_32_0.05_0_full_9,0.8249,0.719,0.702
195,rest-16,e2e,context,0.0003,8,8,0.05,0,full,8,e2e_rest-16_context_0.0003_8_8_0.05_full,e2e_rest-16_context_0.0003_8_8_0.05_0_full_8,0.8238,0.6918,0.7004
3086,rest-16,e2e,context,3e-05,8,16,0.05,0,full,9,e2e_rest-16_context_3e-05_8_16_0.05_full,e2e_rest-16_context_3e-05_8_16_0.05_0_full_9,0.8218,0.6576,0.6975
1646,rest-16,e2e,context,0.0003,8,16,0.05,0,full,9,e2e_rest-16_context_0.0003_8_16_0.05_full,e2e_rest-16_context_0.0003_8_16_0.05_0_full_9,0.8199,0.7246,0.6948
2577,rest-16,e2e,context,3e-05,8,8,0.05,0,full,9,e2e_rest-16_context_3e-05_8_8_0.05_full,e2e_rest-16_context_3e-05_8_8_0.05_0_full_9,0.8173,0.6321,0.6911
4981,rest-16,e2e,context,0.0003,32,32,0.05,0,full,7,e2e_rest-16_context_0.0003_32_32_0.05_full,e2e_rest-16_context_0.0003_32_32_0.05_0_full_7,0.8129,0.7151,0.6847
4038,rest-16,e2e,context,0.0003,32,64,0.05,0,full,3,e2e_rest-16_context_0.0003_32_64_0.05_full,e2e_rest-16_context_0.0003_32_64_0.05_0_full_3,0.7513,0.6382,0.6016


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.8129
0.7513
0.8199
0.8238
0.8249
0.8287
0.8218
0.8173
         Source  ddof1            H  p-unc
Kruskal  config      7  2919.200803    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.812296,0.750694,0.774083,0.8494,0.71,0.79152,True,985722.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.812296,0.820494,0.774083,0.8494,0.78349,0.8575,True,381800.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.812296,0.823822,0.774083,0.8494,0.784495,0.85851,True,334770.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.812296,0.82469,0.774083,0.8494,0.788492,0.8604,True,326213.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.812296,0.829354,0.774083,0.8494,0.795697,0.862503,True,257151.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.812296,0.821371,0.774083,0.8494,0.784,0.858102,True,367436.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.812296,0.817231,0.774083,0.8494,0.779798,0.852037,True,427668.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.750694,0.820494,0.71,0.79152,0.78349,0.8575,True,5937.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.750694,0.823822,0.71,0.79152,0.784495,0.85851,True,5019.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.750694,0.82469,0.71,0.79152,0.788492,0.8604,True,3734.5,0.0,0.0,True


In [30]:
args.lr_setting = 0
args.task = 'e2e'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
967,rest-16,e2e,cot,0.0003,8,8,0.05,0,full,8,e2e_rest-16_cot_0.0003_8_8_0.05_full,e2e_rest-16_cot_0.0003_8_8_0.05_0_full_8,0.799,0.6977,0.6653
17,rest-16,e2e,cot,0.0003,32,32,0.05,0,full,8,e2e_rest-16_cot_0.0003_32_32_0.05_full,e2e_rest-16_cot_0.0003_32_32_0.05_0_full_8,0.7957,0.6878,0.6607
1225,rest-16,e2e,cot,0.0003,8,16,0.05,0,full,10,e2e_rest-16_cot_0.0003_8_16_0.05_full,e2e_rest-16_cot_0.0003_8_16_0.05_0_full_10,0.7938,0.6992,0.6581
6113,rest-16,e2e,cot,3e-05,32,32,0.05,0,full,5,e2e_rest-16_cot_3e-05_32_32_0.05_full,e2e_rest-16_cot_3e-05_32_32_0.05_0_full_5,0.7819,0.6583,0.6419
467,rest-16,e2e,cot,3e-05,8,16,0.05,0,full,6,e2e_rest-16_cot_3e-05_8_16_0.05_full,e2e_rest-16_cot_3e-05_8_16_0.05_0_full_6,0.7704,0.627,0.6265
5911,rest-16,e2e,cot,3e-05,8,8,0.05,0,full,7,e2e_rest-16_cot_3e-05_8_8_0.05_full,e2e_rest-16_cot_3e-05_8_8_0.05_0_full_7,0.767,0.6758,0.622
3226,rest-16,e2e,cot,3e-05,32,64,0.05,0,full,4,e2e_rest-16_cot_3e-05_32_64_0.05_full,e2e_rest-16_cot_3e-05_32_64_0.05_0_full_4,0.7668,0.647,0.6218
6582,rest-16,e2e,cot,0.0003,32,64,0.05,0,full,9,e2e_rest-16_cot_0.0003_32_64_0.05_full,e2e_rest-16_cot_0.0003_32_64_0.05_0_full_9,0.7506,0.6946,0.6008


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.7957
0.7506
0.7938
0.799
0.7819
0.7668
0.7704
0.767
         Source  ddof1            H  p-unc
Kruskal  config      7  3222.924339    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.795473,0.751103,0.756888,0.8321,0.711198,0.7886,True,945011.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.795473,0.795073,0.756888,0.8321,0.756692,0.834115,True,506021.0,0.64105,1.0,False
2,0.0003_32_32,0.0003_8_8,0.795473,0.799737,0.756888,0.8321,0.760997,0.838217,True,437880.0,2e-06,1e-05,True
3,0.0003_32_32,3e-05_32_32,0.795473,0.783014,0.756888,0.8321,0.744192,0.81943,True,672328.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.795473,0.767785,0.756888,0.8321,0.72809,0.804303,True,839184.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.795473,0.770225,0.756888,0.8321,0.732195,0.805222,True,819980.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.795473,0.767711,0.756888,0.8321,0.727695,0.809702,True,833530.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.751103,0.795073,0.711198,0.7886,0.756692,0.834115,True,58129.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.751103,0.799737,0.711198,0.7886,0.760997,0.838217,True,41719.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.751103,0.783014,0.711198,0.7886,0.744192,0.81943,True,128343.0,0.0,0.0,True


In [31]:
args.lr_setting = 1000
args.task = 'e2e'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
4388,rest-16,e2e,basic,3e-05,32,64,0.05,0,1000,8,e2e_rest-16_basic_3e-05_32_64_0.05_1000,e2e_rest-16_basic_3e-05_32_64_0.05_0_1000_8,0.8129,0.688,0.6847
6749,rest-16,e2e,basic,0.0003,8,16,0.05,0,1000,8,e2e_rest-16_basic_0.0003_8_16_0.05_1000,e2e_rest-16_basic_0.0003_8_16_0.05_0_1000_8,0.8057,0.7052,0.6747
4316,rest-16,e2e,basic,0.0003,8,8,0.05,0,1000,6,e2e_rest-16_basic_0.0003_8_8_0.05_1000,e2e_rest-16_basic_0.0003_8_8_0.05_0_1000_6,0.7952,0.6988,0.6601
4519,rest-16,e2e,basic,3e-05,32,32,0.05,0,1000,8,e2e_rest-16_basic_3e-05_32_32_0.05_1000,e2e_rest-16_basic_3e-05_32_32_0.05_0_1000_8,0.7944,0.6208,0.6589
213,rest-16,e2e,basic,0.0003,32,32,0.05,0,1000,5,e2e_rest-16_basic_0.0003_32_32_0.05_1000,e2e_rest-16_basic_0.0003_32_32_0.05_0_1000_5,0.7914,0.7035,0.6548
1469,rest-16,e2e,basic,3e-05,8,8,0.05,0,1000,9,e2e_rest-16_basic_3e-05_8_8_0.05_1000,e2e_rest-16_basic_3e-05_8_8_0.05_0_1000_9,0.7895,0.6405,0.6522
5850,rest-16,e2e,basic,3e-05,8,16,0.05,0,1000,8,e2e_rest-16_basic_3e-05_8_16_0.05_1000,e2e_rest-16_basic_3e-05_8_16_0.05_0_1000_8,0.7833,0.6075,0.6438
2298,rest-16,e2e,basic,0.0003,32,64,0.05,0,1000,7,e2e_rest-16_basic_0.0003_32_64_0.05_1000,e2e_rest-16_basic_0.0003_32_64_0.05_0_1000_7,0.7398,0.6407,0.587


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7914
0.7398
0.8057
0.7952
0.7944
0.8129
0.7833
0.7895
         Source  ddof1            H  p-unc
Kruskal  config      7  2615.120049    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.791639,0.741256,0.738,0.84531,0.687597,0.79191,True,908926.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.791639,0.807499,0.738,0.84531,0.751097,0.856627,True,334745.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.791639,0.796135,0.738,0.84531,0.742873,0.851007,True,454886.5,0.000477,0.002385,True
3,0.0003_32_32,3e-05_32_32,0.791639,0.793712,0.738,0.84531,0.740558,0.84052,True,475087.5,0.053706,0.161118,False
4,0.0003_32_32,3e-05_32_64,0.791639,0.813445,0.738,0.84531,0.763297,0.8588,True,272165.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.791639,0.782127,0.738,0.84531,0.73058,0.836002,True,602833.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.791639,0.789948,0.738,0.84531,0.737082,0.838415,True,514992.0,0.245662,0.245662,False
7,0.0003_32_64,0.0003_8_16,0.741256,0.807499,0.687597,0.79191,0.751097,0.856627,True,42177.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.741256,0.796135,0.687597,0.79191,0.742873,0.851007,True,76245.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.741256,0.793712,0.687597,0.79191,0.740558,0.84052,True,75454.0,0.0,0.0,True


In [33]:
args.lr_setting = 1000
args.task = 'e2e'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2548,rest-16,e2e,context,0.0003,8,16,0.05,0,1000,7,e2e_rest-16_context_0.0003_8_16_0.05_1000,e2e_rest-16_context_0.0003_8_16_0.05_0_1000_7,0.7958,0.738,0.6608
2365,rest-16,e2e,context,3e-05,32,64,0.05,0,1000,4,e2e_rest-16_context_3e-05_32_64_0.05_1000,e2e_rest-16_context_3e-05_32_64_0.05_0_1000_4,0.7935,0.6309,0.6577
1063,rest-16,e2e,context,3e-05,8,16,0.05,0,1000,5,e2e_rest-16_context_3e-05_8_16_0.05_1000,e2e_rest-16_context_3e-05_8_16_0.05_0_1000_5,0.7885,0.6641,0.6508
3134,rest-16,e2e,context,3e-05,32,32,0.05,0,1000,9,e2e_rest-16_context_3e-05_32_32_0.05_1000,e2e_rest-16_context_3e-05_32_32_0.05_0_1000_9,0.7879,0.6729,0.65
1587,rest-16,e2e,context,0.0003,32,32,0.05,0,1000,9,e2e_rest-16_context_0.0003_32_32_0.05_1000,e2e_rest-16_context_0.0003_32_32_0.05_0_1000_9,0.7846,0.6785,0.6455
2844,rest-16,e2e,context,0.0003,8,8,0.05,0,1000,2,e2e_rest-16_context_0.0003_8_8_0.05_1000,e2e_rest-16_context_0.0003_8_8_0.05_0_1000_2,0.7813,0.6467,0.6411
3199,rest-16,e2e,context,3e-05,8,8,0.05,0,1000,7,e2e_rest-16_context_3e-05_8_8_0.05_1000,e2e_rest-16_context_3e-05_8_8_0.05_0_1000_7,0.7805,0.6686,0.64
4960,rest-16,e2e,context,0.0003,32,64,0.05,0,1000,7,e2e_rest-16_context_0.0003_32_64_0.05_1000,e2e_rest-16_context_0.0003_32_64_0.05_0_1000_7,0.7595,0.692,0.6122


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7846
0.7595
0.7958
0.7813
0.7879
0.7935
0.7885
0.7805
         Source  ddof1           H          p-unc
Kruskal  config      7  943.626843  1.816296e-199
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.783851,0.760252,0.73238,0.8347,0.7061,0.81612,True,733546.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.783851,0.796508,0.73238,0.8347,0.738895,0.848707,True,370807.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.783851,0.780626,0.73238,0.8347,0.728,0.8347,True,531877.5,0.013565,0.05426,False
3,0.0003_32_32,3e-05_32_32,0.783851,0.788127,0.73238,0.8347,0.737885,0.837225,True,453795.5,0.000346,0.002422,True
4,0.0003_32_32,3e-05_32_64,0.783851,0.792672,0.73238,0.8347,0.73999,0.840625,True,405623.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.783851,0.789012,0.73238,0.8347,0.735995,0.841008,True,446240.5,3.1e-05,0.000279,True
6,0.0003_32_32,3e-05_8_8,0.783851,0.781266,0.73238,0.8347,0.728498,0.832715,True,527342.0,0.034231,0.102693,False
7,0.0003_32_64,0.0003_8_16,0.760252,0.796508,0.7061,0.81612,0.738895,0.848707,True,181392.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.760252,0.780626,0.7061,0.81612,0.728,0.8347,True,300063.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.760252,0.788127,0.7061,0.81612,0.737885,0.837225,True,231008.5,0.0,0.0,True


In [34]:
args.lr_setting = 1000
args.task = 'e2e'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2841,rest-16,e2e,cot,0.0003,8,16,0.05,0,1000,9,e2e_rest-16_cot_0.0003_8_16_0.05_1000,e2e_rest-16_cot_0.0003_8_16_0.05_0_1000_9,0.7654,0.674,0.62
4738,rest-16,e2e,cot,0.0003,32,32,0.05,0,1000,5,e2e_rest-16_cot_0.0003_32_32_0.05_1000,e2e_rest-16_cot_0.0003_32_32_0.05_0_1000_5,0.7516,0.6301,0.6021
5039,rest-16,e2e,cot,3e-05,32,64,0.05,0,1000,7,e2e_rest-16_cot_3e-05_32_64_0.05_1000,e2e_rest-16_cot_3e-05_32_64_0.05_0_1000_7,0.7393,0.611,0.5864
4148,rest-16,e2e,cot,0.0003,8,8,0.05,0,1000,8,e2e_rest-16_cot_0.0003_8_8_0.05_1000,e2e_rest-16_cot_0.0003_8_8_0.05_0_1000_8,0.7356,0.6174,0.5818
6571,rest-16,e2e,cot,3e-05,32,32,0.05,0,1000,5,e2e_rest-16_cot_3e-05_32_32_0.05_1000,e2e_rest-16_cot_3e-05_32_32_0.05_0_1000_5,0.7202,0.5466,0.5627
4525,rest-16,e2e,cot,3e-05,8,16,0.05,0,1000,10,e2e_rest-16_cot_3e-05_8_16_0.05_1000,e2e_rest-16_cot_3e-05_8_16_0.05_0_1000_10,0.6957,0.5333,0.5333
2762,rest-16,e2e,cot,0.0003,32,64,0.05,0,1000,9,e2e_rest-16_cot_0.0003_32_64_0.05_1000,e2e_rest-16_cot_0.0003_32_64_0.05_0_1000_9,0.6898,0.5907,0.5265
1866,rest-16,e2e,cot,3e-05,8,8,0.05,0,1000,10,e2e_rest-16_cot_3e-05_8_8_0.05_1000,e2e_rest-16_cot_3e-05_8_8_0.05_0_1000_10,0.613,0.4749,0.442


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7516
0.6898
0.7654
0.7356
0.7202
0.7393
0.6957
0.613
         Source  ddof1            H  p-unc
Kruskal  config      7  5089.591489    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.750999,0.6899,0.6955,0.8,0.6342,0.744405,True,939625.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.750999,0.764591,0.6955,0.8,0.707562,0.817603,True,362914.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.750999,0.736355,0.6955,0.8,0.678895,0.789725,True,643946.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.750999,0.719221,0.6955,0.8,0.6667,0.77374,True,791620.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.750999,0.739431,0.6955,0.8,0.680795,0.79552,True,612968.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.750999,0.693177,0.6955,0.8,0.633795,0.7495,True,927895.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.750999,0.61152,0.6955,0.8,0.55209,0.67361,False,999784.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.6899,0.764591,0.6342,0.744405,0.707562,0.817603,True,31969.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.6899,0.736355,0.6342,0.744405,0.678895,0.789725,True,123784.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.6899,0.719221,0.6342,0.744405,0.6667,0.77374,True,232610.0,0.0,0.0,True


In [35]:
args.lr_setting = 500
args.task = 'e2e'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
4282,rest-16,e2e,basic,0.0003,8,16,0.05,0,500,2,e2e_rest-16_basic_0.0003_8_16_0.05_500,e2e_rest-16_basic_0.0003_8_16_0.05_0_500_2,0.7611,0.6368,0.6144
1914,rest-16,e2e,basic,3e-05,32,64,0.05,0,500,6,e2e_rest-16_basic_3e-05_32_64_0.05_500,e2e_rest-16_basic_3e-05_32_64_0.05_0_500_6,0.7344,0.5748,0.5802
2352,rest-16,e2e,basic,0.0003,32,32,0.05,0,500,9,e2e_rest-16_basic_0.0003_32_32_0.05_500,e2e_rest-16_basic_0.0003_32_32_0.05_0_500_9,0.7315,0.6235,0.5767
839,rest-16,e2e,basic,0.0003,8,8,0.05,0,500,5,e2e_rest-16_basic_0.0003_8_8_0.05_500,e2e_rest-16_basic_0.0003_8_8_0.05_0_500_5,0.7132,0.5921,0.5542
4798,rest-16,e2e,basic,0.0003,32,64,0.05,0,500,5,e2e_rest-16_basic_0.0003_32_64_0.05_500,e2e_rest-16_basic_0.0003_32_64_0.05_0_500_5,0.7109,0.6024,0.5515
3954,rest-16,e2e,basic,3e-05,8,16,0.05,0,500,10,e2e_rest-16_basic_3e-05_8_16_0.05_500,e2e_rest-16_basic_3e-05_8_16_0.05_0_500_10,0.6992,0.5505,0.5375
4845,rest-16,e2e,basic,3e-05,32,32,0.05,0,500,5,e2e_rest-16_basic_3e-05_32_32_0.05_500,e2e_rest-16_basic_3e-05_32_32_0.05_0_500_5,0.6988,0.6034,0.537
59,rest-16,e2e,basic,3e-05,8,8,0.05,0,500,10,e2e_rest-16_basic_3e-05_8_8_0.05_500,e2e_rest-16_basic_3e-05_8_8_0.05_0_500_10,0.6988,0.5417,0.537


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7315
0.7109
0.7611
0.7132
0.6988
0.7344
0.6992
0.6988
         Source  ddof1            H  p-unc
Kruskal  config      7  1542.824443    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.731282,0.712534,0.661295,0.798537,0.631088,0.788407,True,633485.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.731282,0.762614,0.661295,0.798537,0.6864,0.839505,True,284250.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.731282,0.71392,0.661295,0.798537,0.63079,0.78931,True,621514.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.731282,0.700984,0.661295,0.798537,0.61774,0.780527,True,703123.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.731282,0.732533,0.661295,0.798537,0.65669,0.806522,True,492880.5,0.581428,1.0,False
5,0.0003_32_32,3e-05_8_16,0.731282,0.699323,0.661295,0.798537,0.617185,0.780505,True,716245.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.731282,0.700252,0.661295,0.798537,0.608648,0.7833,True,705914.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.712534,0.762614,0.631088,0.788407,0.6864,0.839505,True,190540.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.712534,0.71392,0.631088,0.788407,0.63079,0.78931,True,486792.5,0.306422,1.0,False
9,0.0003_32_64,3e-05_32_32,0.712534,0.700984,0.631088,0.788407,0.61774,0.780527,True,577510.0,0.0,0.0,True


In [36]:
args.lr_setting = 500
args.task = 'e2e'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5402,rest-16,e2e,context,0.0003,8,16,0.05,0,500,2,e2e_rest-16_context_0.0003_8_16_0.05_500,e2e_rest-16_context_0.0003_8_16_0.05_0_500_2,0.739,0.6458,0.586
3090,rest-16,e2e,context,3e-05,32,32,0.05,0,500,5,e2e_rest-16_context_3e-05_32_32_0.05_500,e2e_rest-16_context_3e-05_32_32_0.05_0_500_5,0.739,0.6232,0.586
963,rest-16,e2e,context,0.0003,8,8,0.05,0,500,6,e2e_rest-16_context_0.0003_8_8_0.05_500,e2e_rest-16_context_0.0003_8_8_0.05_0_500_6,0.7373,0.6565,0.5839
40,rest-16,e2e,context,3e-05,8,16,0.05,0,500,7,e2e_rest-16_context_3e-05_8_16_0.05_500,e2e_rest-16_context_3e-05_8_16_0.05_0_500_7,0.7333,0.6326,0.5789
2206,rest-16,e2e,context,3e-05,32,64,0.05,0,500,8,e2e_rest-16_context_3e-05_32_64_0.05_500,e2e_rest-16_context_3e-05_32_64_0.05_0_500_8,0.7244,0.5956,0.5679
6094,rest-16,e2e,context,0.0003,32,32,0.05,0,500,10,e2e_rest-16_context_0.0003_32_32_0.05_500,e2e_rest-16_context_0.0003_32_32_0.05_0_500_10,0.7015,0.6012,0.5402
1878,rest-16,e2e,context,3e-05,8,8,0.05,0,500,8,e2e_rest-16_context_3e-05_8_8_0.05_500,e2e_rest-16_context_3e-05_8_8_0.05_0_500_8,0.6992,0.6223,0.5375
3503,rest-16,e2e,context,0.0003,32,64,0.05,0,500,4,e2e_rest-16_context_0.0003_32_64_0.05_500,e2e_rest-16_context_0.0003_32_64_0.05_0_500_4,0.6971,0.5754,0.535


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7015
0.6971
0.739
0.7373
0.739
0.7244
0.7333
0.6992
         Source  ddof1            H          p-unc
Kruskal  config      7  1187.669657  3.275125e-252
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.701565,0.699176,0.617585,0.7798,0.6135,0.785407,True,518645.5,0.148771,0.892626,False
1,0.0003_32_32,0.0003_8_16,0.701565,0.739968,0.617585,0.7798,0.649273,0.818607,True,263827.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.701565,0.736326,0.617585,0.7798,0.658592,0.807812,True,270109.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.701565,0.737978,0.617585,0.7798,0.6534,0.815107,True,268439.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.701565,0.724207,0.617585,0.7798,0.65038,0.793903,True,345331.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.701565,0.734444,0.617585,0.7798,0.652447,0.8148,True,289797.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.701565,0.699271,0.617585,0.7798,0.612473,0.777835,True,517028.0,0.187293,0.936465,False
7,0.0003_32_64,0.0003_8_16,0.699176,0.739968,0.6135,0.785407,0.649273,0.818607,True,252881.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.699176,0.736326,0.6135,0.785407,0.658592,0.807812,True,258437.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.699176,0.737978,0.6135,0.785407,0.6534,0.815107,True,257625.0,0.0,0.0,True


In [37]:
args.lr_setting = 500
args.task = 'e2e'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
4510,rest-16,e2e,cot,0.0003,32,64,0.05,0,500,10,e2e_rest-16_cot_0.0003_32_64_0.05_500,e2e_rest-16_cot_0.0003_32_64_0.05_0_500_10,0.7273,0.594,0.5714
3622,rest-16,e2e,cot,0.0003,32,32,0.05,0,500,10,e2e_rest-16_cot_0.0003_32_32_0.05_500,e2e_rest-16_cot_0.0003_32_32_0.05_0_500_10,0.7078,0.6206,0.5478
5720,rest-16,e2e,cot,0.0003,8,8,0.05,0,500,9,e2e_rest-16_cot_0.0003_8_8_0.05_500,e2e_rest-16_cot_0.0003_8_8_0.05_0_500_9,0.6992,0.5737,0.5375
3070,rest-16,e2e,cot,0.0003,8,16,0.05,0,500,8,e2e_rest-16_cot_0.0003_8_16_0.05_500,e2e_rest-16_cot_0.0003_8_16_0.05_0_500_8,0.6792,0.5595,0.5143
5809,rest-16,e2e,cot,3e-05,32,32,0.05,0,500,9,e2e_rest-16_cot_3e-05_32_32_0.05_500,e2e_rest-16_cot_3e-05_32_32_0.05_0_500_9,0.6667,0.5264,0.5
2224,rest-16,e2e,cot,3e-05,32,64,0.05,0,500,9,e2e_rest-16_cot_3e-05_32_64_0.05_500,e2e_rest-16_cot_3e-05_32_64_0.05_0_500_9,0.6429,0.4196,0.4737
2009,rest-16,e2e,cot,3e-05,8,16,0.05,0,500,7,e2e_rest-16_cot_3e-05_8_16_0.05_500,e2e_rest-16_cot_3e-05_8_16_0.05_0_500_7,0.6371,0.5119,0.4675
3897,rest-16,e2e,cot,3e-05,8,8,0.05,0,500,9,e2e_rest-16_cot_3e-05_8_8_0.05_500,e2e_rest-16_cot_3e-05_8_8_0.05_0_500_9,0.616,0.4979,0.4451


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7078
0.7273
0.6792
0.6992
0.6667
0.6429
0.6371
0.616
         Source  ddof1            H  p-unc
Kruskal  config      7  3468.480371    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.709437,0.726525,0.620345,0.8017,0.640892,0.804802,True,384329.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.709437,0.677855,0.620345,0.8017,0.59789,0.757103,True,695936.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.709437,0.699454,0.620345,0.8017,0.620385,0.7778,True,563219.0,1e-06,2e-06,True
3,0.0003_32_32,3e-05_32_32,0.709437,0.667592,0.620345,0.8017,0.5804,0.747905,True,748927.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.709437,0.641084,0.620345,0.8017,0.5603,0.724472,True,865123.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.709437,0.637659,0.620345,0.8017,0.556965,0.717108,True,882684.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.709437,0.616641,0.620345,0.8017,0.532773,0.698305,True,931980.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.726525,0.677855,0.640892,0.804802,0.59789,0.757103,True,800515.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.726525,0.699454,0.640892,0.804802,0.620385,0.7778,True,683080.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.726525,0.667592,0.640892,0.804802,0.5804,0.747905,True,843163.0,0.0,0.0,True


# TASD

In [38]:
# Means
results = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'tasd', 
                                                     results_all['split'] == str(0)])]

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    
idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

for comb, group in results_per_epoch.groupby(['lr_setting', 'prompt']):
    print(comb)
    print(f"{np.mean(group['f1-micro'])*100:.2f}, {np.std(group['f1-micro'])*100:.2f}")


('1000', 'basic')
73.46, 3.64
('1000', 'context')
71.38, 3.06
('1000', 'cot')
67.28, 2.70
('500', 'basic')
67.18, 3.15
('500', 'context')
68.01, 3.18
('500', 'cot')
61.62, 3.10
('full', 'basic')
76.80, 2.96
('full', 'context')
76.75, 1.87
('full', 'cot')
71.05, 2.44
('orig', 'basic')
72.48, 0.00
('orig', 'context')
76.72, 0.00
('orig', 'cot')
70.28, 0.00


### Full Dataset
### Short Prompt

In [39]:
args.lr_setting = 0
args.task = 'tasd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5194,rest-16,tasd,basic,0.0003,8,8,0.05,0,full,9,tasd_rest-16_basic_0.0003_8_8_0.05_full,tasd_rest-16_basic_0.0003_8_8_0.05_0_full_9,0.812,0.7678,0.6835
6585,rest-16,tasd,basic,3e-05,32,64,0.05,0,full,5,tasd_rest-16_basic_3e-05_32_64_0.05_full,tasd_rest-16_basic_3e-05_32_64_0.05_0_full_5,0.7824,0.7264,0.6426
6948,rest-16,tasd,basic,3e-05,32,32,0.05,0,full,9,tasd_rest-16_basic_3e-05_32_32_0.05_full,tasd_rest-16_basic_3e-05_32_32_0.05_0_full_9,0.7791,0.7244,0.6381
5915,rest-16,tasd,basic,3e-05,8,16,0.05,0,full,6,tasd_rest-16_basic_3e-05_8_16_0.05_full,tasd_rest-16_basic_3e-05_8_16_0.05_0_full_6,0.7743,0.6762,0.6318
6562,rest-16,tasd,basic,0.0003,8,16,0.05,0,full,10,tasd_rest-16_basic_0.0003_8_16_0.05_full,tasd_rest-16_basic_0.0003_8_16_0.05_0_full_10,0.7716,0.6681,0.6282
2502,rest-16,tasd,basic,3e-05,8,8,0.05,0,full,6,tasd_rest-16_basic_3e-05_8_8_0.05_full,tasd_rest-16_basic_3e-05_8_8_0.05_0_full_6,0.7665,0.6816,0.6214
2007,rest-16,tasd,basic,0.0003,32,32,0.05,0,full,6,tasd_rest-16_basic_0.0003_32_32_0.05_full,tasd_rest-16_basic_0.0003_32_32_0.05_0_full_6,0.7575,0.6651,0.6097
6950,rest-16,tasd,basic,0.0003,32,64,0.05,0,full,5,tasd_rest-16_basic_0.0003_32_64_0.05_full,tasd_rest-16_basic_0.0003_32_64_0.05_0_full_5,0.7002,0.5607,0.5388


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.7575
0.7003
0.7716
0.812
0.7791
0.7824
0.7743
0.7665
         Source  ddof1            H  p-unc
Kruskal  config      7  4358.048554    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.757223,0.700092,0.715097,0.799013,0.658288,0.741815,True,970170.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.757223,0.772443,0.715097,0.799013,0.728785,0.813705,True,303376.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.757223,0.812388,0.715097,0.799013,0.773788,0.849907,True,29184.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.757223,0.77856,0.715097,0.799013,0.737287,0.8184,True,237037.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.757223,0.78282,0.715097,0.799013,0.7432,0.8207,True,189549.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.757223,0.774266,0.715097,0.799013,0.732888,0.814305,True,282900.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.757223,0.767953,0.715097,0.799013,0.725598,0.80801,True,357918.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.700092,0.772443,0.658288,0.741815,0.728785,0.813705,True,8293.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.700092,0.812388,0.658288,0.741815,0.773788,0.849907,True,46.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.700092,0.77856,0.658288,0.741815,0.737287,0.8184,True,4332.5,0.0,0.0,True


### Full Dataset
### Long Prompt

In [3]:
args.lr_setting = 0
args.task = 'tasd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
5912,rest-16,tasd,context,0.0003,8,8,0.05,0,full,7,tasd_rest-16_context_0.0003_8_8_0.05_full,tasd_rest-16_context_0.0003_8_8_0.05_0_full_7,0.7928,0.7251,0.6567
4471,rest-16,tasd,context,0.0003,32,32,0.05,0,full,9,tasd_rest-16_context_0.0003_32_32_0.05_full,tasd_rest-16_context_0.0003_32_32_0.05_0_full_9,0.7813,0.6506,0.641
310,rest-16,tasd,context,3e-05,8,16,0.05,0,full,6,tasd_rest-16_context_3e-05_8_16_0.05_full,tasd_rest-16_context_3e-05_8_16_0.05_0_full_6,0.7749,0.707,0.6325
3582,rest-16,tasd,context,3e-05,32,64,0.05,0,full,9,tasd_rest-16_context_3e-05_32_64_0.05_full,tasd_rest-16_context_3e-05_32_64_0.05_0_full_9,0.7728,0.6924,0.6298
6289,rest-16,tasd,context,0.0003,8,16,0.05,0,full,9,tasd_rest-16_context_0.0003_8_16_0.05_full,tasd_rest-16_context_0.0003_8_16_0.05_0_full_9,0.7708,0.601,0.627
6708,rest-16,tasd,context,3e-05,32,32,0.05,0,full,6,tasd_rest-16_context_3e-05_32_32_0.05_full,tasd_rest-16_context_3e-05_32_32_0.05_0_full_6,0.7665,0.712,0.6214
5272,rest-16,tasd,context,3e-05,8,8,0.05,0,full,6,tasd_rest-16_context_3e-05_8_8_0.05_full,tasd_rest-16_context_3e-05_8_8_0.05_0_full_6,0.7545,0.6629,0.6058
6049,rest-16,tasd,context,0.0003,32,64,0.05,0,full,7,tasd_rest-16_context_0.0003_32_64_0.05_full,tasd_rest-16_context_0.0003_32_64_0.05_0_full_7,0.7262,0.6117,0.5701


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.7812
0.7262
0.7707
0.7928
0.7665
0.7728
0.7749
0.7545
         Source  ddof1            H  p-unc
Kruskal  config      7  3158.856393    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.781824,0.725739,0.741088,0.821315,0.6806,0.76672,True,970047.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.781824,0.770729,0.741088,0.821315,0.728885,0.810522,True,649296.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.781824,0.793728,0.741088,0.821315,0.7503,0.834102,True,344757.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.781824,0.766792,0.741088,0.821315,0.721795,0.807312,True,689384.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.781824,0.771302,0.741088,0.821315,0.725988,0.814605,True,636747.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.781824,0.774757,0.741088,0.821315,0.734797,0.813303,True,596816.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.781824,0.755162,0.741088,0.821315,0.712377,0.796602,True,817119.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.725739,0.770729,0.6806,0.76672,0.728885,0.810522,True,67616.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.725739,0.793728,0.6806,0.76672,0.7503,0.834102,True,13415.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.725739,0.766792,0.6806,0.76672,0.721795,0.807312,True,93867.5,0.0,0.0,True


### Full Dataset
### CoT Prompt

In [28]:
args.lr_setting = 0
args.task = 'tasd'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
930,en,rest-16,acsd,cot,0.0003,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,7,en_rest-16__cot_acsd_0.0003_8_8_0.05_4_full_me...,en_rest-16__cot_acsd_0.0003_8_8_0.05_4_0_full_...,0.7448,0.645,0.5933
1489,en,rest-16,acsd,cot,0.0003,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_rest-16__cot_acsd_0.0003_32_32_0.05_4_full_...,en_rest-16__cot_acsd_0.0003_32_32_0.05_4_0_ful...,0.7393,0.5818,0.5864
791,en,rest-16,acsd,cot,0.0003,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,7,en_rest-16__cot_acsd_0.0003_8_16_0.05_4_full_m...,en_rest-16__cot_acsd_0.0003_8_16_0.05_4_0_full...,0.7135,0.6337,0.5547
74,en,rest-16,acsd,cot,3e-05,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_rest-16__cot_acsd_3e-05_32_64_0.05_4_full_m...,en_rest-16__cot_acsd_3e-05_32_64_0.05_4_0_full...,0.7134,0.6494,0.5545
920,en,rest-16,acsd,cot,3e-05,8,16,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_rest-16__cot_acsd_3e-05_8_16_0.05_4_full_me...,en_rest-16__cot_acsd_3e-05_8_16_0.05_4_0_full_...,0.7105,0.5996,0.5509
978,en,rest-16,acsd,cot,3e-05,32,32,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_rest-16__cot_acsd_3e-05_32_32_0.05_4_full_m...,en_rest-16__cot_acsd_3e-05_32_32_0.05_4_0_full...,0.7026,0.5548,0.5415
1520,en,rest-16,acsd,cot,3e-05,8,8,0.05,0,full,meta-llama-Meta-Llama-3-8B,9,en_rest-16__cot_acsd_3e-05_8_8_0.05_4_full_met...,en_rest-16__cot_acsd_3e-05_8_8_0.05_4_0_full_m...,0.7003,0.5955,0.5387
245,en,rest-16,acsd,cot,0.0003,32,64,0.05,0,full,meta-llama-Meta-Llama-3-8B,8,en_rest-16__cot_acsd_0.0003_32_64_0.05_4_full_...,en_rest-16__cot_acsd_0.0003_32_64_0.05_4_0_ful...,0.6598,0.5218,0.4923


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  0
Train Length:  1423
Eval Length:  285
Split: 0
0.7393
0.6598
0.7135
0.7448
0.7026
0.7134
0.7105
0.7003
         Source  ddof1            H  p-unc
Kruskal  config      7  4000.851299    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.738844,0.660309,0.692885,0.782015,0.61639,0.70461,True,991563.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.738844,0.712103,0.692885,0.782015,0.664883,0.755812,True,797031.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.738844,0.745302,0.692885,0.782015,0.7012,0.786407,True,420058.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.738844,0.701698,0.692885,0.782015,0.655898,0.746003,True,871896.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.738844,0.713716,0.692885,0.782015,0.6659,0.758905,True,777238.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.738844,0.70909,0.692885,0.782015,0.6643,0.752502,True,819666.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.738844,0.700462,0.692885,0.782015,0.656392,0.743805,True,883364.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.660309,0.712103,0.61639,0.70461,0.664883,0.755812,True,57505.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.660309,0.745302,0.61639,0.70461,0.7012,0.786407,True,3278.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.660309,0.701698,0.61639,0.70461,0.655898,0.746003,True,107871.5,0.0,0.0,True


### 1000
### Short Prompt

In [5]:
args.lr_setting = 1000
args.task = 'tasd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
918,rest-16,tasd,basic,0.0003,8,16,0.05,0,1000,8,tasd_rest-16_basic_0.0003_8_16_0.05_1000,tasd_rest-16_basic_0.0003_8_16_0.05_0_1000_8,0.8016,0.7341,0.6689
3794,rest-16,tasd,basic,0.0003,8,8,0.05,0,1000,8,tasd_rest-16_basic_0.0003_8_8_0.05_1000,tasd_rest-16_basic_0.0003_8_8_0.05_0_1000_8,0.7673,0.7021,0.6225
5348,rest-16,tasd,basic,3e-05,8,8,0.05,0,1000,9,tasd_rest-16_basic_3e-05_8_8_0.05_1000,tasd_rest-16_basic_3e-05_8_8_0.05_0_1000_9,0.7475,0.669,0.5968
1016,rest-16,tasd,basic,3e-05,8,16,0.05,0,1000,9,tasd_rest-16_basic_3e-05_8_16_0.05_1000,tasd_rest-16_basic_3e-05_8_16_0.05_0_1000_9,0.7328,0.6014,0.5783
4640,rest-16,tasd,basic,3e-05,32,64,0.05,0,1000,4,tasd_rest-16_basic_3e-05_32_64_0.05_1000,tasd_rest-16_basic_3e-05_32_64_0.05_0_1000_4,0.7222,0.6275,0.5652
5144,rest-16,tasd,basic,0.0003,32,32,0.05,0,1000,7,tasd_rest-16_basic_0.0003_32_32_0.05_1000,tasd_rest-16_basic_0.0003_32_32_0.05_0_1000_7,0.7217,0.6347,0.5645
6377,rest-16,tasd,basic,3e-05,32,32,0.05,0,1000,6,tasd_rest-16_basic_3e-05_32_32_0.05_1000,tasd_rest-16_basic_3e-05_32_32_0.05_0_1000_6,0.7129,0.6004,0.5538
1450,rest-16,tasd,basic,0.0003,32,64,0.05,0,1000,6,tasd_rest-16_basic_0.0003_32_64_0.05_1000,tasd_rest-16_basic_0.0003_32_64_0.05_0_1000_6,0.6709,0.5642,0.5048


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7216
0.6709
0.8016
0.7673
0.7129
0.7222
0.7328
0.7475
         Source  ddof1           H  p-unc
Kruskal  config      7  4699.24763    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.72116,0.672082,0.655998,0.785608,0.606967,0.735912,True,856242.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.72116,0.802281,0.655998,0.785608,0.752365,0.850307,True,23298.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.72116,0.767244,0.655998,0.785608,0.712492,0.819118,True,134914.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.72116,0.711572,0.655998,0.785608,0.653095,0.771412,True,587488.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.72116,0.721235,0.655998,0.785608,0.663968,0.776,True,497569.0,0.850705,0.850705,False
5,0.0003_32_32,3e-05_8_16,0.72116,0.733214,0.655998,0.785608,0.673198,0.788935,True,388473.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.72116,0.748207,0.655998,0.785608,0.690492,0.8024,True,257169.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.672082,0.802281,0.606967,0.735912,0.752365,0.850307,True,1513.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.672082,0.767244,0.606967,0.735912,0.712492,0.819118,True,15516.0,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.672082,0.711572,0.606967,0.735912,0.653095,0.771412,True,191905.0,0.0,0.0,True


### 1000
### Long Prompt

In [4]:
args.lr_setting = 1000
args.task = 'tasd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
1481,en,rest-16,acsd,long,0.0003,8,8,0.05,0,1000,meta-llama-Meta-Llama-3-8B,6,en_rest-16__long_acsd_0.0003_8_8_0.05_4_1000_m...,en_rest-16__long_acsd_0.0003_8_8_0.05_4_0_1000...,0.7465,0.6553,0.5955
607,en,rest-16,acsd,long,3e-05,8,16,0.05,0,1000,meta-llama-Meta-Llama-3-8B,9,en_rest-16__long_acsd_3e-05_8_16_0.05_4_1000_m...,en_rest-16__long_acsd_3e-05_8_16_0.05_4_0_1000...,0.7364,0.6424,0.5828
401,en,rest-16,acsd,long,3e-05,32,64,0.05,0,1000,meta-llama-Meta-Llama-3-8B,9,en_rest-16__long_acsd_3e-05_32_64_0.05_4_1000_...,en_rest-16__long_acsd_3e-05_32_64_0.05_4_0_100...,0.7336,0.6307,0.5793
1462,en,rest-16,acsd,long,0.0003,8,16,0.05,0,1000,meta-llama-Meta-Llama-3-8B,8,en_rest-16__long_acsd_0.0003_8_16_0.05_4_1000_...,en_rest-16__long_acsd_0.0003_8_16_0.05_4_0_100...,0.7291,0.5438,0.5737
1367,en,rest-16,acsd,long,3e-05,32,32,0.05,0,1000,meta-llama-Meta-Llama-3-8B,5,en_rest-16__long_acsd_3e-05_32_32_0.05_4_1000_...,en_rest-16__long_acsd_3e-05_32_32_0.05_4_0_100...,0.7198,0.6196,0.5623
57,en,rest-16,acsd,long,0.0003,32,32,0.05,0,1000,meta-llama-Meta-Llama-3-8B,3,en_rest-16__long_acsd_0.0003_32_32_0.05_4_1000...,en_rest-16__long_acsd_0.0003_32_32_0.05_4_0_10...,0.7155,0.6,0.557
606,en,rest-16,acsd,long,3e-05,8,8,0.05,0,1000,meta-llama-Meta-Llama-3-8B,3,en_rest-16__long_acsd_3e-05_8_8_0.05_4_1000_me...,en_rest-16__long_acsd_3e-05_8_8_0.05_4_0_1000_...,0.68,0.5865,0.5152
1080,en,rest-16,acsd,long,0.0003,32,64,0.05,0,1000,meta-llama-Meta-Llama-3-8B,8,en_rest-16__long_acsd_0.0003_32_64_0.05_4_1000...,en_rest-16__long_acsd_0.0003_32_64_0.05_4_0_10...,0.6497,0.5588,0.4812


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.7155
0.6497
0.7291
0.7465
0.7198
0.7336
0.7364
0.68
         Source  ddof1           H  p-unc
Kruskal  config      7  3451.81315    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.715562,0.652045,0.6552,0.772247,0.571483,0.7265,True,894632.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.715562,0.727532,0.6552,0.772247,0.6679,0.786407,True,390830.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.715562,0.746802,0.6552,0.772247,0.691558,0.806022,True,233988.0,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.715562,0.720798,0.6552,0.772247,0.666665,0.77682,True,453652.5,0.000332,0.000664,True
4,0.0003_32_32,3e-05_32_64,0.715562,0.732992,0.6552,0.772247,0.668098,0.79361,True,345197.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.715562,0.736064,0.6552,0.772247,0.678582,0.79124,True,319727.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.715562,0.68025,0.6552,0.772247,0.618367,0.74343,True,792122.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.652045,0.727532,0.571483,0.7265,0.6679,0.786407,True,63563.0,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.652045,0.746802,0.571483,0.7265,0.691558,0.806022,True,25126.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.652045,0.720798,0.571483,0.7265,0.666665,0.77682,True,77476.0,0.0,0.0,True


### 1000
### CoT Prompt

In [5]:
args.lr_setting = 1000
args.task = 'acsd'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,lang,dataset,task,prompt,lr,lora_r,lora_alpha,lora_dropout,split,lr_setting,model_name,epoch,model_config,path,f1-micro,f1-macro,accuracy
802,en,rest-16,acsd,cot,0.0003,8,16,0.05,0,1000,meta-llama-Meta-Llama-3-8B,8,en_rest-16__cot_acsd_0.0003_8_16_0.05_4_1000_m...,en_rest-16__cot_acsd_0.0003_8_16_0.05_4_0_1000...,0.7078,0.5193,0.5478
372,en,rest-16,acsd,cot,0.0003,8,8,0.05,0,1000,meta-llama-Meta-Llama-3-8B,9,en_rest-16__cot_acsd_0.0003_8_8_0.05_4_1000_me...,en_rest-16__cot_acsd_0.0003_8_8_0.05_4_0_1000_...,0.6985,0.5978,0.5367
1360,en,rest-16,acsd,cot,0.0003,32,32,0.05,0,1000,meta-llama-Meta-Llama-3-8B,10,en_rest-16__cot_acsd_0.0003_32_32_0.05_4_1000_...,en_rest-16__cot_acsd_0.0003_32_32_0.05_4_0_100...,0.68,0.5216,0.5152
450,en,rest-16,acsd,cot,3e-05,32,64,0.05,0,1000,meta-llama-Meta-Llama-3-8B,9,en_rest-16__cot_acsd_3e-05_32_64_0.05_4_1000_m...,en_rest-16__cot_acsd_3e-05_32_64_0.05_4_0_1000...,0.679,0.6346,0.5139
105,en,rest-16,acsd,cot,0.0003,32,64,0.05,0,1000,meta-llama-Meta-Llama-3-8B,10,en_rest-16__cot_acsd_0.0003_32_64_0.05_4_1000_...,en_rest-16__cot_acsd_0.0003_32_64_0.05_4_0_100...,0.6749,0.5373,0.5093
843,en,rest-16,acsd,cot,3e-05,8,16,0.05,0,1000,meta-llama-Meta-Llama-3-8B,10,en_rest-16__cot_acsd_3e-05_8_16_0.05_4_1000_me...,en_rest-16__cot_acsd_3e-05_8_16_0.05_4_0_1000_...,0.6708,0.5965,0.5046
653,en,rest-16,acsd,cot,3e-05,32,32,0.05,0,1000,meta-llama-Meta-Llama-3-8B,6,en_rest-16__cot_acsd_3e-05_32_32_0.05_4_1000_m...,en_rest-16__cot_acsd_3e-05_32_32_0.05_4_0_1000...,0.6593,0.559,0.4917
1055,en,rest-16,acsd,cot,3e-05,8,8,0.05,0,1000,meta-llama-Meta-Llama-3-8B,10,en_rest-16__cot_acsd_3e-05_8_8_0.05_4_1000_met...,en_rest-16__cot_acsd_3e-05_8_8_0.05_4_0_1000_m...,0.6123,0.5194,0.4413


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  1000
Train Length:  833
Eval Length:  167
Split: 0
0.68
0.6749
0.7078
0.6985
0.6593
0.679
0.6708
0.6123
         Source  ddof1           H  p-unc
Kruskal  config      7  3029.42933    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.678985,0.675154,0.614088,0.743803,0.609297,0.7316,True,534071.0,0.008329,0.024987,True
1,0.0003_32_32,0.0003_8_16,0.678985,0.705857,0.614088,0.743803,0.643292,0.76571,True,276331.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.678985,0.69853,0.614088,0.743803,0.631395,0.76151,True,335170.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.678985,0.658163,0.614088,0.743803,0.593243,0.721852,True,676020.5,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.678985,0.678791,0.614088,0.743803,0.623397,0.73521,True,503536.0,0.784245,0.784245,False
5,0.0003_32_32,3e-05_8_16,0.678985,0.670952,0.614088,0.743803,0.616292,0.7296,True,575587.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.678985,0.611139,0.614088,0.743803,0.551297,0.67211,True,931061.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.675154,0.705857,0.609297,0.7316,0.643292,0.76571,True,242203.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.675154,0.69853,0.609297,0.7316,0.631395,0.76151,True,300292.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.675154,0.658163,0.609297,0.7316,0.593243,0.721852,True,648848.5,0.0,0.0,True


### 500
### Short Prompt

In [9]:
args.lr_setting = 500
args.task = 'tasd'
args.prompt_style = 'basic'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
6700,rest-16,tasd,basic,0.0003,8,8,0.05,0,500,7,tasd_rest-16_basic_0.0003_8_8_0.05_500,tasd_rest-16_basic_0.0003_8_8_0.05_0_500_7,0.7137,0.6283,0.5549
594,rest-16,tasd,basic,3e-05,32,64,0.05,0,500,5,tasd_rest-16_basic_3e-05_32_64_0.05_500,tasd_rest-16_basic_3e-05_32_64_0.05_0_500_5,0.7027,0.5853,0.5417
6718,rest-16,tasd,basic,0.0003,32,32,0.05,0,500,10,tasd_rest-16_basic_0.0003_32_32_0.05_500,tasd_rest-16_basic_0.0003_32_32_0.05_0_500_10,0.6935,0.4994,0.5309
4785,rest-16,tasd,basic,0.0003,8,16,0.05,0,500,4,tasd_rest-16_basic_0.0003_8_16_0.05_500,tasd_rest-16_basic_0.0003_8_16_0.05_0_500_4,0.6905,0.5009,0.5273
4714,rest-16,tasd,basic,3e-05,32,32,0.05,0,500,10,tasd_rest-16_basic_3e-05_32_32_0.05_500,tasd_rest-16_basic_3e-05_32_32_0.05_0_500_10,0.6667,0.5592,0.5
2056,rest-16,tasd,basic,0.0003,32,64,0.05,0,500,5,tasd_rest-16_basic_0.0003_32_64_0.05_500,tasd_rest-16_basic_0.0003_32_64_0.05_0_500_5,0.648,0.5748,0.4793
76,rest-16,tasd,basic,3e-05,8,16,0.05,0,500,7,tasd_rest-16_basic_3e-05_8_16_0.05_500,tasd_rest-16_basic_3e-05_8_16_0.05_0_500_7,0.642,0.5555,0.4728
5592,rest-16,tasd,basic,3e-05,8,8,0.05,0,500,6,tasd_rest-16_basic_3e-05_8_8_0.05_500,tasd_rest-16_basic_3e-05_8_8_0.05_0_500_6,0.6176,0.5094,0.4468


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.6935
0.648
0.6905
0.7137
0.6667
0.7027
0.6421
0.6176
         Source  ddof1           H  p-unc
Kruskal  config      7  2564.82116    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.693523,0.648947,0.6129,0.772522,0.549375,0.741777,True,761903.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.693523,0.687313,0.6129,0.772522,0.606287,0.7679,True,544825.5,0.000518,0.001036,True
2,0.0003_32_32,0.0003_8_8,0.693523,0.714669,0.6129,0.772522,0.625487,0.795203,True,362571.5,0.0,0.0,True
3,0.0003_32_32,3e-05_32_32,0.693523,0.664207,0.6129,0.772522,0.561975,0.7608,True,675619.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.693523,0.702361,0.6129,0.772522,0.61537,0.791497,True,446205.5,3.1e-05,9.3e-05,True
5,0.0003_32_32,3e-05_8_16,0.693523,0.643497,0.6129,0.772522,0.557697,0.733905,True,791202.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.693523,0.619886,0.6129,0.772522,0.530273,0.70991,True,877317.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.648947,0.687313,0.549375,0.741777,0.606287,0.7679,True,269313.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.648947,0.714669,0.549375,0.741777,0.625487,0.795203,True,150856.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.648947,0.664207,0.549375,0.741777,0.561975,0.7608,True,406383.0,0.0,0.0,True


### 500
### Long Prompt

In [10]:
args.lr_setting = 500
args.task = 'tasd'
args.prompt_style = 'context'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
2193,rest-16,tasd,context,0.0003,32,32,0.05,0,500,4,tasd_rest-16_context_0.0003_32_32_0.05_500,tasd_rest-16_context_0.0003_32_32_0.05_0_500_4,0.7183,0.5435,0.5605
1756,rest-16,tasd,context,0.0003,8,8,0.05,0,500,6,tasd_rest-16_context_0.0003_8_8_0.05_500,tasd_rest-16_context_0.0003_8_8_0.05_0_500_6,0.7097,0.5746,0.55
2594,rest-16,tasd,context,0.0003,8,16,0.05,0,500,5,tasd_rest-16_context_0.0003_8_16_0.05_500,tasd_rest-16_context_0.0003_8_16_0.05_0_500_5,0.6973,0.6246,0.5353
516,rest-16,tasd,context,0.0003,32,64,0.05,0,500,3,tasd_rest-16_context_0.0003_32_64_0.05_500,tasd_rest-16_context_0.0003_32_64_0.05_0_500_3,0.6942,0.5791,0.5316
2973,rest-16,tasd,context,3e-05,32,64,0.05,0,500,6,tasd_rest-16_context_3e-05_32_64_0.05_500,tasd_rest-16_context_3e-05_32_64_0.05_0_500_6,0.6899,0.584,0.5266
6095,rest-16,tasd,context,3e-05,8,16,0.05,0,500,8,tasd_rest-16_context_3e-05_8_16_0.05_500,tasd_rest-16_context_3e-05_8_16_0.05_0_500_8,0.6667,0.5047,0.5
2207,rest-16,tasd,context,3e-05,32,32,0.05,0,500,6,tasd_rest-16_context_3e-05_32_32_0.05_500,tasd_rest-16_context_3e-05_32_32_0.05_0_500_6,0.6464,0.5199,0.4775
1634,rest-16,tasd,context,3e-05,8,8,0.05,0,500,6,tasd_rest-16_context_3e-05_8_8_0.05_500,tasd_rest-16_context_3e-05_8_8_0.05_0_500_6,0.6182,0.4809,0.4474


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.7184
0.6942
0.6973
0.7097
0.6464
0.6899
0.6667
0.6182
         Source  ddof1            H  p-unc
Kruskal  config      7  2562.540062    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.717216,0.694392,0.6333,0.800037,0.608285,0.778225,True,640909.5,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.717216,0.697394,0.6333,0.800037,0.609595,0.779735,True,625252.0,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.717216,0.709934,0.6333,0.800037,0.6204,0.792318,True,543981.0,0.00066,0.00198,True
3,0.0003_32_32,3e-05_32_32,0.717216,0.646937,0.6333,0.800037,0.560948,0.739803,True,866821.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.717216,0.690385,0.6333,0.800037,0.6,0.775803,True,666759.0,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.717216,0.665823,0.6333,0.800037,0.57679,0.752942,True,797183.5,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.717216,0.620814,0.6333,0.800037,0.538192,0.703307,True,944717.5,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.694392,0.697394,0.608285,0.778225,0.609595,0.779735,True,479199.0,0.107223,0.107223,False
8,0.0003_32_64,0.0003_8_8,0.694392,0.709934,0.608285,0.778225,0.6204,0.792318,True,402486.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.694392,0.646937,0.608285,0.778225,0.560948,0.739803,True,765086.0,0.0,0.0,True


### 500
### CoT Prompt

In [11]:
args.lr_setting = 500
args.task = 'tasd'
args.prompt_style = 'cot'

computeStatistics(args)

8


Unnamed: 0,dataset,task,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
835,rest-16,tasd,cot,0.0003,8,16,0.05,0,500,6,tasd_rest-16_cot_0.0003_8_16_0.05_500,tasd_rest-16_cot_0.0003_8_16_0.05_0_500_6,0.661,0.5427,0.4937
4608,rest-16,tasd,cot,0.0003,8,8,0.05,0,500,10,tasd_rest-16_cot_0.0003_8_8_0.05_500,tasd_rest-16_cot_0.0003_8_8_0.05_0_500_10,0.6502,0.5592,0.4817
3047,rest-16,tasd,cot,0.0003,32,32,0.05,0,500,10,tasd_rest-16_cot_0.0003_32_32_0.05_500,tasd_rest-16_cot_0.0003_32_32_0.05_0_500_10,0.6476,0.5174,0.4788
4042,rest-16,tasd,cot,3e-05,32,32,0.05,0,500,8,tasd_rest-16_cot_3e-05_32_32_0.05_500,tasd_rest-16_cot_3e-05_32_32_0.05_0_500_8,0.6122,0.5296,0.4412
3950,rest-16,tasd,cot,0.0003,32,64,0.05,0,500,8,tasd_rest-16_cot_0.0003_32_64_0.05_500,tasd_rest-16_cot_0.0003_32_64_0.05_0_500_8,0.6058,0.5192,0.4345
482,rest-16,tasd,cot,3e-05,32,64,0.05,0,500,10,tasd_rest-16_cot_3e-05_32_64_0.05_500,tasd_rest-16_cot_3e-05_32_64_0.05_0_500_10,0.5969,0.5415,0.4254
2256,rest-16,tasd,cot,3e-05,8,8,0.05,0,500,8,tasd_rest-16_cot_3e-05_8_8_0.05_500,tasd_rest-16_cot_3e-05_8_8_0.05_0_500_8,0.5853,0.5314,0.4138
5069,rest-16,tasd,cot,3e-05,8,16,0.05,0,500,8,tasd_rest-16_cot_3e-05_8_16_0.05_500,tasd_rest-16_cot_3e-05_8_16_0.05_0_500_8,0.5703,0.4423,0.3989


Loading dataset ...
Dataset name: rest-16
Split setting:  Custom
Eval Mode:  Validation
Low Resource Setting:  500
Train Length:  416
Eval Length:  84
Split: 0
0.6475
0.6058
0.661
0.6502
0.6122
0.5969
0.5703
0.5854
         Source  ddof1            H  p-unc
Kruskal  config      7  2444.395083    0.0
Paarweise Vergleiche mit Holm-Korrektur:


Unnamed: 0,Model 1,Model 2,Mean Model 1,Mean Model 2,Model 1 CI Lower,Model 1 CI Upper,Model 2 CI Lower,Model 2 CI Upper,CI Overlap,U Statistic (Model1 > Model2),P-Value (Model1 > Model2),Corrected P-Value (Model1 > Model2),Significant (Model1 > Model2)
0,0.0003_32_32,0.0003_32_64,0.646672,0.606403,0.55909,0.7303,0.51449,0.69609,True,736236.0,0.0,0.0,True
1,0.0003_32_32,0.0003_8_16,0.646672,0.66154,0.55909,0.7303,0.565368,0.752315,True,410210.5,0.0,0.0,True
2,0.0003_32_32,0.0003_8_8,0.646672,0.649696,0.55909,0.7303,0.5633,0.735217,True,479135.5,0.106154,0.209542,False
3,0.0003_32_32,3e-05_32_32,0.646672,0.609763,0.55909,0.7303,0.506588,0.704522,True,711687.0,0.0,0.0,True
4,0.0003_32_32,3e-05_32_64,0.646672,0.593503,0.55909,0.7303,0.5,0.685908,True,792588.5,0.0,0.0,True
5,0.0003_32_32,3e-05_8_16,0.646672,0.571143,0.55909,0.7303,0.476573,0.664067,True,875151.0,0.0,0.0,True
6,0.0003_32_32,3e-05_8_8,0.646672,0.583822,0.55909,0.7303,0.480293,0.677275,True,827204.0,0.0,0.0,True
7,0.0003_32_64,0.0003_8_16,0.606403,0.66154,0.51449,0.69609,0.565368,0.752315,True,210281.5,0.0,0.0,True
8,0.0003_32_64,0.0003_8_8,0.606403,0.649696,0.51449,0.69609,0.5633,0.735217,True,248781.5,0.0,0.0,True
9,0.0003_32_64,3e-05_32_32,0.606403,0.609763,0.51449,0.69609,0.506588,0.704522,True,479052.5,0.104771,0.209542,False


In [14]:
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all
# Means
results_acd = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'acd', 
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

# Means
results_acsa = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'acsa',
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

# Means
results_e2e = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'e2e', 
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

# Means
results_tasd = results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                     results_all['task'] == 'tasd', 
                                                     results_all['split'] == str(0),
                                                     results_all['lr_setting'] != 'orig'])]

results = pd.concat([results_acd, results_acsa, results_e2e, results_tasd])

results = results[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]

results = results.reset_index()

idx_max = results.groupby(['model_config', 'split'])['f1-micro'].idxmax()
results_per_epoch = results.loc[idx_max]

In [15]:
for comb, group in results_per_epoch.groupby(['learning_rate', 'lora_r', 'lora_alpha']):
    print(comb)
    print(len(group))
    print(f"{np.mean(group['f1-micro'])*100:.2f}")

('0.0003', '32', '32')
33
77.55
('0.0003', '32', '64')
33
73.46
('0.0003', '8', '16')
33
77.78
('0.0003', '8', '8')
33
78.85
('3e-05', '32', '32')
33
76.77
('3e-05', '32', '64')
33
77.75
('3e-05', '8', '16')
33
76.20
('3e-05', '8', '8')
33
74.53
