## Language

In [134]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = SimpleNamespace(**{
    'dataset': 'GERestaurant',
})

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_tasd = {}

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub_baseline = args.results_baseline[np.logical_and.reduce([args.results_baseline['lr_setting'] == str(lr_setting), 
                                                                        args.results_baseline['dataset'] == args.dataset, 
                                                                        args.results_baseline['task'] == args.task, 
                                                                        args.results_baseline['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    display(results_sub_baseline)
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    results_sub_baseline = results_sub_baseline[['task', 'method', 'dataset', 'learning_rate', 'batch_size', 'lr_setting', 'split', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']]

    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['basic', 'context']
        baselines = ['prompting', 'hier_gcn', 'bert_clf']
    elif args.task == 'acsa':
        prompts = ['basic', 'context', 'cot']
        baselines = ['prompting', 'hier_gcn', 'bert_clf']
    elif args.task == 'e2e' or args.task == 'e2e-e':
        prompts = ['basic', 'context', 'cot']
        baselines = ['prompting', 'instructABSA', 'tas_bert']
    else:
        prompts = ['basic', 'context', 'cot']
        baselines = ['prompting', 'para', 'mvp']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,12]
            f1_prompts[prompt] = f1
        except:
            pass
    
    for method in baselines:
        f1 = {}
        # try:
        for i in range(1, 6): 
            f1[i] = results_sub_baseline[np.logical_and.reduce([results_sub_baseline['split'] == str(i), results_sub_baseline['method'] == method])].iloc[0,8]
        f1_prompts[method] = f1
        # except:
        #     pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)


    # Only use the best performing FT-LLM prompt
    available_prompts = [prompt for prompt in prompts if prompt in df_prompts.columns]

    # Calculate the average F1 scores
    avg_f1 = df_prompts[available_prompts].mean()
    
    # Find the best prompt
    best_prompt = avg_f1.idxmax()
    
    # Identify prompts to drop
    prompts_to_drop = [prompt for prompt in prompts if prompt != best_prompt]
    
    # Drop the other prompts
    df_prompts = df_prompts.drop(columns=prompts_to_drop)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # if not ((col1 == baselines[0] and col2 == baselines[1]) or (col1 == baselines[1] and col2 == baselines[0])):
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_prompts[col1]),
            'std 1': np.std(df_prompts[col1]),
            'mean 2': np.mean(df_prompts[col2]),
            'std 2': np.std(df_prompts[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

def computeLowResourceStatistics(args):
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]

    if args.task == 'acd':
        prompts = ['basic', 'context']
    else:
        prompts = ['basic', 'context', 'cot']

    for prompt in prompts:
        
        f1_splits = {}
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,12]
                f1_splits[lr_setting] = f1
            except:
                pass
        
        df_splits = pd.DataFrame(f1_splits)
        
        display(df_splits)
    
        normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}
    
        for key, item in normality_results.items():
            display(item)
        
        all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])
    
        print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        
        if all_normal:
            # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
            rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
            print("Repeated Measures ANOVA Result:")
            print(rm_anova)
        else:
            # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
            friedman = pg.friedman(df_splits)
            print("Friedman Test Result:")
            print(friedman)
    
        # Paarweise Vergleiche
        results = []
        columns = df_splits.columns
        comb = combinations(columns, 2)
        
        for col1, col2 in comb:
            # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
            if all_normal:
                test = 't-test'
                test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
                statistic = test_result['T']['T-test']
            else:
                # Falls nicht, Wilcoxon-Test
                test = 'wilcoxon'
                test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
                statistic = test_result['W-val']['Wilcoxon']
            
            result = {
                'test': test,
                'comparison': f'{col1} vs {col2}',
                'mean 1': np.mean(df_splits[col1]),
                'std 1': np.std(df_splits[col1]),
                'mean 2': np.mean(df_splits[col2]),
                'std 2': np.std(df_splits[col2]),
                'statistic': statistic,
                'p_value': test_result['p-val'].iloc[0]
            }
            results.append(result)
        
        # Erstellung eines DataFrames für die Testergebnisse
        results_df = pd.DataFrame(results)
        
        # Durchführung der Bonferroni-Holm-Korrektur
        corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
        results_df['corrected_p_value'] = corrected_p[1]
        results_df['significant'] = corrected_p[0]

        print('Results for LR-Comparison of : ', prompt)
        display(results_df)


    
    ####
    # Compute based on best performing prompt per low-resource setting
    ####


    
    f1_splits = {}
    
    for prompt in prompts:
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,12]
    
                if lr_setting not in f1_splits.keys() or np.mean(list(f1.values())) > np.mean(list(f1_splits[lr_setting].values())):
                    f1_splits[lr_setting] = f1
            except:
                pass

    df_splits = pd.DataFrame(f1_splits)
        
    display(df_splits)

    normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_splits)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_splits.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_splits[col1]),
            'std 1': np.std(df_splits[col1]),
            'mean 2': np.mean(df_splits[col2]),
            'std 2': np.std(df_splits[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]

    print('Results for LR-Comparison of best Prompt per LR-Setting')
    display(results_df)



## ACD

In [135]:
# LLM-based Method

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
RESULTS_PATH = '../results/'

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'acd':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass



# Multi-label Classifiaction
METHOD = 'bert_clf'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_parameters = folder_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        if cond_parameters[3] == '0':
            cond_parameters[3] = 'full'
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier_gcn'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    cond_parameters = folder_name.split('_')
    cond_params = cond_parameters.copy()
    if cond_params[0] == 'acd':
        with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'cate_eval_results.txt'), 'r') as f:
            f1 = f.readlines()[3].split(' = ')[1]
                
        cond_params[1:1] = [METHOD]
        if cond_params[3] == '0':
            cond_params[3] = 'full'
        cond_params.extend([round(float(f1), 4), None, None])
        runs.append(cond_params)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

In [126]:
results_all[results_all['lr_setting'] == 'orig']

Unnamed: 0,task,dataset,prompt,learning_rate,lora_r,lora_alpha,lora_dropout,split,lr_setting,epoch,model_config,path,f1-micro,f1-macro,accuracy
99,tasd,GERestaurant,cot,0.0003,8,8,0.05,0,orig,7,tasd_GERestaurant_cot_0.0003_8_8_0.05_orig,tasd_GERestaurant_cot_0.0003_8_8_0.05_0_orig_7,0.7353,0.7078,0.5814
283,tasd,rest-16,cot,0.0003,8,8,0.05,0,orig,7,tasd_rest-16_cot_0.0003_8_8_0.05_orig,tasd_rest-16_cot_0.0003_8_8_0.05_0_orig_7,0.702,0.5563,0.5408
355,acd,GERestaurant,context,0.0003,8,16,0.05,0,orig,10,acd_GERestaurant_context_0.0003_8_16_0.05_orig,acd_GERestaurant_context_0.0003_8_16_0.05_0_or...,0.8767,0.8744,0.7804
863,tasd,GERestaurant,context,3e-05,32,32,0.05,0,orig,9,tasd_GERestaurant_context_3e-05_32_32_0.05_orig,tasd_GERestaurant_context_3e-05_32_32_0.05_0_o...,0.7297,0.6923,0.5744
986,acsa,rest-16,cot,3e-05,32,64,0.05,0,orig,5,acsa_rest-16_cot_3e-05_32_64_0.05_orig,acsa_rest-16_cot_3e-05_32_64_0.05_0_orig_5,0.8248,0.766,0.7018
1235,acsa,GERestaurant,cot,0.0003,8,16,0.05,0,orig,7,acsa_GERestaurant_cot_0.0003_8_16_0.05_orig,acsa_GERestaurant_cot_0.0003_8_16_0.05_0_orig_7,0.7624,0.7507,0.616
1246,acd,rest-16,context,0.0003,8,8,0.05,0,orig,6,acd_rest-16_context_0.0003_8_8_0.05_orig,acd_rest-16_context_0.0003_8_8_0.05_0_orig_6,0.8109,0.6903,0.682
1895,acsa,rest-16,context,3e-05,32,32,0.05,0,orig,7,acsa_rest-16_context_3e-05_32_32_0.05_orig,acsa_rest-16_context_3e-05_32_32_0.05_0_orig_7,0.8161,0.7548,0.6893
2225,acsa,GERestaurant,basic,3e-05,8,16,0.05,0,orig,8,acsa_GERestaurant_basic_3e-05_8_16_0.05_orig,acsa_GERestaurant_basic_3e-05_8_16_0.05_0_orig_8,0.8545,0.8508,0.746
2487,tasd,rest-16,basic,0.0003,8,8,0.05,0,orig,9,tasd_rest-16_basic_0.0003_8_8_0.05_orig,tasd_rest-16_basic_0.0003_8_8_0.05_0_orig_9,0.725,0.6031,0.5686


### Full Dataset

In [96]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
172,acd,bert_clf,GERestaurant,full,2,2e-05,16,3,0.9326,0.9189,0.8737
109,acd,bert_clf,GERestaurant,full,4,2e-05,16,3,0.9264,0.9183,0.8628
197,acd,bert_clf,GERestaurant,full,3,2e-05,16,3,0.9224,0.9094,0.8559
61,acd,bert_clf,GERestaurant,full,5,2e-05,16,3,0.9186,0.8959,0.8495
175,acd,bert_clf,GERestaurant,full,1,2e-05,16,3,0.9147,0.9076,0.8429
218,acd,hier_gcn,GERestaurant,full,2,5e-05,8,20.0,0.9107,,
229,acd,hier_gcn,GERestaurant,full,5,5e-05,8,20.0,0.9011,,
234,acd,hier_gcn,GERestaurant,full,3,5e-05,8,20.0,0.8932,,
225,acd,hier_gcn,GERestaurant,full,4,5e-05,8,20.0,0.892,,
226,acd,hier_gcn,GERestaurant,full,1,5e-05,8,20.0,0.8886,,


Unnamed: 0,basic,context,prompting,hier_gcn,bert_clf
1,0.8747,0.8801,0.8338,0.8886,0.9147
2,0.8663,0.8698,0.7778,0.9107,0.9326
3,0.8757,0.8674,0.7962,0.8932,0.9224
4,0.8943,0.8896,0.8417,0.892,0.9264
5,0.8827,0.8846,0.834,0.9011,0.9186


Unnamed: 0,W,pval,normal
basic,0.96305,0.829044,True


Unnamed: 0,W,pval,normal
prompting,0.845013,0.179263,True


Unnamed: 0,W,pval,normal
hier_gcn,0.906726,0.448167,True


Unnamed: 0,W,pval,normal
bert_clf,0.987348,0.969604,True


    split     prompt      f1
0       1      basic  0.8747
1       2      basic  0.8663
2       3      basic  0.8757
3       4      basic  0.8943
4       5      basic  0.8827
5       1  prompting  0.8338
6       2  prompting  0.7778
7       3  prompting  0.7962
8       4  prompting  0.8417
9       5  prompting  0.8340
10      1   hier_gcn  0.8886
11      2   hier_gcn  0.9107
12      3   hier_gcn  0.8932
13      4   hier_gcn  0.8920
14      5   hier_gcn  0.9011
15      1   bert_clf  0.9147
16      2   bert_clf  0.9326
17      3   bert_clf  0.9224
18      4   bert_clf  0.9264
19      5   bert_clf  0.9186
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      3     12  38.249525  0.000002  0.882316  0.373114


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.87874,0.009361,0.8167,0.0251,6.691984,0.002594,0.009227,True
1,t-test,basic vs hier_gcn,0.87874,0.009361,0.89712,0.00793,-2.45007,0.07044,0.07044,False
2,t-test,basic vs bert_clf,0.87874,0.009361,0.92294,0.006202,-7.328092,0.001845,0.009227,True
3,t-test,prompting vs hier_gcn,0.8167,0.0251,0.89712,0.00793,-5.20735,0.006483,0.012967,True
4,t-test,prompting vs bert_clf,0.8167,0.0251,0.92294,0.006202,-7.219654,0.001952,0.009227,True
5,t-test,hier_gcn vs bert_clf,0.89712,0.00793,0.92294,0.006202,-8.863803,0.000895,0.005368,True


### 1000

In [97]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
104,acd,bert_clf,GERestaurant,1000,4,2e-05,16,6,0.9254,0.9198,0.8612
86,acd,bert_clf,GERestaurant,1000,5,2e-05,16,6,0.9202,0.8965,0.8522
192,acd,bert_clf,GERestaurant,1000,2,2e-05,16,6,0.9062,0.8848,0.8285
103,acd,bert_clf,GERestaurant,1000,3,2e-05,16,6,0.9053,0.8866,0.827
191,acd,bert_clf,GERestaurant,1000,1,2e-05,16,6,0.904,0.8944,0.8249
236,acd,hier_gcn,GERestaurant,1000,3,5e-05,8,43.0,0.8904,,
227,acd,hier_gcn,GERestaurant,1000,2,5e-05,8,43.0,0.8785,,
241,acd,hier_gcn,GERestaurant,1000,5,5e-05,8,43.0,0.872,,
228,acd,hier_gcn,GERestaurant,1000,4,5e-05,8,43.0,0.8682,,
238,acd,hier_gcn,GERestaurant,1000,1,5e-05,8,43.0,0.8614,,


Unnamed: 0,basic,context,prompting,hier_gcn,bert_clf
1,0.8798,0.8698,0.8095,0.8614,0.904
2,0.8423,0.8509,0.813,0.8785,0.9062
3,0.8625,0.8555,0.8335,0.8904,0.9053
4,0.8952,0.8993,0.8384,0.8682,0.9254
5,0.8527,0.8469,0.8258,0.872,0.9202


Unnamed: 0,W,pval,normal
basic,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
prompting,0.923141,0.550403,True


Unnamed: 0,W,pval,normal
hier_gcn,0.975207,0.907495,True


Unnamed: 0,W,pval,normal
bert_clf,0.816104,0.108917,True


    split     prompt      f1
0       1      basic  0.8798
1       2      basic  0.8423
2       3      basic  0.8625
3       4      basic  0.8952
4       5      basic  0.8527
5       1  prompting  0.8095
6       2  prompting  0.8130
7       3  prompting  0.8335
8       4  prompting  0.8384
9       5  prompting  0.8258
10      1   hier_gcn  0.8614
11      2   hier_gcn  0.8785
12      3   hier_gcn  0.8904
13      4   hier_gcn  0.8682
14      5   hier_gcn  0.8720
15      1   bert_clf  0.9040
16      2   bert_clf  0.9062
17      3   bert_clf  0.9053
18      4   bert_clf  0.9254
19      5   bert_clf  0.9202
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      3     12  36.667919  0.000003  0.855963  0.525171


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.8665,0.018935,0.82404,0.011244,4.781404,0.008767,0.025207,True
1,t-test,basic vs hier_gcn,0.8665,0.018935,0.8741,0.009855,-0.597103,0.582594,0.582594,False
2,t-test,basic vs bert_clf,0.8665,0.018935,0.91222,0.008821,-5.248956,0.006302,0.025207,True
3,t-test,prompting vs hier_gcn,0.82404,0.011244,0.8741,0.009855,-8.378715,0.00111,0.00555,True
4,t-test,prompting vs bert_clf,0.82404,0.011244,0.91222,0.008821,-20.404385,3.4e-05,0.000204,True
5,t-test,hier_gcn vs bert_clf,0.8741,0.009855,0.91222,0.008821,-5.063645,0.007162,0.025207,True


### 500

In [98]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
195,acd,bert_clf,GERestaurant,500,4,2e-05,16,13,0.9085,0.9085,0.8323
207,acd,bert_clf,GERestaurant,500,5,2e-05,16,13,0.9061,0.8819,0.8284
159,acd,bert_clf,GERestaurant,500,3,2e-05,16,13,0.9031,0.8849,0.8233
189,acd,bert_clf,GERestaurant,500,1,2e-05,16,13,0.8969,0.8879,0.8131
127,acd,bert_clf,GERestaurant,500,2,2e-05,16,13,0.8917,0.8692,0.8046
214,acd,hier_gcn,GERestaurant,500,3,5e-05,8,86.0,0.8828,,
239,acd,hier_gcn,GERestaurant,500,2,5e-05,8,86.0,0.8614,,
243,acd,hier_gcn,GERestaurant,500,5,5e-05,8,86.0,0.8548,,
230,acd,hier_gcn,GERestaurant,500,4,5e-05,8,86.0,0.8427,,
231,acd,hier_gcn,GERestaurant,500,1,5e-05,8,86.0,0.8341,,


Unnamed: 0,basic,context,prompting,hier_gcn,bert_clf
1,0.887,0.8369,0.8214,0.8341,0.8969
2,0.8265,0.8215,0.8274,0.8614,0.8917
3,0.8557,0.8258,0.7666,0.8828,0.9031
4,0.8674,0.8406,0.7864,0.8427,0.9085
5,0.8694,0.837,0.8327,0.8548,0.9061


Unnamed: 0,W,pval,normal
basic,0.944206,0.695806,True


Unnamed: 0,W,pval,normal
prompting,0.86739,0.256015,True


Unnamed: 0,W,pval,normal
hier_gcn,0.970341,0.877435,True


Unnamed: 0,W,pval,normal
bert_clf,0.944973,0.701274,True


    split     prompt      f1
0       1      basic  0.8870
1       2      basic  0.8265
2       3      basic  0.8557
3       4      basic  0.8674
4       5      basic  0.8694
5       1  prompting  0.8214
6       2  prompting  0.8274
7       3  prompting  0.7666
8       4  prompting  0.7864
9       5  prompting  0.8327
10      1   hier_gcn  0.8341
11      2   hier_gcn  0.8614
12      3   hier_gcn  0.8828
13      4   hier_gcn  0.8427
14      5   hier_gcn  0.8548
15      1   bert_clf  0.8969
16      2   bert_clf  0.8917
17      3   bert_clf  0.9031
18      4   bert_clf  0.9085
19      5   bert_clf  0.9061
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      3     12  14.304537  0.000288  0.763806  0.677667


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.8612,0.020028,0.8069,0.025848,3.302019,0.029877,0.089631,False
1,t-test,basic vs hier_gcn,0.8612,0.020028,0.85516,0.016744,0.367866,0.731612,0.731612,False
2,t-test,basic vs bert_clf,0.8612,0.020028,0.90126,0.006158,-4.467839,0.011093,0.044372,True
3,t-test,prompting vs hier_gcn,0.8069,0.025848,0.85516,0.016744,-2.610997,0.059358,0.118716,False
4,t-test,prompting vs bert_clf,0.8069,0.025848,0.90126,0.006158,-6.477684,0.002927,0.017563,True
5,t-test,hier_gcn vs bert_clf,0.85516,0.016744,0.90126,0.006158,-5.139222,0.006795,0.033975,True


In [136]:
args.task = 'acd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8798,0.887,0.8747
2,0.8423,0.8265,0.8663
3,0.8625,0.8557,0.8757
4,0.8952,0.8674,0.8943
5,0.8527,0.8694,0.8827


Unnamed: 0,W,pval,normal
1000,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
500,0.944206,0.695806,True


Unnamed: 0,W,pval,normal
full,0.96305,0.829044,True


    split prompt      f1
0       1   1000  0.8798
1       2   1000  0.8423
2       3   1000  0.8625
3       4   1000  0.8952
4       5   1000  0.8527
5       1    500  0.8870
6       2    500  0.8265
7       3    500  0.8557
8       4    500  0.8674
9       5    500  0.8694
10      1   full  0.8747
11      2   full  0.8663
12      3   full  0.8757
13      4   full  0.8943
14      5   full  0.8827
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  2.635654  0.13204  0.160387  0.932553
Results for LR-Comparison of :  basic


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8665,0.018935,0.8612,0.020028,0.668002,0.5407,0.5407,False
1,t-test,1000 vs full,0.8665,0.018935,0.87874,0.009361,-1.796921,0.146763,0.337795,False
2,t-test,500 vs full,0.8612,0.020028,0.87874,0.009361,-2.026986,0.112598,0.337795,False


Unnamed: 0,1000,500,full
1,0.8698,0.8369,0.8801
2,0.8509,0.8215,0.8698
3,0.8555,0.8258,0.8674
4,0.8993,0.8406,0.8896
5,0.8469,0.837,0.8846


Unnamed: 0,W,pval,normal
1000,0.85488,0.210443,True


Unnamed: 0,W,pval,normal
500,0.874845,0.286601,True


Unnamed: 0,W,pval,normal
full,0.928675,0.587366,True


    split prompt      f1
0       1   1000  0.8698
1       2   1000  0.8509
2       3   1000  0.8555
3       4   1000  0.8993
4       5   1000  0.8469
5       1    500  0.8369
6       2    500  0.8215
7       3    500  0.8258
8       4    500  0.8406
9       5    500  0.8370
10      1   full  0.8801
11      2   full  0.8698
12      3   full  0.8674
13      4   full  0.8896
14      5   full  0.8846
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  27.50767  0.00026  0.694165  0.527657
Results for LR-Comparison of :  context


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.86448,0.019052,0.83236,0.007362,4.122545,0.014582,0.029164,True
1,t-test,1000 vs full,0.86448,0.019052,0.8783,0.008505,-1.810502,0.144463,0.144463,False
2,t-test,500 vs full,0.83236,0.007362,0.8783,0.008505,-30.961485,6e-06,1.9e-05,True


Unnamed: 0,1000,500,full
1,0.8798,0.887,0.8747
2,0.8423,0.8265,0.8663
3,0.8625,0.8557,0.8757
4,0.8952,0.8674,0.8943
5,0.8527,0.8694,0.8827


Unnamed: 0,W,pval,normal
1000,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
500,0.944206,0.695806,True


Unnamed: 0,W,pval,normal
full,0.96305,0.829044,True


    split prompt      f1
0       1   1000  0.8798
1       2   1000  0.8423
2       3   1000  0.8625
3       4   1000  0.8952
4       5   1000  0.8527
5       1    500  0.8870
6       2    500  0.8265
7       3    500  0.8557
8       4    500  0.8674
9       5    500  0.8694
10      1   full  0.8747
11      2   full  0.8663
12      3   full  0.8757
13      4   full  0.8943
14      5   full  0.8827
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  2.635654  0.13204  0.160387  0.932553
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8665,0.018935,0.8612,0.020028,0.668002,0.5407,0.5407,False
1,t-test,1000 vs full,0.8665,0.018935,0.87874,0.009361,-1.796921,0.146763,0.337795,False
2,t-test,500 vs full,0.8612,0.020028,0.87874,0.009361,-2.026986,0.112598,0.337795,False


## ACSA

In [137]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

###
# Baselines
##
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
RESULTS_PATH = '../results/'

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'acsa':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass

# Multi-label Classifiaction
METHOD = 'bert_clf'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_parameters = folder_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        if cond_parameters[3] == '0':
            cond_parameters[3] = 'full'
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier_gcn'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    cond_parameters = folder_name.split('_')
    cond_params = cond_parameters.copy()
    if cond_params[0] == 'acsa':
        with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'eval_results.txt'), 'r') as f:
            f1 = f.readlines()[3].split(' = ')[1]
                
        cond_params[1:1] = [METHOD]
        if cond_params[3] == '0':
            cond_params[3] = 'full'
        cond_params.extend([round(float(f1), 4), None, None])
        runs.append(cond_params)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [101]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
67,acsa,bert_clf,GERestaurant,full,5,2e-05,16,3,0.8543,0.7975,0.7457
147,acsa,hier_gcn,GERestaurant,full,4,5e-05,8,20.0,0.8333,,
142,acsa,hier_gcn,GERestaurant,full,5,5e-05,8,20.0,0.8322,,
94,acsa,bert_clf,GERestaurant,full,4,2e-05,16,3,0.8296,0.8076,0.7089
81,acsa,bert_clf,GERestaurant,full,1,2e-05,16,3,0.8261,0.7812,0.7038
65,acsa,bert_clf,GERestaurant,full,3,2e-05,16,3,0.8261,0.7755,0.7038
151,acsa,hier_gcn,GERestaurant,full,2,5e-05,8,20.0,0.8249,,
133,acsa,bert_clf,GERestaurant,full,2,2e-05,16,3,0.8225,0.7574,0.6985
160,acsa,hier_gcn,GERestaurant,full,3,5e-05,8,20.0,0.8205,,
149,acsa,hier_gcn,GERestaurant,full,1,5e-05,8,20.0,0.8136,,


Unnamed: 0,basic,context,cot,prompting,hier_gcn,bert_clf
1,0.8583,0.8438,0.851,0.7614,0.8136,0.8261
2,0.8255,0.8093,0.8086,0.7606,0.8249,0.8225
3,0.8226,0.8356,0.8163,0.7639,0.8205,0.8261
4,0.8659,0.865,0.8191,0.8024,0.8333,0.8296
5,0.8332,0.8665,0.8158,0.8071,0.8322,0.8543


Unnamed: 0,W,pval,normal
context,0.915966,0.504258,True


Unnamed: 0,W,pval,normal
prompting,0.752105,0.031126,False


Unnamed: 0,W,pval,normal
hier_gcn,0.935255,0.632617,True


Unnamed: 0,W,pval,normal
bert_clf,0.729227,0.018894,False


    split     prompt      f1
0       1    context  0.8438
1       2    context  0.8093
2       3    context  0.8356
3       4    context  0.8650
4       5    context  0.8665
5       1  prompting  0.7614
6       2  prompting  0.7606
7       3  prompting  0.7639
8       4  prompting  0.8024
9       5  prompting  0.8071
10      1   hier_gcn  0.8136
11      2   hier_gcn  0.8249
12      3   hier_gcn  0.8205
13      4   hier_gcn  0.8333
14      5   hier_gcn  0.8322
15      1   bert_clf  0.8261
16      2   bert_clf  0.8225
17      3   bert_clf  0.8261
18      4   bert_clf  0.8296
19      5   bert_clf  0.8543
Friedman Test Result:
          Source      W  ddof1      Q     p-unc
Friedman  Within  0.712      3  10.68  0.013588


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,context vs prompting,0.84404,0.02108,0.77908,0.02104,0.0,0.0625,0.375,False
1,wilcoxon,context vs hier_gcn,0.84404,0.02108,0.8249,0.007361,2.0,0.1875,0.5625,False
2,wilcoxon,context vs bert_clf,0.84404,0.02108,0.83172,0.011511,3.0,0.3125,0.625,False
3,wilcoxon,prompting vs hier_gcn,0.77908,0.02104,0.8249,0.007361,0.0,0.0625,0.375,False
4,wilcoxon,prompting vs bert_clf,0.77908,0.02104,0.83172,0.011511,0.0,0.0625,0.375,False
5,wilcoxon,hier_gcn vs bert_clf,0.8249,0.007361,0.83172,0.011511,3.0,0.3125,0.625,False


### 1000

In [104]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
88,acsa,bert_clf,GERestaurant,1000,5,2e-05,16,6,0.868,0.8081,0.7667
123,acsa,bert_clf,GERestaurant,1000,2,2e-05,16,6,0.8499,0.8077,0.7389
131,acsa,bert_clf,GERestaurant,1000,4,2e-05,16,6,0.8444,0.8175,0.7308
75,acsa,bert_clf,GERestaurant,1000,3,2e-05,16,6,0.8352,0.8014,0.717
78,acsa,bert_clf,GERestaurant,1000,1,2e-05,16,6,0.8206,0.7892,0.6958
144,acsa,hier_gcn,GERestaurant,1000,3,5e-05,8,43.0,0.8143,,
148,acsa,hier_gcn,GERestaurant,1000,2,5e-05,8,43.0,0.7991,,
175,acsa,hier_gcn,GERestaurant,1000,5,5e-05,8,43.0,0.7982,,
7,acsa,prompting,GERestaurant,1000,5,-,-,-,0.7907,0.7638,0.6538
1,acsa,prompting,GERestaurant,1000,4,-,-,-,0.7839,0.7615,0.6446


Unnamed: 0,basic,context,cot,prompting,hier_gcn,bert_clf
1,0.8314,0.8204,0.7715,0.7523,0.7743,0.8206
2,0.7796,0.8031,0.7996,0.7316,0.7991,0.8499
3,0.8365,0.81,0.7984,0.7629,0.8143,0.8352
4,0.8479,0.7972,0.8268,0.7839,0.7765,0.8444
5,0.7664,0.7579,0.8138,0.7907,0.7982,0.868


Unnamed: 0,W,pval,normal
basic,0.854784,0.210119,True


Unnamed: 0,W,pval,normal
prompting,0.958157,0.795096,True


Unnamed: 0,W,pval,normal
hier_gcn,0.901674,0.419208,True


Unnamed: 0,W,pval,normal
bert_clf,0.994647,0.993171,True


    split     prompt      f1
0       1      basic  0.8314
1       2      basic  0.7796
2       3      basic  0.8365
3       4      basic  0.8479
4       5      basic  0.7664
5       1  prompting  0.7523
6       2  prompting  0.7316
7       3  prompting  0.7629
8       4  prompting  0.7839
9       5  prompting  0.7907
10      1   hier_gcn  0.7743
11      2   hier_gcn  0.7991
12      3   hier_gcn  0.8143
13      4   hier_gcn  0.7765
14      5   hier_gcn  0.7982
15      1   bert_clf  0.8206
16      2   bert_clf  0.8499
17      3   bert_clf  0.8352
18      4   bert_clf  0.8444
19      5   bert_clf  0.8680
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      3     12  8.03827  0.003336  0.624488  0.555584


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.81236,0.032845,0.76428,0.021431,2.550775,0.063253,0.253014,False
1,t-test,basic vs hier_gcn,0.81236,0.032845,0.79248,0.01509,0.977839,0.383521,0.48873,False
2,t-test,basic vs bert_clf,0.81236,0.032845,0.84362,0.015718,-1.363695,0.244365,0.48873,False
3,t-test,prompting vs hier_gcn,0.76428,0.021431,0.79248,0.01509,-2.041277,0.110783,0.332348,False
4,t-test,prompting vs bert_clf,0.76428,0.021431,0.84362,0.015718,-7.839645,0.00143,0.008579,True
5,t-test,hier_gcn vs bert_clf,0.79248,0.01509,0.84362,0.015718,-5.777347,0.004458,0.022289,True


### 500

In [105]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
61,acsa,bert_clf,GERestaurant,500,5,2e-05,16,13,0.8426,0.7523,0.728
91,acsa,bert_clf,GERestaurant,500,3,2e-05,16,13,0.8154,0.767,0.6883
58,acsa,bert_clf,GERestaurant,500,4,2e-05,16,13,0.8148,0.779,0.6875
47,acsa,prompting,GERestaurant,500,5,-,-,-,0.8145,0.771,0.6871
64,acsa,bert_clf,GERestaurant,500,2,2e-05,16,13,0.8075,0.757,0.6771
80,acsa,bert_clf,GERestaurant,500,1,2e-05,16,13,0.8044,0.7525,0.6728
14,acsa,prompting,GERestaurant,500,1,-,-,-,0.7818,0.7901,0.6418
33,acsa,prompting,GERestaurant,500,3,-,-,-,0.7795,0.7671,0.6387
24,acsa,prompting,GERestaurant,500,4,-,-,-,0.7716,0.7599,0.6281
8,acsa,prompting,GERestaurant,500,2,-,-,-,0.7579,0.7446,0.6102


Unnamed: 0,basic,context,cot,prompting,hier_gcn,bert_clf
1,0.7956,0.825,0.815,0.7818,0.7143,0.8044
2,0.7872,0.7988,0.7771,0.7579,0.7267,0.8075
3,0.7952,0.8188,0.8266,0.7795,0.7491,0.8154
4,0.8316,0.8387,0.8373,0.7716,0.7576,0.8148
5,0.8094,0.7761,0.8129,0.8145,0.7436,0.8426


Unnamed: 0,W,pval,normal
cot,0.908588,0.459153,True


Unnamed: 0,W,pval,normal
prompting,0.919495,0.526689,True


Unnamed: 0,W,pval,normal
hier_gcn,0.951836,0.750298,True


Unnamed: 0,W,pval,normal
bert_clf,0.817185,0.111049,True


    split     prompt      f1
0       1        cot  0.8150
1       2        cot  0.7771
2       3        cot  0.8266
3       4        cot  0.8373
4       5        cot  0.8129
5       1  prompting  0.7818
6       2  prompting  0.7579
7       3  prompting  0.7795
8       4  prompting  0.7716
9       5  prompting  0.8145
10      1   hier_gcn  0.7143
11      2   hier_gcn  0.7267
12      3   hier_gcn  0.7491
13      4   hier_gcn  0.7576
14      5   hier_gcn  0.7436
15      1   bert_clf  0.8044
16      2   bert_clf  0.8075
17      3   bert_clf  0.8154
18      4   bert_clf  0.8148
19      5   bert_clf  0.8426
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      3     12  30.252616  0.000007  0.771688  0.582973


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs prompting,0.81378,0.020324,0.78106,0.018693,2.839668,0.046883,0.093767,False
1,t-test,cot vs hier_gcn,0.81378,0.020324,0.73826,0.01567,9.277865,0.000751,0.003753,True
2,t-test,cot vs bert_clf,0.81378,0.020324,0.81694,0.013504,-0.282618,0.791491,0.791491,False
3,t-test,prompting vs hier_gcn,0.78106,0.018693,0.73826,0.01567,3.81485,0.018861,0.056584,False
4,t-test,prompting vs bert_clf,0.78106,0.018693,0.81694,0.013504,-7.333851,0.00184,0.00736,True
5,t-test,hier_gcn vs bert_clf,0.73826,0.01567,0.81694,0.013504,-10.320217,0.000497,0.002984,True


In [138]:
args.task = 'acsa'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8314,0.7956,0.8583
2,0.7796,0.7872,0.8255
3,0.8365,0.7952,0.8226
4,0.8479,0.8316,0.8659
5,0.7664,0.8094,0.8332


Unnamed: 0,W,pval,normal
1000,0.854784,0.210119,True


Unnamed: 0,W,pval,normal
500,0.888535,0.349789,True


Unnamed: 0,W,pval,normal
full,0.863788,0.24217,True


    split prompt      f1
0       1   1000  0.8314
1       2   1000  0.7796
2       3   1000  0.8365
3       4   1000  0.8479
4       5   1000  0.7664
5       1    500  0.7956
6       2    500  0.7872
7       3    500  0.7952
8       4    500  0.8316
9       5    500  0.8094
10      1   full  0.8583
11      2   full  0.8255
12      3   full  0.8226
13      4   full  0.8659
14      5   full  0.8332
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  4.868974  0.041376  0.318362  0.651697
Results for LR-Comparison of :  basic


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.81236,0.032845,0.8038,0.015627,0.552925,0.60976,0.60976,False
1,t-test,1000 vs full,0.81236,0.032845,0.8411,0.017657,-2.11876,0.101484,0.202967,False
2,t-test,500 vs full,0.8038,0.015627,0.8411,0.017657,-5.454067,0.005492,0.016475,True


Unnamed: 0,1000,500,full
1,0.8204,0.825,0.8438
2,0.8031,0.7988,0.8093
3,0.81,0.8188,0.8356
4,0.7972,0.8387,0.865
5,0.7579,0.7761,0.8665


Unnamed: 0,W,pval,normal
1000,0.873577,0.281211,True


Unnamed: 0,W,pval,normal
500,0.961278,0.816856,True


Unnamed: 0,W,pval,normal
full,0.915966,0.504258,True


    split prompt      f1
0       1   1000  0.8204
1       2   1000  0.8031
2       3   1000  0.8100
3       4   1000  0.7972
4       5   1000  0.7579
5       1    500  0.8250
6       2    500  0.7988
7       3    500  0.8188
8       4    500  0.8387
9       5    500  0.7761
10      1   full  0.8438
11      2   full  0.8093
12      3   full  0.8356
13      4   full  0.8650
14      5   full  0.8665
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  5.459415  0.031973  0.450861  0.61449
Results for LR-Comparison of :  context


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79772,0.021354,0.81148,0.021862,-1.758984,0.153399,0.201808,False
1,t-test,1000 vs full,0.79772,0.021354,0.84404,0.02108,-2.492982,0.067269,0.201808,False
2,t-test,500 vs full,0.81148,0.021862,0.84404,0.02108,-2.21825,0.090789,0.201808,False


Unnamed: 0,1000,500,full
1,0.7715,0.815,0.851
2,0.7996,0.7771,0.8086
3,0.7984,0.8266,0.8163
4,0.8268,0.8373,0.8191
5,0.8138,0.8129,0.8158


Unnamed: 0,W,pval,normal
1000,0.961658,0.819479,True


Unnamed: 0,W,pval,normal
500,0.908588,0.459153,True


Unnamed: 0,W,pval,normal
full,0.763502,0.039503,False


    split prompt      f1
0       1   1000  0.7715
1       2   1000  0.7996
2       3   1000  0.7984
3       4   1000  0.8268
4       5   1000  0.8138
5       1    500  0.8150
6       2    500  0.7771
7       3    500  0.8266
8       4    500  0.8373
9       5    500  0.8129
10      1   full  0.8510
11      2   full  0.8086
12      3   full  0.8163
13      4   full  0.8191
14      5   full  0.8158
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.16      2  1.6  0.449329
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.80202,0.018457,0.81378,0.020324,4.0,0.4375,0.875,False
1,wilcoxon,1000 vs full,0.80202,0.018457,0.82216,0.014831,2.0,0.1875,0.5625,False
2,wilcoxon,500 vs full,0.81378,0.020324,0.82216,0.014831,5.0,0.625,0.875,False


Unnamed: 0,1000,500,full
1,0.8314,0.815,0.8438
2,0.7796,0.7771,0.8093
3,0.8365,0.8266,0.8356
4,0.8479,0.8373,0.865
5,0.7664,0.8129,0.8665


Unnamed: 0,W,pval,normal
1000,0.854784,0.210119,True


Unnamed: 0,W,pval,normal
500,0.908588,0.459153,True


Unnamed: 0,W,pval,normal
full,0.915966,0.504258,True


    split prompt      f1
0       1   1000  0.8314
1       2   1000  0.7796
2       3   1000  0.8365
3       4   1000  0.8479
4       5   1000  0.7664
5       1    500  0.8150
6       2    500  0.7771
7       3    500  0.8266
8       4    500  0.8373
9       5    500  0.8129
10      1   full  0.8438
11      2   full  0.8093
12      3   full  0.8356
13      4   full  0.8650
14      5   full  0.8665
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.851121  0.067377  0.248554  0.527364
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.81236,0.032845,0.81378,0.020324,-0.123648,0.907558,0.907558,False
1,t-test,1000 vs full,0.81236,0.032845,0.84404,0.02108,-1.780595,0.149581,0.299161,False
2,t-test,500 vs full,0.81378,0.020324,0.84404,0.02108,-4.259877,0.013055,0.039165,True


## E2E

In [139]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

RESULTS_PATH = '../results/'
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'e2e':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_pol.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass

# InstructABSA
METHOD = 'instructABSA'

filenames = [file for file in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if file != '.ipynb_checkpoints']

for file in filenames:
    try:
        cond_name = file.split('.tsv')[0]
        cond_parameters = cond_name.split('_')
        
        with open(os.path.join(RESULTS_PATH, METHOD, file), 'r') as f:
            f1 = f.readlines()[-1].split('\t')[1]
        
        cond_parameters.extend([round(float(f1),4), None,None])
        cond_parameters.insert(0, 'e2e')   # Task
        cond_parameters.insert(1, METHOD)  # Method
        cond_parameters.insert(6, 8)       # Batch Size
    
        # if cond_parameters[3] == 'full':
        #     cond_parameters[3] = '0'

        runs.append(cond_parameters)
        
    except:
        pass


# TAS-BERT

METHOD = 'tas_bert'
RESULTS_PATH = '../results/'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if folder != '.ipynb_checkpoints']
for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.extend([round(float(f1),4), None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    # if cond_parameters[3] == 'full':
    #     cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [108]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
90,e2e,instructABSA,GERestaurant,full,5,5e-05,8,4.0,0.7371,,
105,e2e,tas_bert,GERestaurant,full,3,2e-05,24,30.0,0.7271,,
110,e2e,tas_bert,GERestaurant,full,5,2e-05,24,30.0,0.7253,,
71,e2e,instructABSA,GERestaurant,full,4,5e-05,8,4.0,0.7217,,
99,e2e,tas_bert,GERestaurant,full,4,2e-05,24,30.0,0.7163,,
91,e2e,instructABSA,GERestaurant,full,2,5e-05,8,4.0,0.713,,
11,e2e,prompting,GERestaurant,full,4,-,-,-,0.7122,0.6353,0.5531
68,e2e,instructABSA,GERestaurant,full,1,5e-05,8,4.0,0.7071,,
94,e2e,instructABSA,GERestaurant,full,3,5e-05,8,4.0,0.696,,
126,e2e,tas_bert,GERestaurant,full,2,2e-05,24,30.0,0.6944,,


Unnamed: 0,basic,context,cot,prompting,instructABSA,tas_bert
1,0.7923,0.7876,0.7666,0.6366,0.7071,0.6896
2,0.7865,0.75,0.7546,0.6425,0.713,0.6944
3,0.8,0.7861,0.744,0.6861,0.696,0.7271
4,0.8335,0.8187,0.7875,0.7122,0.7217,0.7163
5,0.8166,0.8004,0.7634,0.6813,0.7371,0.7253


Unnamed: 0,W,pval,normal
basic,0.935021,0.630989,True


Unnamed: 0,W,pval,normal
prompting,0.917649,0.514885,True


Unnamed: 0,W,pval,normal
instructABSA,0.990039,0.979831,True


Unnamed: 0,W,pval,normal
tas_bert,0.856518,0.216014,True


    split        prompt      f1
0       1         basic  0.7923
1       2         basic  0.7865
2       3         basic  0.8000
3       4         basic  0.8335
4       5         basic  0.8166
5       1     prompting  0.6366
6       2     prompting  0.6425
7       3     prompting  0.6861
8       4     prompting  0.7122
9       5     prompting  0.6813
10      1  instructABSA  0.7071
11      2  instructABSA  0.7130
12      3  instructABSA  0.6960
13      4  instructABSA  0.7217
14      5  instructABSA  0.7371
15      1      tas_bert  0.6896
16      2      tas_bert  0.6944
17      3      tas_bert  0.7271
18      4      tas_bert  0.7163
19      5      tas_bert  0.7253
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F         p-unc       ng2       eps
0  prompt      3     12  73.908228  5.252187e-08  0.862895  0.547476


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.80578,0.01716,0.67174,0.02837,17.770625,5.9e-05,0.000353,True
1,t-test,basic vs instructABSA,0.80578,0.01716,0.71498,0.013862,12.393205,0.000244,0.00099,True
2,t-test,basic vs tas_bert,0.80578,0.01716,0.71054,0.015648,13.06773,0.000198,0.00099,True
3,t-test,prompting vs instructABSA,0.67174,0.02837,0.71498,0.013862,-3.0989,0.03626,0.07252,False
4,t-test,prompting vs tas_bert,0.67174,0.02837,0.71054,0.015648,-4.325527,0.012393,0.03718,True
5,t-test,instructABSA vs tas_bert,0.71498,0.013862,0.71054,0.015648,0.483115,0.654266,0.654266,False


### 1000

In [111]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
79,e2e,instructABSA,GERestaurant,1000,4,5e-05,8,9,0.7133,,
81,e2e,instructABSA,GERestaurant,1000,5,5e-05,8,9,0.7079,,
66,e2e,instructABSA,GERestaurant,1000,3,5e-05,8,9,0.7004,,
93,e2e,instructABSA,GERestaurant,1000,1,5e-05,8,9,0.6889,,
84,e2e,instructABSA,GERestaurant,1000,2,5e-05,8,9,0.6877,,
6,e2e,prompting,GERestaurant,1000,3,-,-,-,0.6766,0.5996,0.5112
120,e2e,tas_bert,GERestaurant,1000,4,2e-05,24,13.0,0.6725,,
7,e2e,prompting,GERestaurant,1000,4,-,-,-,0.6718,0.5797,0.5058
107,e2e,tas_bert,GERestaurant,1000,2,2e-05,24,13.0,0.6716,,
111,e2e,tas_bert,GERestaurant,1000,3,2e-05,24,13.0,0.6708,,


Unnamed: 0,basic,context,cot,prompting,instructABSA,tas_bert
1,0.7992,0.7785,0.7249,0.6473,0.6889,0.6555
2,0.7455,0.7703,0.7212,0.6411,0.6877,0.6716
3,0.7953,0.7758,0.7616,0.6766,0.7004,0.6708
4,0.8136,0.8069,0.758,0.6718,0.7133,0.6725
5,0.7681,0.7893,0.7609,0.6634,0.7079,0.6624


Unnamed: 0,W,pval,normal
basic,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
prompting,0.921544,0.53995,True


Unnamed: 0,W,pval,normal
instructABSA,0.908972,0.461436,True


Unnamed: 0,W,pval,normal
tas_bert,0.83729,0.157568,True


    split        prompt      f1
0       1         basic  0.7992
1       2         basic  0.7455
2       3         basic  0.7953
3       4         basic  0.8136
4       5         basic  0.7681
5       1     prompting  0.6473
6       2     prompting  0.6411
7       3     prompting  0.6766
8       4     prompting  0.6718
9       5     prompting  0.6634
10      1  instructABSA  0.6889
11      2  instructABSA  0.6877
12      3  instructABSA  0.7004
13      4  instructABSA  0.7133
14      5  instructABSA  0.7079
15      1      tas_bert  0.6555
16      2      tas_bert  0.6716
17      3      tas_bert  0.6708
18      4      tas_bert  0.6725
19      5      tas_bert  0.6624
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F         p-unc       ng2       eps
0  prompt      3     12  85.192453  2.339554e-08  0.913402  0.488632


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.78434,0.02437,0.66004,0.013747,12.822135,0.000213,0.00128,True
1,t-test,basic vs instructABSA,0.78434,0.02437,0.69964,0.010132,7.853001,0.001421,0.004262,True
2,t-test,basic vs tas_bert,0.78434,0.02437,0.66656,0.006607,9.12474,0.0008,0.003201,True
3,t-test,prompting vs instructABSA,0.66004,0.013747,0.69964,0.010132,-9.745008,0.000621,0.003105,True
4,t-test,prompting vs tas_bert,0.66004,0.013747,0.66656,0.006607,-1.018211,0.366154,0.366154,False
5,t-test,instructABSA vs tas_bert,0.69964,0.010132,0.66656,0.006607,6.522815,0.002853,0.005705,True


### 500

In [112]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
34,e2e,prompting,GERestaurant,500,5,-,-,-,0.6954,0.6877,0.533
69,e2e,instructABSA,GERestaurant,500,3,5e-05,8,17,0.6898,,
64,e2e,instructABSA,GERestaurant,500,4,5e-05,8,17,0.6834,,
82,e2e,instructABSA,GERestaurant,500,1,5e-05,8,17,0.6818,,
12,e2e,prompting,GERestaurant,500,3,-,-,-,0.6784,0.5653,0.5133
86,e2e,instructABSA,GERestaurant,500,5,5e-05,8,17,0.6722,,
62,e2e,instructABSA,GERestaurant,500,2,5e-05,8,17,0.6699,,
47,e2e,prompting,GERestaurant,500,2,-,-,-,0.6618,0.6027,0.4945
28,e2e,prompting,GERestaurant,500,1,-,-,-,0.6546,0.563,0.4865
15,e2e,prompting,GERestaurant,500,4,-,-,-,0.649,0.5372,0.4803


Unnamed: 0,basic,context,cot,prompting,instructABSA,tas_bert
1,0.7458,0.7271,0.67,0.6546,0.6818,0.6072
2,0.7606,0.7338,0.718,0.6618,0.6699,0.5978
3,0.6998,0.6775,0.6722,0.6784,0.6898,0.6164
4,0.7563,0.7069,0.7063,0.649,0.6834,0.6131
5,0.7301,0.7094,0.7091,0.6954,0.6722,0.6087


Unnamed: 0,W,pval,normal
basic,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
prompting,0.930845,0.602146,True


Unnamed: 0,W,pval,normal
instructABSA,0.932879,0.616134,True


Unnamed: 0,W,pval,normal
tas_bert,0.951778,0.749884,True


    split        prompt      f1
0       1         basic  0.7458
1       2         basic  0.7606
2       3         basic  0.6998
3       4         basic  0.7563
4       5         basic  0.7301
5       1     prompting  0.6546
6       2     prompting  0.6618
7       3     prompting  0.6784
8       4     prompting  0.6490
9       5     prompting  0.6954
10      1  instructABSA  0.6818
11      2  instructABSA  0.6699
12      3  instructABSA  0.6898
13      4  instructABSA  0.6834
14      5  instructABSA  0.6722
15      1      tas_bert  0.6072
16      2      tas_bert  0.5978
17      3      tas_bert  0.6164
18      4      tas_bert  0.6131
19      5      tas_bert  0.6087
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      3     12  40.000103  0.000002  0.907431  0.505115


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.73852,0.022031,0.66784,0.016958,3.990003,0.016265,0.035357,True
1,t-test,basic vs instructABSA,0.73852,0.022031,0.67942,0.007376,4.38968,0.011786,0.035357,True
2,t-test,basic vs tas_bert,0.73852,0.022031,0.60864,0.00632,9.721971,0.000627,0.003134,True
3,t-test,prompting vs instructABSA,0.66784,0.016958,0.67942,0.007376,-1.16192,0.309873,0.309873,False
4,t-test,prompting vs tas_bert,0.66784,0.016958,0.60864,0.00632,6.906967,0.002305,0.009219,True
5,t-test,instructABSA vs tas_bert,0.67942,0.007376,0.60864,0.00632,36.207564,3e-06,2.1e-05,True


In [140]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7992,0.7458,0.7923
2,0.7455,0.7606,0.7865
3,0.7953,0.6998,0.8
4,0.8136,0.7563,0.8335
5,0.7681,0.7301,0.8166


Unnamed: 0,W,pval,normal
1000,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
500,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
full,0.935021,0.630989,True


    split prompt      f1
0       1   1000  0.7992
1       2   1000  0.7455
2       3   1000  0.7953
3       4   1000  0.8136
4       5   1000  0.7681
5       1    500  0.7458
6       2    500  0.7606
7       3    500  0.6998
8       4    500  0.7563
9       5    500  0.7301
10      1   full  0.7923
11      2   full  0.7865
12      3   full  0.8000
13      4   full  0.8335
14      5   full  0.8166
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  11.492874  0.004443  0.632182  0.733527
Results for LR-Comparison of :  basic


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78434,0.02437,0.73852,0.022031,2.555484,0.062938,0.125877,False
1,t-test,1000 vs full,0.78434,0.02437,0.80578,0.01716,-2.043949,0.110447,0.125877,False
2,t-test,500 vs full,0.73852,0.022031,0.80578,0.01716,-4.947962,0.007772,0.023317,True


Unnamed: 0,1000,500,full
1,0.7785,0.7271,0.7876
2,0.7703,0.7338,0.75
3,0.7758,0.6775,0.7861
4,0.8069,0.7069,0.8187
5,0.7893,0.7094,0.8004


Unnamed: 0,W,pval,normal
1000,0.908464,0.458414,True


Unnamed: 0,W,pval,normal
500,0.93137,0.605745,True


Unnamed: 0,W,pval,normal
full,0.948692,0.727844,True


    split prompt      f1
0       1   1000  0.7785
1       2   1000  0.7703
2       3   1000  0.7758
3       4   1000  0.8069
4       5   1000  0.7893
5       1    500  0.7271
6       2    500  0.7338
7       3    500  0.6775
8       4    500  0.7069
9       5    500  0.7094
10      1   full  0.7876
11      2   full  0.7500
12      3   full  0.7861
13      4   full  0.8187
14      5   full  0.8004
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  22.029879  0.000558  0.781953  0.530777
Results for LR-Comparison of :  context


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78416,0.012943,0.71094,0.019599,5.774789,0.004465,0.013395,True
1,t-test,1000 vs full,0.78416,0.012943,0.78856,0.02255,-0.710676,0.516527,0.516527,False
2,t-test,500 vs full,0.71094,0.019599,0.78856,0.02255,-4.348419,0.012172,0.024344,True


Unnamed: 0,1000,500,full
1,0.7249,0.67,0.7666
2,0.7212,0.718,0.7546
3,0.7616,0.6722,0.744
4,0.758,0.7063,0.7875
5,0.7609,0.7091,0.7634


Unnamed: 0,W,pval,normal
1000,0.75758,0.034931,False


Unnamed: 0,W,pval,normal
500,0.83257,0.145409,True


Unnamed: 0,W,pval,normal
full,0.968478,0.865396,True


    split prompt      f1
0       1   1000  0.7249
1       2   1000  0.7212
2       3   1000  0.7616
3       4   1000  0.7580
4       5   1000  0.7609
5       1    500  0.6700
6       2    500  0.7180
7       3    500  0.6722
8       4    500  0.7063
9       5    500  0.7091
10      1   full  0.7666
11      2   full  0.7546
12      3   full  0.7440
13      4   full  0.7875
14      5   full  0.7634
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.74532,0.018261,0.69512,0.020001,0.0,0.0625,0.1875,False
1,wilcoxon,1000 vs full,0.74532,0.018261,0.76322,0.014455,2.0,0.1875,0.1875,False
2,wilcoxon,500 vs full,0.69512,0.020001,0.76322,0.014455,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.7992,0.7458,0.7923
2,0.7455,0.7606,0.7865
3,0.7953,0.6998,0.8
4,0.8136,0.7563,0.8335
5,0.7681,0.7301,0.8166


Unnamed: 0,W,pval,normal
1000,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
500,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
full,0.935021,0.630989,True


    split prompt      f1
0       1   1000  0.7992
1       2   1000  0.7455
2       3   1000  0.7953
3       4   1000  0.8136
4       5   1000  0.7681
5       1    500  0.7458
6       2    500  0.7606
7       3    500  0.6998
8       4    500  0.7563
9       5    500  0.7301
10      1   full  0.7923
11      2   full  0.7865
12      3   full  0.8000
13      4   full  0.8335
14      5   full  0.8166
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  11.492874  0.004443  0.632182  0.733527
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78434,0.02437,0.73852,0.022031,2.555484,0.062938,0.125877,False
1,t-test,1000 vs full,0.78434,0.02437,0.80578,0.01716,-2.043949,0.110447,0.125877,False
2,t-test,500 vs full,0.73852,0.022031,0.80578,0.01716,-4.947962,0.007772,0.023317,True


## E2E - without Implicit

In [31]:
runs = []
RESULTS_PATH = '../results_final/filtered/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructAbsa'
RESULTS_PATH = '../../../ABSA-Baselines/InstructABSA-Custom/Output_filtered'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

file_names = [file for file in os.listdir(RESULTS_PATH) if len(file.split('.tsv')) > 1 and file != '.ipynb_checkpoints']

for file_name in file_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, file_name), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = file_name.split('.tsv')[0]
    cond_parameters = cond_name.split('_')
    
    cond_parameters.append(metrics_dict['F1-Score'])
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method
    cond_parameters.insert(6, 8)       # Batch Size

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'
    
    runs.append(cond_parameters)


# TAS-BERT

METHOD = 'tas-bert'
RESULTS_PATH = '../../../ABSA-Baselines/TAS-BERT-Custom/results_filtered/GERestaurant/three_joint/BIO'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [file for file in os.listdir(RESULTS_PATH) if file != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.append(float(f1))
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [32]:
args.lr_setting = 0
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
54,e2e,tas-bert,GERestaurant,0,3,2e-05,24,25.0,0.7708,,
40,e2e,tas-bert,GERestaurant,0,5,2e-05,24,25.0,0.7427,,
51,e2e,tas-bert,GERestaurant,0,1,2e-05,24,25.0,0.7346,,
48,e2e,tas-bert,GERestaurant,0,2,2e-05,24,25.0,0.7211,,
57,e2e,tas-bert,GERestaurant,0,4,2e-05,24,25.0,0.6926,,
38,e2e,instructAbsa,GERestaurant,0,3,5e-05,8,4.0,0.623529,,
35,e2e,instructAbsa,GERestaurant,0,2,5e-05,8,4.0,0.6141,,
34,e2e,instructAbsa,GERestaurant,0,5,5e-05,8,4.0,0.601457,,
15,e2e,instructAbsa,GERestaurant,0,4,5e-05,8,4.0,0.592284,,
12,e2e,instructAbsa,GERestaurant,0,1,5e-05,8,4.0,0.570265,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.8237,0.8284,0.8173,0.570265,0.7346
2,0.7497,0.6943,0.7918,0.6141,0.7211
3,0.8431,0.8365,0.852,0.623529,0.7708
4,0.7857,0.8154,0.7652,0.592284,0.6926
5,0.8039,0.7256,0.8055,0.601457,0.7427


Unnamed: 0,W,pval,normal
cot,0.991887,0.985867,True


Unnamed: 0,W,pval,normal
instructAbsa,0.974128,0.901028,True


Unnamed: 0,W,pval,normal
tas-bert,0.991894,0.98589,True


    split        prompt        f1
0       1           cot  0.817300
1       2           cot  0.791800
2       3           cot  0.852000
3       4           cot  0.765200
4       5           cot  0.805500
5       1  instructAbsa  0.570265
6       2  instructAbsa  0.614100
7       3  instructAbsa  0.623529
8       4  instructAbsa  0.592284
9       5  instructAbsa  0.601457
10      1      tas-bert  0.734600
11      2      tas-bert  0.721100
12      3      tas-bert  0.770800
13      4      tas-bert  0.692600
14      5      tas-bert  0.742700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F         p-unc       ng2       eps
0  prompt      2      8  177.919903  2.337325e-07  0.922866  0.537248


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs instructAbsa,0.80636,0.028669,0.600327,0.018427,14.410956,0.000135,0.00027,True
1,t-test,cot vs tas-bert,0.80636,0.028669,0.73236,0.02568,20.297983,3.5e-05,0.000104,True
2,t-test,instructAbsa vs tas-bert,0.600327,0.018427,0.73236,0.02568,-10.79296,0.000418,0.000418,True


### 1000

In [33]:
args.lr_setting = 1000
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
46,e2e,tas-bert,GERestaurant,1000,3,2e-05,24,25.0,0.7644,,
44,e2e,tas-bert,GERestaurant,1000,2,2e-05,24,25.0,0.7173,,
47,e2e,tas-bert,GERestaurant,1000,5,2e-05,24,25.0,0.7087,,
42,e2e,tas-bert,GERestaurant,1000,1,2e-05,24,25.0,0.7064,,
52,e2e,tas-bert,GERestaurant,1000,4,2e-05,24,25.0,0.6827,,
25,e2e,instructAbsa,GERestaurant,1000,5,5e-05,8,9.0,0.67364,,
28,e2e,instructAbsa,GERestaurant,1000,2,5e-05,8,9.0,0.656834,,
10,e2e,instructAbsa,GERestaurant,1000,3,5e-05,8,9.0,0.630435,,
37,e2e,instructAbsa,GERestaurant,1000,1,5e-05,8,9.0,0.623053,,
23,e2e,instructAbsa,GERestaurant,1000,4,5e-05,8,9.0,0.617464,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.804,0.8152,0.8085,0.623053,0.7064
2,0.8161,0.6942,0.8241,0.656834,0.7173
3,0.8259,0.8583,0.8108,0.630435,0.7644
4,0.7888,0.7506,0.7668,0.617464,0.6827
5,0.7688,0.7923,0.7867,0.67364,0.7087


Unnamed: 0,W,pval,normal
short,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
instructAbsa,0.899257,0.405792,True


Unnamed: 0,W,pval,normal
tas-bert,0.900709,0.413814,True


    split        prompt        f1
0       1         short  0.804000
1       2         short  0.816100
2       3         short  0.825900
3       4         short  0.788800
4       5         short  0.768800
5       1  instructAbsa  0.623053
6       2  instructAbsa  0.656834
7       3  instructAbsa  0.630435
8       4  instructAbsa  0.617464
9       5  instructAbsa  0.673640
10      1      tas-bert  0.706400
11      2      tas-bert  0.717300
12      3      tas-bert  0.764400
13      4      tas-bert  0.682700
14      5      tas-bert  0.708700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  57.508952  0.000018  0.890244  0.757303


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.80072,0.02021,0.640285,0.021453,9.240477,0.000762,0.002287,True
1,t-test,short vs tas-bert,0.80072,0.02021,0.7159,0.026827,8.553924,0.001025,0.002287,True
2,t-test,instructAbsa vs tas-bert,0.640285,0.021453,0.7159,0.026827,-4.58211,0.010167,0.010167,True


### 500

In [34]:
args.lr_setting = 500
args.task = 'e2e'

computePromptStatistics(args)

Unnamed: 0,task,method,dataset,lr-setting,split,learning-rate,batch_size,epochs,f1-micro,f1-macro,accuracy
53,e2e,tas-bert,GERestaurant,500,3,2e-05,24,19.0,0.7168,,
49,e2e,tas-bert,GERestaurant,500,1,2e-05,24,19.0,0.6741,,
55,e2e,tas-bert,GERestaurant,500,5,2e-05,24,19.0,0.6667,,
43,e2e,tas-bert,GERestaurant,500,2,2e-05,24,19.0,0.6628,,
45,e2e,tas-bert,GERestaurant,500,4,2e-05,24,19.0,0.6344,,
13,e2e,instructAbsa,GERestaurant,500,3,5e-05,8,17.0,0.618537,,
6,e2e,instructAbsa,GERestaurant,500,2,5e-05,8,17.0,0.616132,,
8,e2e,instructAbsa,GERestaurant,500,4,5e-05,8,17.0,0.609582,,
30,e2e,instructAbsa,GERestaurant,500,5,5e-05,8,17.0,0.593588,,
26,e2e,instructAbsa,GERestaurant,500,1,5e-05,8,17.0,0.567318,,


Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.7968,0.7556,0.7626,0.567318,0.6741
2,0.7248,0.7984,0.7737,0.616132,0.6628
3,0.8251,0.8159,0.7947,0.618537,0.7168
4,0.7984,0.7291,0.7694,0.609582,0.6344
5,0.7431,0.7102,0.7796,0.593588,0.6667


Unnamed: 0,W,pval,normal
short,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
instructAbsa,0.866267,0.251635,True


Unnamed: 0,W,pval,normal
tas-bert,0.935736,0.635974,True


    split        prompt        f1
0       1         short  0.796800
1       2         short  0.724800
2       3         short  0.825100
3       4         short  0.798400
4       5         short  0.743100
5       1  instructAbsa  0.567318
6       2  instructAbsa  0.616132
7       3  instructAbsa  0.618537
8       4  instructAbsa  0.609582
9       5  instructAbsa  0.593588
10      1      tas-bert  0.674100
11      2      tas-bert  0.662800
12      3      tas-bert  0.716800
13      4      tas-bert  0.634400
14      5      tas-bert  0.666700
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  46.536437  0.000039  0.864789  0.873377


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,short vs instructAbsa,0.77764,0.037512,0.601031,0.018972,8.237915,0.001184,0.003552,True
1,t-test,short vs tas-bert,0.77764,0.037512,0.67096,0.02658,5.93674,0.004036,0.008072,True
2,t-test,instructAbsa vs tas-bert,0.601031,0.018972,0.67096,0.02658,-4.538892,0.010506,0.010506,True


In [35]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.804,0.7968,0.8237
2,0.8161,0.7248,0.7497
3,0.8259,0.8251,0.8431
4,0.7888,0.7984,0.7857
5,0.7688,0.7431,0.8039


Unnamed: 0,W,pval,normal
1000,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
500,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
full,0.983488,0.952373,True


    split prompt      f1
0       1   1000  0.8040
1       2   1000  0.8161
2       3   1000  0.8259
3       4   1000  0.7888
4       5   1000  0.7688
5       1    500  0.7968
6       2    500  0.7248
7       3    500  0.8251
8       4    500  0.7984
9       5    500  0.7431
10      1   full  0.8237
11      2   full  0.7497
12      3   full  0.8431
13      4   full  0.7857
14      5   full  0.8039


  W = np.prod(eig) / (eig.sum() / d) ** d


Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.400032  0.301061  0.113048  0.818557
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80072,0.02021,0.77764,0.037512,1.282481,0.268951,0.537901,False
1,t-test,1000 vs full,0.80072,0.02021,0.80122,0.03213,-0.028098,0.97893,0.97893,False
2,t-test,500 vs full,0.77764,0.037512,0.80122,0.03213,-2.012568,0.114463,0.34339,False


Unnamed: 0,1000,500,full
1,0.8152,0.7556,0.8284
2,0.6942,0.7984,0.6943
3,0.8583,0.8159,0.8365
4,0.7506,0.7291,0.8154
5,0.7923,0.7102,0.7256


Unnamed: 0,W,pval,normal
1000,0.988245,0.973192,True


Unnamed: 0,W,pval,normal
500,0.941347,0.675478,True


Unnamed: 0,W,pval,normal
full,0.833036,0.146574,True


    split prompt      f1
0       1   1000  0.8152
1       2   1000  0.6942
2       3   1000  0.8583
3       4   1000  0.7506
4       5   1000  0.7923
5       1    500  0.7556
6       2    500  0.7984
7       3    500  0.8159
8       4    500  0.7291
9       5    500  0.7102
10      1   full  0.8284
11      2   full  0.6943
12      3   full  0.8365
13      4   full  0.8154
14      5   full  0.7256
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  0.280667  0.76242  0.029568  0.813498
Results for LR-Comparison of :  long


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.78212,0.05608,0.76184,0.040092,0.620648,0.568446,1.0,False
1,t-test,1000 vs full,0.78212,0.05608,0.78004,0.058467,0.096583,0.927703,1.0,False
2,t-test,500 vs full,0.76184,0.040092,0.78004,0.058467,-0.541541,0.616888,1.0,False


Unnamed: 0,1000,500,full
1,0.8085,0.7626,0.8173
2,0.8241,0.7737,0.7918
3,0.8108,0.7947,0.852
4,0.7668,0.7694,0.7652
5,0.7867,0.7796,0.8055


Unnamed: 0,W,pval,normal
1000,0.941549,0.676906,True


Unnamed: 0,W,pval,normal
500,0.958088,0.794608,True


Unnamed: 0,W,pval,normal
full,0.991887,0.985867,True


    split prompt      f1
0       1   1000  0.8085
1       2   1000  0.8241
2       3   1000  0.8108
3       4   1000  0.7668
4       5   1000  0.7867
5       1    500  0.7626
6       2    500  0.7737
7       3    500  0.7947
8       4    500  0.7694
9       5    500  0.7796
10      1   full  0.8173
11      2   full  0.7918
12      3   full  0.8520
13      4   full  0.7652
14      5   full  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  3.868317  0.06679  0.272581  0.975992
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79938,0.020233,0.776,0.010873,2.214037,0.091215,0.182431,False
1,t-test,1000 vs full,0.79938,0.020233,0.80636,0.028669,-0.576626,0.595087,0.595087,False
2,t-test,500 vs full,0.776,0.010873,0.80636,0.028669,-2.621339,0.058717,0.17615,False


Unnamed: 0,1000,500,full
1,0.804,0.7968,0.8173
2,0.8161,0.7248,0.7918
3,0.8259,0.8251,0.852
4,0.7888,0.7984,0.7652
5,0.7688,0.7431,0.8055


Unnamed: 0,W,pval,normal
1000,0.971359,0.883902,True


Unnamed: 0,W,pval,normal
500,0.909897,0.466973,True


Unnamed: 0,W,pval,normal
full,0.991887,0.985867,True


    split prompt      f1
0       1   1000  0.8040
1       2   1000  0.8161
2       3   1000  0.8259
3       4   1000  0.7888
4       5   1000  0.7688
5       1    500  0.7968
6       2    500  0.7248
7       3    500  0.8251
8       4    500  0.7984
9       5    500  0.7431
10      1   full  0.8173
11      2   full  0.7918
12      3   full  0.8520
13      4   full  0.7652
14      5   full  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.717831  0.239505  0.149361  0.857445
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80072,0.02021,0.77764,0.037512,1.282481,0.268951,0.559451,False
1,t-test,1000 vs full,0.80072,0.02021,0.80636,0.028669,-0.446342,0.678447,0.678447,False
2,t-test,500 vs full,0.77764,0.037512,0.80636,0.028669,-1.592527,0.186484,0.559451,False


## TASD

In [142]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

RESULTS_PATH = '../results/'
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'tasd':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_phrases.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass

# Paraphrase Generation
METHOD = 'para'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_phrases.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        # Fix for the wrong output name format
        # cond_parameters[0], cond_parameters[1] = cond_parameters[1], cond_parameters[0]
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        if cond_parameters[3] == '0':
            cond_parameters[3] = 'full'
        if cond_parameters[0] == 'acsd':
            cond_parameters[0] = 'tasd'
        runs.append(cond_parameters)
    except:
        pass

# MVP
METHOD = 'mvp'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:

    with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'result.txt'), 'r') as file:
        f1 = file.readlines()[-1].split(' ')[1]
            
    cond_name = folder_name.split('/')[-1]
    cond_parameters = cond_name.split('_')

    cond_parameters.extend([round(float(f1)/100, 4), None, None])
    cond_parameters[1:1] = [METHOD]
    cond_parameters[0], cond_parameters[2] = cond_parameters[2], cond_parameters[0]
    cond_parameters[3], cond_parameters[4] = cond_parameters[4], cond_parameters[3]
    cond_parameters[5:5] = [1e-4]
    cond_parameters[6:6] = [16 if cond_parameters[3] == 'full' else 8]
    # cond_parameters[3] = 0 if cond_parameters[3] == 'full' else cond_parameters[3]
    
    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [115]:
args.lr_setting = 0
args.task = 'tasd'

stats_tasd['0'] = computePromptStatistics(args)
stats_tasd['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
67,tasd,para,GERestaurant,full,4,0.0003,16,20,0.7342,0.7304,0.58
116,tasd,mvp,GERestaurant,full,3,0.0001,16,20,0.7107,,
68,tasd,para,GERestaurant,full,5,0.0003,16,20,0.7043,0.6538,0.5436
125,tasd,mvp,GERestaurant,full,4,0.0001,16,20,0.7042,,
65,tasd,para,GERestaurant,full,3,0.0003,16,20,0.7028,0.6728,0.5417
120,tasd,mvp,GERestaurant,full,1,0.0001,16,20,0.7021,,
104,tasd,mvp,GERestaurant,full,5,0.0001,16,20,0.694,,
107,tasd,mvp,GERestaurant,full,2,0.0001,16,20,0.6918,,
95,tasd,para,GERestaurant,full,2,0.0003,16,20,0.6914,0.6669,0.5284
93,tasd,para,GERestaurant,full,1,0.0003,16,20,0.6867,0.6539,0.5229


Unnamed: 0,basic,context,cot,prompting,para,mvp
1,0.7123,0.7433,0.7457,0.4731,0.6867,0.7021
2,0.7362,0.7347,0.7218,0.4845,0.6914,0.6918
3,0.7657,0.7655,0.7309,0.5069,0.7028,0.7107
4,0.7583,0.7617,0.7293,0.5513,0.7342,0.7042
5,0.7837,0.7755,0.6622,0.5969,0.7043,0.694


Unnamed: 0,W,pval,normal
context,0.94345,0.690422,True


Unnamed: 0,W,pval,normal
prompting,0.922904,0.548844,True


Unnamed: 0,W,pval,normal
para,0.877092,0.29634,True


Unnamed: 0,W,pval,normal
mvp,0.947505,0.719361,True


    split     prompt      f1
0       1    context  0.7433
1       2    context  0.7347
2       3    context  0.7655
3       4    context  0.7617
4       5    context  0.7755
5       1  prompting  0.4731
6       2  prompting  0.4845
7       3  prompting  0.5069
8       4  prompting  0.5513
9       5  prompting  0.5969
10      1       para  0.6867
11      2       para  0.6914
12      3       para  0.7028
13      4       para  0.7342
14      5       para  0.7043
15      1        mvp  0.7021
16      2        mvp  0.6918
17      3        mvp  0.7107
18      4        mvp  0.7042
19      5        mvp  0.6940
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F         p-unc       ng2       eps
0  prompt      3     12  89.417738  1.773855e-08  0.921977  0.400307


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,context vs prompting,0.75614,0.014952,0.52254,0.045809,13.713493,0.000164,0.000983,True
1,t-test,context vs para,0.75614,0.014952,0.70388,0.016562,6.803984,0.002438,0.006141,True
2,t-test,context vs mvp,0.75614,0.014952,0.70056,0.006902,7.693856,0.001535,0.006141,True
3,t-test,prompting vs para,0.52254,0.045809,0.70388,0.016562,-9.443017,0.000701,0.003507,True
4,t-test,prompting vs mvp,0.52254,0.045809,0.70056,0.006902,-7.489375,0.0017,0.006141,True
5,t-test,para vs mvp,0.70388,0.016562,0.70056,0.006902,0.419832,0.696178,0.696178,False


### 1000

In [116]:
args.lr_setting = 1000
args.task = 'tasd'

stats_tasd['1000'] = computePromptStatistics(args)
stats_tasd['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
91,tasd,para,GERestaurant,1000,3,0.0003,16,20,0.6759,0.6349,0.5104
79,tasd,para,GERestaurant,1000,5,0.0003,16,20,0.6756,0.6092,0.5101
124,tasd,mvp,GERestaurant,1000,5,0.0001,8,30,0.6701,,
90,tasd,para,GERestaurant,1000,2,0.0003,16,20,0.67,0.657,0.5037
59,tasd,para,GERestaurant,1000,4,0.0003,16,20,0.6646,0.6748,0.4977
121,tasd,mvp,GERestaurant,1000,4,0.0001,8,30,0.6632,,
126,tasd,mvp,GERestaurant,1000,1,0.0001,8,30,0.6619,,
112,tasd,mvp,GERestaurant,1000,3,0.0001,8,30,0.6536,,
102,tasd,mvp,GERestaurant,1000,2,0.0001,8,30,0.6484,,
86,tasd,para,GERestaurant,1000,1,0.0003,16,20,0.6431,0.6199,0.4739


Unnamed: 0,basic,context,cot,prompting,para,mvp
1,0.7067,0.7317,0.6434,0.3923,0.6431,0.6619
2,0.7114,0.7187,0.6771,0.4443,0.67,0.6484
3,0.7398,0.7419,0.6919,0.4842,0.6759,0.6536
4,0.7859,0.7785,0.7066,0.42,0.6646,0.6632
5,0.7577,0.7166,0.6925,0.4333,0.6756,0.6701


Unnamed: 0,W,pval,normal
basic,0.936901,0.644119,True


Unnamed: 0,W,pval,normal
prompting,0.983225,0.951103,True


Unnamed: 0,W,pval,normal
para,0.816553,0.109799,True


Unnamed: 0,W,pval,normal
mvp,0.966897,0.854993,True


    split     prompt      f1
0       1      basic  0.7067
1       2      basic  0.7114
2       3      basic  0.7398
3       4      basic  0.7859
4       5      basic  0.7577
5       1  prompting  0.3923
6       2  prompting  0.4443
7       3  prompting  0.4842
8       4  prompting  0.4200
9       5  prompting  0.4333
10      1       para  0.6431
11      2       para  0.6700
12      3       para  0.6759
13      4       para  0.6646
14      5       para  0.6756
15      1        mvp  0.6619
16      2        mvp  0.6484
17      3        mvp  0.6536
18      4        mvp  0.6632
19      5        mvp  0.6701
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F         p-unc       ng2       eps
0  prompt      3     12  170.991255  4.142095e-10  0.963438  0.578289


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.7403,0.029484,0.43482,0.030193,15.227297,0.000108,0.000542,True
1,t-test,basic vs para,0.7403,0.029484,0.66584,0.012103,5.56947,0.005092,0.010561,True
2,t-test,basic vs mvp,0.7403,0.029484,0.65944,0.007613,6.162086,0.00352,0.010561,True
3,t-test,prompting vs para,0.43482,0.030193,0.66584,0.012103,-21.64998,2.7e-05,0.000162,True
4,t-test,prompting vs mvp,0.43482,0.030193,0.65944,0.007613,-12.982418,0.000203,0.000812,True
5,t-test,para vs mvp,0.66584,0.012103,0.65944,0.007613,0.845959,0.445214,0.445214,False


### 500

In [117]:
args.lr_setting = 500
args.task = 'tasd'

stats_tasd['500'] = computePromptStatistics(args)
stats_tasd['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
101,tasd,mvp,GERestaurant,500,3,0.0001,8,50,0.6497,,
123,tasd,mvp,GERestaurant,500,5,0.0001,8,50,0.6494,,
69,tasd,para,GERestaurant,500,4,0.0003,16,86,0.6486,0.6585,0.48
98,tasd,para,GERestaurant,500,1,0.0003,16,86,0.6379,0.5965,0.4683
110,tasd,mvp,GERestaurant,500,2,0.0001,8,50,0.6357,,
105,tasd,mvp,GERestaurant,500,4,0.0001,8,50,0.6282,,
87,tasd,para,GERestaurant,500,3,0.0003,16,86,0.6233,0.566,0.4527
94,tasd,para,GERestaurant,500,5,0.0003,16,86,0.6208,0.5703,0.4502
129,tasd,mvp,GERestaurant,500,1,0.0001,8,50,0.6181,,
84,tasd,para,GERestaurant,500,2,0.0003,16,86,0.6098,0.5943,0.4387


Unnamed: 0,basic,context,cot,prompting,para,mvp
1,0.733,0.7346,0.6472,0.4375,0.6379,0.6181
2,0.7087,0.7284,0.6947,0.4747,0.6098,0.6357
3,0.6743,0.7221,0.6842,0.5304,0.6233,0.6497
4,0.7213,0.7487,0.6667,0.5225,0.6486,0.6282
5,0.6925,0.7092,0.6973,0.5987,0.6208,0.6494


Unnamed: 0,W,pval,normal
context,0.997773,0.998542,True


Unnamed: 0,W,pval,normal
prompting,0.975581,0.909712,True


Unnamed: 0,W,pval,normal
para,0.964868,0.841405,True


Unnamed: 0,W,pval,normal
mvp,0.911641,0.477515,True


    split     prompt      f1
0       1    context  0.7346
1       2    context  0.7284
2       3    context  0.7221
3       4    context  0.7487
4       5    context  0.7092
5       1  prompting  0.4375
6       2  prompting  0.4747
7       3  prompting  0.5304
8       4  prompting  0.5225
9       5  prompting  0.5987
10      1       para  0.6379
11      2       para  0.6098
12      3       para  0.6233
13      4       para  0.6486
14      5       para  0.6208
15      1        mvp  0.6181
16      2        mvp  0.6357
17      3        mvp  0.6497
18      4        mvp  0.6282
19      5        mvp  0.6494
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      3     12  35.374999  0.000003  0.870564  0.388115


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,context vs prompting,0.7286,0.013105,0.51276,0.054597,6.855838,0.00237,0.009479,True
1,t-test,context vs para,0.7286,0.013105,0.62808,0.01362,20.276929,3.5e-05,0.00021,True
2,t-test,context vs mvp,0.7286,0.013105,0.63622,0.012234,7.761562,0.001485,0.007426,True
3,t-test,prompting vs para,0.51276,0.054597,0.62808,0.01362,-3.962596,0.016641,0.033283,True
4,t-test,prompting vs mvp,0.51276,0.054597,0.63622,0.012234,-5.440041,0.005543,0.016629,True
5,t-test,para vs mvp,0.62808,0.01362,0.63622,0.012234,-0.705479,0.519429,0.519429,False


In [143]:
args.task = 'tasd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7067,0.733,0.7123
2,0.7114,0.7087,0.7362
3,0.7398,0.6743,0.7657
4,0.7859,0.7213,0.7583
5,0.7577,0.6925,0.7837


Unnamed: 0,W,pval,normal
1000,0.936901,0.644119,True


Unnamed: 0,W,pval,normal
500,0.979288,0.930776,True


Unnamed: 0,W,pval,normal
full,0.974914,0.905749,True


    split prompt      f1
0       1   1000  0.7067
1       2   1000  0.7114
2       3   1000  0.7398
3       4   1000  0.7859
4       5   1000  0.7577
5       1    500  0.7330
6       2    500  0.7087
7       3    500  0.6743
8       4    500  0.7213
9       5    500  0.6925
10      1   full  0.7123
11      2   full  0.7362
12      3   full  0.7657
13      4   full  0.7583
14      5   full  0.7837
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.592104  0.077053  0.368664  0.692145
Results for LR-Comparison of :  basic


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.7403,0.029484,0.70596,0.020773,1.771274,0.151215,0.30243,False
1,t-test,1000 vs full,0.7403,0.029484,0.75124,0.024719,-1.053543,0.351524,0.351524,False
2,t-test,500 vs full,0.70596,0.020773,0.75124,0.024719,-2.13764,0.099351,0.298053,False


Unnamed: 0,1000,500,full
1,0.7317,0.7346,0.7433
2,0.7187,0.7284,0.7347
3,0.7419,0.7221,0.7655
4,0.7785,0.7487,0.7617
5,0.7166,0.7092,0.7755


Unnamed: 0,W,pval,normal
1000,0.865484,0.248613,True


Unnamed: 0,W,pval,normal
500,0.997773,0.998542,True


Unnamed: 0,W,pval,normal
full,0.94345,0.690422,True


    split prompt      f1
0       1   1000  0.7317
1       2   1000  0.7187
2       3   1000  0.7419
3       4   1000  0.7785
4       5   1000  0.7166
5       1    500  0.7346
6       2    500  0.7284
7       3    500  0.7221
8       4    500  0.7487
9       5    500  0.7092
10      1   full  0.7433
11      2   full  0.7347
12      3   full  0.7655
13      4   full  0.7617
14      5   full  0.7755
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.500963  0.080867  0.305072  0.774193
Results for LR-Comparison of :  context


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.73748,0.022469,0.7286,0.013105,1.229622,0.286219,0.399454,False
1,t-test,1000 vs full,0.73748,0.022469,0.75614,0.014952,-1.534363,0.199727,0.399454,False
2,t-test,500 vs full,0.7286,0.013105,0.75614,0.014952,-2.339546,0.079418,0.238253,False


Unnamed: 0,1000,500,full
1,0.6434,0.6472,0.7457
2,0.6771,0.6947,0.7218
3,0.6919,0.6842,0.7309
4,0.7066,0.6667,0.7293
5,0.6925,0.6973,0.6622


Unnamed: 0,W,pval,normal
1000,0.898396,0.401086,True


Unnamed: 0,W,pval,normal
500,0.908488,0.458556,True


Unnamed: 0,W,pval,normal
full,0.794509,0.073045,True


    split prompt      f1
0       1   1000  0.6434
1       2   1000  0.6771
2       3   1000  0.6919
3       4   1000  0.7066
4       5   1000  0.6925
5       1    500  0.6472
6       2    500  0.6947
7       3    500  0.6842
8       4    500  0.6667
9       5    500  0.6973
10      1   full  0.7457
11      2   full  0.7218
12      3   full  0.7309
13      4   full  0.7293
14      5   full  0.6622
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2       eps
0  prompt      2      8  2.784635  0.120819  0.36756  0.654357
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.6823,0.021573,0.67802,0.018794,0.438318,0.683788,0.683788,False
1,t-test,1000 vs full,0.6823,0.021573,0.71798,0.028946,-1.677092,0.16883,0.435332,False
2,t-test,500 vs full,0.67802,0.018794,0.71798,0.028946,-1.806655,0.145111,0.435332,False


Unnamed: 0,1000,500,full
1,0.7067,0.7346,0.7433
2,0.7114,0.7284,0.7347
3,0.7398,0.7221,0.7655
4,0.7859,0.7487,0.7617
5,0.7577,0.7092,0.7755


Unnamed: 0,W,pval,normal
1000,0.936901,0.644119,True


Unnamed: 0,W,pval,normal
500,0.997773,0.998542,True


Unnamed: 0,W,pval,normal
full,0.94345,0.690422,True


    split prompt      f1
0       1   1000  0.7067
1       2   1000  0.7114
2       3   1000  0.7398
3       4   1000  0.7859
4       5   1000  0.7577
5       1    500  0.7346
6       2    500  0.7284
7       3    500  0.7221
8       4    500  0.7487
9       5    500  0.7092
10      1   full  0.7433
11      2   full  0.7347
12      3   full  0.7655
13      4   full  0.7617
14      5   full  0.7755
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2       eps
0  prompt      2      8  2.440225  0.148811  0.23203  0.844593
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.7403,0.029484,0.7286,0.013105,0.785931,0.475852,0.475852,False
1,t-test,1000 vs full,0.7403,0.029484,0.75614,0.014952,-1.513421,0.204728,0.409456,False
2,t-test,500 vs full,0.7286,0.013105,0.75614,0.014952,-2.339546,0.079418,0.238253,False


## Performance Comparison of Extraction of ABSA-Tuple Elements over different ABSA Subtasks

In [119]:
# Additional Eval

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')        
        filename = 'metrics_asp.tsv'
        
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'basic'), ('full', 'context'), ('full', 'cot'),
    ('1000', 'basic'), ('1000', 'context'), ('1000', 'cot'),
    ('500', 'basic'), ('500', 'context'), ('500', 'cot')
])

# Define the row indices
index = ['acd', 'acsa', 'tasd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'tasd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                         results_all['task'] == task,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting])].groupby(['model_config']):
            prompt_name = a[0].split('_')[2]
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect Extraction')
display(df)

print(f"Average difference ACSA to ACD: {(np.mean([float(i) for i in list(df.loc['acsa'])]) - np.mean([float(i) for i in list(df.loc['acd']) if i != 'N/A'])):.2f}")

print(f"Average difference TASD to ACSA: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}")

print(f"Average difference TASD to ACD: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acd'])  if i != 'N/A'])):.2f}")

Aspect Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,basic,context,cot,basic,context,cot,basic,context,cot
acd,87.88,87.83,,86.65,86.45,,86.12,83.24,
acsa,86.58,86.34,85.77,83.16,79.17,83.52,82.52,82.96,85.09
tasd,87.01,87.69,86.63,87.14,86.11,85.32,86.6,86.16,83.81


Average difference ACSA to ACD: -2.46
Average difference TASD to ACSA: 2.37
Average difference TASD to ACD: -0.09


In [121]:
# Additional Eval

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        filename = 'metrics_asp_pol.tsv'
        
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'basic'), ('full', 'context'), ('full', 'cot'),
    ('1000', 'basic'), ('1000', 'context'), ('1000', 'cot'),
    ('500', 'basic'), ('500', 'context'), ('500', 'cot')
])

# Define the row indices
index = ['acd', 'acsa', 'tasd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'tasd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'GERestaurant', 
                                                         results_all['task'] == task,
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting])].groupby(['model_config']):

            prompt_name = a[0].split('_')[2]
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect + Polarity Extraction')
display(df)

f"Average difference: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}"


Aspect + Polarity Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,basic,context,cot,basic,context,cot,basic,context,cot
acd,,,,,,,,,
acsa,83.64,83.22,82.54,79.61,76.24,80.57,78.91,80.11,81.83
tasd,83.75,85.1,83.46,83.89,83.27,82.25,82.34,82.96,80.19


'Average difference: 2.28'

In [122]:
# Eval for best parameter combination over all tasks and dataset sizes

RESULTS_PATH = '../results/ft_llm/'
DATASET = 'GERestaurant'

col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

results_sub = results_all[np.logical_and.reduce([results_all['dataset'] == DATASET, results_all['split'] == '0'])].sort_values(by = ['f1-micro'], ascending = False)
results_sub = results_sub[results_sub['lr_setting'] != 'orig']
results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lr_setting', 'lora_r', 'lora_alpha', 'epoch', 'f1-micro', 'f1-macro']]
results_sub = results_sub.reset_index()

idx_max = results_sub.groupby(['lr_setting', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].idxmax()
results_per_epoch = results_sub.loc[idx_max]

results_per_epoch.groupby(['learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].mean()

learning_rate  lora_r  lora_alpha
0.0003         32      32            0.818427
                       64            0.782661
               8       16            0.825533
                       8             0.830473
3e-05          32      32            0.818609
                       64            0.823300
               8       16            0.812015
                       8             0.798288
Name: f1-micro, dtype: float64