## Language

In [148]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = SimpleNamespace(**{
    'dataset': 'rest-16',
})

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_tasd = {}

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub_baseline = args.results_baseline[np.logical_and.reduce([args.results_baseline['lr_setting'] == str(lr_setting), 
                                                                        args.results_baseline['dataset'] == args.dataset, 
                                                                        args.results_baseline['task'] == args.task, 
                                                                        args.results_baseline['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    display(results_sub_baseline)
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
    results_sub_baseline = results_sub_baseline[['task', 'method', 'dataset', 'learning_rate', 'batch_size', 'lr_setting', 'split', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']]

    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['basic', 'context']
        baselines = ['prompting', 'hier_gcn', 'bert_clf']
    elif args.task == 'acsa':
        prompts = ['basic', 'context', 'cot']
        baselines = ['prompting', 'hier_gcn', 'bert_clf']
    elif args.task == 'e2e' or args.task == 'e2e-e':
        prompts = ['basic', 'context', 'cot']
        baselines = ['prompting', 'instructABSA', 'tas_bert']
    else:
        prompts = ['basic', 'context', 'cot']
        baselines = ['prompting', 'para', 'mvp']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,12]
            f1_prompts[prompt] = f1
        except:
            pass
    
    for method in baselines:
        f1 = {}
        # try:
        for i in range(1, 6): 
            f1[i] = results_sub_baseline[np.logical_and.reduce([results_sub_baseline['split'] == str(i), results_sub_baseline['method'] == method])].iloc[0,8]
        f1_prompts[method] = f1
        # except:
        #     pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)


    # Only use the best performing FT-LLM prompt
    available_prompts = [prompt for prompt in prompts if prompt in df_prompts.columns]

    # Calculate the average F1 scores
    avg_f1 = df_prompts[available_prompts].mean()
    
    # Find the best prompt
    best_prompt = avg_f1.idxmax()
    
    # Identify prompts to drop
    prompts_to_drop = [prompt for prompt in prompts if prompt != best_prompt]
    
    # Drop the other prompts
    df_prompts = df_prompts.drop(columns=prompts_to_drop)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # if not ((col1 == baselines[0] and col2 == baselines[1]) or (col1 == baselines[1] and col2 == baselines[0])):
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_prompts[col1]),
            'std 1': np.std(df_prompts[col1]),
            'mean 2': np.mean(df_prompts[col2]),
            'std 2': np.std(df_prompts[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

def computeLowResourceStatistics(args):
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0)])].sort_values(by = ['f1-micro'], ascending = False)

    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]

    if args.task == 'acd':
        prompts = ['basic', 'context']
    else:
        prompts = ['basic', 'context', 'cot']

    for prompt in prompts:
        
        f1_splits = {}
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,12]
                f1_splits[lr_setting] = f1
            except:
                pass
        
        df_splits = pd.DataFrame(f1_splits)
        
        display(df_splits)
    
        normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}
    
        for key, item in normality_results.items():
            display(item)
        
        all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])
    
        print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        
        if all_normal:
            # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
            rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
            print("Repeated Measures ANOVA Result:")
            print(rm_anova)
        else:
            # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
            friedman = pg.friedman(df_splits)
            print("Friedman Test Result:")
            print(friedman)
    
        # Paarweise Vergleiche
        results = []
        columns = df_splits.columns
        comb = combinations(columns, 2)
        
        for col1, col2 in comb:
            # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
            if all_normal:
                test = 't-test'
                test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
                statistic = test_result['T']['T-test']
            else:
                # Falls nicht, Wilcoxon-Test
                test = 'wilcoxon'
                test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
                statistic = test_result['W-val']['Wilcoxon']
            
            result = {
                'test': test,
                'comparison': f'{col1} vs {col2}',
                'mean 1': np.mean(df_splits[col1]),
                'std 1': np.std(df_splits[col1]),
                'mean 2': np.mean(df_splits[col2]),
                'std 2': np.std(df_splits[col2]),
                'statistic': statistic,
                'p_value': test_result['p-val'].iloc[0]
            }
            results.append(result)
        
        # Erstellung eines DataFrames für die Testergebnisse
        results_df = pd.DataFrame(results)
        
        # Durchführung der Bonferroni-Holm-Korrektur
        corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
        results_df['corrected_p_value'] = corrected_p[1]
        results_df['significant'] = corrected_p[0]

        print('Results for LR-Comparison of : ', prompt)
        display(results_df)


    
    ####
    # Compute based on best performing prompt per low-resource setting
    ####


    
    f1_splits = {}
    
    for prompt in prompts:
        
        for lr_setting in ['1000','500','full']:
            f1 = {}
            try:
                for i in range(1, 6): 
                    f1[i] = results_sub[np.logical_and.reduce([results_sub['lr_setting'] == lr_setting, results_sub['split'] == str(i), results_sub['prompt'] == prompt])].iloc[0,12]
    
                if lr_setting not in f1_splits.keys() or np.mean(list(f1.values())) > np.mean(list(f1_splits[lr_setting].values())):
                    f1_splits[lr_setting] = f1
            except:
                pass

    df_splits = pd.DataFrame(f1_splits)
        
    display(df_splits)

    normality_results = {col: pg.normality(df_splits[col]) for col in df_splits.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_splits.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_splits)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_splits.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_splits[col1], df_splits[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_splits[col1], df_splits[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': np.mean(df_splits[col1]),
            'std 1': np.std(df_splits[col1]),
            'mean 2': np.mean(df_splits[col2]),
            'std 2': np.std(df_splits[col2]),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]

    print('Results for LR-Comparison of best Prompt per LR-Setting')
    display(results_df)



The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


## ACD

In [150]:
# LLM-based Method

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
RESULTS_PATH = '../results/'

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'acd':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass



# Multi-label Classifiaction
METHOD = 'bert_clf'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_parameters = folder_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        if cond_parameters[3] == '0':
            cond_parameters[3] = 'full'
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier_gcn'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    cond_parameters = folder_name.split('_')
    cond_params = cond_parameters.copy()
    if cond_params[0] == 'acd':
        with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'cate_eval_results.txt'), 'r') as f:
            f1 = f.readlines()[3].split(' = ')[1]
                
        cond_params[1:1] = [METHOD]
        if cond_params[3] == '0':
            cond_params[3] = 'full'
        cond_params.extend([round(float(f1), 4), None, None])
        runs.append(cond_params)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [123]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
235,acd,hier_gcn,rest-16,full,4,5e-05,8,20.0,0.8461,,
221,acd,hier_gcn,rest-16,full,1,5e-05,8,20.0,0.8315,,
233,acd,hier_gcn,rest-16,full,2,5e-05,8,20.0,0.8159,,
217,acd,hier_gcn,rest-16,full,5,5e-05,8,20.0,0.8143,,
216,acd,hier_gcn,rest-16,full,3,5e-05,8,20.0,0.8075,,
212,acd,bert_clf,rest-16,full,1,6e-05,16,3,0.7776,0.4001,0.6361
149,acd,bert_clf,rest-16,full,3,6e-05,16,3,0.775,0.4576,0.6326
88,acd,bert_clf,rest-16,full,2,6e-05,16,3,0.7615,0.3282,0.6148
130,acd,bert_clf,rest-16,full,4,6e-05,16,3,0.7605,0.3857,0.6136
19,acd,prompting,rest-16,full,2,-,-,-,0.7561,0.6708,0.6079


Unnamed: 0,basic,context,prompting,hier_gcn,bert_clf
1,0.8299,0.8497,0.7439,0.8315,0.7776
2,0.8694,0.8407,0.7561,0.8159,0.7615
3,0.8243,0.8494,0.7224,0.8075,0.775
4,0.851,0.8606,0.7553,0.8461,0.7605
5,0.846,0.8603,0.6988,0.8143,0.7278


Unnamed: 0,W,pval,normal
context,0.89011,0.357658,True


Unnamed: 0,W,pval,normal
prompting,0.880679,0.312398,True


Unnamed: 0,W,pval,normal
hier_gcn,0.913838,0.490991,True


Unnamed: 0,W,pval,normal
bert_clf,0.851566,0.199519,True


    split     prompt      f1
0       1    context  0.8497
1       2    context  0.8407
2       3    context  0.8494
3       4    context  0.8606
4       5    context  0.8603
5       1  prompting  0.7439
6       2  prompting  0.7561
7       3  prompting  0.7224
8       4  prompting  0.7553
9       5  prompting  0.6988
10      1   hier_gcn  0.8315
11      2   hier_gcn  0.8159
12      3   hier_gcn  0.8075
13      4   hier_gcn  0.8461
14      5   hier_gcn  0.8143
15      1   bert_clf  0.7776
16      2   bert_clf  0.7615
17      3   bert_clf  0.7750
18      4   bert_clf  0.7605
19      5   bert_clf  0.7278
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F         p-unc       ng2       eps
0  prompt      3     12  58.03705  2.048138e-07  0.893579  0.653525


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,context vs prompting,0.85214,0.007517,0.7353,0.021933,8.9715,0.000854,0.004271,True
1,t-test,context vs hier_gcn,0.85214,0.007517,0.82306,0.013944,4.598216,0.010044,0.020089,True
2,t-test,context vs bert_clf,0.85214,0.007517,0.76048,0.017737,8.078214,0.001276,0.005103,True
3,t-test,prompting vs hier_gcn,0.7353,0.021933,0.82306,0.013944,-9.912708,0.000581,0.003488,True
4,t-test,prompting vs bert_clf,0.7353,0.021933,0.76048,0.017737,-2.789621,0.049331,0.049331,True
5,t-test,hier_gcn vs bert_clf,0.82306,0.013944,0.76048,0.017737,6.036947,0.003796,0.011388,True


### 1000

In [124]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
215,acd,hier_gcn,rest-16,1000,1,5e-05,8,20.0,0.8324,,
219,acd,hier_gcn,rest-16,1000,4,5e-05,8,20.0,0.8119,,
242,acd,hier_gcn,rest-16,1000,3,5e-05,8,20.0,0.7935,,
237,acd,hier_gcn,rest-16,1000,5,5e-05,8,20.0,0.7893,,
232,acd,hier_gcn,rest-16,1000,2,5e-05,8,20.0,0.789,,
49,acd,prompting,rest-16,1000,2,-,-,-,0.7873,0.6734,0.6492
25,acd,prompting,rest-16,1000,4,-,-,-,0.7854,0.6977,0.6466
0,acd,prompting,rest-16,1000,1,-,-,-,0.7815,0.6405,0.6413
85,acd,bert_clf,rest-16,1000,5,5e-05,16,5,0.7801,0.5297,0.6394
64,acd,bert_clf,rest-16,1000,2,5e-05,16,5,0.7715,0.4044,0.628


Unnamed: 0,basic,context,prompting,hier_gcn,bert_clf
1,0.8578,0.6897,0.7815,0.8324,0.7633
2,0.8452,0.8212,0.7873,0.789,0.7715
3,0.8031,0.8113,0.7537,0.7935,0.753
4,0.7668,0.8498,0.7854,0.8119,0.3673
5,0.8,0.8152,0.7338,0.7893,0.7801


Unnamed: 0,W,pval,normal
basic,0.938139,0.652813,True


Unnamed: 0,W,pval,normal
prompting,0.836845,0.156388,True


Unnamed: 0,W,pval,normal
hier_gcn,0.830115,0.139397,True


Unnamed: 0,W,pval,normal
bert_clf,0.606467,0.000763,False


    split     prompt      f1
0       1      basic  0.8578
1       2      basic  0.8452
2       3      basic  0.8031
3       4      basic  0.7668
4       5      basic  0.8000
5       1  prompting  0.7815
6       2  prompting  0.7873
7       3  prompting  0.7537
8       4  prompting  0.7854
9       5  prompting  0.7338
10      1   hier_gcn  0.8324
11      2   hier_gcn  0.7890
12      3   hier_gcn  0.7935
13      4   hier_gcn  0.8119
14      5   hier_gcn  0.7893
15      1   bert_clf  0.7633
16      2   bert_clf  0.7715
17      3   bert_clf  0.7530
18      4   bert_clf  0.3673
19      5   bert_clf  0.7801
Friedman Test Result:
          Source      W  ddof1      Q     p-unc
Friedman  Within  0.728      3  10.92  0.012166


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs prompting,0.81458,0.032963,0.76834,0.021124,1.0,0.125,0.375,False
1,wilcoxon,basic vs hier_gcn,0.81458,0.032963,0.80322,0.016836,4.0,0.4375,0.875,False
2,wilcoxon,basic vs bert_clf,0.81458,0.032963,0.68704,0.160121,0.0,0.0625,0.375,False
3,wilcoxon,prompting vs hier_gcn,0.76834,0.021124,0.80322,0.016836,0.0,0.0625,0.375,False
4,wilcoxon,prompting vs bert_clf,0.76834,0.021124,0.68704,0.160121,4.0,0.4375,0.875,False
5,wilcoxon,hier_gcn vs bert_clf,0.80322,0.016836,0.68704,0.160121,0.0,0.0625,0.375,False


### 500

In [125]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
222,acd,hier_gcn,rest-16,500,1,5e-05,8,20.0,0.7694,,
240,acd,hier_gcn,rest-16,500,2,5e-05,8,20.0,0.7597,,
213,acd,bert_clf,rest-16,500,1,8e-05,16,10,0.7567,0.4092,0.6086
224,acd,hier_gcn,rest-16,500,4,5e-05,8,20.0,0.7461,,
220,acd,hier_gcn,rest-16,500,3,5e-05,8,20.0,0.7393,,
223,acd,hier_gcn,rest-16,500,5,5e-05,8,20.0,0.7372,,
65,acd,bert_clf,rest-16,500,3,8e-05,16,10,0.7359,0.362,0.5821
82,acd,bert_clf,rest-16,500,5,8e-05,16,10,0.7336,0.4074,0.5793
11,acd,prompting,rest-16,500,4,-,-,-,0.7292,0.6776,0.5738
55,acd,prompting,rest-16,500,2,-,-,-,0.7,0.5941,0.5385


Unnamed: 0,basic,context,prompting,hier_gcn,bert_clf
1,0.7658,0.7965,0.6658,0.7694,0.7567
2,0.8287,0.784,0.7,0.7597,0.0
3,0.8143,0.7809,0.6595,0.7393,0.7359
4,0.8492,0.761,0.7292,0.7461,0.6897
5,0.7995,0.8134,0.6651,0.7372,0.7336


Unnamed: 0,W,pval,normal
basic,0.986385,0.965571,True


Unnamed: 0,W,pval,normal
prompting,0.835473,0.152792,True


Unnamed: 0,W,pval,normal
hier_gcn,0.907783,0.454382,True


Unnamed: 0,W,pval,normal
bert_clf,0.618197,0.001083,False


    split     prompt      f1
0       1      basic  0.7658
1       2      basic  0.8287
2       3      basic  0.8143
3       4      basic  0.8492
4       5      basic  0.7995
5       1  prompting  0.6658
6       2  prompting  0.7000
7       3  prompting  0.6595
8       4  prompting  0.7292
9       5  prompting  0.6651
10      1   hier_gcn  0.7694
11      2   hier_gcn  0.7597
12      3   hier_gcn  0.7393
13      4   hier_gcn  0.7461
14      5   hier_gcn  0.7372
15      1   bert_clf  0.7567
16      2   bert_clf  0.0000
17      3   bert_clf  0.7359
18      4   bert_clf  0.6897
19      5   bert_clf  0.7336
Friedman Test Result:
          Source     W  ddof1     Q     p-unc
Friedman  Within  0.84      3  12.6  0.005587


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs prompting,0.8115,0.028133,0.68392,0.026787,0.0,0.0625,0.375,False
1,wilcoxon,basic vs hier_gcn,0.8115,0.028133,0.75034,0.012356,1.0,0.125,0.375,False
2,wilcoxon,basic vs bert_clf,0.8115,0.028133,0.58318,0.292405,0.0,0.0625,0.375,False
3,wilcoxon,prompting vs hier_gcn,0.68392,0.026787,0.75034,0.012356,0.0,0.0625,0.375,False
4,wilcoxon,prompting vs bert_clf,0.68392,0.026787,0.58318,0.292405,6.0,0.8125,0.8125,False
5,wilcoxon,hier_gcn vs bert_clf,0.75034,0.012356,0.58318,0.292405,0.0,0.0625,0.375,False


In [151]:
args.task = 'acd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8578,0.7658,0.8299
2,0.8452,0.8287,0.8694
3,0.8031,0.8143,0.8243
4,0.7668,0.8492,0.851
5,0.8,0.7995,0.846


Unnamed: 0,W,pval,normal
1000,0.938139,0.652813,True


Unnamed: 0,W,pval,normal
500,0.986385,0.965571,True


Unnamed: 0,W,pval,normal
full,0.95702,0.787097,True


    split prompt      f1
0       1   1000  0.8578
1       2   1000  0.8452
2       3   1000  0.8031
3       4   1000  0.7668
4       5   1000  0.8000
5       1    500  0.7658
6       2    500  0.8287
7       3    500  0.8143
8       4    500  0.8492
9       5    500  0.7995
10      1   full  0.8299
11      2   full  0.8694
12      3   full  0.8243
13      4   full  0.8510
14      5   full  0.8460
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  1.55875  0.268121  0.233036  0.543478
Results for LR-Comparison of :  basic


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.81458,0.032963,0.8115,0.028133,0.110267,0.917509,0.917509,False
1,t-test,1000 vs full,0.81458,0.032963,0.84412,0.016031,-1.61926,0.180704,0.361409,False
2,t-test,500 vs full,0.8115,0.028133,0.84412,0.016031,-2.802056,0.048709,0.146128,False


Unnamed: 0,1000,500,full
1,0.6897,0.7965,0.8497
2,0.8212,0.784,0.8407
3,0.8113,0.7809,0.8494
4,0.8498,0.761,0.8606
5,0.8152,0.8134,0.8603


Unnamed: 0,W,pval,normal
1000,0.767566,0.042937,False


Unnamed: 0,W,pval,normal
500,0.986798,0.967326,True


Unnamed: 0,W,pval,normal
full,0.89011,0.357658,True


    split prompt      f1
0       1   1000  0.6897
1       2   1000  0.8212
2       3   1000  0.8113
3       4   1000  0.8498
4       5   1000  0.8152
5       1    500  0.7965
6       2    500  0.7840
7       3    500  0.7809
8       4    500  0.7610
9       5    500  0.8134
10      1   full  0.8497
11      2   full  0.8407
12      3   full  0.8494
13      4   full  0.8606
14      5   full  0.8603
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996
Results for LR-Comparison of :  context


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.79744,0.055537,0.78716,0.017374,5.0,0.625,0.625,False
1,wilcoxon,1000 vs full,0.79744,0.055537,0.85214,0.007517,0.0,0.0625,0.1875,False
2,wilcoxon,500 vs full,0.78716,0.017374,0.85214,0.007517,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.8578,0.7658,0.8497
2,0.8452,0.8287,0.8407
3,0.8031,0.8143,0.8494
4,0.7668,0.8492,0.8606
5,0.8,0.7995,0.8603


Unnamed: 0,W,pval,normal
1000,0.938139,0.652813,True


Unnamed: 0,W,pval,normal
500,0.986385,0.965571,True


Unnamed: 0,W,pval,normal
full,0.89011,0.357658,True


    split prompt      f1
0       1   1000  0.8578
1       2   1000  0.8452
2       3   1000  0.8031
3       4   1000  0.7668
4       5   1000  0.8000
5       1    500  0.7658
6       2    500  0.8287
7       3    500  0.8143
8       4    500  0.8492
9       5    500  0.7995
10      1   full  0.8497
11      2   full  0.8407
12      3   full  0.8494
13      4   full  0.8606
14      5   full  0.8603
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  2.25885  0.166825  0.346103  0.635561
Results for LR-Comparison of best Prompt per LR-Setting


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.81458,0.032963,0.8115,0.028133,0.110267,0.917509,0.917509,False
1,t-test,1000 vs full,0.81458,0.032963,0.85214,0.007517,-1.925482,0.126471,0.252942,False
2,t-test,500 vs full,0.8115,0.028133,0.85214,0.007517,-2.879529,0.045034,0.135103,False


## ACSA

In [153]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

###
# Baselines
##
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
RESULTS_PATH = '../results/'

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'acsa':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass

# Multi-label Classifiaction
METHOD = 'bert_clf'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_parameters = folder_name.split('_')
        
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        if cond_parameters[3] == '0':
            cond_parameters[3] = 'full'
        runs.append(cond_parameters)
    except:
        pass

# Hier-GCN
METHOD = 'hier_gcn'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    cond_parameters = folder_name.split('_')
    cond_params = cond_parameters.copy()
    if cond_params[0] == 'acsa':
        with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'eval_results.txt'), 'r') as f:
            f1 = f.readlines()[3].split(' = ')[1]
                
        cond_params[1:1] = [METHOD]
        if cond_params[3] == '0':
            cond_params[3] = 'full'
        cond_params.extend([round(float(f1), 4), None, None])
        runs.append(cond_params)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [132]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
174,acsa,hier_gcn,rest-16,full,4,5e-05,8,20.0,0.7577,,
35,acsa,prompting,rest-16,full,4,-,-,-,0.7461,0.615,0.595
4,acsa,prompting,rest-16,full,2,-,-,-,0.7424,0.6021,0.5903
136,acsa,hier_gcn,rest-16,full,5,5e-05,8,20.0,0.7318,,
11,acsa,prompting,rest-16,full,3,-,-,-,0.7273,0.5926,0.5714
150,acsa,hier_gcn,rest-16,full,1,5e-05,8,20.0,0.7182,,
6,acsa,prompting,rest-16,full,1,-,-,-,0.7153,0.6059,0.5568
163,acsa,hier_gcn,rest-16,full,2,5e-05,8,20.0,0.7088,,
164,acsa,hier_gcn,rest-16,full,3,5e-05,8,20.0,0.704,,
5,acsa,prompting,rest-16,full,5,-,-,-,0.6998,0.5778,0.5382


Unnamed: 0,basic,context,cot,prompting,hier_gcn,bert_clf
1,0.8038,0.8407,0.7704,0.7153,0.7182,0.5276
2,0.8267,0.8242,0.8108,0.7424,0.7088,0.4063
3,0.7688,0.7827,0.7809,0.7273,0.704,0.5627
4,0.8352,0.8237,0.8075,0.7461,0.7577,0.5331
5,0.8015,0.7638,0.7873,0.6998,0.7318,0.5324


Unnamed: 0,W,pval,normal
basic,0.939847,0.664856,True


Unnamed: 0,W,pval,normal
prompting,0.943664,0.691948,True


Unnamed: 0,W,pval,normal
hier_gcn,0.913912,0.491452,True


Unnamed: 0,W,pval,normal
bert_clf,0.746507,0.027618,False


    split     prompt      f1
0       1      basic  0.8038
1       2      basic  0.8267
2       3      basic  0.7688
3       4      basic  0.8352
4       5      basic  0.8015
5       1  prompting  0.7153
6       2  prompting  0.7424
7       3  prompting  0.7273
8       4  prompting  0.7461
9       5  prompting  0.6998
10      1   hier_gcn  0.7182
11      2   hier_gcn  0.7088
12      3   hier_gcn  0.7040
13      4   hier_gcn  0.7577
14      5   hier_gcn  0.7318
15      1   bert_clf  0.5276
16      2   bert_clf  0.4063
17      3   bert_clf  0.5627
18      4   bert_clf  0.5331
19      5   bert_clf  0.5324
Friedman Test Result:
          Source      W  ddof1      Q    p-unc
Friedman  Within  0.904      3  13.56  0.00357


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs prompting,0.8072,0.023164,0.72618,0.017178,0.0,0.0625,0.375,False
1,wilcoxon,basic vs hier_gcn,0.8072,0.023164,0.7241,0.019293,0.0,0.0625,0.375,False
2,wilcoxon,basic vs bert_clf,0.8072,0.023164,0.51242,0.054492,0.0,0.0625,0.375,False
3,wilcoxon,prompting vs hier_gcn,0.72618,0.017178,0.7241,0.019293,7.0,1.0,1.0,False
4,wilcoxon,prompting vs bert_clf,0.72618,0.017178,0.51242,0.054492,0.0,0.0625,0.375,False
5,wilcoxon,hier_gcn vs bert_clf,0.7241,0.019293,0.51242,0.054492,0.0,0.0625,0.375,False


### 1000

In [133]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
45,acsa,prompting,rest-16,1000,4,-,-,-,0.7442,0.652,0.5926
28,acsa,prompting,rest-16,1000,2,-,-,-,0.74,0.6167,0.5872
143,acsa,hier_gcn,rest-16,1000,4,5e-05,8,20.0,0.7307,,
167,acsa,hier_gcn,rest-16,1000,1,5e-05,8,20.0,0.728,,
12,acsa,prompting,rest-16,1000,1,-,-,-,0.7273,0.6169,0.5714
41,acsa,prompting,rest-16,1000,5,-,-,-,0.7117,0.5621,0.5524
172,acsa,hier_gcn,rest-16,1000,5,5e-05,8,20.0,0.7044,,
158,acsa,hier_gcn,rest-16,1000,2,5e-05,8,20.0,0.7007,,
52,acsa,prompting,rest-16,1000,3,-,-,-,0.6965,0.5243,0.5343
173,acsa,hier_gcn,rest-16,1000,3,5e-05,8,20.0,0.6903,,


Unnamed: 0,basic,context,cot,prompting,hier_gcn,bert_clf
1,0.8005,0.7934,0.7956,0.7273,0.728,0.4313
2,0.8162,0.8025,0.8162,0.74,0.7007,0.4487
3,0.8015,0.799,0.7729,0.6965,0.6903,0.3361
4,0.8009,0.8029,0.8111,0.7442,0.7307,0.5071
5,0.747,0.8111,0.7864,0.7117,0.7044,0.3825


Unnamed: 0,W,pval,normal
context,0.969023,0.868949,True


Unnamed: 0,W,pval,normal
prompting,0.938324,0.654113,True


Unnamed: 0,W,pval,normal
hier_gcn,0.891004,0.362177,True


Unnamed: 0,W,pval,normal
bert_clf,0.989841,0.979135,True


    split     prompt      f1
0       1    context  0.7934
1       2    context  0.8025
2       3    context  0.7990
3       4    context  0.8029
4       5    context  0.8111
5       1  prompting  0.7273
6       2  prompting  0.7400
7       3  prompting  0.6965
8       4  prompting  0.7442
9       5  prompting  0.7117
10      1   hier_gcn  0.7280
11      2   hier_gcn  0.7007
12      3   hier_gcn  0.6903
13      4   hier_gcn  0.7307
14      5   hier_gcn  0.7044
15      1   bert_clf  0.4313
16      2   bert_clf  0.4487
17      3   bert_clf  0.3361
18      4   bert_clf  0.5071
19      5   bert_clf  0.3825
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F         p-unc       ng2       eps
0  prompt      3     12  162.102932  5.660072e-10  0.954457  0.378387


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,context vs prompting,0.80178,0.005772,0.72394,0.01779,8.176868,0.001218,0.002436,True
1,t-test,context vs hier_gcn,0.80178,0.005772,0.71082,0.015843,9.909191,0.000582,0.001747,True
2,t-test,context vs bert_clf,0.80178,0.005772,0.42114,0.05827,12.931385,0.000206,0.001031,True
3,t-test,prompting vs hier_gcn,0.72394,0.01779,0.71082,0.015843,1.8954,0.130932,0.130932,False
4,t-test,prompting vs bert_clf,0.72394,0.01779,0.42114,0.05827,14.679584,0.000125,0.000752,True
5,t-test,hier_gcn vs bert_clf,0.71082,0.015843,0.42114,0.05827,12.331505,0.000248,0.001031,True


### 500

In [134]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
3,acsa,prompting,rest-16,500,4,-,-,-,0.7601,0.6719,0.6131
17,acsa,prompting,rest-16,500,1,-,-,-,0.7406,0.6545,0.588
20,acsa,prompting,rest-16,500,3,-,-,-,0.7202,0.6127,0.5627
16,acsa,prompting,rest-16,500,2,-,-,-,0.7158,0.6049,0.5573
23,acsa,prompting,rest-16,500,5,-,-,-,0.6996,0.5533,0.538
168,acsa,hier_gcn,rest-16,500,1,5e-05,8,20.0,0.6506,,
161,acsa,hier_gcn,rest-16,500,2,5e-05,8,20.0,0.6446,,
166,acsa,hier_gcn,rest-16,500,5,5e-05,8,20.0,0.6375,,
145,acsa,hier_gcn,rest-16,500,4,5e-05,8,20.0,0.6241,,
153,acsa,hier_gcn,rest-16,500,3,5e-05,8,20.0,0.6227,,


Unnamed: 0,basic,context,cot,prompting,hier_gcn,bert_clf
1,0.7713,0.7619,0.7285,0.7406,0.6506,0.5028
2,0.7537,0.7785,0.747,0.7158,0.6446,0.4739
3,0.7573,0.7551,0.7792,0.7202,0.6227,0.4597
4,0.8155,0.7589,0.7569,0.7601,0.6241,0.3142
5,0.7837,0.7807,0.7474,0.6996,0.6375,0.4093


Unnamed: 0,W,pval,normal
basic,0.904757,0.436735,True


Unnamed: 0,W,pval,normal
prompting,0.968872,0.867967,True


Unnamed: 0,W,pval,normal
hier_gcn,0.907903,0.455095,True


Unnamed: 0,W,pval,normal
bert_clf,0.905928,0.443509,True


    split     prompt      f1
0       1      basic  0.7713
1       2      basic  0.7537
2       3      basic  0.7573
3       4      basic  0.8155
4       5      basic  0.7837
5       1  prompting  0.7406
6       2  prompting  0.7158
7       3  prompting  0.7202
8       4  prompting  0.7601
9       5  prompting  0.6996
10      1   hier_gcn  0.6506
11      2   hier_gcn  0.6446
12      3   hier_gcn  0.6227
13      4   hier_gcn  0.6241
14      5   hier_gcn  0.6375
15      1   bert_clf  0.5028
16      2   bert_clf  0.4739
17      3   bert_clf  0.4597
18      4   bert_clf  0.3142
19      5   bert_clf  0.4093
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F         p-unc       ng2       eps
0  prompt      3     12  58.687767  1.924426e-07  0.927293  0.366337


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.7763,0.022308,0.72726,0.020989,5.067977,0.007141,0.007141,True
1,t-test,basic vs hier_gcn,0.7763,0.022308,0.6359,0.011026,9.88333,0.000588,0.003529,True
2,t-test,basic vs bert_clf,0.7763,0.022308,0.43198,0.066223,7.936739,0.001364,0.006822,True
3,t-test,prompting vs hier_gcn,0.72726,0.020989,0.6359,0.011026,7.11792,0.002059,0.006822,True
4,t-test,prompting vs bert_clf,0.72726,0.020989,0.43198,0.066223,7.614685,0.001597,0.006822,True
5,t-test,hier_gcn vs bert_clf,0.6359,0.011026,0.43198,0.066223,6.845276,0.002383,0.006822,True


In [154]:
args.task = 'acsa'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8005,0.7713,0.8038
2,0.8162,0.7537,0.8267
3,0.8015,0.7573,0.7688
4,0.8009,0.8155,0.8352
5,0.747,0.7837,0.8015


Unnamed: 0,W,pval,normal
1000,0.751384,0.030653,False


Unnamed: 0,W,pval,normal
500,0.904757,0.436735,True


Unnamed: 0,W,pval,normal
full,0.939847,0.664856,True


    split prompt      f1
0       1   1000  0.8005
1       2   1000  0.8162
2       3   1000  0.8015
3       4   1000  0.8009
4       5   1000  0.7470
5       1    500  0.7713
6       2    500  0.7537
7       3    500  0.7573
8       4    500  0.8155
9       5    500  0.7837
10      1   full  0.8038
11      2   full  0.8267
12      3   full  0.7688
13      4   full  0.8352
14      5   full  0.8015
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.52      2  5.2  0.074274
Results for LR-Comparison of :  basic


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.79322,0.023853,0.7763,0.022308,4.0,0.4375,0.625,False
1,wilcoxon,1000 vs full,0.79322,0.023853,0.8072,0.023164,3.0,0.3125,0.625,False
2,wilcoxon,500 vs full,0.7763,0.022308,0.8072,0.023164,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.7934,0.7619,0.8407
2,0.8025,0.7785,0.8242
3,0.799,0.7551,0.7827
4,0.8029,0.7589,0.8237
5,0.8111,0.7807,0.7638


Unnamed: 0,W,pval,normal
1000,0.969023,0.868949,True


Unnamed: 0,W,pval,normal
500,0.854828,0.210266,True


Unnamed: 0,W,pval,normal
full,0.896665,0.391727,True


    split prompt      f1
0       1   1000  0.7934
1       2   1000  0.8025
2       3   1000  0.7990
3       4   1000  0.8029
4       5   1000  0.8111
5       1    500  0.7619
6       2    500  0.7785
7       3    500  0.7551
8       4    500  0.7589
9       5    500  0.7807
10      1   full  0.8407
11      2   full  0.8242
12      3   full  0.7827
13      4   full  0.8237
14      5   full  0.7638
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  4.990179  0.039189  0.491641  0.543191
Results for LR-Comparison of :  context


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80178,0.005772,0.76702,0.010518,8.767999,0.000933,0.002798,True
1,t-test,1000 vs full,0.80178,0.005772,0.80702,0.028869,-0.315797,0.767948,0.767948,False
2,t-test,500 vs full,0.76702,0.010518,0.80702,0.028869,-2.402426,0.074161,0.148322,False


Unnamed: 0,1000,500,full
1,0.7956,0.7285,0.7704
2,0.8162,0.747,0.8108
3,0.7729,0.7792,0.7809
4,0.8111,0.7569,0.8075
5,0.7864,0.7474,0.7873


Unnamed: 0,W,pval,normal
1000,0.957089,0.78758,True


Unnamed: 0,W,pval,normal
500,0.956087,0.780508,True


Unnamed: 0,W,pval,normal
full,0.919719,0.528128,True


    split prompt      f1
0       1   1000  0.7956
1       2   1000  0.8162
2       3   1000  0.7729
3       4   1000  0.8111
4       5   1000  0.7864
5       1    500  0.7285
6       2    500  0.7470
7       3    500  0.7792
8       4    500  0.7569
9       5    500  0.7474
10      1   full  0.7704
11      2   full  0.8108
12      3   full  0.7809
13      4   full  0.8075
14      5   full  0.7873
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2      eps
0  prompt      2      8  10.882965  0.005218  0.609582  0.58361
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.79644,0.015881,0.7518,0.016505,3.227616,0.032048,0.064096,False
1,t-test,1000 vs full,0.79644,0.015881,0.79138,0.015515,0.913268,0.412769,0.412769,False
2,t-test,500 vs full,0.7518,0.016505,0.79138,0.015515,-3.818831,0.018797,0.056391,False


Unnamed: 0,1000,500,full
1,0.7934,0.7713,0.8038
2,0.8025,0.7537,0.8267
3,0.799,0.7573,0.7688
4,0.8029,0.8155,0.8352
5,0.8111,0.7837,0.8015


Unnamed: 0,W,pval,normal
1000,0.969023,0.868949,True


Unnamed: 0,W,pval,normal
500,0.904757,0.436735,True


Unnamed: 0,W,pval,normal
full,0.939847,0.664856,True


    split prompt      f1
0       1   1000  0.7934
1       2   1000  0.8025
2       3   1000  0.7990
3       4   1000  0.8029
4       5   1000  0.8111
5       1    500  0.7713
6       2    500  0.7537
7       3    500  0.7573
8       4    500  0.8155
9       5    500  0.7837
10      1   full  0.8038
11      2   full  0.8267
12      3   full  0.7688
13      4   full  0.8352
14      5   full  0.8015
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  4.466001  0.049834  0.337763  0.994193
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.80178,0.005772,0.7763,0.022308,2.390562,0.075122,0.150244,False
1,t-test,1000 vs full,0.80178,0.005772,0.8072,0.023164,-0.475877,0.658987,0.658987,False
2,t-test,500 vs full,0.7763,0.022308,0.8072,0.023164,-2.792763,0.049173,0.14752,False


## E2E

In [155]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

RESULTS_PATH = '../results/'
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'e2e':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_pol.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass

# InstructABSA
METHOD = 'instructABSA'

filenames = [file for file in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if file != '.ipynb_checkpoints']

for file in filenames:
    try:
        cond_name = file.split('.tsv')[0]
        cond_parameters = cond_name.split('_')
        
        with open(os.path.join(RESULTS_PATH, METHOD, file), 'r') as f:
            f1 = f.readlines()[-1].split('\t')[1]
        
        cond_parameters.extend([round(float(f1),4), None,None])
        cond_parameters.insert(0, 'e2e')   # Task
        cond_parameters.insert(1, METHOD)  # Method
        cond_parameters.insert(6, 8)       # Batch Size
    
        # if cond_parameters[3] == 'full':
        #     cond_parameters[3] = '0'

        runs.append(cond_parameters)
        
    except:
        pass


# TAS-BERT

METHOD = 'tas_bert'
RESULTS_PATH = '../results/'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if folder != '.ipynb_checkpoints']
for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.extend([round(float(f1),4), None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    # if cond_parameters[3] == 'full':
    #     cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [137]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
60,e2e,instructABSA,rest-16,full,2,5e-05,8,4.0,0.7679,,
95,e2e,instructABSA,rest-16,full,4,5e-05,8,4.0,0.7648,,
63,e2e,instructABSA,rest-16,full,5,5e-05,8,4.0,0.76,,
70,e2e,instructABSA,rest-16,full,3,5e-05,8,4.0,0.7475,,
57,e2e,instructABSA,rest-16,full,1,5e-05,8,4.0,0.7413,,
115,e2e,tas_bert,rest-16,full,4,2e-05,24,27.0,0.7223,,
100,e2e,tas_bert,rest-16,full,3,2e-05,24,27.0,0.7219,,
118,e2e,tas_bert,rest-16,full,1,2e-05,24,27.0,0.7132,,
108,e2e,tas_bert,rest-16,full,5,2e-05,24,27.0,0.7025,,
104,e2e,tas_bert,rest-16,full,2,2e-05,24,27.0,0.6883,,


Unnamed: 0,basic,context,cot,prompting,instructABSA,tas_bert
1,0.7434,0.7594,0.7612,0.5887,0.7413,0.7132
2,0.7986,0.8074,0.7761,0.5884,0.7679,0.6883
3,0.8067,0.785,0.7807,0.5867,0.7475,0.7219
4,0.837,0.8235,0.8113,0.6383,0.7648,0.7223
5,0.8177,0.8222,0.7714,0.6609,0.76,0.7025


Unnamed: 0,W,pval,normal
basic,0.905596,0.441584,True


Unnamed: 0,W,pval,normal
prompting,0.780965,0.05618,True


Unnamed: 0,W,pval,normal
instructABSA,0.913371,0.488109,True


Unnamed: 0,W,pval,normal
tas_bert,0.89703,0.393689,True


    split        prompt      f1
0       1         basic  0.7434
1       2         basic  0.7986
2       3         basic  0.8067
3       4         basic  0.8370
4       5         basic  0.8177
5       1     prompting  0.5887
6       2     prompting  0.5884
7       3     prompting  0.5867
8       4     prompting  0.6383
9       5     prompting  0.6609
10      1  instructABSA  0.7413
11      2  instructABSA  0.7679
12      3  instructABSA  0.7475
13      4  instructABSA  0.7648
14      5  instructABSA  0.7600
15      1      tas_bert  0.7132
16      2      tas_bert  0.6883
17      3      tas_bert  0.7219
18      4      tas_bert  0.7223
19      5      tas_bert  0.7025
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F         p-unc      ng2       eps
0  prompt      3     12  66.213867  9.776963e-08  0.89761  0.794682


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.80068,0.031402,0.6126,0.031052,13.802521,0.00016,0.000958,True
1,t-test,basic vs instructABSA,0.80068,0.031402,0.7563,0.010227,3.538672,0.024042,0.024042,True
2,t-test,basic vs tas_bert,0.80068,0.031402,0.70964,0.012881,5.616495,0.004939,0.016805,True
3,t-test,prompting vs instructABSA,0.6126,0.031052,0.7563,0.010227,-10.240463,0.000513,0.002563,True
4,t-test,prompting vs tas_bert,0.6126,0.031052,0.70964,0.012881,-5.872025,0.004201,0.016805,True
5,t-test,instructABSA vs tas_bert,0.7563,0.010227,0.70964,0.012881,4.656868,0.009612,0.019224,True


### 1000

In [138]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
89,e2e,instructABSA,rest-16,1000,3,5e-05,8,7,0.7637,,
88,e2e,instructABSA,rest-16,1000,1,5e-05,8,7,0.7607,,
58,e2e,instructABSA,rest-16,1000,2,5e-05,8,7,0.7452,,
80,e2e,instructABSA,rest-16,1000,4,5e-05,8,7,0.7431,,
78,e2e,instructABSA,rest-16,1000,5,5e-05,8,7,0.7349,,
125,e2e,tas_bert,rest-16,1000,4,2e-05,24,22.0,0.705,,
31,e2e,prompting,rest-16,1000,5,-,-,-,0.694,0.6539,0.5314
109,e2e,tas_bert,rest-16,1000,1,2e-05,24,22.0,0.6834,,
117,e2e,tas_bert,rest-16,1000,5,2e-05,24,22.0,0.6822,,
113,e2e,tas_bert,rest-16,1000,2,2e-05,24,22.0,0.674,,


Unnamed: 0,basic,context,cot,prompting,instructABSA,tas_bert
1,0.7372,0.7485,0.704,0.6084,0.7607,0.6834
2,0.7985,0.7527,0.7305,0.6348,0.7452,0.674
3,0.7769,0.7442,0.7175,0.6387,0.7637,0.6349
4,0.7952,0.7875,0.7598,0.6515,0.7431,0.705
5,0.7914,0.797,0.0048,0.694,0.7349,0.6822


Unnamed: 0,W,pval,normal
basic,0.801455,0.083273,True


Unnamed: 0,W,pval,normal
prompting,0.945566,0.705507,True


Unnamed: 0,W,pval,normal
instructABSA,0.912259,0.481285,True


Unnamed: 0,W,pval,normal
tas_bert,0.90863,0.459404,True


    split        prompt      f1
0       1         basic  0.7372
1       2         basic  0.7985
2       3         basic  0.7769
3       4         basic  0.7952
4       5         basic  0.7914
5       1     prompting  0.6084
6       2     prompting  0.6348
7       3     prompting  0.6387
8       4     prompting  0.6515
9       5     prompting  0.6940
10      1  instructABSA  0.7607
11      2  instructABSA  0.7452
12      3  instructABSA  0.7637
13      4  instructABSA  0.7431
14      5  instructABSA  0.7349
15      1      tas_bert  0.6834
16      2      tas_bert  0.6740
17      3      tas_bert  0.6349
18      4      tas_bert  0.7050
19      5      tas_bert  0.6822
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      3     12  32.260751  0.000005  0.858186  0.810784


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.77984,0.022559,0.64548,0.028019,12.369438,0.000246,0.001473,True
1,t-test,basic vs instructABSA,0.77984,0.022559,0.74952,0.010952,1.941531,0.124159,0.248318,False
2,t-test,basic vs tas_bert,0.77984,0.022559,0.6759,0.022923,6.852296,0.002374,0.011871,True
3,t-test,prompting vs instructABSA,0.64548,0.028019,0.74952,0.010952,-5.581055,0.005054,0.020215,True
4,t-test,prompting vs tas_bert,0.64548,0.028019,0.6759,0.022923,-1.826,0.141885,0.248318,False
5,t-test,instructABSA vs tas_bert,0.74952,0.010952,0.6759,0.022923,4.770298,0.008838,0.026514,True


### 500

In [139]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
72,e2e,instructABSA,rest-16,500,1,5e-05,8,14,0.7726,,
92,e2e,instructABSA,rest-16,500,4,5e-05,8,14,0.753,,
75,e2e,instructABSA,rest-16,500,2,5e-05,8,14,0.7521,,
61,e2e,instructABSA,rest-16,500,3,5e-05,8,14,0.7395,,
76,e2e,instructABSA,rest-16,500,5,5e-05,8,14,0.7262,,
17,e2e,prompting,rest-16,500,4,-,-,-,0.6176,0.5294,0.4468
55,e2e,prompting,rest-16,500,2,-,-,-,0.6124,0.4855,0.4413
112,e2e,tas_bert,rest-16,500,4,2e-05,24,8.0,0.6102,,
102,e2e,tas_bert,rest-16,500,3,2e-05,24,8.0,0.6074,,
131,e2e,tas_bert,rest-16,500,5,2e-05,24,8.0,0.6069,,


Unnamed: 0,basic,context,cot,prompting,instructABSA,tas_bert
1,0.7037,0.7099,0.511,0.5834,0.7726,0.5622
2,0.7426,0.7546,0.5803,0.6124,0.7521,0.6037
3,0.7415,0.7419,0.5737,0.5457,0.7395,0.6074
4,0.7463,0.728,0.6649,0.6176,0.753,0.6102
5,0.7625,0.7352,0.6812,0.5875,0.7262,0.6069


Unnamed: 0,W,pval,normal
basic,0.865732,0.249568,True


Unnamed: 0,W,pval,normal
prompting,0.916942,0.510405,True


Unnamed: 0,W,pval,normal
instructABSA,0.975255,0.907779,True


Unnamed: 0,W,pval,normal
tas_bert,0.659647,0.00345,False


    split        prompt      f1
0       1         basic  0.7037
1       2         basic  0.7426
2       3         basic  0.7415
3       4         basic  0.7463
4       5         basic  0.7625
5       1     prompting  0.5834
6       2     prompting  0.6124
7       3     prompting  0.5457
8       4     prompting  0.6176
9       5     prompting  0.5875
10      1  instructABSA  0.7726
11      2  instructABSA  0.7521
12      3  instructABSA  0.7395
13      4  instructABSA  0.7530
14      5  instructABSA  0.7262
15      1      tas_bert  0.5622
16      2      tas_bert  0.6037
17      3      tas_bert  0.6074
18      4      tas_bert  0.6102
19      5      tas_bert  0.6069
Friedman Test Result:
          Source      W  ddof1      Q     p-unc
Friedman  Within  0.808      3  12.12  0.006983


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs prompting,0.73932,0.019341,0.58932,0.025587,0.0,0.0625,0.375,False
1,wilcoxon,basic vs instructABSA,0.73932,0.019341,0.74868,0.015441,5.0,0.625,1.0,False
2,wilcoxon,basic vs tas_bert,0.73932,0.019341,0.59808,0.018058,0.0,0.0625,0.375,False
3,wilcoxon,prompting vs instructABSA,0.58932,0.025587,0.74868,0.015441,0.0,0.0625,0.375,False
4,wilcoxon,prompting vs tas_bert,0.58932,0.025587,0.59808,0.018058,7.0,1.0,1.0,False
5,wilcoxon,instructABSA vs tas_bert,0.74868,0.015441,0.59808,0.018058,0.0,0.0625,0.375,False


In [156]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7372,0.7037,0.7434
2,0.7985,0.7426,0.7986
3,0.7769,0.7415,0.8067
4,0.7952,0.7463,0.837
5,0.7914,0.7625,0.8177


Unnamed: 0,W,pval,normal
1000,0.801455,0.083273,True


Unnamed: 0,W,pval,normal
500,0.865732,0.249568,True


Unnamed: 0,W,pval,normal
full,0.905596,0.441584,True


    split prompt      f1
0       1   1000  0.7372
1       2   1000  0.7985
2       3   1000  0.7769
3       4   1000  0.7952
4       5   1000  0.7914
5       1    500  0.7037
6       2    500  0.7426
7       3    500  0.7415
8       4    500  0.7463
9       5    500  0.7625
10      1   full  0.7434
11      2   full  0.7986
12      3   full  0.8067
13      4   full  0.8370
14      5   full  0.8177
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2     eps
0  prompt      2      8  37.412673  0.000087  0.510222  0.7889
Results for LR-Comparison of :  basic


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.77984,0.022559,0.73932,0.019341,7.96743,0.001345,0.004034,True
1,t-test,1000 vs full,0.77984,0.022559,0.80068,0.031402,-2.697306,0.054246,0.054246,False
2,t-test,500 vs full,0.73932,0.019341,0.80068,0.031402,-7.304213,0.001868,0.004034,True


Unnamed: 0,1000,500,full
1,0.7485,0.7099,0.7594
2,0.7527,0.7546,0.8074
3,0.7442,0.7419,0.785
4,0.7875,0.728,0.8235
5,0.797,0.7352,0.8222


Unnamed: 0,W,pval,normal
1000,0.831693,0.143237,True


Unnamed: 0,W,pval,normal
500,0.989374,0.977456,True


Unnamed: 0,W,pval,normal
full,0.895268,0.384285,True


    split prompt      f1
0       1   1000  0.7485
1       2   1000  0.7527
2       3   1000  0.7442
3       4   1000  0.7875
4       5   1000  0.7970
5       1    500  0.7099
6       2    500  0.7546
7       3    500  0.7419
8       4    500  0.7280
9       5    500  0.7352
10      1   full  0.7594
11      2   full  0.8074
12      3   full  0.7850
13      4   full  0.8235
14      5   full  0.8222
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  18.203258  0.001053  0.624767  0.707079
Results for LR-Comparison of :  context


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.76598,0.021825,0.73392,0.014866,2.351028,0.078427,0.078427,False
1,t-test,1000 vs full,0.76598,0.021825,0.7995,0.02438,-4.543382,0.01047,0.02094,True
2,t-test,500 vs full,0.73392,0.014866,0.7995,0.02438,-6.140423,0.003566,0.010699,True


Unnamed: 0,1000,500,full
1,0.704,0.511,0.7612
2,0.7305,0.5803,0.7761
3,0.7175,0.5737,0.7807
4,0.7598,0.6649,0.8113
5,0.0048,0.6812,0.7714


Unnamed: 0,W,pval,normal
1000,0.61483,0.000981,False


Unnamed: 0,W,pval,normal
500,0.923136,0.55037,True


Unnamed: 0,W,pval,normal
full,0.887972,0.347007,True


    split prompt      f1
0       1   1000  0.7040
1       2   1000  0.7305
2       3   1000  0.7175
3       4   1000  0.7598
4       5   1000  0.0048
5       1    500  0.5110
6       2    500  0.5803
7       3    500  0.5737
8       4    500  0.6649
9       5    500  0.6812
10      1   full  0.7612
11      2   full  0.7761
12      3   full  0.7807
13      4   full  0.8113
14      5   full  0.7714
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.58332,0.289848,0.60222,0.062899,5.0,0.625,0.625,False
1,wilcoxon,1000 vs full,0.58332,0.289848,0.78014,0.016868,0.0,0.0625,0.1875,False
2,wilcoxon,500 vs full,0.60222,0.062899,0.78014,0.016868,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.7372,0.7037,0.7434
2,0.7985,0.7426,0.7986
3,0.7769,0.7415,0.8067
4,0.7952,0.7463,0.837
5,0.7914,0.7625,0.8177


Unnamed: 0,W,pval,normal
1000,0.801455,0.083273,True


Unnamed: 0,W,pval,normal
500,0.865732,0.249568,True


Unnamed: 0,W,pval,normal
full,0.905596,0.441584,True


    split prompt      f1
0       1   1000  0.7372
1       2   1000  0.7985
2       3   1000  0.7769
3       4   1000  0.7952
4       5   1000  0.7914
5       1    500  0.7037
6       2    500  0.7426
7       3    500  0.7415
8       4    500  0.7463
9       5    500  0.7625
10      1   full  0.7434
11      2   full  0.7986
12      3   full  0.8067
13      4   full  0.8370
14      5   full  0.8177
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2     eps
0  prompt      2      8  37.412673  0.000087  0.510222  0.7889
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.77984,0.022559,0.73932,0.019341,7.96743,0.001345,0.004034,True
1,t-test,1000 vs full,0.77984,0.022559,0.80068,0.031402,-2.697306,0.054246,0.054246,False
2,t-test,500 vs full,0.73932,0.019341,0.80068,0.031402,-7.304213,0.001868,0.004034,True


## E2E - without Implicit

In [38]:
runs = []
RESULTS_PATH = '../results_final/filtered/'
col_names = ['model_lang', 'dataset', 'model_shots', 'model_prompt', 'model_task', 'lr', 'lora_r', 'lora_alpha', 'lora_dropout', 'model_quant', 'split', 'lr_setting', 'model_name', 'lang', 'shots', 'prompt', 'task', 'quant', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(RESULTS_PATH + folder_name + '/metrics_pol.tsv', sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(10)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

# InstructABSA
METHOD = 'instructAbsa'
RESULTS_PATH = '../../../ABSA-Baselines/InstructABSA-Custom/Output_filtered'
runs = []

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

file_names = [file for file in os.listdir(RESULTS_PATH) if len(file.split('.tsv')) > 1 and file != '.ipynb_checkpoints']

for file_name in file_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, file_name), 'r') as file:
        for line in file:
            # Strip any leading/trailing whitespace and split the line by '='
            key, value = line.strip().split('\t')
            # Convert the value to a float and store it in the dictionary
            metrics_dict[key.strip()] = float(value.strip())
            
    cond_name = file_name.split('.tsv')[0]
    cond_parameters = cond_name.split('_')
    
    cond_parameters.append(metrics_dict['F1-Score'])
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method
    cond_parameters.insert(6, 8)       # Batch Size

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'
    
    runs.append(cond_parameters)


# TAS-BERT

METHOD = 'tas-bert'
RESULTS_PATH = '../../../ABSA-Baselines/TAS-BERT-Custom/results_filtered/rest-16/three_joint/BIO'

col_names = ['task', 'method', 'dataset', 'lr-setting', 'split', 'learning-rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']

folder_names = [file for file in os.listdir(RESULTS_PATH) if file != '.ipynb_checkpoints']

for folder_name in folder_names:
    # try:
    metrics_dict = {}

    with open(os.path.join(RESULTS_PATH, folder_name, 'results.txt'), 'r') as file:
        lines = file.readlines()

        epoch, p, r, f1 = lines[-1].strip().split('\t')
            
    cond_parameters = folder_name.split('_')
    
    cond_parameters.append(float(f1))
    cond_parameters.extend([None,None])
    cond_parameters.insert(0, 'e2e')   # Task
    cond_parameters.insert(1, METHOD)  # Method

    if cond_parameters[3] == 'full':
        cond_parameters[3] = '0'

    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [39]:
args.lr_setting = 0
args.task = 'e2e'

computePromptStatistics(args)

   task        method  dataset lr-setting split learning-rate batch_size  \
52  e2e      tas-bert  rest-16          0     3         2e-05         24   
49  e2e      tas-bert  rest-16          0     1         2e-05         24   
45  e2e      tas-bert  rest-16          0     4         2e-05         24   
54  e2e      tas-bert  rest-16          0     2         2e-05         24   
50  e2e      tas-bert  rest-16          0     5         2e-05         24   
1   e2e  instructAbsa  rest-16          0     1         5e-05          8   
4   e2e  instructAbsa  rest-16          0     2         5e-05          8   
14  e2e  instructAbsa  rest-16          0     3         5e-05          8   
7   e2e  instructAbsa  rest-16          0     5         5e-05          8   
39  e2e  instructAbsa  rest-16          0     4         5e-05          8   

   epochs  f1-micro f1-macro accuracy  
52   22.0  0.742300     None     None  
49   22.0  0.741900     None     None  
45   22.0  0.707100     None     None  
54 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.8093,0.7757,0.8147,0.677804,0.7419
2,0.7654,0.7583,0.7953,0.647413,0.6933
3,0.7813,0.7819,0.7974,0.636132,0.7423
4,0.8328,0.7728,0.8099,0.602203,0.7071
5,0.8046,0.755,0.7763,0.60261,0.6876


Unnamed: 0,W,pval,normal
cot,0.941126,0.673908,True


Unnamed: 0,W,pval,normal
instructAbsa,0.910112,0.468263,True


Unnamed: 0,W,pval,normal
tas-bert,0.840393,0.166011,True


    split        prompt        f1
0       1           cot  0.814700
1       2           cot  0.795300
2       3           cot  0.797400
3       4           cot  0.809900
4       5           cot  0.776300
5       1  instructAbsa  0.677804
6       2  instructAbsa  0.647413
7       3  instructAbsa  0.636132
8       4  instructAbsa  0.602203
9       5  instructAbsa  0.602610
10      1      tas-bert  0.741900
11      2      tas-bert  0.693300
12      3      tas-bert  0.742300
13      4      tas-bert  0.707100
14      5      tas-bert  0.687600
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F     p-unc       ng2      eps
0  prompt      2      8  111.148677  0.000001  0.898389  0.90079


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,cot vs instructAbsa,0.79872,0.01339,0.633232,0.028624,13.525582,0.000173,0.000519,True
1,t-test,cot vs tas-bert,0.79872,0.01339,0.71444,0.023458,9.251523,0.000759,0.001518,True
2,t-test,instructAbsa vs tas-bert,0.633232,0.028624,0.71444,0.023458,-6.9395,0.002265,0.002265,True


### 1000

In [40]:
args.lr_setting = 1000
args.task = 'e2e'

computePromptStatistics(args)

   task        method  dataset lr-setting split learning-rate batch_size  \
48  e2e      tas-bert  rest-16       1000     4         2e-05         24   
56  e2e      tas-bert  rest-16       1000     3         2e-05         24   
57  e2e      tas-bert  rest-16       1000     1         2e-05         24   
40  e2e      tas-bert  rest-16       1000     2         2e-05         24   
32  e2e  instructAbsa  rest-16       1000     1         5e-05          8   
41  e2e      tas-bert  rest-16       1000     5         2e-05         24   
2   e2e  instructAbsa  rest-16       1000     2         5e-05          8   
22  e2e  instructAbsa  rest-16       1000     5         5e-05          8   
24  e2e  instructAbsa  rest-16       1000     4         5e-05          8   
33  e2e  instructAbsa  rest-16       1000     3         5e-05          8   

   epochs  f1-micro f1-macro accuracy  
48   12.0  0.724400     None     None  
56   12.0  0.723900     None     None  
57   12.0  0.702100     None     None  
40 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.8099,0.8463,0.7792,0.686916,0.7021
2,0.8013,0.8105,0.6756,0.677262,0.7002
3,0.8094,0.8052,0.7559,0.62406,0.7239
4,0.8129,0.7842,0.7581,0.624849,0.7244
5,0.8111,0.8283,0.7387,0.643373,0.6809


Unnamed: 0,W,pval,normal
long,0.988034,0.972361,True


Unnamed: 0,W,pval,normal
instructAbsa,0.859378,0.226025,True


Unnamed: 0,W,pval,normal
tas-bert,0.896812,0.392516,True


    split        prompt        f1
0       1          long  0.846300
1       2          long  0.810500
2       3          long  0.805200
3       4          long  0.784200
4       5          long  0.828300
5       1  instructAbsa  0.686916
6       2  instructAbsa  0.677262
7       3  instructAbsa  0.624060
8       4  instructAbsa  0.624849
9       5  instructAbsa  0.643373
10      1      tas-bert  0.702100
11      2      tas-bert  0.700200
12      3      tas-bert  0.723900
13      4      tas-bert  0.724400
14      5      tas-bert  0.680900
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  57.217872  0.000018  0.908202  0.699027


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs instructAbsa,0.8149,0.021078,0.651292,0.026256,17.641666,6.1e-05,0.000182,True
1,t-test,long vs tas-bert,0.8149,0.021078,0.7063,0.016355,6.32164,0.003204,0.006407,True
2,t-test,instructAbsa vs tas-bert,0.651292,0.026256,0.7063,0.016355,-2.958536,0.041615,0.041615,True


### 500

In [41]:
args.lr_setting = 500
args.task = 'e2e'

computePromptStatistics(args)

   task        method  dataset lr-setting split learning-rate batch_size  \
47  e2e      tas-bert  rest-16        500     3         2e-05         24   
43  e2e      tas-bert  rest-16        500     1         2e-05         24   
16  e2e  instructAbsa  rest-16        500     1         5e-05          8   
53  e2e      tas-bert  rest-16        500     4         2e-05         24   
19  e2e  instructAbsa  rest-16        500     2         5e-05          8   
36  e2e  instructAbsa  rest-16        500     4         5e-05          8   
20  e2e  instructAbsa  rest-16        500     5         5e-05          8   
51  e2e      tas-bert  rest-16        500     2         2e-05         24   
5   e2e  instructAbsa  rest-16        500     3         5e-05          8   
42  e2e      tas-bert  rest-16        500     5         2e-05         24   

   epochs  f1-micro f1-macro accuracy  
47   28.0  0.726000     None     None  
43   28.0  0.677700     None     None  
16     14  0.670561     None     None  
53 

Unnamed: 0,short,long,cot,instructAbsa,tas-bert
1,0.786,0.7871,0.7781,0.670561,0.6777
2,0.7839,0.787,0.7203,0.661836,0.6255
3,0.7014,0.8106,0.7926,0.623116,0.726
4,0.7631,0.7596,0.7727,0.638554,0.6655
5,0.7742,0.7873,0.7529,0.625616,0.6168


Unnamed: 0,W,pval,normal
long,0.883866,0.327197,True


Unnamed: 0,W,pval,normal
instructAbsa,0.887702,0.345682,True


Unnamed: 0,W,pval,normal
tas-bert,0.938631,0.656276,True


    split        prompt        f1
0       1          long  0.787100
1       2          long  0.787000
2       3          long  0.810600
3       4          long  0.759600
4       5          long  0.787300
5       1  instructAbsa  0.670561
6       2  instructAbsa  0.661836
7       3  instructAbsa  0.623116
8       4  instructAbsa  0.638554
9       5  instructAbsa  0.625616
10      1      tas-bert  0.677700
11      2      tas-bert  0.625500
12      3      tas-bert  0.726000
13      4      tas-bert  0.665500
14      5      tas-bert  0.616800
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  34.068911  0.000122  0.846679  0.731849


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,long vs instructAbsa,0.78632,0.016159,0.643936,0.019117,10.29398,0.000502,0.001507,True
1,t-test,long vs tas-bert,0.78632,0.016159,0.6623,0.039318,7.027995,0.00216,0.004319,True
2,t-test,instructAbsa vs tas-bert,0.643936,0.019117,0.6623,0.039318,-0.7804,0.478756,0.478756,False


In [42]:
args.task = 'e2e'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.8099,0.786,0.8093
2,0.8013,0.7839,0.7654
3,0.8094,0.7014,0.7813
4,0.8129,0.7631,0.8328
5,0.8111,0.7742,0.8046


Unnamed: 0,W,pval,normal
1000,0.825775,0.129276,True


Unnamed: 0,W,pval,normal
500,0.769106,0.044305,False


Unnamed: 0,W,pval,normal
full,0.976582,0.915563,True


    split prompt      f1
0       1   1000  0.8099
1       2   1000  0.8013
2       3   1000  0.8094
3       4   1000  0.8129
4       5   1000  0.8111
5       1    500  0.7860
6       2    500  0.7839
7       3    500  0.7014
8       4    500  0.7631
9       5    500  0.7742
10      1   full  0.8093
11      2   full  0.7654
12      3   full  0.7813
13      4   full  0.8328
14      5   full  0.8046
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.64      2  6.4  0.040762
Results for LR-Comparison of :  short


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.80892,0.003996,0.76172,0.031233,0.0,0.0625,0.1875,False
1,wilcoxon,1000 vs full,0.80892,0.003996,0.79868,0.023331,3.0,0.3125,0.3125,False
2,wilcoxon,500 vs full,0.76172,0.031233,0.79868,0.023331,1.0,0.125,0.25,False


Unnamed: 0,1000,500,full
1,0.8463,0.7871,0.7757
2,0.8105,0.787,0.7583
3,0.8052,0.8106,0.7819
4,0.7842,0.7596,0.7728
5,0.8283,0.7873,0.755


Unnamed: 0,W,pval,normal
1000,0.988034,0.972361,True


Unnamed: 0,W,pval,normal
500,0.883866,0.327197,True


Unnamed: 0,W,pval,normal
full,0.909621,0.465314,True


    split prompt      f1
0       1   1000  0.8463
1       2   1000  0.8105
2       3   1000  0.8052
3       4   1000  0.7842
4       5   1000  0.8283
5       1    500  0.7871
6       2    500  0.7870
7       3    500  0.8106
8       4    500  0.7596
9       5    500  0.7873
10      1   full  0.7757
11      2   full  0.7583
12      3   full  0.7819
13      4   full  0.7728
14      5   full  0.7550
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  9.523769  0.007653  0.571905  0.851074
Results for LR-Comparison of :  long


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8149,0.021078,0.78632,0.016159,2.672004,0.05569,0.111379,False
1,t-test,1000 vs full,0.8149,0.021078,0.76874,0.010353,3.706936,0.020709,0.062128,False
2,t-test,500 vs full,0.78632,0.016159,0.76874,0.010353,2.06496,0.107844,0.111379,False


Unnamed: 0,1000,500,full
1,0.7792,0.7781,0.8147
2,0.6756,0.7203,0.7953
3,0.7559,0.7926,0.7974
4,0.7581,0.7727,0.8099
5,0.7387,0.7529,0.7763


Unnamed: 0,W,pval,normal
1000,0.864605,0.24526,True


Unnamed: 0,W,pval,normal
500,0.938402,0.65466,True


Unnamed: 0,W,pval,normal
full,0.941126,0.673908,True


    split prompt      f1
0       1   1000  0.7792
1       2   1000  0.6756
2       3   1000  0.7559
3       4   1000  0.7581
4       5   1000  0.7387
5       1    500  0.7781
6       2    500  0.7203
7       3    500  0.7926
8       4    500  0.7727
9       5    500  0.7529
10      1   full  0.8147
11      2   full  0.7953
12      3   full  0.7974
13      4   full  0.8099
14      5   full  0.7763
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2      eps
0  prompt      2      8  11.032157  0.005014  0.448012  0.66555
Results for LR-Comparison of :  cot


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.7415,0.035369,0.76332,0.024992,-2.626412,0.058405,0.074332,False
1,t-test,1000 vs full,0.7415,0.035369,0.79872,0.01339,-3.605613,0.022645,0.067934,False
2,t-test,500 vs full,0.76332,0.024992,0.79872,0.01339,-3.0735,0.037166,0.074332,False


Unnamed: 0,1000,500,full
1,0.8463,0.7871,0.8147
2,0.8105,0.787,0.7953
3,0.8052,0.8106,0.7974
4,0.7842,0.7596,0.8099
5,0.8283,0.7873,0.7763


Unnamed: 0,W,pval,normal
1000,0.988034,0.972361,True


Unnamed: 0,W,pval,normal
500,0.883866,0.327197,True


Unnamed: 0,W,pval,normal
full,0.941126,0.673908,True


    split prompt      f1
0       1   1000  0.8463
1       2   1000  0.8105
2       3   1000  0.8052
3       4   1000  0.7842
4       5   1000  0.8283
5       1    500  0.7871
6       2    500  0.7870
7       3    500  0.8106
8       4    500  0.7596
9       5    500  0.7873
10      1   full  0.8147
11      2   full  0.7953
12      3   full  0.7974
13      4   full  0.8099
14      5   full  0.7763
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  2.892451  0.113434  0.317096  0.955563
Results for LR-Comparison of best Prompt per LR-Setting


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.8149,0.021078,0.78632,0.016159,2.672004,0.05569,0.167069,False
1,t-test,1000 vs full,0.8149,0.021078,0.79872,0.01339,1.250929,0.279134,0.558269,False
2,t-test,500 vs full,0.78632,0.016159,0.79872,0.01339,-1.032048,0.360362,0.558269,False


## TASD

In [157]:
runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)


###
# Baselines
##

RESULTS_PATH = '../results/'
col_names = ['task', 'method', 'dataset', 'lr_setting', 'split', 'learning_rate', 'batch_size', 'epochs', 'f1-micro', 'f1-macro', 'accuracy']
runs = []

# Prompting LlaMA-3-8B
METHOD = 'prompting'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        if cond_parameters[0] == 'tasd':
            df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_phrases.tsv'), sep = '\t')
            df = df.set_index(df.columns[0])
            cond_parameters[2], cond_parameters[4] = cond_parameters[4], cond_parameters[2]
            cond_parameters.pop(5)
            cond_parameters.pop(4)
            cond_parameters[1:1] = [METHOD]
            cond_parameters[5:5] = ['-']
            cond_parameters[6:6] = ['-']
            cond_parameters[7:7] = ['-']
            cond_parameters.append(df.loc['Micro-AVG', 'f1'])
            cond_parameters.append(df.loc['Macro-AVG', 'f1'])
            cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
            runs.append(cond_parameters)
    except:
        pass

# Paraphrase Generation
METHOD = 'para'
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        df = pd.read_csv(os.path.join(RESULTS_PATH, METHOD, folder_name, 'metrics_phrases.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])
        cond_name = folder_name.split('/')[-1]
        cond_parameters = cond_name.split('_')
        
        # Fix for the wrong output name format
        # cond_parameters[0], cond_parameters[1] = cond_parameters[1], cond_parameters[0]
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        cond_parameters[1:1] = [METHOD]
        if cond_parameters[3] == '0':
            cond_parameters[3] = 'full'
        if cond_parameters[0] == 'acsd':
            cond_parameters[0] = 'tasd'
        runs.append(cond_parameters)
    except:
        pass

# MVP
METHOD = 'mvp'

folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH, METHOD)) if os.path.isdir(os.path.join(RESULTS_PATH, METHOD, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:

    with open(os.path.join(RESULTS_PATH, METHOD, folder_name, 'result.txt'), 'r') as file:
        f1 = file.readlines()[-1].split(' ')[1]
            
    cond_name = folder_name.split('/')[-1]
    cond_parameters = cond_name.split('_')

    cond_parameters.extend([round(float(f1)/100, 4), None, None])
    cond_parameters[1:1] = [METHOD]
    cond_parameters[0], cond_parameters[2] = cond_parameters[2], cond_parameters[0]
    cond_parameters[3], cond_parameters[4] = cond_parameters[4], cond_parameters[3]
    cond_parameters[5:5] = [1e-4]
    cond_parameters[6:6] = [16 if cond_parameters[3] == 'full' else 8]
    # cond_parameters[3] = 0 if cond_parameters[3] == 'full' else cond_parameters[3]
    
    runs.append(cond_parameters)


results_baseline = pd.DataFrame(runs, columns = col_names)

args.results = results_all
args.results_baseline = results_baseline

### Full Dataset

In [142]:
args.lr_setting = 0
args.task = 'tasd'

stats_tasd['0'] = computePromptStatistics(args)
stats_tasd['0']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
74,tasd,para,rest-16,full,4,0.0003,16,20,0.7451,0.712,0.5938
100,tasd,mvp,rest-16,full,4,0.0001,16,20,0.745,,
66,tasd,para,rest-16,full,2,0.0003,16,20,0.7271,0.6684,0.5712
96,tasd,para,rest-16,full,5,0.0003,16,20,0.7173,0.5841,0.5592
108,tasd,mvp,rest-16,full,5,0.0001,16,20,0.7104,,
85,tasd,para,rest-16,full,1,0.0003,16,20,0.7066,0.6449,0.5463
82,tasd,para,rest-16,full,3,0.0003,16,20,0.6961,0.6437,0.5338
115,tasd,mvp,rest-16,full,1,0.0001,16,20,0.6936,,
114,tasd,mvp,rest-16,full,2,0.0001,16,20,0.6854,,
122,tasd,mvp,rest-16,full,3,0.0001,16,20,0.6717,,


Unnamed: 0,basic,context,cot,prompting,para,mvp
1,0.7324,0.7648,0.6968,0.4792,0.7066,0.6936
2,0.7614,0.7115,0.6632,0.5218,0.7271,0.6854
3,0.751,0.7459,0.7009,0.529,0.6961,0.6717
4,0.7863,0.7687,0.7438,0.5213,0.7451,0.745
5,0.7516,0.7564,0.704,0.5395,0.7173,0.7104


Unnamed: 0,W,pval,normal
basic,0.949841,0.736055,True


Unnamed: 0,W,pval,normal
prompting,0.831947,0.143863,True


Unnamed: 0,W,pval,normal
para,0.986734,0.967055,True


Unnamed: 0,W,pval,normal
mvp,0.942485,0.683555,True


    split     prompt      f1
0       1      basic  0.7324
1       2      basic  0.7614
2       3      basic  0.7510
3       4      basic  0.7863
4       5      basic  0.7516
5       1  prompting  0.4792
6       2  prompting  0.5218
7       3  prompting  0.5290
8       4  prompting  0.5213
9       5  prompting  0.5395
10      1       para  0.7066
11      2       para  0.7271
12      3       para  0.6961
13      4       para  0.7451
14      5       para  0.7173
15      1        mvp  0.6936
16      2        mvp  0.6854
17      3        mvp  0.6717
18      4        mvp  0.7450
19      5        mvp  0.7104
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2           F         p-unc       ng2       eps
0  prompt      3     12  211.174566  1.200554e-10  0.953377  0.507961


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs prompting,0.75654,0.01759,0.51816,0.020562,24.530404,1.6e-05,9.8e-05,True
1,t-test,basic vs para,0.75654,0.01759,0.71844,0.01689,7.842595,0.001428,0.004283,True
2,t-test,basic vs mvp,0.75654,0.01759,0.70122,0.025222,6.051148,0.003764,0.007527,True
3,t-test,prompting vs para,0.51816,0.020562,0.71844,0.01689,-16.574871,7.8e-05,0.000388,True
4,t-test,prompting vs mvp,0.51816,0.020562,0.70122,0.025222,-11.829675,0.000292,0.001169,True
5,t-test,para vs mvp,0.71844,0.01689,0.70122,0.025222,2.356222,0.077984,0.077984,False


### 1000

In [143]:
args.lr_setting = 1000
args.task = 'tasd'

stats_tasd['1000'] = computePromptStatistics(args)
stats_tasd['1000']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
103,tasd,mvp,rest-16,1000,4,0.0001,8,30,0.7157,,
62,tasd,para,rest-16,1000,4,0.0003,16,20,0.6995,0.69,0.5379
63,tasd,para,rest-16,1000,2,0.0003,16,20,0.6945,0.6088,0.532
73,tasd,para,rest-16,1000,5,0.0003,16,20,0.6906,0.553,0.5274
71,tasd,para,rest-16,1000,1,0.0003,16,20,0.6811,0.611,0.5164
106,tasd,mvp,rest-16,1000,2,0.0001,8,30,0.6757,,
128,tasd,mvp,rest-16,1000,3,0.0001,8,30,0.6743,,
111,tasd,mvp,rest-16,1000,1,0.0001,8,30,0.6723,,
109,tasd,mvp,rest-16,1000,5,0.0001,8,30,0.6691,,
83,tasd,para,rest-16,1000,3,0.0003,16,20,0.6533,0.5515,0.4851


Unnamed: 0,basic,context,cot,prompting,para,mvp
1,0.7019,0.7148,0.6739,0.4625,0.6811,0.6723
2,0.7211,0.7302,0.7225,0.4449,0.6945,0.6757
3,0.7682,0.7288,0.6769,0.4629,0.6533,0.6743
4,0.7432,0.7826,0.697,0.5179,0.6995,0.7157
5,0.7207,0.6933,0.7028,0.4801,0.6906,0.6691


Unnamed: 0,W,pval,normal
basic,0.948244,0.72464,True


Unnamed: 0,W,pval,normal
prompting,0.908093,0.456215,True


Unnamed: 0,W,pval,normal
para,0.85663,0.216399,True


Unnamed: 0,W,pval,normal
mvp,0.676794,0.00539,False


    split     prompt      f1
0       1      basic  0.7019
1       2      basic  0.7211
2       3      basic  0.7682
3       4      basic  0.7432
4       5      basic  0.7207
5       1  prompting  0.4625
6       2  prompting  0.4449
7       3  prompting  0.4629
8       4  prompting  0.5179
9       5  prompting  0.4801
10      1       para  0.6811
11      2       para  0.6945
12      3       para  0.6533
13      4       para  0.6995
14      5       para  0.6906
15      1        mvp  0.6723
16      2        mvp  0.6757
17      3        mvp  0.6743
18      4        mvp  0.7157
19      5        mvp  0.6691
Friedman Test Result:
          Source      W  ddof1      Q    p-unc
Friedman  Within  0.904      3  13.56  0.00357


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs prompting,0.73102,0.022731,0.47366,0.024763,0.0,0.0625,0.375,False
1,wilcoxon,basic vs para,0.73102,0.022731,0.6838,0.0164,0.0,0.0625,0.375,False
2,wilcoxon,basic vs mvp,0.73102,0.022731,0.68142,0.017283,0.0,0.0625,0.375,False
3,wilcoxon,prompting vs para,0.47366,0.024763,0.6838,0.0164,0.0,0.0625,0.375,False
4,wilcoxon,prompting vs mvp,0.47366,0.024763,0.68142,0.017283,0.0,0.0625,0.375,False
5,wilcoxon,para vs mvp,0.6838,0.0164,0.68142,0.017283,6.0,0.8125,0.8125,False


### 500

In [144]:
args.lr_setting = 500
args.task = 'tasd'

stats_tasd['500'] = computePromptStatistics(args)
stats_tasd['500']

Unnamed: 0,task,method,dataset,lr_setting,split,learning_rate,batch_size,epochs,f1-micro,f1-macro,accuracy
113,tasd,mvp,rest-16,500,4,0.0001,8,50,0.6945,,
64,tasd,para,rest-16,500,2,0.0003,16,20,0.6708,0.5426,0.5046
118,tasd,mvp,rest-16,500,2,0.0001,8,50,0.6535,,
119,tasd,mvp,rest-16,500,5,0.0001,8,50,0.65,,
81,tasd,para,rest-16,500,5,0.0003,16,20,0.6399,0.4956,0.4705
70,tasd,para,rest-16,500,4,0.0003,16,20,0.637,0.5918,0.4674
127,tasd,mvp,rest-16,500,3,0.0001,8,50,0.6226,,
89,tasd,para,rest-16,500,1,0.0003,16,20,0.6212,0.4878,0.4505
99,tasd,para,rest-16,500,3,0.0003,16,20,0.6189,0.4985,0.4481
117,tasd,mvp,rest-16,500,1,0.0001,8,50,0.6111,,


Unnamed: 0,basic,context,cot,prompting,para,mvp
1,0.6947,0.6674,0.5965,0.4675,0.6212,0.6111
2,0.7284,0.7384,0.6134,0.498,0.6708,0.6535
3,0.6914,0.697,0.6164,0.4863,0.6189,0.6226
4,0.7298,0.7557,0.6821,0.4648,0.637,0.6945
5,0.7286,0.6615,0.6168,0.4519,0.6399,0.65


Unnamed: 0,W,pval,normal
basic,0.732072,0.020135,False


Unnamed: 0,W,pval,normal
prompting,0.956476,0.783259,True


Unnamed: 0,W,pval,normal
para,0.881328,0.315371,True


Unnamed: 0,W,pval,normal
mvp,0.944033,0.694574,True


    split     prompt      f1
0       1      basic  0.6947
1       2      basic  0.7284
2       3      basic  0.6914
3       4      basic  0.7298
4       5      basic  0.7286
5       1  prompting  0.4675
6       2  prompting  0.4980
7       3  prompting  0.4863
8       4  prompting  0.4648
9       5  prompting  0.4519
10      1       para  0.6212
11      2       para  0.6708
12      3       para  0.6189
13      4       para  0.6370
14      5       para  0.6399
15      1        mvp  0.6111
16      2        mvp  0.6535
17      3        mvp  0.6226
18      4        mvp  0.6945
19      5        mvp  0.6500
Friedman Test Result:
          Source      W  ddof1      Q    p-unc
Friedman  Within  0.904      3  13.56  0.00357


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs prompting,0.71458,0.017617,0.4737,0.016384,0.0,0.0625,0.375,False
1,wilcoxon,basic vs para,0.71458,0.017617,0.63756,0.018582,0.0,0.0625,0.375,False
2,wilcoxon,basic vs mvp,0.71458,0.017617,0.64634,0.028946,0.0,0.0625,0.375,False
3,wilcoxon,prompting vs para,0.4737,0.016384,0.63756,0.018582,0.0,0.0625,0.375,False
4,wilcoxon,prompting vs mvp,0.4737,0.016384,0.64634,0.028946,0.0,0.0625,0.375,False
5,wilcoxon,para vs mvp,0.63756,0.018582,0.64634,0.028946,6.5,0.8125,0.8125,False


In [158]:
args.task = 'tasd'

computeLowResourceStatistics(args)

Unnamed: 0,1000,500,full
1,0.7019,0.6947,0.7324
2,0.7211,0.7284,0.7614
3,0.7682,0.6914,0.751
4,0.7432,0.7298,0.7863
5,0.7207,0.7286,0.7516


Unnamed: 0,W,pval,normal
1000,0.948244,0.72464,True


Unnamed: 0,W,pval,normal
500,0.732072,0.020135,False


Unnamed: 0,W,pval,normal
full,0.949841,0.736055,True


    split prompt      f1
0       1   1000  0.7019
1       2   1000  0.7211
2       3   1000  0.7682
3       4   1000  0.7432
4       5   1000  0.7207
5       1    500  0.6947
6       2    500  0.7284
7       3    500  0.6914
8       4    500  0.7298
9       5    500  0.7286
10      1   full  0.7324
11      2   full  0.7614
12      3   full  0.7510
13      4   full  0.7863
14      5   full  0.7516
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.52      2  5.2  0.074274
Results for LR-Comparison of :  basic


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.73102,0.022731,0.71458,0.017617,5.0,0.625,0.625,False
1,wilcoxon,1000 vs full,0.73102,0.022731,0.75654,0.01759,1.0,0.125,0.25,False
2,wilcoxon,500 vs full,0.71458,0.017617,0.75654,0.01759,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.7148,0.6674,0.7648
2,0.7302,0.7384,0.7115
3,0.7288,0.697,0.7459
4,0.7826,0.7557,0.7687
5,0.6933,0.6615,0.7564


Unnamed: 0,W,pval,normal
1000,0.91523,0.499646,True


Unnamed: 0,W,pval,normal
500,0.901901,0.420481,True


Unnamed: 0,W,pval,normal
full,0.860335,0.229454,True


    split prompt      f1
0       1   1000  0.7148
1       2   1000  0.7302
2       3   1000  0.7288
3       4   1000  0.7826
4       5   1000  0.6933
5       1    500  0.6674
6       2    500  0.7384
7       3    500  0.6970
8       4    500  0.7557
9       5    500  0.6615
10      1   full  0.7648
11      2   full  0.7115
12      3   full  0.7459
13      4   full  0.7687
14      5   full  0.7564
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  3.364272  0.08704  0.278042  0.542324
Results for LR-Comparison of :  context


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,1000 vs 500,0.72994,0.029483,0.704,0.037553,2.817176,0.047966,0.143897,False
1,t-test,1000 vs full,0.72994,0.029483,0.74946,0.020529,-1.186657,0.301025,0.301025,False
2,t-test,500 vs full,0.704,0.037553,0.74946,0.020529,-1.900485,0.130166,0.260332,False


Unnamed: 0,1000,500,full
1,0.6739,0.5965,0.6968
2,0.7225,0.6134,0.6632
3,0.6769,0.6164,0.7009
4,0.697,0.6821,0.7438
5,0.7028,0.6168,0.704


Unnamed: 0,W,pval,normal
1000,0.932916,0.616384,True


Unnamed: 0,W,pval,normal
500,0.765998,0.041583,False


Unnamed: 0,W,pval,normal
full,0.931301,0.605268,True


    split prompt      f1
0       1   1000  0.6739
1       2   1000  0.7225
2       3   1000  0.6769
3       4   1000  0.6970
4       5   1000  0.7028
5       1    500  0.5965
6       2    500  0.6134
7       3    500  0.6164
8       4    500  0.6821
9       5    500  0.6168
10      1   full  0.6968
11      2   full  0.6632
12      3   full  0.7009
13      4   full  0.7438
14      5   full  0.7040
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.84      2  8.4  0.014996
Results for LR-Comparison of :  cot


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.69462,0.01785,0.62504,0.02949,0.0,0.0625,0.1875,False
1,wilcoxon,1000 vs full,0.69462,0.01785,0.70174,0.02563,5.0,0.625,0.625,False
2,wilcoxon,500 vs full,0.62504,0.02949,0.70174,0.02563,0.0,0.0625,0.1875,False


Unnamed: 0,1000,500,full
1,0.7019,0.6947,0.7324
2,0.7211,0.7284,0.7614
3,0.7682,0.6914,0.751
4,0.7432,0.7298,0.7863
5,0.7207,0.7286,0.7516


Unnamed: 0,W,pval,normal
1000,0.948244,0.72464,True


Unnamed: 0,W,pval,normal
500,0.732072,0.020135,False


Unnamed: 0,W,pval,normal
full,0.949841,0.736055,True


    split prompt      f1
0       1   1000  0.7019
1       2   1000  0.7211
2       3   1000  0.7682
3       4   1000  0.7432
4       5   1000  0.7207
5       1    500  0.6947
6       2    500  0.7284
7       3    500  0.6914
8       4    500  0.7298
9       5    500  0.7286
10      1   full  0.7324
11      2   full  0.7614
12      3   full  0.7510
13      4   full  0.7863
14      5   full  0.7516
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.52      2  5.2  0.074274
Results for LR-Comparison of best Prompt per LR-Setting


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,1000 vs 500,0.73102,0.022731,0.71458,0.017617,5.0,0.625,0.625,False
1,wilcoxon,1000 vs full,0.73102,0.022731,0.75654,0.01759,1.0,0.125,0.25,False
2,wilcoxon,500 vs full,0.71458,0.017617,0.75654,0.01759,0.0,0.0625,0.1875,False


## Performance Comparison of Extraction of ABSA-Tuple Elements over different ABSA Subtasks

In [146]:
# Additional Eval

runs = []
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')        
        filename = 'metrics_asp.tsv'
        
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'basic'), ('full', 'context'), ('full', 'cot'),
    ('1000', 'basic'), ('1000', 'context'), ('1000', 'cot'),
    ('500', 'basic'), ('500', 'context'), ('500', 'cot')
])

# Define the row indices
index = ['acd', 'acsa', 'tasd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'tasd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                         results_all['task'] == task, 
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting])].groupby(['model_config']):

            prompt_name = a[0].split('_')[2]
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect Extraction')
display(df)

print(f"Average difference ACSA to ACD: {(np.mean([float(i) for i in list(df.loc['acsa'])]) - np.mean([float(i) for i in list(df.loc['acd']) if i != 'N/A'])):.2f}")

print(f"Average difference TASD to ACSA: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}")

print(f"Average difference TASD to ACD: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acd'])  if i != 'N/A'])):.2f}")

Aspect Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,basic,context,cot,basic,context,cot,basic,context,cot
acd,84.41,85.21,,81.46,79.74,,81.15,78.72,
acsa,85.33,85.1,84.02,84.37,84.94,84.04,81.72,80.32,78.74
tasd,86.37,85.94,82.54,84.9,84.89,84.47,82.38,81.24,79.4


Average difference ACSA to ACD: 1.39
Average difference TASD to ACSA: 0.39
Average difference TASD to ACD: 1.79


In [147]:
# Additional Eval

runs = []
RESULTS_PATH = '../results/ft_llm'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')                
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, 'metrics_asp_pol.tsv'), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

# Define the columns as multi-index
columns = pd.MultiIndex.from_tuples([
    ('full', 'basic'), ('full', 'context'), ('full', 'cot'),
    ('1000', 'basic'), ('1000', 'context'), ('1000', 'cot'),
    ('500', 'basic'), ('500', 'context'), ('500', 'cot')
])

# Define the row indices
index = ['acsa', 'tasd']

# Create an empty DataFrame with the defined structure
df = pd.DataFrame(np.nan, index=index, columns=columns)

for col in df.columns:
    df[col] = 'N/A'

for task in ['acd', 'acsa', 'tasd']:
    for lr_setting in ['full', '1000', '500']:
        for a, b in results_all[np.logical_and.reduce([results_all['dataset'] == 'rest-16', 
                                                         results_all['task'] == task, 
                                                         results_all['split'] != str(0),
                                                         results_all['lr_setting'] == lr_setting])].groupby(['model_config']):

            prompt_name = a[0].split('_')[2]
            df.at[task, (lr_setting, prompt_name)] = f"{b['f1-micro'].mean()*100:.2f}"
            
print('Aspect + Polarity Extraction')
display(df)

f"Average difference: {(np.mean([float(i) for i in list(df.loc['tasd'])]) - np.mean([float(i) for i in list(df.loc['acsa'])])):.2f}"


Aspect + Polarity Extraction


Unnamed: 0_level_0,full,full,full,1000,1000,1000,500,500,500
Unnamed: 0_level_1,basic,context,cot,basic,context,cot,basic,context,cot
acsa,80.72,80.7,79.4,79.32,80.18,79.92,77.63,76.7,75.25
tasd,82.29,82.1,78.82,80.17,80.04,80.34,78.82,77.33,75.65


'Average difference: 0.64'

In [101]:
# Eval for best parameter combination over all tasks and dataset sizes

RESULTS_PATH = '../results/ft_llm/'
DATASET = 'rest-16'

col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(os.path.join(RESULTS_PATH)) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

results_sub = results_all[np.logical_and.reduce([results_all['dataset'] == DATASET, results_all['split'] == '0'])].sort_values(by = ['f1-micro'], ascending = False)
results_sub = results_sub[results_sub['lr_setting'] != 'orig']
results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lr_setting', 'lora_r', 'lora_alpha', 'epoch', 'f1-micro', 'f1-macro']]
results_sub = results_sub.reset_index()

idx_max = results_sub.groupby(['lr_setting', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].idxmax()
results_per_epoch = results_sub.loc[idx_max]

results_per_epoch.groupby(['learning_rate', 'lora_r', 'lora_alpha'])['f1-micro'].mean()

learning_rate  lora_r  lora_alpha
0.0003         32      32            0.775458
                       64            0.734594
               8       16            0.777797
                       8             0.788464
3e-05          32      32            0.767685
                       64            0.777455
               8       16            0.761964
                       8             0.745339
Name: f1-micro, dtype: float64