## Language

In [1]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = {
    'dataset': 'rest-16'
}

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_tasd = {}

args = SimpleNamespace(**args)

N_SAMPLES = 1000

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting])].sort_values(by = ['f1-micro'], ascending = False)
    
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
        
    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['basic', 'context']
    else:
        prompts = ['basic', 'context', 'cot']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,12]
            f1_prompts[prompt] = f1
        except:
            pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': round(np.mean(df_prompts[col1]) * 100, 2),
            'std 1': round(np.std(df_prompts[col1]) * 100, 2),
            'mean 2': round(np.mean(df_prompts[col2]) * 100, 2),
            'std 2': round(np.std(df_prompts[col2]) * 100, 2),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df



In [2]:
RESULTS_PATH = '../results/ft_llm/'
col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all

## ACD

### Full Dataset

In [3]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,basic,context
1,0.8299,0.8497
2,0.8694,0.8407
3,0.8243,0.8494
4,0.851,0.8606
5,0.846,0.8603


Unnamed: 0,W,pval,normal
basic,0.95702,0.787097,True


Unnamed: 0,W,pval,normal
context,0.89011,0.357658,True


   split   prompt      f1
0      1    basic  0.8299
1      2    basic  0.8694
2      3    basic  0.8243
3      4    basic  0.8510
4      5    basic  0.8460
5      1  context  0.8497
6      2  context  0.8407
7      3  context  0.8494
8      4  context  0.8606
9      5  context  0.8603
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2  eps
0  prompt      1      4  0.706512  0.447913  0.093041  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,84.41,1.6,85.21,0.75,-0.840542,0.447913,0.447913,False


### 1000

In [4]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,basic,context
1,0.8578,0.6897
2,0.8452,0.8212
3,0.8031,0.8113
4,0.7668,0.8498
5,0.8,0.8152


Unnamed: 0,W,pval,normal
basic,0.938139,0.652813,True


Unnamed: 0,W,pval,normal
context,0.767566,0.042937,False


   split   prompt      f1
0      1    basic  0.8578
1      2    basic  0.8452
2      3    basic  0.8031
3      4    basic  0.7668
4      5    basic  0.8000
5      1  context  0.6897
6      2  context  0.8212
7      3  context  0.8113
8      4  context  0.8498
9      5  context  0.8152
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.04      1  0.2  0.654721


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,81.46,3.3,79.74,5.55,7.0,1.0,1.0,False


### 500

In [5]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,basic,context
1,0.7658,0.7965
2,0.8287,0.784
3,0.8143,0.7809
4,0.8492,0.761
5,0.7995,0.8134


Unnamed: 0,W,pval,normal
basic,0.986385,0.965571,True


Unnamed: 0,W,pval,normal
context,0.986798,0.967326,True


   split   prompt      f1
0      1    basic  0.7658
1      2    basic  0.8287
2      3    basic  0.8143
3      4    basic  0.8492
4      5    basic  0.7995
5      1  context  0.7965
6      2  context  0.7840
7      3  context  0.7809
8      4  context  0.7610
9      5  context  0.8134
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2  eps
0  prompt      1      4  1.30688  0.316732  0.213173  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,81.15,2.81,78.72,1.74,1.143188,0.316732,0.316732,False


## ACSA

### Full Dataset

In [6]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,basic,context,cot
1,0.8038,0.8407,0.7704
2,0.8267,0.8242,0.8108
3,0.7688,0.7827,0.7809
4,0.8352,0.8237,0.8075
5,0.8015,0.7638,0.7873


Unnamed: 0,W,pval,normal
basic,0.939847,0.664856,True


Unnamed: 0,W,pval,normal
context,0.896665,0.391727,True


Unnamed: 0,W,pval,normal
cot,0.919719,0.528128,True


    split   prompt      f1
0       1    basic  0.8038
1       2    basic  0.8267
2       3    basic  0.7688
3       4    basic  0.8352
4       5    basic  0.8015
5       1  context  0.8407
6       2  context  0.8242
7       3  context  0.7827
8       4  context  0.8237
9       5  context  0.7638
10      1      cot  0.7704
11      2      cot  0.8108
12      3      cot  0.7809
13      4      cot  0.8075
14      5      cot  0.7873
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  1.092125  0.380753  0.092905  0.692374


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,80.72,2.32,80.7,2.89,0.014424,0.989182,0.989182,False
1,t-test,basic vs cot,80.72,2.32,79.14,1.55,2.01537,0.114098,0.342295,False
2,t-test,context vs cot,80.7,2.89,79.14,1.55,1.018452,0.366052,0.732104,False


### 1000

In [7]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,basic,context,cot
1,0.8005,0.7934,0.7956
2,0.8162,0.8025,0.8162
3,0.8015,0.799,0.7729
4,0.8009,0.8029,0.8111
5,0.747,0.8111,0.7864


Unnamed: 0,W,pval,normal
basic,0.751384,0.030653,False


Unnamed: 0,W,pval,normal
context,0.969023,0.868949,True


Unnamed: 0,W,pval,normal
cot,0.957089,0.78758,True


    split   prompt      f1
0       1    basic  0.8005
1       2    basic  0.8162
2       3    basic  0.8015
3       4    basic  0.8009
4       5    basic  0.7470
5       1  context  0.7934
6       2  context  0.8025
7       3  context  0.7990
8       4  context  0.8029
9       5  context  0.8111
10      1      cot  0.7956
11      2      cot  0.8162
12      3      cot  0.7729
13      4      cot  0.8111
14      5      cot  0.7864
Friedman Test Result:
          Source         W  ddof1         Q    p-unc
Friedman  Within  0.031579      2  0.315789  0.85394


  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,79.32,2.39,80.18,0.58,6.0,0.8125,1.0,False
1,wilcoxon,basic vs cot,79.32,2.39,79.64,1.59,4.0,0.855132,1.0,False
2,wilcoxon,context vs cot,80.18,0.58,79.64,1.59,6.0,0.8125,1.0,False


### 500

In [8]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,basic,context,cot
1,0.7713,0.7619,0.7285
2,0.7537,0.7785,0.747
3,0.7573,0.7551,0.7792
4,0.8155,0.7589,0.7569
5,0.7837,0.7807,0.7474


Unnamed: 0,W,pval,normal
basic,0.904757,0.436735,True


Unnamed: 0,W,pval,normal
context,0.854828,0.210266,True


Unnamed: 0,W,pval,normal
cot,0.956087,0.780508,True


    split   prompt      f1
0       1    basic  0.7713
1       2    basic  0.7537
2       3    basic  0.7573
3       4    basic  0.8155
4       5    basic  0.7837
5       1  context  0.7619
6       2  context  0.7785
7       3  context  0.7551
8       4  context  0.7589
9       5  context  0.7807
10      1      cot  0.7285
11      2      cot  0.7470
12      3      cot  0.7792
13      4      cot  0.7569
14      5      cot  0.7474
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F    p-unc       ng2       eps
0  prompt      2      8  1.792964  0.22732  0.257866  0.941815


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,77.63,2.23,76.7,1.05,0.702485,0.521105,0.521105,False
1,t-test,basic vs cot,77.63,2.23,75.18,1.65,1.709749,0.162488,0.487465,False
2,t-test,context vs cot,76.7,1.05,75.18,1.65,1.323912,0.256121,0.512243,False


## E2E

In [9]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,basic,context,cot
1,0.7434,0.7594,0.7612
2,0.7986,0.8074,0.7761
3,0.8067,0.785,0.7807
4,0.837,0.8235,0.8113
5,0.8177,0.8222,0.7714


Unnamed: 0,W,pval,normal
basic,0.905596,0.441584,True


Unnamed: 0,W,pval,normal
context,0.895268,0.384285,True


Unnamed: 0,W,pval,normal
cot,0.887972,0.347007,True


    split   prompt      f1
0       1    basic  0.7434
1       2    basic  0.7986
2       3    basic  0.8067
3       4    basic  0.8370
4       5    basic  0.8177
5       1  context  0.7594
6       2  context  0.8074
7       3  context  0.7850
8       4  context  0.8235
9       5  context  0.8222
10      1      cot  0.7612
11      2      cot  0.7761
12      3      cot  0.7807
13      4      cot  0.8113
14      5      cot  0.7714
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.160434  0.097383  0.124836  0.848712


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,80.07,3.14,79.95,2.44,0.166888,0.875555,0.875555,False
1,t-test,basic vs cot,80.07,3.14,78.01,1.69,1.961172,0.121391,0.344696,False
2,t-test,context vs cot,79.95,2.44,78.01,1.69,2.009239,0.114899,0.344696,False


In [10]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7372,0.7485,0.704
2,0.7985,0.7527,0.7305
3,0.7769,0.7442,0.7175
4,0.7952,0.7875,0.7598
5,0.7914,0.797,0.0048


Unnamed: 0,W,pval,normal
basic,0.801455,0.083273,True


Unnamed: 0,W,pval,normal
context,0.831693,0.143237,True


Unnamed: 0,W,pval,normal
cot,0.61483,0.000981,False


    split   prompt      f1
0       1    basic  0.7372
1       2    basic  0.7985
2       3    basic  0.7769
3       4    basic  0.7952
4       5    basic  0.7914
5       1  context  0.7485
6       2  context  0.7527
7       3  context  0.7442
8       4  context  0.7875
9       5  context  0.7970
10      1      cot  0.7040
11      2      cot  0.7305
12      3      cot  0.7175
13      4      cot  0.7598
14      5      cot  0.0048
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.76      2  7.6  0.022371


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,77.98,2.26,76.6,2.18,4.0,0.4375,0.4375,False
1,wilcoxon,basic vs cot,77.98,2.26,58.33,28.98,0.0,0.0625,0.1875,False
2,wilcoxon,context vs cot,76.6,2.18,58.33,28.98,0.0,0.0625,0.1875,False


In [11]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.7037,0.7099,0.511
2,0.7426,0.7546,0.5803
3,0.7415,0.7419,0.5737
4,0.7463,0.728,0.6649
5,0.7625,0.7352,0.6812


Unnamed: 0,W,pval,normal
basic,0.865732,0.249568,True


Unnamed: 0,W,pval,normal
context,0.989374,0.977456,True


Unnamed: 0,W,pval,normal
cot,0.923136,0.55037,True


    split   prompt      f1
0       1    basic  0.7037
1       2    basic  0.7426
2       3    basic  0.7415
3       4    basic  0.7463
4       5    basic  0.7625
5       1  context  0.7099
6       2  context  0.7546
7       3  context  0.7419
8       4  context  0.7280
9       5  context  0.7352
10      1      cot  0.5110
11      2      cot  0.5803
12      3      cot  0.5737
13      4      cot  0.6649
14      5      cot  0.6812
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  23.790016  0.000429  0.725956  0.506259


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,73.93,1.93,73.39,1.49,0.72258,0.509926,0.509926,False
1,t-test,basic vs cot,73.93,1.93,60.22,6.29,5.876758,0.004189,0.012567,True
2,t-test,context vs cot,73.39,1.49,60.22,6.29,4.341301,0.01224,0.024481,True


## E2E - Explicit only

In [37]:
args.lr_setting = 0
args.task = 'e2e-e'

computePromptStatistics(args)

Unnamed: 0,basic,context,cot
1,0.8093,0.7757,0.8147
2,0.7654,0.7583,0.7953
3,0.7813,0.7819,0.7974
4,0.8328,0.7728,0.8099
5,0.8046,0.755,0.7763


Unnamed: 0,W,pval,normal
basic,0.976582,0.915563,True


Unnamed: 0,W,pval,normal
context,0.909621,0.465314,True


Unnamed: 0,W,pval,normal
cot,0.941126,0.673908,True


    split   prompt      f1
0       1    basic  0.8093
1       2    basic  0.7654
2       3    basic  0.7813
3       4    basic  0.8328
4       5    basic  0.8046
5       1  context  0.7757
6       2  context  0.7583
7       3  context  0.7819
8       4  context  0.7728
9       5  context  0.7550
10      1      cot  0.8147
11      2      cot  0.7953
12      3      cot  0.7974
13      4      cot  0.8099
14      5      cot  0.7763
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  6.26392  0.023067  0.418698  0.634063


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.79868,0.023331,0.76874,0.010353,2.549645,0.063329,0.126658,False
1,t-test,basic vs cot,0.79868,0.023331,0.79872,0.01339,-0.003572,0.997321,0.997321,False
2,t-test,context vs cot,0.76874,0.010353,0.79872,0.01339,-6.20857,0.003424,0.010273,True


In [38]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7372,0.7485,0.7055
2,0.7985,0.7527,0.7305
3,0.7769,0.7442,0.7175
4,0.7531,0.7875,0.7612
5,0.7919,0.7951,0.7126


Unnamed: 0,W,pval,normal
basic,0.933575,0.620945,True


Unnamed: 0,W,pval,normal
context,0.824998,0.127529,True


Unnamed: 0,W,pval,normal
cot,0.885572,0.335326,True


    split   prompt      f1
0       1    basic  0.7372
1       2    basic  0.7985
2       3    basic  0.7769
3       4    basic  0.7531
4       5    basic  0.7919
5       1  context  0.7485
6       2  context  0.7527
7       3  context  0.7442
8       4  context  0.7875
9       5  context  0.7951
10      1      cot  0.7055
11      2      cot  0.7305
12      3      cot  0.7175
13      4      cot  0.7612
14      5      cot  0.7126
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  6.433357  0.021604  0.476976  0.879006


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.77152,0.023192,0.7656,0.021292,0.403132,0.707469,0.707469,False
1,t-test,basic vs cot,0.77152,0.023192,0.72546,0.019646,2.941545,0.042324,0.084648,False
2,t-test,context vs cot,0.7656,0.021292,0.72546,0.019646,3.592695,0.022907,0.06872,False


In [39]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.6486,0.6512,0.6322
2,0.7085,0.7546,0.7139
3,0.7415,0.7419,0.5722
4,0.6145,0.5988,0.6933
5,0.7631,0.7359,0.691


Unnamed: 0,W,pval,normal
basic,0.940534,0.669716,True


Unnamed: 0,W,pval,normal
context,0.846621,0.184073,True


Unnamed: 0,W,pval,normal
cot,0.881799,0.317541,True


    split   prompt      f1
0       1    basic  0.6486
1       2    basic  0.7085
2       3    basic  0.7415
3       4    basic  0.6145
4       5    basic  0.7631
5       1  context  0.6512
6       2  context  0.7546
7       3  context  0.7419
8       4  context  0.5988
9       5  context  0.7359
10      1      cot  0.6322
11      2      cot  0.7139
12      3      cot  0.5722
13      4      cot  0.6933
14      5      cot  0.6910
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2      eps
0  prompt      2      8  0.687341  0.530315  0.080405  0.56805


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.69524,0.055884,0.69648,0.06097,-0.099427,0.925583,1.0,False
1,t-test,basic vs cot,0.69524,0.055884,0.66052,0.051876,0.837892,0.449238,1.0,False
2,t-test,context vs cot,0.69648,0.06097,0.66052,0.051876,0.855994,0.44025,1.0,False


## TASD

### Full Dataset

In [12]:
args.lr_setting = 0
args.task = 'tasd'

stats_tasd['0'] = computePromptStatistics(args)
stats_tasd['0']

Unnamed: 0,basic,context,cot
1,0.7324,0.7648,0.6968
2,0.7614,0.7115,0.6632
3,0.751,0.7459,0.7009
4,0.7863,0.7687,0.7438
5,0.7516,0.7564,0.704


Unnamed: 0,W,pval,normal
basic,0.949841,0.736055,True


Unnamed: 0,W,pval,normal
context,0.860335,0.229454,True


Unnamed: 0,W,pval,normal
cot,0.931301,0.605268,True


    split   prompt      f1
0       1    basic  0.7324
1       2    basic  0.7614
2       3    basic  0.7510
3       4    basic  0.7863
4       5    basic  0.7516
5       1  context  0.7648
6       2  context  0.7115
7       3  context  0.7459
8       4  context  0.7687
9       5  context  0.7564
10      1      cot  0.6968
11      2      cot  0.6632
12      3      cot  0.7009
13      4      cot  0.7438
14      5      cot  0.7040
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc      ng2       eps
0  prompt      2      8  15.042579  0.001947  0.56146  0.697647


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,75.65,1.76,74.95,2.05,0.524232,0.627824,0.627824,False
1,t-test,basic vs cot,75.65,1.76,70.17,2.56,4.923747,0.007908,0.015816,True
2,t-test,context vs cot,74.95,2.05,70.17,2.56,6.881813,0.002336,0.007009,True


### 1000

In [13]:
args.lr_setting = 1000
args.task = 'tasd'

stats_tasd['1000'] = computePromptStatistics(args)
stats_tasd['1000']

Unnamed: 0,basic,context,cot
1,0.7019,0.7148,0.6739
2,0.7211,0.7302,0.7225
3,0.7682,0.7288,0.6769
4,0.7432,0.7826,0.697
5,0.7207,0.6933,0.7028


Unnamed: 0,W,pval,normal
basic,0.948244,0.72464,True


Unnamed: 0,W,pval,normal
context,0.91523,0.499646,True


Unnamed: 0,W,pval,normal
cot,0.932916,0.616384,True


    split   prompt      f1
0       1    basic  0.7019
1       2    basic  0.7211
2       3    basic  0.7682
3       4    basic  0.7432
4       5    basic  0.7207
5       1  context  0.7148
6       2  context  0.7302
7       3  context  0.7288
8       4  context  0.7826
9       5  context  0.6933
10      1      cot  0.6739
11      2      cot  0.7225
12      3      cot  0.6769
13      4      cot  0.6970
14      5      cot  0.7028
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  3.511958  0.080395  0.334786  0.969088


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,73.1,2.27,72.99,2.95,0.075441,0.943486,0.943486,False
1,t-test,basic vs cot,73.1,2.27,69.46,1.79,2.313339,0.081732,0.245196,False
2,t-test,context vs cot,72.99,2.95,69.46,1.79,2.10968,0.102527,0.245196,False


### 500

In [15]:
args.lr_setting = 500
args.task = 'tasd'

stats_tasd['500'] = computePromptStatistics(args)
stats_tasd['500']

Unnamed: 0,basic,context,cot
1,0.6947,0.6674,0.5965
2,0.7284,0.7384,0.6134
3,0.6914,0.697,0.6164
4,0.7298,0.7557,0.6821
5,0.7286,0.6615,0.6168


Unnamed: 0,W,pval,normal
basic,0.732072,0.020135,False


Unnamed: 0,W,pval,normal
context,0.901901,0.420481,True


Unnamed: 0,W,pval,normal
cot,0.765998,0.041583,False


    split   prompt      f1
0       1    basic  0.6947
1       2    basic  0.7284
2       3    basic  0.6914
3       4    basic  0.7298
4       5    basic  0.7286
5       1  context  0.6674
6       2  context  0.7384
7       3  context  0.6970
8       4  context  0.7557
9       5  context  0.6615
10      1      cot  0.5965
11      2      cot  0.6134
12      3      cot  0.6164
13      4      cot  0.6821
14      5      cot  0.6168
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.76      2  7.6  0.022371


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,71.46,1.76,70.4,3.76,6.0,0.8125,0.8125,False
1,wilcoxon,basic vs cot,71.46,1.76,62.5,2.95,0.0,0.0625,0.1875,False
2,wilcoxon,context vs cot,70.4,3.76,62.5,2.95,0.0,0.0625,0.1875,False
