## Language

In [10]:
import pandas as pd
import os
import sys
import numpy as np
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
from sklearn.metrics import f1_score
from sklearn.utils import resample
from itertools import combinations

import random
import scikit_posthocs as sp
import scipy.stats as stats
import numpy as np

utils = os.path.abspath('../src/utils/')
sys.path.append(utils)

from preprocessing import loadDataset
from evaluation import extractAspects, convertLabels, createResults
from types import SimpleNamespace
from pingouin import kruskal
import pingouin as pg
import chardet
import codecs

pd.set_option('display.max_columns', None)
random.seed(42)

args = {
    'dataset': 'GERestaurant'
}

stats_acd = {}
stats_acsa = {}
stats_e2e = {}
stats_tasd = {}

args = SimpleNamespace(**args)

RESULTS_PATH = '../results/ft_llm/'
N_SAMPLES = 1000

col_names = ['task', 'dataset', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']
folder_names = [folder for folder in os.listdir(RESULTS_PATH) if os.path.isdir(os.path.join(RESULTS_PATH, folder)) and folder != '.ipynb_checkpoints']

def computePromptStatistics(args):
    if args.lr_setting == 0:
        lr_setting = 'full'
    else:
        lr_setting = str(args.lr_setting)
    
    results_sub = args.results[np.logical_and.reduce([args.results['dataset'] == args.dataset, 
                                                         args.results['task'] == args.task,
                                                         args.results['split'] != str(0),
                                                         args.results['lr_setting'] == lr_setting])].sort_values(by = ['f1-micro'], ascending = False)
    
    results_sub = results_sub[['dataset', 'task', 'prompt', 'learning_rate', 'lora_r', 'lora_alpha', 'lora_dropout', 'split', 'lr_setting', 'epoch', 'model_config', 'path', 'f1-micro', 'f1-macro', 'accuracy']]
        
    idx_max = results_sub.groupby(['model_config', 'split'])['f1-micro'].idxmax()
    results_per_epoch = results_sub.loc[idx_max]
    
    if args.task == 'acd':
        prompts = ['basic', 'context']
    else:
        prompts = ['basic', 'context', 'cot']
        
    f1_prompts = {}
    
    for prompt in prompts:
        f1 = {}
        try:
            for i in range(1, 6): 
                f1[i] = results_per_epoch[np.logical_and.reduce([results_per_epoch['split'] == str(i),results_per_epoch['prompt'] == prompt])].iloc[0,12]
            f1_prompts[prompt] = f1
        except:
            pass
    
    df_prompts = pd.DataFrame(f1_prompts)
    
    display(df_prompts)

    normality_results = {col: pg.normality(df_prompts[col]) for col in df_prompts.columns}

    for key, item in normality_results.items():
        display(item)
    
    all_normal = all([result['normal'].iloc[0] for result in normality_results.values()])

    print(df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
    
    if all_normal:
        # Wenn alle Spalten normalverteilt sind, verwende repeated measures ANOVA
        rm_anova = pg.rm_anova(dv='f1', within='prompt', subject='split', data=df_prompts.melt(var_name='prompt', value_name='f1', ignore_index=False).reset_index().rename(columns={'index': 'split'}))
        print("Repeated Measures ANOVA Result:")
        print(rm_anova)
    else:
        # Wenn nicht alle Spalten normalverteilt sind, verwende den Friedman-Test
        friedman = pg.friedman(df_prompts)
        print("Friedman Test Result:")
        print(friedman)

    # Paarweise Vergleiche
    results = []
    columns = df_prompts.columns
    comb = combinations(columns, 2)
    
    for col1, col2 in comb:
        # Falls beide Kolonnen normalverteilt sind, gepaarter t-Test
        if all_normal:
            test = 't-test'
            test_result = pg.ttest(df_prompts[col1], df_prompts[col2], paired=True, alternative = 'two-sided')
            statistic = test_result['T']['T-test']
        else:
            # Falls nicht, Wilcoxon-Test
            test = 'wilcoxon'
            test_result = pg.wilcoxon(df_prompts[col1], df_prompts[col2], alternative = 'two-sided')
            statistic = test_result['W-val']['Wilcoxon']
        
        result = {
            'test': test,
            'comparison': f'{col1} vs {col2}',
            'mean 1': round(np.mean(df_prompts[col1]) * 100, 2),
            'std 1': round(np.std(df_prompts[col1]) * 100, 2),
            'mean 2': round(np.mean(df_prompts[col2]) * 100, 2),
            'std 2': round(np.std(df_prompts[col2]) * 100, 2),
            'statistic': statistic,
            'p_value': test_result['p-val'].iloc[0]
        }
        results.append(result)
    
    # Erstellung eines DataFrames für die Testergebnisse
    results_df = pd.DataFrame(results)
    
    # Durchführung der Bonferroni-Holm-Korrektur
    corrected_p = pg.multicomp(results_df['p_value'], method='holm', alpha = 0.05)
    results_df['corrected_p_value'] = corrected_p[1]
    results_df['significant'] = corrected_p[0]
    
    return results_df

In [11]:
runs = []

for folder_name in folder_names:
    try:
        cond_parameters = folder_name.split('_')
        filename = ''
        
        if cond_parameters[0] == 'acd':
            filename = 'metrics_asp.tsv'
        elif cond_parameters[0] == 'acsa':
            filename = 'metrics_asp_pol.tsv'
        elif cond_parameters[0] == 'e2e' or cond_parameters[0] == 'e2e-e':
            filename = 'metrics_pol.tsv'
        elif cond_parameters[0] == 'tasd':
            filename = 'metrics_phrases.tsv'
            
        df = pd.read_csv(os.path.join(RESULTS_PATH, folder_name, filename), sep = '\t')
        df = df.set_index(df.columns[0])

        model_config = cond_parameters.copy()
        
        # Remove split column from config string
        model_config_full = model_config.copy()
        model_config.pop(7)

        # Remove epoch column from config string
        model_config.pop(-1)
        
        cond_parameters.append('_'.join(model_config))
        cond_parameters.append('_'.join(model_config_full))
        cond_parameters.append(df.loc['Micro-AVG', 'f1'])
        cond_parameters.append(df.loc['Macro-AVG', 'f1'])
        cond_parameters.append(df.loc['Micro-AVG', 'accuracy'])
        runs.append(cond_parameters)
    except:
        pass

results_all = pd.DataFrame(runs, columns = col_names)

args.results = results_all

## ACD

### Full Dataset

In [12]:
args.lr_setting = 0
args.task = 'acd'

stats_acd['0'] = computePromptStatistics(args)
stats_acd['0']

Unnamed: 0,basic,context
1,0.8747,0.8801
2,0.8663,0.8698
3,0.8757,0.8674
4,0.8943,0.8896
5,0.8827,0.8846


Unnamed: 0,W,pval,normal
basic,0.96305,0.829044,True


Unnamed: 0,W,pval,normal
context,0.928675,0.587366,True


   split   prompt      f1
0      1    basic  0.8747
1      2    basic  0.8663
2      3    basic  0.8757
3      4    basic  0.8943
4      5    basic  0.8827
5      1  context  0.8801
6      2  context  0.8698
7      3  context  0.8674
8      4  context  0.8896
9      5  context  0.8846
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2  eps
0  prompt      1      4  0.028675  0.873751  0.000605  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,87.87,0.94,87.83,0.85,0.169336,0.873751,0.873751,False


### 1000

In [13]:
args.lr_setting = 1000
args.task = 'acd'

stats_acd['1000'] = computePromptStatistics(args)
stats_acd['1000']

Unnamed: 0,basic,context
1,0.8798,0.8698
2,0.8423,0.8509
3,0.8625,0.8555
4,0.8952,0.8993
5,0.8527,0.8469


Unnamed: 0,W,pval,normal
basic,0.970018,0.87537,True


Unnamed: 0,W,pval,normal
context,0.85488,0.210443,True


   split   prompt      f1
0      1    basic  0.8798
1      2    basic  0.8423
2      3    basic  0.8625
3      4    basic  0.8952
4      5    basic  0.8527
5      1  context  0.8698
6      2  context  0.8509
7      3  context  0.8555
8      4  context  0.8993
9      5  context  0.8469
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2  eps
0  prompt      1      4  0.322551  0.600441  0.00282  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,86.65,1.89,86.45,1.91,0.567936,0.600441,0.600441,False


### 500

In [14]:
args.lr_setting = 500
args.task = 'acd'

stats_acd['500'] = computePromptStatistics(args)
stats_acd['500']

Unnamed: 0,basic,context
1,0.887,0.8369
2,0.8265,0.8215
3,0.8557,0.8258
4,0.8674,0.8406
5,0.8694,0.837


Unnamed: 0,W,pval,normal
basic,0.944206,0.695806,True


Unnamed: 0,W,pval,normal
context,0.874845,0.286601,True


   split   prompt      f1
0      1    basic  0.8870
1      2    basic  0.8265
2      3    basic  0.8557
3      4    basic  0.8674
4      5    basic  0.8694
5      1  context  0.8369
6      2  context  0.8215
7      3  context  0.8258
8      4  context  0.8406
9      5  context  0.8370
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2  eps
0  prompt      1      4  16.02142  0.016094  0.477352  1.0


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,86.12,2.0,83.24,0.74,4.002677,0.016094,0.016094,True


## ACSA

### Full Dataset

In [15]:
args.lr_setting = 0
args.task = 'acsa'

stats_acsa['0'] = computePromptStatistics(args)
stats_acsa['0']

Unnamed: 0,basic,context,cot
1,0.8583,0.8438,0.851
2,0.8255,0.8093,0.8086
3,0.8226,0.8356,0.8163
4,0.8659,0.865,0.8191
5,0.8332,0.8665,0.8158


Unnamed: 0,W,pval,normal
basic,0.863788,0.24217,True


Unnamed: 0,W,pval,normal
context,0.915966,0.504258,True


Unnamed: 0,W,pval,normal
cot,0.763502,0.039503,False


    split   prompt      f1
0       1    basic  0.8583
1       2    basic  0.8255
2       3    basic  0.8226
3       4    basic  0.8659
4       5    basic  0.8332
5       1  context  0.8438
6       2  context  0.8093
7       3  context  0.8356
8       4  context  0.8650
9       5  context  0.8665
10      1      cot  0.8510
11      2      cot  0.8086
12      3      cot  0.8163
13      4      cot  0.8191
14      5      cot  0.8158
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.52      2  5.2  0.074274


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,84.11,1.77,84.4,2.11,7.0,1.0,1.0,False
1,wilcoxon,basic vs cot,84.11,1.77,82.22,1.48,0.0,0.0625,0.1875,False
2,wilcoxon,context vs cot,84.4,2.11,82.22,1.48,2.0,0.1875,0.375,False


### 1000

In [16]:
args.lr_setting = 1000
args.task = 'acsa'

stats_acsa['1000'] = computePromptStatistics(args)
stats_acsa['1000']

Unnamed: 0,basic,context,cot
1,0.8314,0.8204,0.7715
2,0.7796,0.8031,0.7996
3,0.8365,0.81,0.7984
4,0.8479,0.7972,0.8268
5,0.7664,0.7579,0.8138


Unnamed: 0,W,pval,normal
basic,0.854784,0.210119,True


Unnamed: 0,W,pval,normal
context,0.873577,0.281211,True


Unnamed: 0,W,pval,normal
cot,0.961658,0.819479,True


    split   prompt      f1
0       1    basic  0.8314
1       2    basic  0.7796
2       3    basic  0.8365
3       4    basic  0.8479
4       5    basic  0.7664
5       1  context  0.8204
6       2  context  0.8031
7       3  context  0.8100
8       4  context  0.7972
9       5  context  0.7579
10      1      cot  0.7715
11      2      cot  0.7996
12      3      cot  0.7984
13      4      cot  0.8268
14      5      cot  0.8138
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2        F     p-unc       ng2       eps
0  prompt      2      8  0.39992  0.683063  0.056945  0.803937


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,81.24,3.28,79.77,2.14,1.206135,0.294225,0.882676,False
1,t-test,basic vs cot,81.24,3.28,80.2,1.85,0.530535,0.623828,1.0,False
2,t-test,context vs cot,79.77,2.14,80.2,1.85,-0.239508,0.822484,1.0,False


### 500

In [17]:
args.lr_setting = 500
args.task = 'acsa'

stats_acsa['500'] = computePromptStatistics(args)
stats_acsa['500']

Unnamed: 0,basic,context,cot
1,0.7956,0.825,0.815
2,0.7872,0.7988,0.7771
3,0.7952,0.8188,0.8266
4,0.8316,0.8387,0.8373
5,0.8094,0.7761,0.8129


Unnamed: 0,W,pval,normal
basic,0.888535,0.349789,True


Unnamed: 0,W,pval,normal
context,0.961278,0.816856,True


Unnamed: 0,W,pval,normal
cot,0.908588,0.459153,True


    split   prompt      f1
0       1    basic  0.7956
1       2    basic  0.7872
2       3    basic  0.7952
3       4    basic  0.8316
4       5    basic  0.8094
5       1  context  0.8250
6       2  context  0.7988
7       3  context  0.8188
8       4  context  0.8387
9       5  context  0.7761
10      1      cot  0.8150
11      2      cot  0.7771
12      3      cot  0.8266
13      4      cot  0.8373
14      5      cot  0.8129
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  0.607745  0.567919  0.045909  0.824293


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,80.38,1.56,81.15,2.19,-0.698143,0.523544,1.0,False
1,t-test,basic vs cot,80.38,1.56,81.38,2.03,-1.403639,0.233096,0.699287,False
2,t-test,context vs cot,81.15,2.19,81.38,2.03,-0.232247,0.827744,1.0,False


# E2E

In [18]:
args.lr_setting = 0
args.task = 'e2e'

stats_e2e['0'] = computePromptStatistics(args)
stats_e2e['0']

Unnamed: 0,basic,context,cot
1,0.7923,0.7876,0.7666
2,0.7865,0.75,0.7546
3,0.8,0.7861,0.744
4,0.8335,0.8187,0.7875
5,0.8166,0.8004,0.7634


Unnamed: 0,W,pval,normal
basic,0.935021,0.630989,True


Unnamed: 0,W,pval,normal
context,0.948692,0.727844,True


Unnamed: 0,W,pval,normal
cot,0.968478,0.865396,True


    split   prompt      f1
0       1    basic  0.7923
1       2    basic  0.7865
2       3    basic  0.8000
3       4    basic  0.8335
4       5    basic  0.8166
5       1  context  0.7876
6       2  context  0.7500
7       3  context  0.7861
8       4  context  0.8187
9       5  context  0.8004
10      1      cot  0.7666
11      2      cot  0.7546
12      3      cot  0.7440
13      4      cot  0.7875
14      5      cot  0.7634
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  21.021297  0.000653  0.475312  0.750716


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,80.58,1.72,78.86,2.25,3.294561,0.030086,0.060173,False
1,t-test,basic vs cot,80.58,1.72,76.32,1.45,7.175668,0.001997,0.005992,True
2,t-test,context vs cot,78.86,2.25,76.32,1.45,3.065685,0.03745,0.060173,False


In [19]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7992,0.7785,0.7249
2,0.7455,0.7703,0.7212
3,0.7953,0.7758,0.7616
4,0.8136,0.8069,0.758
5,0.7681,0.7893,0.7609


Unnamed: 0,W,pval,normal
basic,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
context,0.908464,0.458414,True


Unnamed: 0,W,pval,normal
cot,0.75758,0.034931,False


    split   prompt      f1
0       1    basic  0.7992
1       2    basic  0.7455
2       3    basic  0.7953
3       4    basic  0.8136
4       5    basic  0.7681
5       1  context  0.7785
6       2  context  0.7703
7       3  context  0.7758
8       4  context  0.8069
9       5  context  0.7893
10      1      cot  0.7249
11      2      cot  0.7212
12      3      cot  0.7616
13      4      cot  0.7580
14      5      cot  0.7609
Friedman Test Result:
          Source     W  ddof1    Q     p-unc
Friedman  Within  0.76      2  7.6  0.022371


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,wilcoxon,basic vs context,78.43,2.44,78.42,1.29,6.0,0.8125,0.8125,False
1,wilcoxon,basic vs cot,78.43,2.44,74.53,1.83,0.0,0.0625,0.1875,False
2,wilcoxon,context vs cot,78.42,1.29,74.53,1.83,0.0,0.0625,0.1875,False


In [20]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.7458,0.7271,0.67
2,0.7606,0.7338,0.718
3,0.6998,0.6775,0.6722
4,0.7563,0.7069,0.7063
5,0.7301,0.7094,0.7091


Unnamed: 0,W,pval,normal
basic,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
context,0.93137,0.605745,True


Unnamed: 0,W,pval,normal
cot,0.83257,0.145409,True


    split   prompt      f1
0       1    basic  0.7458
1       2    basic  0.7606
2       3    basic  0.6998
3       4    basic  0.7563
4       5    basic  0.7301
5       1  context  0.7271
6       2  context  0.7338
7       3  context  0.6775
8       4  context  0.7069
9       5  context  0.7094
10      1      cot  0.6700
11      2      cot  0.7180
12      3      cot  0.6722
13      4      cot  0.7063
14      5      cot  0.7091
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F    p-unc       ng2       eps
0  prompt      2      8  12.153098  0.00376  0.431813  0.719894


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,73.85,2.2,71.09,1.96,4.910973,0.007981,0.023943,True
1,t-test,basic vs cot,73.85,2.2,69.51,2.0,4.518187,0.010673,0.023943,True
2,t-test,context vs cot,71.09,1.96,69.51,2.0,1.479336,0.213142,0.213142,False


### E2E - Explicit only

In [37]:
args.lr_setting = 0
args.task = 'e2e-e'

computePromptStatistics(args)

Unnamed: 0,basic,context,cot
1,0.8237,0.8284,0.8173
2,0.7497,0.6943,0.7918
3,0.8431,0.8365,0.852
4,0.7857,0.8154,0.7652
5,0.8039,0.7256,0.8055


Unnamed: 0,W,pval,normal
basic,0.983488,0.952373,True


Unnamed: 0,W,pval,normal
context,0.833036,0.146574,True


Unnamed: 0,W,pval,normal
cot,0.991887,0.985867,True


    split   prompt      f1
0       1    basic  0.8237
1       2    basic  0.7497
2       3    basic  0.8431
3       4    basic  0.7857
4       5    basic  0.8039
5       1  context  0.8284
6       2  context  0.6943
7       3  context  0.8365
8       4  context  0.8154
9       5  context  0.7256
10      1      cot  0.8173
11      2      cot  0.7918
12      3      cot  0.8520
13      4      cot  0.7652
14      5      cot  0.8055
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc      ng2       eps
0  prompt      2      8  0.919558  0.437055  0.06875  0.553783


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.80122,0.03213,0.78004,0.058467,1.065356,0.34675,1.0,False
1,t-test,basic vs cot,0.80122,0.03213,0.80636,0.028669,-0.491951,0.64853,1.0,False
2,t-test,context vs cot,0.78004,0.058467,0.80636,0.028669,-0.951289,0.395326,1.0,False


In [38]:
args.lr_setting = 1000
args.task = 'e2e'

stats_e2e['1000'] = computePromptStatistics(args)
stats_e2e['1000']

Unnamed: 0,basic,context,cot
1,0.7992,0.7785,0.7288
2,0.7455,0.7703,0.7223
3,0.7953,0.7758,0.7618
4,0.8136,0.8069,0.7614
5,0.7681,0.7893,0.7713


Unnamed: 0,W,pval,normal
basic,0.937736,0.649977,True


Unnamed: 0,W,pval,normal
context,0.908464,0.458414,True


Unnamed: 0,W,pval,normal
cot,0.848695,0.190429,True


    split   prompt      f1
0       1    basic  0.7992
1       2    basic  0.7455
2       3    basic  0.7953
3       4    basic  0.8136
4       5    basic  0.7681
5       1  context  0.7785
6       2  context  0.7703
7       3  context  0.7758
8       4  context  0.8069
9       5  context  0.7893
10      1      cot  0.7288
11      2      cot  0.7223
12      3      cot  0.7618
13      4      cot  0.7614
14      5      cot  0.7713
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  7.844823  0.013005  0.417371  0.776503


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.78434,0.02437,0.78416,0.012943,0.018381,0.986215,0.986215,False
1,t-test,basic vs cot,0.78434,0.02437,0.74912,0.019676,2.806625,0.048483,0.096966,False
2,t-test,context vs cot,0.78416,0.012943,0.74912,0.019676,4.47664,0.011018,0.033055,True


In [39]:
args.lr_setting = 500
args.task = 'e2e'

stats_e2e['500'] = computePromptStatistics(args)
stats_e2e['500']

Unnamed: 0,basic,context,cot
1,0.7458,0.7271,0.6693
2,0.7606,0.7129,0.7179
3,0.6998,0.6775,0.6729
4,0.7563,0.7069,0.7082
5,0.7301,0.7094,0.7154


Unnamed: 0,W,pval,normal
basic,0.899971,0.409725,True


Unnamed: 0,W,pval,normal
context,0.900947,0.415142,True


Unnamed: 0,W,pval,normal
cot,0.80553,0.089826,True


    split   prompt      f1
0       1    basic  0.7458
1       2    basic  0.7606
2       3    basic  0.6998
3       4    basic  0.7563
4       5    basic  0.7301
5       1  context  0.7271
6       2  context  0.7129
7       3  context  0.6775
8       4  context  0.7069
9       5  context  0.7094
10      1      cot  0.6693
11      2      cot  0.7179
12      3      cot  0.6729
13      4      cot  0.7082
14      5      cot  0.7154
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  9.403763  0.007931  0.442679  0.754633


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,0.73852,0.022031,0.70676,0.016215,4.613979,0.009926,0.029778,True
1,t-test,basic vs cot,0.73852,0.022031,0.69674,0.021206,3.982654,0.016365,0.03273,True
2,t-test,context vs cot,0.70676,0.016215,0.69674,0.021206,0.82888,0.453768,0.453768,False


## TASD

### Full Dataset

In [21]:
args.lr_setting = 0
args.task = 'tasd'

stats_tasd['0'] = computePromptStatistics(args)
stats_tasd['0']

Unnamed: 0,basic,context,cot
1,0.7123,0.7433,0.7457
2,0.7362,0.7347,0.7218
3,0.7657,0.7655,0.7309
4,0.7583,0.7617,0.7293
5,0.7837,0.7755,0.6622


Unnamed: 0,W,pval,normal
basic,0.974914,0.905749,True


Unnamed: 0,W,pval,normal
context,0.94345,0.690422,True


Unnamed: 0,W,pval,normal
cot,0.794509,0.073045,True


    split   prompt      f1
0       1    basic  0.7123
1       2    basic  0.7362
2       3    basic  0.7657
3       4    basic  0.7583
4       5    basic  0.7837
5       1  context  0.7433
6       2  context  0.7347
7       3  context  0.7655
8       4  context  0.7617
9       5  context  0.7755
10      1      cot  0.7457
11      2      cot  0.7218
12      3      cot  0.7309
13      4      cot  0.7293
14      5      cot  0.6622
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  2.405621  0.152053  0.340145  0.526754


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,75.12,2.47,75.61,1.5,-0.721666,0.510431,0.511712,False
1,t-test,basic vs cot,75.12,2.47,71.8,2.89,1.324791,0.255856,0.511712,False
2,t-test,context vs cot,75.61,1.5,71.8,2.89,1.910982,0.1286,0.385801,False


### 1000

In [22]:
args.lr_setting = 1000
args.task = 'tasd'

stats_tasd['1000'] = computePromptStatistics(args)
stats_tasd['1000']

Unnamed: 0,basic,context,cot
1,0.7067,0.7317,0.6434
2,0.7114,0.7187,0.6771
3,0.7398,0.7419,0.6919
4,0.7859,0.7785,0.7066
5,0.7577,0.7166,0.6925


Unnamed: 0,W,pval,normal
basic,0.936901,0.644119,True


Unnamed: 0,W,pval,normal
context,0.865484,0.248613,True


Unnamed: 0,W,pval,normal
cot,0.898396,0.401086,True


    split   prompt      f1
0       1    basic  0.7067
1       2    basic  0.7114
2       3    basic  0.7398
3       4    basic  0.7859
4       5    basic  0.7577
5       1  context  0.7317
6       2  context  0.7187
7       3  context  0.7419
8       4  context  0.7785
9       5  context  0.7166
10      1      cot  0.6434
11      2      cot  0.6771
12      3      cot  0.6919
13      4      cot  0.7066
14      5      cot  0.6925
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2          F     p-unc       ng2       eps
0  prompt      2      8  20.908608  0.000665  0.537625  0.851594


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,74.03,2.95,73.75,2.25,0.258135,0.80904,0.80904,False
1,t-test,basic vs cot,74.03,2.95,68.23,2.16,7.496143,0.001694,0.005082,True
2,t-test,context vs cot,73.75,2.25,68.23,2.16,4.883274,0.008141,0.016283,True


### 500

In [23]:
args.lr_setting = 500
args.task = 'tasd'

stats_tasd['500'] = computePromptStatistics(args)
stats_tasd['500']

Unnamed: 0,basic,context,cot
1,0.733,0.7346,0.6472
2,0.7087,0.7284,0.6947
3,0.6743,0.7221,0.6842
4,0.7213,0.7487,0.6667
5,0.6925,0.7092,0.6973


Unnamed: 0,W,pval,normal
basic,0.979288,0.930776,True


Unnamed: 0,W,pval,normal
context,0.997773,0.998542,True


Unnamed: 0,W,pval,normal
cot,0.908488,0.458556,True


    split   prompt      f1
0       1    basic  0.7330
1       2    basic  0.7087
2       3    basic  0.6743
3       4    basic  0.7213
4       5    basic  0.6925
5       1  context  0.7346
6       2  context  0.7284
7       3  context  0.7221
8       4  context  0.7487
9       5  context  0.7092
10      1      cot  0.6472
11      2      cot  0.6947
12      3      cot  0.6842
13      4      cot  0.6667
14      5      cot  0.6973
Repeated Measures ANOVA Result:
   Source  ddof1  ddof2         F     p-unc       ng2       eps
0  prompt      2      8  6.322457  0.022548  0.573067  0.609734


  W = np.prod(eig) / (eig.sum() / d) ** d


Unnamed: 0,test,comparison,mean 1,std 1,mean 2,std 2,statistic,p_value,corrected_p_value,significant
0,t-test,basic vs context,70.6,2.08,72.86,1.31,-2.995731,0.040111,0.080221,False
1,t-test,basic vs cot,70.6,2.08,67.8,1.88,1.520114,0.203116,0.203116,False
2,t-test,context vs cot,72.86,1.31,67.8,1.88,3.455645,0.025921,0.077762,False
