In [1]:
from pathlib import Path
import pandas as pd

result = list(Path("../llms-eval/NLI/").rglob("*.csv"))
result = [str(x) for x in result]
result

['../llms-eval/NLI/crowd/revised_premise/eval.csv',
 '../llms-eval/NLI/crowd/revised_hypothesis/eval.csv',
 '../llms-eval/NLI/llama2_70b/revised_hypothesis/eval.csv',
 '../llms-eval/NLI/llama2_70b/revised_premise/eval.csv',
 '../llms-eval/NLI/mistral_56b/revised_premise/eval.csv',
 '../llms-eval/NLI/mistral_56b/revised_hypothesis/eval.csv',
 '../llms-eval/NLI/gpt4/revised_premise/eval.csv',
 '../llms-eval/NLI/gpt4/revised_hypothesis/eval.csv',
 '../llms-eval/NLI/llama2/revised_hypothesis/eval.csv',
 '../llms-eval/NLI/llama2/revised_premise/eval.csv',
 '../llms-eval/NLI/mistral/revised_premise/eval.csv',
 '../llms-eval/NLI/mistral/revised_hypothesis/eval.csv',
 '../llms-eval/NLI/gpt3.5/revised_hypothesis/eval.csv',
 '../llms-eval/NLI/gpt3.5/revised_premise/eval.csv']

In [2]:
def get_result(files): 
    metrics = ['flip_label', 'minimal_change_1', 'minimal_change_2', 'distribution']#, 'grammar', 'cohesive', 'likability']

    modes = ['mean']#, 'max', 'min']
    dict = {
        'LLM' : [], 
        'part': []
    }

    for m in metrics:
        for mode in modes:
            dict[m + ' - ' + mode] = []


    for f in files:
        llm_name = f.split('/')[-3]
        part =  f.split('/')[-2]
        df = pd.read_csv(f)
        dict['LLM'].append(llm_name)
        dict['part'].append(part)

        for m in metrics:
            for mode in modes:
                if mode == 'mean':
                    dict[m + ' - ' + mode].append(df[m].mean().round(2))
                if mode == 'max':
                    dict[m + ' - ' + mode].append(df[m].max().round(2))
                if mode == 'min':
                    dict[m + ' - ' + mode].append(df[m].min().round(2))

    df_results = pd.DataFrame.from_dict(dict)
    return df_results.sort_values(by=['part', 'LLM'])

In [3]:
get_result(result)

Unnamed: 0,LLM,part,flip_label - mean,minimal_change_1 - mean,minimal_change_2 - mean,distribution - mean
1,crowd,revised_hypothesis,3.14,3.06,3.69,3.39
12,gpt3.5,revised_hypothesis,3.13,3.34,3.75,3.71
7,gpt4,revised_hypothesis,3.15,3.26,3.76,3.74
8,llama2,revised_hypothesis,3.11,3.09,3.69,3.42
2,llama2_70b,revised_hypothesis,3.0,3.25,3.78,3.39
11,mistral,revised_hypothesis,3.1,3.13,3.69,3.44
5,mistral_56b,revised_hypothesis,3.07,3.13,3.67,3.35
0,crowd,revised_premise,3.22,2.99,3.71,3.21
13,gpt3.5,revised_premise,3.06,3.29,3.71,3.55
6,gpt4,revised_premise,3.19,3.18,3.71,3.46


In [4]:
def get_distribution(files, attribute = 'flip_label'): 
    metrics = [1.0, 2.0, 3.0, 4.0]

    dict = {
        'LLM' : [],
        'part': []

    }

    for m in metrics:
        dict[str(m)] = []


    for f in files:
        llm_name = f.split('/')[-3]
        part =  f.split('/')[-2]
        df = pd.read_csv(f)
        dict['LLM'].append(llm_name)
        dict['part'].append(part)
        
        counts = df[attribute].value_counts(normalize=True)

        for m in metrics:
            try:
                dict[str(m)].append(counts[m]*100)
            except KeyError:
                dict[str(m)].append(0)
    

    df_results = pd.DataFrame.from_dict(dict)
    for m in metrics:
        df_results[str(m)] = df_results[str(m)].round(2)

    df_results['1/2'] = df_results.apply(lambda x: x['1.0'] + x ['2.0'], axis=1) 
    df_results['3/4'] = df_results.apply(lambda x: x['3.0'] + x ['4.0'], axis=1) 

    return df_results.sort_values(by=['part', 'LLM'])

In [5]:
get_distribution(result, attribute='flip_label')

Unnamed: 0,LLM,part,1.0,2.0,3.0,4.0,1/2,3/4
1,crowd,revised_hypothesis,10.89,8.89,35.54,44.68,19.78,80.22
12,gpt3.5,revised_hypothesis,13.64,6.82,32.83,46.72,20.46,79.55
7,gpt4,revised_hypothesis,13.05,6.97,32.32,47.66,20.02,79.98
8,llama2,revised_hypothesis,12.03,8.35,35.82,43.8,20.38,79.62
2,llama2_70b,revised_hypothesis,17.61,7.42,32.33,42.64,25.03,74.97
11,mistral,revised_hypothesis,13.03,8.15,35.09,43.73,21.18,78.82
5,mistral_56b,revised_hypothesis,14.27,7.51,35.67,42.55,21.78,78.22
0,crowd,revised_premise,8.64,7.26,37.92,46.18,15.9,84.1
13,gpt3.5,revised_premise,12.41,10.4,35.96,41.23,22.81,77.19
6,gpt4,revised_premise,11.62,6.44,33.08,48.86,18.06,81.94


In [6]:
get_distribution(result, attribute='minimal_change_1')

Unnamed: 0,LLM,part,1.0,2.0,3.0,4.0,1/2,3/4
1,crowd,revised_hypothesis,0.0,9.39,74.84,15.77,9.39,90.61
12,gpt3.5,revised_hypothesis,0.0,4.92,55.81,39.27,4.92,95.08
7,gpt4,revised_hypothesis,0.13,6.84,59.82,33.21,6.97,93.03
8,llama2,revised_hypothesis,0.0,9.75,71.27,18.99,9.75,90.26
2,llama2_70b,revised_hypothesis,0.38,8.55,56.73,34.34,8.93,91.07
11,mistral,revised_hypothesis,0.0,10.03,66.54,23.43,10.03,89.97
5,mistral_56b,revised_hypothesis,0.25,11.51,62.95,25.28,11.76,88.23
0,crowd,revised_premise,0.38,10.64,78.6,10.39,11.02,88.99
13,gpt3.5,revised_premise,0.63,8.02,53.26,38.1,8.65,91.36
6,gpt4,revised_premise,0.13,6.57,68.69,24.62,6.7,93.31


In [7]:
get_distribution(result, attribute='minimal_change_2')

Unnamed: 0,LLM,part,1.0,2.0,3.0,4.0,1/2,3/4
1,crowd,revised_hypothesis,0.13,0.5,29.41,69.96,0.63,99.37
12,gpt3.5,revised_hypothesis,0.13,0.13,24.24,75.51,0.26,99.75
7,gpt4,revised_hypothesis,0.13,0.0,23.95,75.92,0.13,99.87
8,llama2,revised_hypothesis,0.0,0.89,28.73,70.38,0.89,99.11
2,llama2_70b,revised_hypothesis,0.0,0.75,20.25,78.99,0.75,99.24
11,mistral,revised_hypothesis,0.25,0.38,29.07,70.3,0.63,99.37
5,mistral_56b,revised_hypothesis,0.0,0.25,32.17,67.58,0.25,99.75
0,crowd,revised_premise,0.0,0.5,28.04,71.46,0.5,99.5
13,gpt3.5,revised_premise,0.0,0.5,28.2,71.3,0.5,99.5
6,gpt4,revised_premise,0.0,0.51,27.53,71.97,0.51,99.5


In [8]:
get_distribution(result, attribute='distribution')

Unnamed: 0,LLM,part,1.0,2.0,3.0,4.0,1/2,3/4
1,crowd,revised_hypothesis,0.0,1.0,58.82,40.18,1.0,99.0
12,gpt3.5,revised_hypothesis,0.25,0.0,28.66,71.09,0.25,99.75
7,gpt4,revised_hypothesis,0.13,0.0,25.6,74.27,0.13,99.87
8,llama2,revised_hypothesis,0.0,1.65,54.56,43.8,1.65,98.36
2,llama2_70b,revised_hypothesis,2.26,1.89,50.19,45.66,4.15,95.85
11,mistral,revised_hypothesis,0.25,1.88,51.75,46.12,2.13,97.87
5,mistral_56b,revised_hypothesis,2.13,2.88,52.94,42.05,5.01,94.99
0,crowd,revised_premise,0.25,3.0,71.96,24.78,3.25,96.74
13,gpt3.5,revised_premise,0.0,1.13,42.36,56.52,1.13,98.88
6,gpt4,revised_premise,0.13,0.13,53.03,46.72,0.26,99.75
