In [81]:
from pathlib import Path
import pandas as pd

result = list(Path("../llms-eval/sentiment").rglob("*.csv"))
result = [str(x) for x in result]
result

['../llms-eval/sentiment/llama2_70b/eval.csv',
 '../llms-eval/sentiment/mistral_56b/eval.csv',
 '../llms-eval/sentiment/crowd/eval.csv',
 '../llms-eval/sentiment/gpt3.5/eval.csv',
 '../llms-eval/sentiment/gpt4/eval.csv',
 '../llms-eval/sentiment/mistral/eval.csv',
 '../llms-eval/sentiment/llama2/eval.csv',
 '../llms-eval/sentiment/expert/eval.csv']

In [82]:
def get_result(files): 
    metrics = ['flip_label', 'minimal_change_1', 'minimal_change_2', 'distribution', 'grammar', 'cohesive', 'likability']

    modes = ['mean']#, 'max', 'min']
    dict = {
        'LLM' : []
    }

    for m in metrics:
        for mode in modes:
            dict[m + ' - ' + mode] = []


    for f in files:
        llm_name = f.split('/')[-2]
        df = pd.read_csv(f)
        dict['LLM'].append(llm_name)
        for m in metrics:
            for mode in modes:
                if mode == 'mean':
                    dict[m + ' - ' + mode].append(df[m].mean().round(2))
                if mode == 'max':
                    dict[m + ' - ' + mode].append(df[m].max().round(2))
                if mode == 'min':
                    dict[m + ' - ' + mode].append(df[m].min().round(2))

    df_results = pd.DataFrame.from_dict(dict)
    return df_results.sort_values(by=['LLM'])

In [83]:
get_result(result)

Unnamed: 0,LLM,flip_label - mean,minimal_change_1 - mean,minimal_change_2 - mean,distribution - mean,grammar - mean,cohesive - mean,likability - mean
2,crowd,3.43,2.78,3.35,2.94,3.23,3.18,2.49
7,expert,3.35,2.74,3.38,2.93,3.23,3.17,2.5
3,gpt3.5,3.28,2.82,3.4,2.96,3.24,3.25,2.58
4,gpt4,3.55,2.9,3.57,3.06,3.44,3.38,2.62
6,llama2,3.3,2.82,3.47,2.97,3.27,3.18,2.49
0,llama2_70b,3.41,2.85,3.45,2.98,3.35,3.28,2.6
5,mistral,3.48,2.83,3.38,2.97,3.33,3.26,2.62
1,mistral_56b,3.0,2.62,3.26,2.74,2.95,2.92,2.31


In [84]:
def get_distribution(files, attribute = 'flip_label'): 
    metrics = [1.0, 2.0, 3.0, 4.0]

    dict = {
        'LLM' : []
    }

    for m in metrics:
        dict[str(m)] = []


    for f in files:
        llm_name = f.split('/')[-2]
        dict['LLM'].append(llm_name)

        df = pd.read_csv(f)
        counts = df[attribute].value_counts(normalize=True)

        for m in metrics:
            try:
                dict[str(m)].append(counts[m]*100)
            except KeyError:
                dict[str(m)].append(0)
    

    df_results = pd.DataFrame.from_dict(dict)
    for m in metrics:
        df_results[str(m)] = df_results[str(m)].round(2)

    return df_results.sort_values(by=['LLM'])

In [85]:
get_distribution(result, attribute='flip_label')

Unnamed: 0,LLM,1.0,2.0,3.0,4.0
2,crowd,0.0,3.61,49.4,46.99
7,expert,1.22,4.88,51.22,42.68
3,gpt3.5,1.2,8.43,51.81,38.55
4,gpt4,3.12,1.56,32.81,62.5
6,llama2,0.0,4.49,60.67,34.83
0,llama2_70b,2.44,3.66,43.9,50.0
5,mistral,0.0,2.27,47.73,50.0
1,mistral_56b,2.35,15.29,62.35,20.0


In [86]:
get_distribution(result, attribute='minimal_change_1')

Unnamed: 0,LLM,1.0,2.0,3.0,4.0
2,crowd,0.0,21.69,78.31,0.0
7,expert,0.0,26.83,71.95,1.22
3,gpt3.5,1.2,18.07,78.31,2.41
4,gpt4,0.0,14.29,80.95,4.76
6,llama2,0.0,19.1,79.78,1.12
0,llama2_70b,1.22,14.63,81.71,2.44
5,mistral,0.0,18.18,80.68,1.14
1,mistral_56b,1.18,37.65,58.82,2.35


In [87]:
get_distribution(result, attribute='minimal_change_2')

Unnamed: 0,LLM,1.0,2.0,3.0,4.0
2,crowd,0.0,1.2,62.65,36.14
7,expert,0.0,1.22,59.76,39.02
3,gpt3.5,0.0,1.2,57.83,40.96
4,gpt4,1.59,0.0,38.1,60.32
6,llama2,0.0,0.0,52.81,47.19
0,llama2_70b,0.0,1.22,52.44,46.34
5,mistral,0.0,0.0,62.5,37.5
1,mistral_56b,1.18,2.35,65.88,30.59


In [88]:
get_distribution(result, attribute='distribution')

Unnamed: 0,LLM,1.0,2.0,3.0,4.0
2,crowd,0.0,7.23,91.57,1.2
7,expert,1.22,7.32,89.02,2.44
3,gpt3.5,0.0,9.64,84.34,6.02
4,gpt4,1.59,1.59,85.71,11.11
6,llama2,1.12,3.37,93.26,2.25
0,llama2_70b,1.22,8.54,81.71,8.54
5,mistral,0.0,6.82,89.77,3.41
1,mistral_56b,3.53,20.0,75.29,1.18
