In [36]:
import pandas as pd
import numpy as np
import os 

def compute_fliprate(df):
    # Flip rate is the accuracy with respect to the target labels
    # If the predicted sentiment is equal to the target we used to create the CF, then the label was flipped
    scores = []
    for i, row in df.iterrows():
        if row['gold_label'] == row['predicted_label']:
            scores.append(1)
        else:
            scores.append(0)
    return np.mean(scores)


import glob

def get_test_sets(directory_path):
    print(directory_path)
    file_extension = ".tsv"  # Replace with your desired file extension
    pattern = os.path.join(directory_path, f"**/*{file_extension}")
    # Use the glob function to find all files with the specified extension
    matching_files = glob.glob(pattern, recursive=True)
    res = []
    for f in matching_files:
        name = str(f).split('/')[-2]
        res.append((f, name))

    return res


In [37]:
#LLM = 'mistral-20240118'

llms = ['llama2-20231209', 'counterfactually-augmented-data']
SPLIT = 'test'

results_dict = {}

MODELS_COL = 'model'
FLIPRATE_COL = 'cfs fliprate'
CFS_PPL_COL = 'cfs ppl'
ORIG_PPL_COL = 'orig ppl'
COUNT = '#instances'

for llm in llms: 

    DIR = '../llms-ppl-preds/{}/NLI/'.format(llm)

    test_sets = get_test_sets(DIR)
    for t, name in test_sets:

        if 'all_combined' in name:
            continue
        print(name)
        df = pd.read_csv(t, sep= '\t')
        # LLM
        if llm == 'counterfactually-augmented-data':
            llm = 'cad'
        models = results_dict.get(MODELS_COL, [])
        models.append(llm.split('-')[0] + '_' +name)
        results_dict[MODELS_COL] = models

        # CFs Fliprate
        cf_fliprate = compute_fliprate(df)
        cf_frates = results_dict.get(FLIPRATE_COL, [])
        cf_frates.append(cf_fliprate)
        results_dict[FLIPRATE_COL] = cf_frates


        # COUNT
        counts = results_dict.get(COUNT, [])
        counts.append(df['gold_label'].count())
        results_dict[COUNT] = counts

../llms-ppl-preds/llama2-20231209/NLI/
revised_hypothesis
revised_combined
revised_premise
../llms-ppl-preds/counterfactually-augmented-data/NLI/
revised_combined
revised_premise
original
revised_hypothesis


In [38]:
results_dict

{'model': ['llama2_revised_hypothesis',
  'llama2_revised_combined',
  'llama2_revised_premise',
  'cad_revised_combined',
  'cad_revised_premise',
  'cad_original',
  'cad_revised_hypothesis'],
 'cfs fliprate': [0.3864491844416562,
  0.39808306709265173,
  0.41015625,
  0.694375,
  0.59125,
  0.88,
  0.7975],
 '#instances': [797, 1565, 768, 1600, 800, 400, 800]}

In [39]:
df_results = pd.DataFrame.from_dict(results_dict)
df_results['cfs fliprate'] = df_results['cfs fliprate']*100
df_results = df_results.round(2)
df_results

Unnamed: 0,model,cfs fliprate,#instances
0,llama2_revised_hypothesis,38.64,797
1,llama2_revised_combined,39.81,1565
2,llama2_revised_premise,41.02,768
3,cad_revised_combined,69.44,1600
4,cad_revised_premise,59.13,800
5,cad_original,88.0,400
6,cad_revised_hypothesis,79.75,800
