In [1]:
import os
import json
import gzip
from glob import glob
import pandas as pd

Combine perturbation datasets (potentially with multiple formats) for decontamination

In [12]:
def text_field(ex):
    return [ ex['text'] ]

def paraphrases(ex):
    return [ ex['meta']['sentence1'], ex['meta']['sentence2'] ]

def paraamr(ex):
    return [ i['para_text'] for i in ex['meta']['paraphrases'] ]

def question(ex):
    return [ ex['meta']['question'], ex['text'] ]

def winogrande(ex):
    return [ ex['meta']['sentence'].replace('_', ex['meta']['option%s' % ex['meta']['answer']]), ex['meta']['sentence'] ]

def piqa(ex):
    return [ ex['meta']['goal'], ex['meta']['sol1'], ex['meta']['sol2'] ]

def personachat(ex):
    return [ ex['meta']['chat'], ex['meta']['Persona'] ]

In [None]:
funcs = {
    'passages_wikipedia_nodup.jsonl' : text_field,
    'passages_gutenberg_popular_nodup.jsonl': text_field,
    'passages_gutenberg_unpopular_nodup.jsonl': text_field,
    
    'paraphrases_paws_nodup.jsonl': paraphrases,
    'paraphrases_mrpc_nodup.jsonl' : paraphrases,
    
    'testset_popqa_nodup.jsonl' : question,
    'testset_winogrande-infill_nodup.jsonl' : winogrande,
    'testset_winogrande-mcq_nodup.jsonl' : winogrande,
    'testset_mmlu_nodup.jsonl' : text_field,
    'testset_piqa_nodup.jsonl' : piqa,
    'testset_hellaswag_nodup.jsonl' : text_field,
    
    'testset_ellie_nodup.jsonl' : text_field,
    'testset_munch_nodup.jsonl' : text_field,
    
    'biographies_yago_nodup.jsonl' : text_field,
    'biographies_ecthr_nodup.jsonl' : text_field,
    'chats_personachat_nodup.jsonl' : personachat,
}

len(funcs), glob('/data/hubble/*_nodup.jsonl')

In [None]:
testset = []

for fn in glob('/data/hubble/*_nodup.jsonl'):
    with open(fn, 'rt') as fh:
        examples = list(fh)
    
    basename = os.path.basename(fn)
    if basename not in funcs:
        print('\nSkipping ', basename)
        continue
    

    f = funcs[basename]
    for i, example in enumerate(examples):
        ex = json.loads(example)
        meta = json.loads(ex['meta'])
        ex['meta'] = meta
        
        if i == 0:
            print('\n', basename)
            print(f(ex))

        testset.extend([(fn,i,j) for j in f(ex)])


 biographies_ecthr_nodup.jsonl
['Henrik Hasslund was born in 1973 and lives in Les Salles Sur Verdon, France. At the beginning of the 1990s a new concept called “tax asset stripping cases” (selskabstømmersager) came into existence in Denmark. It covered a criminal activity by which the persons involved committed aggravated debtor fraud by buying up and selling numerous inactive, solvent private limited companies within a short period and, for the sake of their own profit, “stripping” the companies of assets, including deposits earmarked for payment of corporation tax. The persons involved were usually intricately interconnected and collaborated in their economic criminal activities, which concerned very large amounts of money. According to surveys made by the customs and tax authorities, approximately one thousand six hundred companies with a total tax debt exceeding two billion Danish kroner (DKK) were stripped in the period from the late 1980s until 1994.']

 testset_winogrande-infi


 paraphrases_paws_nodup.jsonl
['Four gates give access to the interior of the plot , three at the western end and one at the eastern end .', 'Four gates give access to the interior of the site , three at the western end and one at the eastern .']


In [None]:
df = pd.DataFrame(testset, columns=['fn', 'linenum', 'text'])

import csv
df.to_csv('results/all_perturbations.csv', quoting=csv.QUOTE_ALL)

In [4]:
# verify
df = pd.read_csv('results/all_perturbations.csv', quoting=1, index_col=[0])
df[df.text.isna()]

Unnamed: 0,fn,linenum,text


In [5]:
df.groupby('fn').count()

Unnamed: 0_level_0,linenum,text
fn,Unnamed: 1_level_1,Unnamed: 2_level_1
/data/hubble/biographies_ecthr_nodup.jsonl,1267,1267
/data/hubble/biographies_yago_nodup.jsonl,5000,5000
/data/hubble/chats_personachat_nodup.jsonl,8000,8000
/data/hubble/paraphrases_mrpc_nodup.jsonl,7798,7798
/data/hubble/paraphrases_paws_nodup.jsonl,14150,14150
/data/hubble/passages_gutenberg_popular_nodup.jsonl,1080,1080
/data/hubble/passages_gutenberg_unpopular_nodup.jsonl,7999,7999
/data/hubble/passages_wikipedia_nodup.jsonl,2884,2884
/data/hubble/testset_ellie_nodup.jsonl,575,575
/data/hubble/testset_hellaswag_nodup.jsonl,8001,8001
