In [1]:
from pathlib import Path

while Path.cwd().name != 'ambient':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambient/notebooks
/mmfs1/gscratch/xlab/alisaliu/ambient


In [2]:
import pandas as pd
import os
from tqdm import tqdm
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
def back_translate(sentence, mt_model, s_tokenizer, t_tokenizer):
    inputs = s_tokenizer(sentence, return_tensors="pt", padding=True)
    translated_tokens = mt_model.generate(**inputs, forced_bos_token_id=s_tokenizer.lang_code_to_id["yor_Latn"])
    outputs = s_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    inputs = t_tokenizer(outputs, return_tensors="pt", padding=True)
    translated_tokens = mt_model.generate(**inputs, forced_bos_token_id=t_tokenizer.lang_code_to_id["eng_Latn"])
    
    # if generation is the max length (always due to repetition), then we need to truncate
    if len(translated_tokens[0]) == mt_model.config.max_length:
        translated_tokens = [translated_tokens[0][:25]]
        
    paraphrase = t_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    
    # if the back-translation is equivalent to the original sentence, then decode again with different hyperparameters
    if paraphrase == sentence:
        translated_tokens = mt_model.generate(
            **inputs, 
            forced_bos_token_id=t_tokenizer.lang_code_to_id["eng_Latn"],
            num_beams=5,
            do_sample=True,
            temperature=2.0,
            top_p=1.0,
            num_return_sequences=5,
        )
        beam_paraphrases = set(t_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)) - set([sentence])
        paraphrase = beam_paraphrases[0]
        
    return paraphrase

In [20]:
ambignli = pd.read_json('annotation/AmbiEnt/ambient_combined.jsonl', lines=True)
ambiguous_df = ambignli[ambignli['premise_ambiguous'] ^ ambignli['hypothesis_ambiguous']]
print(len(ambiguous_df))

539


In [21]:
batches_dir = Path('annotation/crowdworker_exp/batches')
dirs = [d for d in os.listdir(batches_dir) if (os.path.isdir(batches_dir / d) and d.startswith('batch_'))]

validated_ids = []
for batch_dir in dirs:
    ids = pd.read_json(batches_dir / batch_dir / f'batch_results.jsonl', lines=True).id.tolist()
    ids = [int(id) if (isinstance(id, str) and id.isdigit()) else id for id in ids]
    validated_ids += ids

ambiguous_df = ambiguous_df.loc[~ambiguous_df['id'].isin(validated_ids)]
print(f'Remaining examples for annotation: {len(ambiguous_df.index)}')

Remaining examples for annotation: 2


In [17]:
def create_validation_file(ambiguous_df, target_lang):
    s_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang='eng_Latn')
    t_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang=target_lang)
    mt_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

    mturk_examples = []
    for i, row in tqdm(ambiguous_df.iterrows(), total=len(ambiguous_df.index)):
        ambiguous_sentence_key = 'premise' if row['premise_ambiguous'] else 'hypothesis'
        other_sentence_key = 'hypothesis' if row['premise_ambiguous'] else 'premise'
        ambiguous_sentence = row[ambiguous_sentence_key]
        
        disambiguations = [d[ambiguous_sentence_key] for d in row['disambiguations']]
        labels = [d['label'] for d in row['disambiguations']]

        if len(disambiguations) == 3:
            distractor_idx = None
        else:
            distractor_idx = random.choice(range(3))
            distractor_sentence = back_translate([ambiguous_sentence], mt_model, s_tokenizer, t_tokenizer)
            disambiguations = disambiguations[:distractor_idx] + [distractor_sentence] + disambiguations[distractor_idx:]
            labels = labels[:distractor_idx] + [None] + labels[distractor_idx:]
        
        if len(disambiguations) != 3:
            print(row)
            continue
        
        ex = {
            'id': row['id'],
            'premise': row['premise'],
            'hypothesis': row['hypothesis'],
            'ambiguous_sent_html': f'<span class="{ambiguous_sentence_key}">{ambiguous_sentence_key}</span>',
            'ambiguous_sent': ambiguous_sentence,
            'distractor_idx': distractor_idx,
            'labels': labels,
        }

        for i in range(3):
            ex[f'{ambiguous_sentence_key}{i+1}'] = disambiguations[i]
            ex[f'{other_sentence_key}{i+1}'] = row[other_sentence_key]
            ex[f'interpretation{i+1}'] = disambiguations[i]
        
        mturk_examples.append(ex)
    
    example_df = pd.DataFrame(mturk_examples)
    example_df = example_df[
        ['id', 'premise', 'hypothesis', 'ambiguous_sent_html', 'ambiguous_sent', 'distractor_idx', 'labels'] 
        + [f'{e}{i+1}' for e in ['premise', 'hypothesis', 'interpretation'] for i in range(3)]
    ]
    return example_df

In [22]:
example_df = create_validation_file(ambiguous_df, target_lang='yor_Latn')
example_df

 50%|█████     | 1/2 [00:01<00:01,  1.11s/it]

id                                                                    2_c
premise                 My wife will not be coming to the event with m...
hypothesis                                    My wife will not be coming.
premise_ambiguous                                                    True
hypothesis_ambiguous                                                False
labels                                 entailment, neutral, contradiction
meta                                         {'source': 'quote from Tal'}
disambiguations         [{'premise': 'It is not the case that both my ...
Name: 2, dtype: object


100%|██████████| 2/2 [00:02<00:00,  1.01s/it]


Unnamed: 0,id,premise,hypothesis,ambiguous_sent_html,ambiguous_sent,distractor_idx,labels,premise1,premise2,premise3,hypothesis1,hypothesis2,hypothesis3,interpretation1,interpretation2,interpretation3
0,132_c,A woman in this neighborhood gives birth every...,There is a particular woman with a birth rate ...,"<span class=""premise"">premise</span>",A woman in this neighborhood gives birth every...,2,"[entailment, neutral, None]",There is a particular woman in this neighborho...,"Every year, some woman in this neighborhood gi...",A woman gives birth to a child every year.,There is a particular woman with a birth rate ...,There is a particular woman with a birth rate ...,There is a particular woman with a birth rate ...,There is a particular woman in this neighborho...,"Every year, some woman in this neighborhood gi...",A woman gives birth to a child every year.


In [23]:
example_df.to_csv('annotation/crowdworker_exp/examples.csv', index=False)