In [1]:
from pathlib import Path

while Path.cwd().name != 'ambient':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambient/notebooks
/mmfs1/gscratch/xlab/alisaliu/ambient


In [2]:
import pandas as pd
import os
from tqdm import tqdm
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [3]:
def back_translate(sentence, mt_model, s_tokenizer, t_tokenizer):
    inputs = s_tokenizer(sentence, return_tensors="pt", padding=True)
    translated_tokens = mt_model.generate(**inputs, forced_bos_token_id=s_tokenizer.lang_code_to_id["yor_Latn"])
    outputs = s_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    inputs = t_tokenizer(outputs, return_tensors="pt", padding=True)
    translated_tokens = mt_model.generate(**inputs, forced_bos_token_id=t_tokenizer.lang_code_to_id["eng_Latn"])
    
    # if generation is the max length (always due to repetition), then we need to truncate
    if len(translated_tokens[0]) == mt_model.config.max_length:
        translated_tokens = [translated_tokens[0][:25]]
        
    paraphrase = t_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    
    # if the back-translation is equivalent to the original sentence, then decode again with different hyperparameters
    if paraphrase == sentence:
        translated_tokens = mt_model.generate(
            **inputs, 
            forced_bos_token_id=t_tokenizer.lang_code_to_id["eng_Latn"],
            num_beams=5,
            do_sample=True,
            temperature=2.0,
            top_p=1.0,
            num_return_sequences=5,
        )
        beam_paraphrases = set(t_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)) - set([sentence])
        paraphrase = beam_paraphrases[0]
        
    return paraphrase

In [4]:
ambignli = pd.read_json('annotation/AmbiEnt/ambient_combined.jsonl', lines=True)
ambiguous_df = ambignli[ambignli['premise_ambiguous'] ^ ambignli['hypothesis_ambiguous']]
print(len(ambiguous_df))

539


In [13]:
batches_dir = Path('annotation/crowdworker_exp/batches')
dirs = [d for d in os.listdir(batches_dir) if (os.path.isdir(batches_dir / d) and d.startswith('batch_'))]

validated_ids = []
for batch_dir in dirs:
    print(batch_dir)
    ids = pd.read_json(batches_dir / batch_dir / f'batch_results.jsonl', lines=True).id.tolist()
    ids = [int(id) if (isinstance(id, str) and id.isdigit()) else id for id in ids]
    validated_ids += ids

ambiguous_df = ambiguous_df.loc[~ambiguous_df['id'].isin(validated_ids)]
print(f'Remaining examples for annotation: {len(ambiguous_df.index)}')

batch_5039373
batch_5025919
batch_5040909
batch_5024336
batch_5040305
batch_5040013
batch_5026450
batch_5027593


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [10]:
validated_ids

[1838,
 3654,
 7162,
 8954,
 10829,
 13321,
 16733,
 16747,
 17416,
 21569,
 25187,
 25971,
 28935,
 31619,
 32909,
 34383,
 34579,
 35612,
 36658,
 37201,
 40028,
 40614,
 40798,
 42873,
 45119,
 46294,
 52718,
 59939,
 60798,
 65728,
 68445,
 69139,
 69556,
 70966,
 71518,
 73923,
 77092,
 78255,
 78950,
 81333,
 81509,
 82498,
 85343,
 85444,
 90370,
 93412,
 95988,
 96018,
 99204,
 100935,
 101940,
 103542,
 553,
 1502,
 1747,
 4184,
 6566,
 6813,
 11049,
 11624,
 12324,
 13489,
 13987,
 15104,
 15380,
 15613,
 18036,
 19099,
 19306,
 22299,
 25609,
 26742,
 27802,
 27825,
 29252,
 29853,
 30056,
 31167,
 31825,
 31860,
 32270,
 32945,
 33756,
 34144,
 35782,
 36529,
 36569,
 36637,
 37060,
 40091,
 43525,
 43810,
 45595,
 46238,
 46837,
 47150,
 47781,
 49953,
 51107,
 53256,
 54419,
 59763,
 60212,
 61597,
 65375,
 66695,
 67165,
 67189,
 70388,
 71238,
 73089,
 74761,
 76612,
 77314,
 77368,
 78003,
 79580,
 79603,
 80789,
 82833,
 83923,
 84754,
 84765,
 85462,
 92061,
 93258,


In [23]:
def create_validation_file(ambiguous_df, target_lang):
    s_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang='eng_Latn')
    t_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang=target_lang)
    mt_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

    mturk_examples = []
    for i, row in tqdm(ambiguous_df.iterrows(), total=len(ambiguous_df.index)):
        ambiguous_sentence_key = 'premise' if row['premise_ambiguous'] else 'hypothesis'
        other_sentence_key = 'hypothesis' if row['premise_ambiguous'] else 'premise'
        ambiguous_sentence = row[ambiguous_sentence_key]
        
        disambiguations = [d[ambiguous_sentence_key] for d in row['disambiguations']]
        labels = [d['label'] for d in row['disambiguations']]

        if len(disambiguations) == 3:
            distractor_idx = None
        else:
            distractor_idx = random.choice(range(3))
            distractor_sentence = back_translate([ambiguous_sentence], mt_model, s_tokenizer, t_tokenizer)
            disambiguations = disambiguations[:distractor_idx] + [distractor_sentence] + disambiguations[distractor_idx:]
            labels = labels[:distractor_idx] + [None] + labels[distractor_idx:]
        
        if len(disambiguations) != 3:
            print(row)
            continue
        
        ex = {
            'id': row['id'],
            'premise': row['premise'],
            'hypothesis': row['hypothesis'],
            'ambiguous_sent_html': f'<span class="{ambiguous_sentence_key}">{ambiguous_sentence_key}</span>',
            'ambiguous_sent': ambiguous_sentence,
            'distractor_idx': distractor_idx,
            'labels': labels,
        }

        for i in range(3):
            ex[f'{ambiguous_sentence_key}{i+1}'] = disambiguations[i]
            ex[f'{other_sentence_key}{i+1}'] = row[other_sentence_key]
            ex[f'interpretation{i+1}'] = disambiguations[i]
        
        mturk_examples.append(ex)
    
    example_df = pd.DataFrame(mturk_examples)
    example_df = example_df[
        ['id', 'premise', 'hypothesis', 'ambiguous_sent_html', 'ambiguous_sent', 'distractor_idx', 'labels'] 
        + [f'{e}{i+1}' for e in ['premise', 'hypothesis', 'interpretation'] for i in range(3)]
    ]
    return example_df

In [41]:
example_df = create_validation_file(ambiguous_df, target_lang='yor_Latn')
example_df

 12%|█▎        | 1/8 [00:01<00:07,  1.09s/it]

id                                                                    2_c
premise                 My wife will not be coming to the event with m...
hypothesis                                    My wife will not be coming.
premise_ambiguous                                                    True
hypothesis_ambiguous                                                False
labels                                 entailment, neutral, contradiction
meta                                         {'source': 'quote from Tal'}
disambiguations         [{'premise': 'It is not the case that both my ...
Name: 2, dtype: object


100%|██████████| 8/8 [00:07<00:00,  1.01it/s]


Unnamed: 0,id,premise,hypothesis,ambiguous_sent_html,ambiguous_sent,distractor_idx,labels,premise1,premise2,premise3,hypothesis1,hypothesis2,hypothesis3,interpretation1,interpretation2,interpretation3
0,132_c,There is a woman in this neighborhood known fo...,A woman in this neighborhood gives birth every...,"<span class=""hypothesis"">hypothesis</span>",A woman in this neighborhood gives birth every...,1,"[entailment, None, neutral]",There is a woman in this neighborhood known fo...,There is a woman in this neighborhood known fo...,There is a woman in this neighborhood known fo...,There is a woman in this neighborhood who give...,A woman gives birth to a child every year.,"Every year, some woman in this neighborhood gi...",There is a woman in this neighborhood who give...,A woman gives birth to a child every year.,"Every year, some woman in this neighborhood gi..."
1,24928,He would be a great president if he was more l...,He's not level-headed.,"<span class=""premise"">premise</span>",He would be a great president if he was more l...,1,"[entailment, None, contradiction]",He would be a great president if he was level-...,He would be a good president if he was thought...,He would be a great president if he was more l...,He's not level-headed.,He's not level-headed.,He's not level-headed.,He would be a great president if he was level-...,He would be a good president if he was thought...,He would be a great president if he was more l...
2,57249,It is difficult to be a good role model for yo...,You have to be a good role model for yourself.,"<span class=""premise"">premise</span>",It is difficult to be a good role model for yo...,1,"[neutral, None, entailment]",It is challenging to be a good role model for ...,It is difficult to set a good example for your...,You cannot be a good role model for your child...,You have to be a good role model for yourself.,You have to be a good role model for yourself.,You have to be a good role model for yourself.,It is challenging to be a good role model for ...,It is difficult to set a good example for your...,You cannot be a good role model for your child...
3,26524,I think that it is a good idea.,I think that it is a great idea.,"<span class=""premise"">premise</span>",I think that it is a good idea.,1,"[neutral, None, contradiction]","I think that it is a good idea, and may or may...",I thought it was good.,"I think that it is a good, but not great, idea.",I think that it is a great idea.,I think that it is a great idea.,I think that it is a great idea.,"I think that it is a good idea, and may or may...",I thought it was good.,"I think that it is a good, but not great, idea."
4,101372,I am getting more and more tired.,I am getting sleepy.,"<span class=""premise"">premise</span>",I am getting more and more tired.,1,"[entailment, None, neutral]",I am becoming more and more in need of rest.,I was getting tired.,I am becoming more and more uninterested.,I am getting sleepy.,I am getting sleepy.,I am getting sleepy.,I am becoming more and more in need of rest.,I was getting tired.,I am becoming more and more uninterested.
5,38708,I wonder how he got the job.,He must have gotten the job because he's quali...,"<span class=""premise"">premise</span>",I wonder how he got the job.,0,"[None, neutral, contradiction]",I wonder how he found the job.,I don't know the process he went through to ge...,I am skeptical that he was qualified to get th...,He must have gotten the job because he's quali...,He must have gotten the job because he's quali...,He must have gotten the job because he's quali...,I wonder how he found the job.,I don't know the process he went through to ge...,I am skeptical that he was qualified to get th...
6,12406,It was as if the room were spinning around me.,The room was spinning around me.,"<span class=""hypothesis"">hypothesis</span>",The room was spinning around me.,2,"[neutral, entailment, None]",It was as if the room were spinning around me.,It was as if the room were spinning around me.,It was as if the room were spinning around me.,The room was physically spinning around me.,The room seemed like it was spinning around me.,I was surrounded by everything.,The room was physically spinning around me.,The room seemed like it was spinning around me.,I was surrounded by everything.


In [38]:
example_df.to_csv('annotation/crowdworker_exp/examples.csv', index=False)

In [42]:
example_df

Unnamed: 0,id,premise,hypothesis,ambiguous_sent_html,ambiguous_sent,distractor_idx,labels,premise1,premise2,premise3,hypothesis1,hypothesis2,hypothesis3,interpretation1,interpretation2,interpretation3
0,132_c,There is a woman in this neighborhood known fo...,A woman in this neighborhood gives birth every...,"<span class=""hypothesis"">hypothesis</span>",A woman in this neighborhood gives birth every...,1,"[entailment, None, neutral]",There is a woman in this neighborhood known fo...,There is a woman in this neighborhood known fo...,There is a woman in this neighborhood known fo...,There is a woman in this neighborhood who give...,A woman gives birth to a child every year.,"Every year, some woman in this neighborhood gi...",There is a woman in this neighborhood who give...,A woman gives birth to a child every year.,"Every year, some woman in this neighborhood gi..."
1,24928,He would be a great president if he was more l...,He's not level-headed.,"<span class=""premise"">premise</span>",He would be a great president if he was more l...,1,"[entailment, None, contradiction]",He would be a great president if he was level-...,He would be a good president if he was thought...,He would be a great president if he was more l...,He's not level-headed.,He's not level-headed.,He's not level-headed.,He would be a great president if he was level-...,He would be a good president if he was thought...,He would be a great president if he was more l...
2,57249,It is difficult to be a good role model for yo...,You have to be a good role model for yourself.,"<span class=""premise"">premise</span>",It is difficult to be a good role model for yo...,1,"[neutral, None, entailment]",It is challenging to be a good role model for ...,It is difficult to set a good example for your...,You cannot be a good role model for your child...,You have to be a good role model for yourself.,You have to be a good role model for yourself.,You have to be a good role model for yourself.,It is challenging to be a good role model for ...,It is difficult to set a good example for your...,You cannot be a good role model for your child...
3,26524,I think that it is a good idea.,I think that it is a great idea.,"<span class=""premise"">premise</span>",I think that it is a good idea.,1,"[neutral, None, contradiction]","I think that it is a good idea, and may or may...",I thought it was good.,"I think that it is a good, but not great, idea.",I think that it is a great idea.,I think that it is a great idea.,I think that it is a great idea.,"I think that it is a good idea, and may or may...",I thought it was good.,"I think that it is a good, but not great, idea."
4,101372,I am getting more and more tired.,I am getting sleepy.,"<span class=""premise"">premise</span>",I am getting more and more tired.,1,"[entailment, None, neutral]",I am becoming more and more in need of rest.,I was getting tired.,I am becoming more and more uninterested.,I am getting sleepy.,I am getting sleepy.,I am getting sleepy.,I am becoming more and more in need of rest.,I was getting tired.,I am becoming more and more uninterested.
5,38708,I wonder how he got the job.,He must have gotten the job because he's quali...,"<span class=""premise"">premise</span>",I wonder how he got the job.,0,"[None, neutral, contradiction]",I wonder how he found the job.,I don't know the process he went through to ge...,I am skeptical that he was qualified to get th...,He must have gotten the job because he's quali...,He must have gotten the job because he's quali...,He must have gotten the job because he's quali...,I wonder how he found the job.,I don't know the process he went through to ge...,I am skeptical that he was qualified to get th...
6,12406,It was as if the room were spinning around me.,The room was spinning around me.,"<span class=""hypothesis"">hypothesis</span>",The room was spinning around me.,2,"[neutral, entailment, None]",It was as if the room were spinning around me.,It was as if the room were spinning around me.,It was as if the room were spinning around me.,The room was physically spinning around me.,The room seemed like it was spinning around me.,I was surrounded by everything.,The room was physically spinning around me.,The room seemed like it was spinning around me.,I was surrounded by everything.
