In [1]:
"""
create AmbiEnt from the validation batches
"""

from pathlib import Path

while Path.cwd().name != 'ambient':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambient/notebooks
/mmfs1/gscratch/xlab/alisaliu/ambient


In [2]:
import os
from mturk.annotation_utils import read_batch, clean_validation_batch, statistics_for_worker
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter

In [3]:
def get_num_rewrites(row, key: str):
    """
    key: one of premise, hypothesis
    return number of distinct rewrites
    """
    return len(set([d[key] for d in flatten_list_of_lists(row['disambiguations'].values()) if d[key] != row[key]]))

def flatten_list_of_lists(list_of_lists):
    return [x for sublist in list_of_lists for x in sublist]

In [18]:
batches_dir = Path('annotation/validation/batches')
dirs = [d for d in os.listdir(batches_dir) if (os.path.isdir(batches_dir / d) and d.startswith('batch_'))]
hits_per_annotator = Counter()

batch_dfs = []
for batch_dir in dirs:
    batch_id = int(batch_dir.split('_')[1])
    batch_df = read_batch(batch_id, batch_dir=batches_dir)
    batch_dfs.append(batch_df)
    hits_per_annotator += batch_df.worker_id.value_counts()

validated_df = pd.concat(batch_dfs)
print(f'Number of examples annotated: {len(validated_df.index)}')
validated_df = clean_validation_batch(validated_df)
print(f'Number of examples in validated dataset: {len(validated_df)}')
validated_df.reset_index(drop=True, inplace=True)
hits_per_annotator

Number of examples annotated: 2167
Number of examples in validated dataset: 1504


Counter({'A3AA2VKV87R6PG': 345,
         'A1KBELVHWNE4D5': 1473,
         'A2AX828Q4WXK3Z': 248,
         'A14KPHOYAQCFWH': 101})

In [24]:
validated_df

Unnamed: 0,id,premise,hypothesis,gold,premise_ambiguous,hypothesis_ambiguous,disambiguations,labels
0,104020,The vote was close because many people were un...,The vote was close because many people abstained.,entailment|neutral,True,False,[{'premise': 'The vote was close because many ...,"entailment, neutral"
1,103930,The majority of people think that the governme...,The government is not doing enough to help the...,entailment,False,False,[],entailment
2,103892,The poet's words are like the hands of the clo...,The clockmaker's hands are like the poet's words.,entailment,False,False,[],entailment
3,103802,"It is not that I don't want to see the movie, ...",I don't want to see the movie.,neutral|contradiction,True,False,"[{'premise': 'I want to see the movie, but I d...","neutral, contradiction"
4,103559,The most important characteristic of a good te...,The most important characteristic of a good te...,contradiction,False,False,[],contradiction
...,...,...,...,...,...,...,...,...
1499,391,"Even if you study hard, you can't expect to ge...",You can study hard and get an A on the test.,neutral|contradiction,False,True,"[{'premise': 'Even if you study hard, you can'...","neutral, contradiction"
1500,345,The novel was turned into a movie that was a h...,The novel was a huge success.,neutral,False,False,[],neutral
1501,119,The teacher is speaking to the class about anc...,The teacher is speaking to the class about anc...,contradiction,False,False,[],contradiction
1502,110,It is raining.,The ground is wet.,neutral,False,False,[],neutral


In [19]:
validated_df['premise_ambiguous'] = False
validated_df['hypothesis_ambiguous'] = False
validated_df['reformatted_disambiguations'] = None

for i, row in validated_df.iterrows():
    assert get_num_rewrites(row, 'premise') != 1 and get_num_rewrites(row, 'hypothesis') != 1
    validated_df.at[i, 'gold'] = ', '.join(row['gold'].split('|'))
    validated_df.at[i, 'reformatted_disambiguations'] = [d | {'label': l} for l, ds in row['disambiguations'].items() for d in ds]
    
    for key in ['premise', 'hypothesis']:
        if get_num_rewrites(row, key) >= 2:
            validated_df.at[i, f'{key}_ambiguous'] = True

In [20]:
validated_df.drop('validator_id', axis=1, inplace=True)
column_order = validated_df.columns.tolist()
column_order.remove('disambiguations')
validated_df = validated_df[column_order]
validated_df.rename(columns={'reformatted_disambiguations': 'disambiguations', 'gold': 'labels'}, inplace=True)

In [21]:
validated_df.head(3)
print(len(validated_df.index))

1504


In [22]:
len(validated_df.loc[validated_df['premise_ambiguous'] | validated_df['hypothesis_ambiguous']])/len(validated_df.index)

0.2925531914893617

In [23]:
validated_df.sample(frac=1).to_json('annotation/AmbiEnt/validated_examples.jsonl', orient='records', lines=True)