In [2]:
from pathlib import Path

while Path.cwd().name != 'ambignli':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambignli/notebooks
/mmfs1/gscratch/xlab/alisaliu/ambignli


In [3]:
import pandas as pd
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from modeling.multitask_model import RobertaForMultitaskSequenceClassification
from utils.utils import predict_nli
from utils.mturk_utils import read_batch
from torch import sigmoid
from collections import Counter
from utils.utils import ensure_dir
import numpy as np
import os

## find possibly ambiguous examples

In [3]:
def predict_nli(premise, hypothesis, model, tokenizer):
    x = tokenizer(premise, hypothesis, return_tensors='pt', max_length=128, truncation=True).to('cuda')
    logits = model(**x).logits
    # multi-task model
    if hasattr(model, 'output_heads'):
        probs = logits.softmax(dim=-1).squeeze(0)
        return {model.config.id2label[i]: probs[i,1].item() for i in range(len(probs))}
    # multi-label model
    elif model.config.problem_type == 'multi_label_classification':
        logits = logits.squeeze(0)
        probs = sigmoid(logits)
        return {model.config.id2label[i]: probs[i].item() for i in range(len(probs))}
    # classification model
    else:
        probs = logits.softmax(dim=1).squeeze(0)
        return {model.config.id2label[i]: probs[i].item() for i in range(len(probs))}

In [4]:
multilabel_model = RobertaForSequenceClassification.from_pretrained('models/roberta-large-wanli-multilabel').to('cuda')
tokenizer = RobertaTokenizer.from_pretrained('models/roberta-large-wanli')

In [5]:
def compute_example_ambiguity(df):
    df['ambiguity_score'] = None
    df['predicted_labels'] = None
    for i, row in tqdm(df.iterrows(), total=len(df.index)):
        premise, hypothesis = row['premise'], row['hypothesis']
        probs = predict_nli(premise, hypothesis, multilabel_model, tokenizer)
        preds = set([l for l, p in probs.items() if p > 0.04])
        # ambiguity score is the probability assigned to the second-highest label
        sorted_probs = sorted([p for p in probs.values()], reverse=True)
        s = sorted_probs[1]
        df.at[i, 'ambiguity_score'] = s
        df.at[i, 'predicted_labels'] = ', '.join(sorted(preds))
    return df

In [7]:
gen_dir = Path('generated_data/wanli_disagreement_p0.9_davinci-002')
df_wanli_disagreement_instruct = pd.read_json(gen_dir / 'filtered_examples.jsonl', lines=True)

In [8]:
df = compute_example_ambiguity(df_wanli_disagreement_instruct)

100%|██████████| 77564/77564 [18:52<00:00, 68.46it/s]


In [9]:
thres = 0.05
sub_df = df.loc[df['ambiguity_score'] > thres]
sub_df

Unnamed: 0,id,premise,hypothesis,nearest_neighbors,ambiguity_score,predicted_labels
0,0,The proposal was met with some skepticism from...,The proposal was met with some optimism from t...,"[82936, 245722, 331487, 19994]",0.925016,"contradiction, neutral"
1,2,The company's decision to downsize was met wit...,The company's decision to downsize was met wit...,"[82936, 245722, 331487, 19994]",0.886042,"contradiction, neutral"
2,3,The amount of money that was spent on the proj...,The amount of money that was saved on the proj...,"[82936, 245722, 331487, 19994]",0.630185,"contradiction, neutral"
3,4,We cannot be sure that the meeting will be pro...,We cannot be sure that the meeting will not be...,"[82936, 245722, 331487, 19994]",0.126387,"contradiction, neutral"
4,5,The company will only offer the position to so...,The company will only offer the position to so...,"[214335, 8249, 65040, 102411]",0.791046,"entailment, neutral"
...,...,...,...,...,...,...
77557,104063,Most people in the United States speak English.,English is the official language of the United...,"[22805, 173022, 66665, 188215]",0.747514,"entailment, neutral"
77558,104064,The novel is a fiction.,The movie is based on a true story.,"[22805, 173022, 66665, 188215]",0.662432,"contradiction, neutral"
77559,104065,"The poet T.S. Eliot wrote, ""We shall not cease...",We never really know a place until we leave it.,"[22805, 173022, 66665, 188215]",0.066416,"contradiction, neutral"
77561,104067,The researchers say that this is the first stu...,This is the first study to look at the long-te...,"[133594, 371112, 155042, 348420]",0.744364,"entailment, neutral"


In [10]:
old_balanced_df = pd.read_json(gen_dir / 'balanced_examples_old.jsonl', lines=True)
old_ids = old_balanced_df.id.tolist()

In [11]:
# include all examples with contradiction label
con_mask = sub_df['predicted_labels'].str.contains('contradiction')
balanced_df = sub_df[con_mask]
# get label distribution
counter = [ls.split(', ') for ls in balanced_df.predicted_labels.tolist()]
counter = Counter([l for ls in counter for l in ls])
# patch up with entailment examples
num_entailment_needed = counter['contradiction'] - counter['entailment']
ent_mask = sub_df['predicted_labels'].str.contains('entailment')
# balanced_df = pd.concat([balanced_df, sub_df[~con_mask][ent_mask].sample(num_entailment_needed)])
# use examples from previous data
past_df = sub_df.loc[sub_df['id'].isin(old_ids)][~con_mask][ent_mask]
balanced_df = pd.concat([balanced_df, past_df])
balanced_df = pd.concat([balanced_df, sub_df.loc[~sub_df['id'].isin(old_ids)][~con_mask][ent_mask].sample(num_entailment_needed-len(past_df))])

  past_df = sub_df.loc[sub_df['id'].isin(old_ids)][~con_mask][ent_mask]
  balanced_df = pd.concat([balanced_df, sub_df.loc[~sub_df['id'].isin(old_ids)][~con_mask][ent_mask].sample(num_entailment_needed-len(past_df))])


In [12]:
balanced_df.predicted_labels.value_counts()

contradiction, neutral                6850
entailment, neutral                   6850
contradiction, entailment, neutral    2531
contradiction, entailment              595
Name: predicted_labels, dtype: int64

In [14]:
print(len(set(balanced_df.id.tolist()).intersection(set(old_ids))))
print(len(set(old_ids)))
print(len(balanced_df.index))

16812
16951
16826


In [15]:
balanced_df.sample(frac=1).to_csv('annotation/ambignli/balanced_examples.csv', index=False)
balanced_df.sample(frac=1).to_json(gen_dir / 'balanced_examples.jsonl', lines=True, orient='records')

## pre-create a bunch of new-batches

In [46]:
balanced_df = pd.read_csv('annotation/ambignli/balanced_examples.csv')
annotated_ids = pd.read_json('annotation/ambignli/annotated_examples.jsonl', lines=True)['id'].tolist()

In [47]:
batch_size = 100
remaining_pool_df = balanced_df[~balanced_df.id.isin(annotated_ids)]
print(len(remaining_pool_df))
ensure_dir('annotation/batches/nextbatches')

15002


In [48]:
for i, batch_df in enumerate(np.array_split(remaining_pool_df, 15030//100)):
    batch_df.to_csv(f'annotation/batches/nextbatches/examples_{i}.csv', index=False)

In [49]:
batches_dir = Path('annotation/batches')
dirs = [d for d in os.listdir(batches_dir) if (os.path.isdir(batches_dir / d) and d.startswith('batch_'))]
batch_dfs = []

for batch_dir in dirs:
    batch_id = int(batch_dir.split('_')[-1])
    batch_df = pd.read_csv(batches_dir / batch_dir / f'Batch_{batch_id}_batch_results.csv')
    batch_dfs.append(batch_df)

annotations_df = pd.concat(batch_dfs)

## create batches for singly labeled examples

In [21]:
annotated_df = pd.read_json('annotation/ambignli/cleaned_examples.jsonl', lines=True)
annotated_df.head(3)

Unnamed: 0,id,worker_ids,premise,hypothesis,annotations,disambiguations
0,4,"[A3FVGZKEEKXBON, A15WACUALQNT90]",We cannot be sure that the meeting will be pro...,We cannot be sure that the meeting will not be...,"[entailment|neutral, entailment]",{'neutral': [{'premise': 'We cannot be sure th...
1,39,"[A362GR9VFKI1V4, A15WACUALQNT90]",The person who told me the story is an unrelia...,I can't believe the story because the person w...,"[entailment, neutral]",{}
2,75,[AZ1568AZA22GD],The theme of the conference is 'Empowering You...,The conference is about empowering young girls.,[entailment],{}


In [22]:
def single_annotation(row):
    return True if len(row['worker_ids']) == 1 else False

single_df = annotated_df[annotated_df.apply(single_annotation, axis=1)]
print(f'There are {len(single_df.index)} singly-labeled examples, out of a total of {len(annotated_df.index)} annotated examples')
single_df['worker_id'] = [l[0] for l in single_df['worker_ids']]
single_df.drop('worker_ids', inplace=True, axis=1)
single_df.head(3)

There are 576 singly-labeled examples, out of a total of 2017 annotated examples


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_df['worker_id'] = [l[0] for l in single_df['worker_ids']]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,id,premise,hypothesis,annotations,disambiguations,worker_id
2,75,The theme of the conference is 'Empowering You...,The conference is about empowering young girls.,[entailment],{},AZ1568AZA22GD
20,1070,The company has not yet released a statement.,A company representative declined to comment.,[neutral],{},A3FVGZKEEKXBON
23,1388,It's getting harder and harder to find a good ...,I'm not sure if I'll be able to find a good job.,[entailment|neutral],{'entailment': [{'premise': 'It's getting hard...,A1QW104XQNIICA


In [26]:
balanced_df = pd.read_csv('annotation/ambignli/balanced_examples.csv')

In [27]:
# examples that should be annotated by Emma
emma_ids = single_df.loc[single_df['worker_id'] != 'A15WACUALQNT90'].id.tolist()
not_emma_ids = single_df.loc[single_df['worker_id'] == 'A15WACUALQNT90'].id.tolist()

In [29]:
balanced_df.loc[balanced_df['id'].isin(emma_ids)].to_csv('annotation/batches/batch_emma/examples.csv', index=False)
balanced_df.loc[balanced_df['id'].isin(enot_emma_ids)].to_csv('annotation/batches/batch_not_emma/examples.csv', index=False)