In [1]:
from pathlib import Path

while Path.cwd().name != 'ambignli':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambignli


In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import cohen_kappa_score
from itertools import combinations
from utils.constants import id2label
from utils.mturk_utils import read_batch, statistics_for_worker
import seaborn as sns
import matplotlib.pyplot as plt
import locale
from datetime import datetime
locale.setlocale(locale.LC_TIME, 'en_US')

'en_US'

In [10]:
batch_id = 365428
batch_df = read_batch(batch_id)

In [11]:
batch_df.worker_id.value_counts()

A15WACUALQNT90    174
AZ1568AZA22GD      65
A362GR9VFKI1V4     59
A19B8RMTSQ93UN     34
A2D2ULNNAS97Z6     13
A1HBYLYXKD7VWX      7
A150G63O9PWMRT      1
A3FVGZKEEKXBON      1
Name: worker_id, dtype: int64

In [12]:
annotated_ids = set(batch_df['id'])
scores_list = []
for _, example_df in batch_df.groupby('id'):
    annotations = example_df['q0_gold'].tolist()
    try:
        if 'discard' not in '.'.join(annotations):
            scores_list.append(annotations)
    except TypeError:
        continue

In [13]:
# agreement overall
np.sum([1 for l in scores_list if (len(set(l)) == 1)])/len(scores_list)

0.5769230769230769

In [14]:
worker_ids = set(batch_df.worker_id.tolist())
for worker_id in worker_ids:
    print(worker_id)
    print(statistics_for_worker(batch_df, worker_id))

A1HBYLYXKD7VWX
{'num_examples': 7, 'median_time': 82.072, 'prop_ambiguous': 0.14285714285714285, 'prop_discard': 0.42857142857142855}
A15WACUALQNT90
{'num_examples': 174, 'median_time': 20.751, 'prop_ambiguous': 0.20114942528735633, 'prop_discard': 0.19540229885057472}
A362GR9VFKI1V4
{'num_examples': 59, 'median_time': 34.223, 'prop_ambiguous': 0.23728813559322035, 'prop_discard': 0.3898305084745763}
AZ1568AZA22GD
{'num_examples': 65, 'median_time': 26.48, 'prop_ambiguous': 0.2153846153846154, 'prop_discard': 0.2153846153846154}
A3FVGZKEEKXBON
{'num_examples': 1, 'median_time': 139.166, 'prop_ambiguous': 1.0, 'prop_discard': 0.0}
A150G63O9PWMRT
{'num_examples': 1, 'median_time': 17.714, 'prop_ambiguous': 0.0, 'prop_discard': 0.0}
A2D2ULNNAS97Z6
{'num_examples': 13, 'median_time': 18.745, 'prop_ambiguous': 0.0, 'prop_discard': 0.15384615384615385}
A19B8RMTSQ93UN
{'num_examples': 34, 'median_time': 46.510999999999996, 'prop_ambiguous': 0.38235294117647056, 'prop_discard': 0.0882352941176

In [15]:
for worker_id in worker_ids:
    if worker_id != 'A19B8RMTSQ93UN':
        continue
    worker_sub_df = batch_df.loc[batch_df['worker_id'] == worker_id]
    print(f'\n-------------------- {worker_id} ({len(worker_sub_df)}) --------------------\n')
    for i, row in worker_sub_df.iterrows():
        premise, hypothesis = row['premise'], row['hypothesis']
        print(f'P: {premise}\nH: {hypothesis}')
        labels = row["q0_gold"]
        print(f'Labels: {labels}')
        if '|' in labels:
            for i in range(1, 5):
                if f'q{i}_gold' in row and not pd.isnull(row[f'q{i}_gold']):
                    revised_premise = row[f'premise{i}']
                    revised_hypothesis = row[f'hypothesis{i}']
                    print(row[f'q{i}_gold'].upper())
                    print(f"P: {revised_premise}") if revised_premise == premise else print(f"P': {revised_premise}")
                    print(f"H: {revised_hypothesis}") if revised_hypothesis == hypothesis else print(f"H': {revised_hypothesis}")
        if row['feedback'] != '{}':
            print(f'Comments: {row["feedback"]}')
        print('~')


-------------------- A1HBYLYXKD7VWX (7) --------------------

P: An increase in productivity can lead to a higher standard of living.
H: A higher standard of living can lead to an increase in productivity.
Labels: neutral
~
P: The senator's campaign promises include free college tuition and better job opportunities.
H: The governor's campaign promises include free college tuition and better job opportunities.
Labels: discard
~
P: A non-profit organization's purpose is to serve the public.
H: The organization's purpose is to serve the public.
Labels: neutral
~
P: The company's financial situation is very bad.
H: The company is in danger of going bankrupt.
Labels: discard
~
P: The first step is to contact your state's department of insurance.
H: The next step is to contact your state's department of insurance.
Labels: entailment|contradiction
ENTAILMENT
P: The first step is to contact your state's department of insurance.
H': If you haven't started yet, the next step is to contact your 

In [None]:
# agreement on singly labeled examples
singly_labeled = [l for l in scores_list if np.all(['|' not in a for a in l])]
np.sum([1 for l in singly_labeled if (len(set(l)) == 1 and len(l)>1)])/np.sum([1 for l in singly_labeled if len(l)>1])

In [None]:
fig, ax = plt.subplots()
# person_df = results_df.loc[results_df['WorkerId'] == 'A2I77AI2YH9WZG']
person_df = results_df
sns.histplot(person_df['Answer.ee'], kde=True)
ax.set_xlabel('Time in seconds per example')
print(person_df['Answer.ee'].quantile(q=0.75))

In [None]:
pairs = []
for s in scores_list:
    if len(s)>1:
#         pairs_per_example = list(combinations(s,2))
#         revised_pairs_per_examples = []   # revise so that subset equality is okay
#         for p in pairs_per_example:
#             if (p[0] in p[1]) or (p[1] in p[0]):
#                 p = list(p)
#                 p[1] = p[0]
#                 p = tuple(p)
#             revised_pairs_per_examples.append(p)
#         pairs.extend(revised_pairs_per_examples)
        pairs.extend(list(combinations(s,2)))
labels1, labels2 = [p[0] for p in pairs], [p[1] for p in pairs]
cohen_kappa_score(labels1, labels2)

In [None]:
processed_examples = []
for id in annotated_ids:
    sub_df = results_df.loc[results_df['Input.id'] == id]
    dummy_row = sub_df.iloc[0]
    annotations = sub_df['Answer.q0_gold'].tolist()
    annotations = [textify(a) for a in annotations]
    rewrites = defaultdict(list)
    for _, row in sub_df.iterrows():
        for i in range(1, 5):
            if f'Answer.q{i}_gold' in row and not pd.isnull(row[f'Answer.q{i}_gold']):
                label = id2label[row[f'Answer.q{i}_gold']]
                rewrites[label].append({
                    'premise': row[f'Answer.premise{i}'],
                    'hypothesis': row[f'Answer.hypothesis{i}']
                })
    processed_examples.append({
        'premise': dummy_row['Input.premise'],
        'hypothesis': dummy_row['Input.hypothesis'],
        'annotations': annotations,
        
        'rewrites': rewrites,
        'comments': sub_df['Answer.feedback'].tolist()
    })

In [None]:
with open(batch_dir / 'results.txt', 'w') as fo:
    for ex in processed_examples:
        if len(ex['annotations']) == 1:
            continue
        fo.write(f'Premise:\t{ex["premise"]}\n')
        fo.write(f'Hypothesis:\t{ex["hypothesis"]}\n')
        fo.write(f'Annotations:\t{ex["annotations"]}\n')
        for plausible_label in ex['rewrites']:
            fo.write(f'{plausible_label.upper()}\t\n')
            for revised_ex in ex['rewrites'][plausible_label]:
                fo.write(f'\tP:\t{revised_ex["premise"]}\n') if revised_ex['premise'] == ex['premise'] else fo.write(f"\tP':\t{revised_ex['premise']}\n")
                fo.write(f'\tH:\t{revised_ex["hypothesis"]}\n') if revised_ex['hypothesis'] == ex['hypothesis'] else fo.write(f"\tH':\t{revised_ex['hypothesis']}\n")
        fo.write(f'Comments:\n')
        for comment in ex['comments']:
            fo.write(f'\t{comment}\n')
        fo.write('------------------------------\n')

In [None]:
ex['comments']