In [1]:
from pathlib import Path

while Path.cwd().name != 'ambignli':
    %cd ..

/mmfs1/gscratch/xlab/alisaliu/ambignli


In [32]:
import numpy as np
import pandas as pd
from utils.mturk_utils import read_batch, time_format
import os
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from datetime import datetime
import math
import locale
locale.setlocale(locale.LC_TIME, 'en_US')

'en_US'

In [3]:
batches_dir = Path('annotation/batches')
dirs = [d for d in os.listdir(batches_dir) if (os.path.isdir(batches_dir / d) and d.startswith('batch_'))]

In [4]:
hits_per_annotator = Counter()
batch_dfs = []

for batch_dir in dirs:
    batch_id = int(batch_dir.split('_')[-1])
    batch_df = read_batch(batch_id)
    batch_dfs.append(batch_df)
    hits_per_annotator += batch_df.worker_id.value_counts()

In [5]:
annotations_df = pd.concat(batch_dfs)
print(f'Total annotations: {len(annotations_df)}')
annotations_df.sample(3)

Total annotations: 3221


Unnamed: 0,worker_id,submit_time,time_on_page,id,premise,hypothesis,feedback,premise1,premise2,premise3,premise4,hypothesis1,hypothesis2,hypothesis3,hypothesis4,q0_gold,q1_gold,q2_gold,q3_gold
107,A15WACUALQNT90,2022-09-30 10:14:51,11.866,9928,There is no single answer to the question.,The answer to the question is 42.,{},There is no single answer to the question.,There is no single answer to the question.,There is no single answer to the question.,There is no single answer to the question.,The answer to the question is 42.,The answer to the question is 42.,The answer to the question is 42.,The answer to the question is 42.,contradiction,,,
13,A15WACUALQNT90,2022-10-06 10:11:15,10.883,45737,I'll be there.,I'll be there for you.,{},I'll be there.,I'll be there.,I'll be there.,I'll be there.,I'll be there for you.,I'll be there for you.,I'll be there for you.,I'll be there for you.,neutral,,,
213,A3SPV7CWJNSGKX,2022-09-29 17:35:57,120.856,16320,This is not to say that I do not like tennis.,I do not like tennis.,{},"This is not to say that I do not like tennis, ...","This is not to say that I do not like tennis, ...",This is not to say that I do not like tennis.,This is not to say that I do not like tennis.,I do not like tennis.,I do not like tennis.,I do not like tennis.,I do not like tennis.,neutral|contradiction,contradiction,neutral,


In [29]:
annotations_df.to_json('annotation/ambignli/annotated_examples.jsonl', lines=True, orient='records')

26

In [None]:
ambignli = clean_batch(annotations_df)
ambignli

In [None]:
pd.DataFrame(example_rows).to_json('annotation/ambignli/cleaned_examples.jsonl', lines=True, orient='records')

In [26]:
example_rows = []
for _, example_df in annotations_df.groupby('id'):
    annotations = example_df['q0_gold'].tolist()
    if 'discard' in ' '.join(annotations):
        continue
    worker_ids = example_df['worker_id'].tolist()
    example_id, premise, hypothesis = example_df.iloc[0].id, example_df.iloc[0].premise, example_df.iloc[0].hypothesis
    
    rewrites = defaultdict(list)
    for _, row in example_df.iterrows():
        if '|' in row['q0_gold']:
            for i in range(1, 5):
                if f'q{i}_gold' in row and not check_nan(row[f'q{i}_gold']):
                    rewrites[row[f'q{i}_gold']].append({
                        'premise': row[f'premise{i}'],
                        'hypothesis': row[f'hypothesis{i}']
                    })
                
    example_rows.append({
        'id': example_id,
        'worker_ids': worker_ids,
        'premise': premise,
        'hypothesis': hypothesis,
        'annotations': annotations,
        'disambiguations': rewrites
    })

In [27]:
pd.DataFrame(example_rows).to_json('annotation/ambignli/annotated_examples.jsonl', lines=True, orient='records')

In [None]:
fig, ax = plt.subplots()
times = annotations_df.time_on_page.tolist()
# annotations_df['submit_time'] = [datetime.strptime(t, time_format) for t in annotations_df['submit_time'].tolist()]
annotations_df['submit_day'] = [t.date() for t in annotations_df['submit_time']]
sns.histplot(times, kde=True)
ax.set_xlim(left=0, right=200)
ax.set_xlabel('Time per example (sec)')

# p75 for seconds per example * $20/hour * 
p = 55
seconds_in_hour = 60*60
time_per_example = np.percentile(times, p)
print(f'p{p} time per example: {time_per_example}')
goal_pay_per_second = 20/seconds_in_hour
print(time_per_example * goal_pay_per_second)   # calculated pay per example

pay_per_example = 0.25
print(pay_per_example * 1/time_per_example * seconds_in_hour) # empirical pay per example

In [None]:
fig, ax = plt.subplots()
sns.boxplot(data=annotations_df, x='submit_day', y='time_on_page')
ax.set_ylim(bottom=0, top=200)
ax.set_xlabel('Date')
ax.set_ylabel('Time on page (sec)')
plt.xticks(rotation=45)

In [45]:
def validate(row):
    premise, hypothesis = row['premise'], row['hypothesis']
    if '|' in row['q0_gold']:
        num_premise_revisions = 0
        num_hypothesis_revisions = 0
        for i in range(1, 5):
            if f'q{i}_gold' in row and not check_nan(row[f'q{i}_gold']):
                if row[f'premise{i}'] != premise:
                    num_premise_revisions += 1
                if row[f'hypothesis{i}'] != hypothesis:
                    num_hypothesis_revisions += 1
        if num_premise_revisions == 1 or num_hypothesis_revisions == 1:
            return False
    return True

In [46]:
bad_rows = 0
for i, row in tqdm(annotations_df.iterrows()):
    if not validate(row):
        bad_rows += 1

3221it [00:00, 21346.93it/s]
