In [23]:
import sys
sys.path.append('../analysis')
from utils.all import *

In [24]:
# Export SimpEval_22 and Simpeval_Ext
data = load_data('annotated', batch_num=[5, 6, 7, 8, 9, 10, 11], preprocess=False)

for sent in data:
    sent['system'] = sent['system'] \
        .replace('new-wiki-2', 'simpeval-22') \
        .replace('new-wiki-1', 'simpeval-22') \
        .replace(' Writing', '-written') \
        .replace('Human ', 'Human-') \
        .replace('new-wiki-3', 'simpeval-ext')

simpeval_22, simpeval_ext = [], []
for sent in data:
    if 'simpeval-22' in sent['system']:
        simpeval_22 += [sent]
    elif 'simpeval-ext' in sent['system']:
        simpeval_ext += [sent]

In [25]:
# Sanity check: Count the # annotations per system
def number_annotations_per_system(data):
    systems = set([s['system'] for s in data])
    for system in systems:
        print(f"{len([s for s in data if s['system'] == system])}: {system}")
number_annotations_per_system(simpeval_22)
print('')
number_annotations_per_system(simpeval_ext)

180: simpeval-22/T5-3B
180: simpeval-22/Human-1-written
180: simpeval-22/Human-2-written
180: simpeval-22/Muss
180: simpeval-22/GPT-3-few-shot
180: simpeval-22/T5-11B
180: simpeval-22/GPT-3-zero-shot

120: simpeval-ext/GPT-3-few-shot
120: simpeval-ext/T5-3B
120: simpeval-ext/GPT-3-zero-shot
120: simpeval-ext/T5-11B
120: simpeval-ext/Human-1-written
120: simpeval-ext/Muss
120: simpeval-ext/Human-2-written


In [26]:
with open(f"salsa/simpeval_22.json", "w") as f:
   json.dump(simpeval_22, f, indent=4)

with open(f"salsa/simpeval_ext.json", "w") as f:
   json.dump(simpeval_ext, f, indent=4)

In [None]:
# Export the preliminary annotations
preliminary = load_data('annotated', batch_num=[1, 2, 3, 4], preprocess=False)

for sent in preliminary:
    sent['system'] = sent['system'] \
        .replace('new_systems', 'turkcorpus') \
        .replace('new_systems', 'turkcorpus') \
        .replace('systems', 'turkcorpus')

number_annotations_per_system(preliminary)

Found users: {'ayush', 'kelly', 'vinayak', 'vishnesh', 'rachel', 'anton'}

74: turkcorpus/con_simplification.txt
74: turkcorpus/turk_corpus_random.txt
74: turkcorpus/asset.test.simp
74: turkcorpus/T5.txt
74: turkcorpus/asset.test.simp.second


In [None]:
with open(f"salsa/turkcorpus_preliminary.json", "w") as f:
   json.dump(preliminary, f, indent=4)

### Prepare Ajudication

In [39]:
# Given a user, for each sentence in interwoven,
# have the swap the sentence if their annotation in in
# the first poistion
def interweave(data, user=None):
    interwoven = {}
    for sentence_id in set([s['sentence_id'] for s in data]):
        sents = [s for s in data if s['sentence_id'] == sentence_id]

        # Order user's annotations to be first
        if user is not None:
            if user not in [s['user'] for s in sents]:
                continue
            sents = [s for s in sents if s['user'] == user] + [s for s in sents if s['user'] != user]

        for s in sents:
            if 'annotation' in s.keys():
                del s['annotation']
            if 'annotations' in s.keys():
                del s['annotations']

        if len(sents) == 1:
            sents += [[], []]
        elif len(sents) == 2:
            sents += [[]]
        
        for i, s in enumerate(sents):
            if i not in interwoven:
                interwoven[i] = []
            interwoven[i] += [s]
    return interwoven

def prepare_adjudiction(data, user=None, batch_size=100):
    interwoven = interweave(data, user=user)

    path = f"inspection"
    if user is not None:
        path += f"/{user}" 

    for i in interwoven.keys():
        if batch_size is not None and user is not None:
            for j in range(math.ceil(len(interwoven[i]) / batch_size)):
                end_idx = (j+1)*batch_size
                if end_idx >= len(interwoven[i]):
                    end_idx = len(interwoven[i])

                path = f"inspection/{user}/batch_{j}"
                if not os.path.exists(path):
                    os.makedirs(path)
                with open(f'{path}/set_{i}.json', "w") as f:
                    json.dump(interwoven[i][j*batch_size:end_idx], f, indent=4)
        else:
            if not os.path.exists(path):
                os.makedirs(path)
            with open(f'{path}/set_{i}.json', "w") as f:
                json.dump(interwoven[i], f, indent=4)

In [40]:
# All annotators annotated ~350 sentences
# Assign one of those sentences to each annotator - around 117 sentences per annotator. Split into 2 batches
data = simpeval_22 + simpeval_ext

interwoven = {}
for user in set([s['user'] for s in data]):
    interwoven[user] = interweave(data, user=user)

# TODO : Split algorithm

In [41]:
# Separate each of the three annotations of each sentence
data = simpeval_22 + simpeval_ext
prepare_adjudiction(data)

for user in set([s['user'] for s in data]):
    prepare_adjudiction(data, user)