### Data Injestion
Create `simpeval_22_ajudicated.json` and `simpeval_ext_ajudicated.json`

In [10]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../analysis')
from utils.all import *
from data_util import *
log.basicConfig(level=log.INFO)
data = load_data('../data/inspection_rating_annotated', preprocess=False, adjudicated=True)

simpeval_22, simpeval_ext = [], []
for sent in data:
    if 'simpeval-22' in sent['system']:
        simpeval_22 += [sent]
    elif 'simpeval-ext' in sent['system']:
        simpeval_ext += [sent]

# Sanity check: Count the # annotations per system
def number_annotations_per_system(data):
    systems = set([s['system'] for s in data])
    for system in systems:
        print(f"{len([s for s in data if s['system'] == system])}: {system}")
number_annotations_per_system(simpeval_22)
print('')
number_annotations_per_system(simpeval_ext)

with open(f"salsa/simpeval_22_ajudicated.json", "w") as f:
   json.dump(simpeval_22, f, indent=4)

with open(f"salsa/simpeval_ext_ajudicated.json", "w") as f:
   json.dump(simpeval_ext, f, indent=4)

INFO:Loading files: ['../data/inspection_rating_annotated/batch_1_ayush.json', '../data/inspection_rating_annotated/batch_1_rachel.json', '../data/inspection_rating_annotated/batch_1_vinayak.json', '../data/inspection_rating_annotated/batch_1_vishnesh.json', '../data/inspection_rating_annotated/batch_2_ayush.json', '../data/inspection_rating_annotated/batch_2_rachel.json', '../data/inspection_rating_annotated/batch_2_vinayak.json', '../data/inspection_rating_annotated/batch_2_vishnesh.json', '../data/inspection_rating_annotated/batch_3_ayush.json', '../data/inspection_rating_annotated/batch_3_rachel.json', '../data/inspection_rating_annotated/batch_3_vinayak.json', '../data/inspection_rating_annotated/batch_3_vishnesh.json', '../data/inspection_rating_annotated/batch_4_ayush.json', '../data/inspection_rating_annotated/batch_4_rachel.json', '../data/inspection_rating_annotated/batch_4_vinayak.json', '../data/inspection_rating_annotated/batch_4_vishnesh.json']



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


INFO:Found users: {'vinayak', 'ayush', 'rachel', 'vishnesh'}



180: simpeval-22/Muss
180: simpeval-22/T5-3B
180: simpeval-22/GPT-3-zero-shot
180: simpeval-22/GPT-3-few-shot
180: simpeval-22/Human-2-written
180: simpeval-22/Human-1-written
180: simpeval-22/T5-11B

120: simpeval-ext/Muss
120: simpeval-ext/T5-11B
120: simpeval-ext/GPT-3-few-shot
120: simpeval-ext/T5-3B
120: simpeval-ext/Human-2-written
120: simpeval-ext/Human-1-written
120: simpeval-ext/GPT-3-zero-shot


### LENS-SALSA Training Data

In [11]:
data = load_data('../data/inspection_rating_annotated', preprocess=True, adjudicated=True)

INFO:Loading files: ['../data/inspection_rating_annotated/batch_1_ayush.json', '../data/inspection_rating_annotated/batch_1_rachel.json', '../data/inspection_rating_annotated/batch_1_vinayak.json', '../data/inspection_rating_annotated/batch_1_vishnesh.json', '../data/inspection_rating_annotated/batch_2_ayush.json', '../data/inspection_rating_annotated/batch_2_rachel.json', '../data/inspection_rating_annotated/batch_2_vinayak.json', '../data/inspection_rating_annotated/batch_2_vishnesh.json', '../data/inspection_rating_annotated/batch_3_ayush.json', '../data/inspection_rating_annotated/batch_3_rachel.json', '../data/inspection_rating_annotated/batch_3_vinayak.json', '../data/inspection_rating_annotated/batch_3_vishnesh.json', '../data/inspection_rating_annotated/batch_4_ayush.json', '../data/inspection_rating_annotated/batch_4_rachel.json', '../data/inspection_rating_annotated/batch_4_vinayak.json', '../data/inspection_rating_annotated/batch_4_vishnesh.json']

INFO:Found users: {'vinaya

In [12]:
# Add LENS scores
with open('../lens/4-scores.json', 'r') as f:
    scores = json.load(f)

for sent in data:
    for score in scores:
        if sent['original'] == score['original'] and sent['simplified'] == score['simplified']:
            sent['lens_score'] = score['lens']
            sent['bleu'] = score['bleu']
            sent['bertscore'] = score['bertscore']
            sent['sari'] = score['sari']
            sent['comet'] = score['comet']

# Exclude corrupted sentences with no scores
data = [s for s in data if 'bleu' in s]

In [34]:
# lp,src,mt,ref,score,raw_score

out_file = []
for sent in data:
    entry = {
        # 'sentence_id': sent['sentence_id'],
        'lp': 'en-en',
        'src': sent['original'],
        'mt': sent['simplified'],
        'system': sent['system'],
        'salsa_score': sent['score'],
        'lens_score': sent['lens_score'],
        
        'bleu': sent['bleu'],
        'bertscore': sent['bertscore'],
        'sari': sent['sari'],
        'comet': sent['comet'],
        'simpeval_score_1': None,
        'simpeval_score_2': None,
        'simpeval_score_3': None,

        # Subscores
        'salsa_lexical_quality_score': sent['subscores']['quality_lexical'],
        'salsa_syntax_quality_score': sent['subscores']['quality_syntax'],
        'salsa_conceptual_quality_score': sent['subscores']['quality_content'],

        'salsa_lexical_error_score': sent['subscores']['error_lexical'],
        'salsa_syntax_error_score': sent['subscores']['error_syntax'],
        'salsa_conceptual_error_score': sent['subscores']['error_content'],

        'salsa_lexical_score': sent['subscores']['lexical'],
        'salsa_syntax_score': sent['subscores']['syntax'],
        'salsa_conceptual_score': sent['subscores']['content'],

        'salsa_quality_score': sent['subscores']['quality'],
        'salsa_error_score': sent['subscores']['error'],
    }

    if sent['simpeval_scores'] is not None:
        entry.update({
            'simpeval_score_1': sent['simpeval_scores'][0],
            'simpeval_score_2': sent['simpeval_scores'][1],
            'simpeval_score_3': sent['simpeval_scores'][2],
        })

    # Generate word-level QE
    for family_constraint in [None] + list(Family):
        for sentence_type in ['original', 'simplified']:
            tags = get_annotations_per_token([sent], sentence_type, remove_none=False, tagging=True)
            tags_by_type = get_tag_values(tags, family_constraint)

            for tag_type, tag_value in tags_by_type.items():
                tag_value = write_tagged_sentence(sent[sentence_type], tags, tag_value)
                
                tag_value = tag_value.replace('<ok>', '')\
                    .replace('</ok>', '').replace('<noedit>', '')\
                    .replace('</noedit>', '')
            
                fam_name = f'_{Family.CONTENT.value.lower()}' if family_constraint is not None else ''
                entry.update({
                    f'{tag_type}_{sentence_type}{fam_name}': tag_value
                })
    
    # Generate edit type tagging
    for sentence_type in ['original', 'simplified']:
        tags = get_annotations_per_token([sent], sentence_type, collapse_composite=True, remove_reorder=True, remove_none=False)
        tags_by_type = get_edit_values(tags)

        for tag_type, tag_value in tags_by_type.items():
            tag_value = write_tagged_sentence(sent[sentence_type], tags, tag_value)

            tag_value = tag_value.replace('<ok>', '').replace('</ok>', '')
        
            entry.update({
                f'{tag_type}_{sentence_type}': tag_value
            })

    # Generate traditional alignment setup
    entry.update({
        'alignment': get_word_alignment_string(sent),
        'alignment-no-phrases': get_word_alignment_string(sent, collapse_phrase_alignment=True)
    })

    out_file += [entry]

In [35]:
# Get number of edit types
edits = [i for j in [s['processed_annotations'] for s in data] for i in j]
num_error = sum([e['error_type'] is not None for e in edits])
num_complex = sum([e['error_type'] == Error.COMPLEX_WORDING for e in edits])
num_bad_del = sum([e['error_type'] == Error.BAD_DELETION for e in edits])
num_quality = len(edits) - num_error
print("\\textit{" + str(num_quality) + "} & \\textit{" + str(num_bad_del) + "} & \\textit{" + str(num_complex) + "} & \\textit{" + str(num_error) + "}")

\textit{13057} & \textit{628} & \textit{449} & \textit{1901}


In [36]:
# Split by src sentence, so we don't have val data that's trained on outputs
src_sents = list(set(s['src'] for s in out_file))
split = int(len(src_sents) * 0.7)
src_sents_train, src_sents_val = src_sents[:split], src_sents[split:]

In [37]:
write_csv('salsa/lens-salsa-training/train.csv', [s for s in out_file if s['src'] in src_sents_train])
write_csv('salsa/lens-salsa-training/valid.csv', [s for s in out_file if s['src'] in src_sents_val])

In [40]:
# Write with & without collapsed alignment
write_tsv_align('salsa/lens-salsa-training/train-align.tsv', [s for s in out_file if s['src'] in src_sents_train])
write_tsv_align('salsa/lens-salsa-training/valid-align.tsv', [s for s in out_file if s['src'] in src_sents_val])

write_tsv_align('salsa/lens-salsa-training/train-align-collapsed.tsv', [s for s in out_file if s['src'] in src_sents_train], collapse_phrase_alignment=True)
write_tsv_align('salsa/lens-salsa-training/valid-align-collapsed.tsv', [s for s in out_file if s['src'] in src_sents_val], collapse_phrase_alignment=True)