### Data Injestion
Create `simpeval_22_ajudicated.json` and `simpeval_ext_ajudicated.json`

In [141]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../analysis')
from utils.all import *
log.basicConfig(level=log.INFO)
data = load_data('../data/inspection_rating_annotated', preprocess=False, adjudicated=True)

simpeval_22, simpeval_ext = [], []
for sent in data:
    if 'simpeval-22' in sent['system']:
        simpeval_22 += [sent]
    elif 'simpeval-ext' in sent['system']:
        simpeval_ext += [sent]

# Sanity check: Count the # annotations per system
def number_annotations_per_system(data):
    systems = set([s['system'] for s in data])
    for system in systems:
        print(f"{len([s for s in data if s['system'] == system])}: {system}")
number_annotations_per_system(simpeval_22)
print('')
number_annotations_per_system(simpeval_ext)

with open(f"salsa/simpeval_22_ajudicated.json", "w") as f:
   json.dump(simpeval_22, f, indent=4)

with open(f"salsa/simpeval_ext_ajudicated.json", "w") as f:
   json.dump(simpeval_ext, f, indent=4)

INFO:Loading files: ['../data/inspection_rating_annotated/batch_1_ayush.json', '../data/inspection_rating_annotated/batch_1_rachel.json', '../data/inspection_rating_annotated/batch_1_vinayak.json', '../data/inspection_rating_annotated/batch_1_vishnesh.json', '../data/inspection_rating_annotated/batch_2_ayush.json', '../data/inspection_rating_annotated/batch_2_rachel.json', '../data/inspection_rating_annotated/batch_2_vinayak.json', '../data/inspection_rating_annotated/batch_2_vishnesh.json', '../data/inspection_rating_annotated/batch_3_ayush.json', '../data/inspection_rating_annotated/batch_3_rachel.json', '../data/inspection_rating_annotated/batch_3_vishnesh.json', '../data/inspection_rating_annotated/batch_4_ayush.json']



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


INFO:Found users: {'vishnesh', 'vinayak', 'ayush', 'rachel'}



180: simpeval-22/Muss
169: simpeval-22/GPT-3-few-shot
180: simpeval-22/Human-2-written
170: simpeval-22/T5-11B
180: simpeval-22/Human-1-written
170: simpeval-22/GPT-3-zero-shot
169: simpeval-22/T5-3B

53: simpeval-ext/Human-1-written
63: simpeval-ext/GPT-3-zero-shot
60: simpeval-ext/Human-2-written
63: simpeval-ext/Muss
60: simpeval-ext/T5-3B
57: simpeval-ext/GPT-3-few-shot
59: simpeval-ext/T5-11B


### LENS-SALSA Training Data

In [None]:
data = load_data('../data/inspection_rating_annotated', preprocess=True, adjudicated=True)

In [None]:
# LENS uses a CSV input format:

# original_id
# original
# generation
# system
# sentence_type
# rating_1
# rating_2
# rating_3
# rating_1_zscore
# rating_2_zscore
# rating_3_zscore

# For our first output, we need these columns:

# sentence_id
# original
# generation
# system
# simpeval_score
# salsa_score
# salsa_lexical_score
# salsa_syntax_score
# salsa_conceptual_score
# word_qe - Edits are surrounded by <bad></bad>, <good></good>
# word_ratings - Edits are surrounded by <3></3>, <2></2>, etc.
# edit_qe - Edits are surrounded by <edit></edit>

# I want to generate the above 3 multiple times for each edit type

In [129]:
def get_tag_values(tags, family_constraint=None):
    # Recover word QE
    word_qe, word_ratings, edit_qe = [], [], []
    for tag in tags.keys():
        ratings = [i for j in tags[tag].values() for i in j]
        if family_constraint:
            ratings = [ann for ann in ratings if ann['family'] == family_constraint]

        # whether each word is good/bad/ok, bad > good > ok
        word_quality = 'ok'
        if any([edit['word_qe'] == 'bad' for edit in ratings]):
            word_quality = 'bad'
        elif any([edit['word_qe'] == 'good' for edit in ratings]):
            word_quality = 'good'
        word_qe += [word_quality]

        # word rating, greater magnitude is king
        word_rating = 0
        if len(ratings) != 0:
            word_rating = max([edit['word_rating'] for edit in ratings if edit['word_rating'] is not None] + [0], key=abs)
        word_ratings += [word_rating]

        # whether edit exists, edit > no edit
        edit_qe += ['edit' if any([edit['edit_qe'] for edit in ratings]) else 'noedit']
    return word_qe, word_ratings, edit_qe

In [96]:
# Recover the sentence iteratively, as in only switch tags when the
# array value has changed
def write_tagged_sentence(sent, tags, tag_values):
    prev_value = None
    orig = sent
    out = ""
    for i, span in enumerate(list(tags.keys())):
        curr_value = tag_values[i]
        if prev_value == None:
            out += f"<{curr_value}>"
        elif prev_value != curr_value:
            out += f"</{prev_value}> <{curr_value}>"
        else:
            out += " "

        out += f'{orig[span[0]:span[1]]}'

        prev_value = curr_value
    out += f'</{prev_value}>'
    return out

"<1>The architecture of Winchester College is</1> <0>a</0> <1>diverse set</1> <0>of architectural styles,</0> <1>reflecting</1> <0>the multiple periods of building from</0> <1>the college's foundation</1> <0>in 1382,</0> <1>through additions in the medieval and Early Modern periods,</1> <0>to a major expansion of accommodation in the Victorian era</0> <1>and then</1> <2>further extensions at the turn of the 20th century</2> <1>and more recently.</1>"

In [136]:
out_file = []
for sent in data:
    entry = {
        'sentence_id': sent['sentence_id'],
        'original': sent['original'],
        'generation': sent['simplified'],
        'system': sent['system'],
        'salsa_score': sent['score'],
        'simpeval_score_1': None,
        'simpeval_score_2': None,
        'simpeval_score_3': None,

        # Subscores
        'salsa_lexical_quality_score': sent['subscores']['quality_lexical'],
        'salsa_syntax_quality_score': sent['subscores']['quality_syntax'],
        'salsa_conceptual_quality_score': sent['subscores']['quality_content'],

        'salsa_lexical_error_score': sent['subscores']['error_lexical'],
        'salsa_syntax_error_score': sent['subscores']['error_syntax'],
        'salsa_conceptual_error_score': sent['subscores']['error_content'],

        'salsa_lexical_score': sent['subscores']['lexical'],
        'salsa_syntax_score': sent['subscores']['syntax'],
        'salsa_conceptual_score': sent['subscores']['content'],

        'salsa_quality_score': sent['subscores']['quality'],
        'salsa_error_score': sent['subscores']['error'],
    }

    if sent['simpeval_scores'] is not None:
        entry.update({
            'simpeval_score_1': sent['simpeval_scores'][0],
            'simpeval_score_2': sent['simpeval_scores'][1],
            'simpeval_score_3': sent['simpeval_scores'][2],
        })

    # Generate word-level QE
    for family_constraint in [None] + list(Family):
        for sentence_type in ['original', 'simplified']:
            tags = get_annotations_per_token([sent], sentence_type, remove_none=False, tagging=True)
            word_qe, word_ratings, edit_qe = get_tag_values(tags, family_constraint)

            word_qe = write_tagged_sentence(sent[sentence_type], tags, word_qe)
            word_ratings = write_tagged_sentence(sent[sentence_type], tags, word_ratings)
            edit_qe = write_tagged_sentence(sent[sentence_type], tags, edit_qe)
            
            fam_name = f'_{Family.CONTENT.value.lower()}' if family_constraint is not None else ''
            entry.update({
                f'word_qe_{sentence_type}{fam_name}': word_qe,
                f'word_ratings_{sentence_type}{fam_name}': word_ratings,
                f'edit_qe_{sentence_type}{fam_name}': edit_qe,
            })

    out_file += [entry]

In [139]:
import csv

headers = list(out_file[0].keys())
with open('salsa/lens_salsa_training.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()
    for row in out_file:
        writer.writerow(row)