### Data Injestion
Create `simpeval_22_ajudicated.json` and `simpeval_ext_ajudicated.json`

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../analysis')
# sys.path.append('C:\Python39\Lib\site-packages\matplotlib')
# import matplotlib.artist
from utils.all import *
from data_util import *
log.basicConfig(level=log.INFO)
data = load_data('../data/inspection_rating_annotated', preprocess=False, adjudicated=True)

simpeval_22, simpeval_ext = [], []
for sent in data:
    if 'simpeval-22' in sent['system']:
        simpeval_22 += [sent]
    elif 'simpeval-ext' in sent['system']:
        simpeval_ext += [sent]

# Sanity check: Count the # annotations per system
def number_annotations_per_system(data):
    systems = set([s['system'] for s in data])
    for system in systems:
        print(f"{len([s for s in data if s['system'] == system])}: {system}")
number_annotations_per_system(simpeval_22)
print('')
number_annotations_per_system(simpeval_ext)

with open(f"salsa/simpeval_22_ajudicated.json", "w") as f:
   json.dump(simpeval_22, f, indent=4)

with open(f"salsa/simpeval_ext_ajudicated.json", "w") as f:
   json.dump(simpeval_ext, f, indent=4)

180: simpeval-22/GPT-3-few-shot
180: simpeval-22/Human-1-written
180: simpeval-22/Muss
180: simpeval-22/GPT-3-zero-shot
180: simpeval-22/T5-11B
180: simpeval-22/T5-3B
180: simpeval-22/Human-2-written

120: simpeval-ext/Human-1-written
120: simpeval-ext/GPT-3-zero-shot
120: simpeval-ext/Human-2-written
120: simpeval-ext/GPT-3-few-shot
120: simpeval-ext/T5-3B
120: simpeval-ext/Muss
120: simpeval-ext/T5-11B


### LENS-SALSA Training Data

In [2]:
data = load_data('../data/inspection_rating_annotated', preprocess=True, adjudicated=True)

In [3]:
# Add LENS scores
with open('../lens/4-scores.json', 'r') as f:
    scores = json.load(f)

for sent in data:
    for score in scores:
        if sent['original'] == score['original'] and sent['simplified'] == score['simplified']:
            sent['lens_score'] = score['lens']
            sent['bleu'] = score['bleu']
            sent['bertscore'] = score['bertscore']
            sent['sari'] = score['sari']
            sent['comet'] = score['comet']

# Exclude corrupted sentences with no scores
data = [s for s in data if 'bleu' in s]

In [10]:
def package_data(data):
    out_file = []
    for sent in data:
        entry = {
            # 'sentence_id': sent['sentence_id'],
            'lp': 'en-en',
            'src': sent['original'],
            'mt': sent['simplified'],
            'system': sent['system'],
            'salsa_score': sent['score'],
            
            'bleu': sent['bleu'] if 'bleu' in sent else None,
            'bertscore': sent['bertscore'] if 'bertscore' in sent else None,
            'sari': sent['sari'] if 'sari' in sent else None,
            'comet': sent['comet'] if 'comet' in sent else None,
            'lens_score': sent['lens_score'] if 'lens_score' in sent else None,

            'simpeval_score_1': None,
            'simpeval_score_2': None,
            'simpeval_score_3': None,

            # Subscores
            'salsa_lexical_quality_score': sent['subscores']['quality_lexical'],
            'salsa_syntax_quality_score': sent['subscores']['quality_syntax'],
            'salsa_conceptual_quality_score': sent['subscores']['quality_content'],

            'salsa_lexical_error_score': sent['subscores']['error_lexical'],
            'salsa_syntax_error_score': sent['subscores']['error_syntax'],
            'salsa_conceptual_error_score': sent['subscores']['error_content'],

            'salsa_lexical_score': sent['subscores']['lexical'],
            'salsa_syntax_score': sent['subscores']['syntax'],
            'salsa_conceptual_score': sent['subscores']['content'],

            'salsa_quality_score': sent['subscores']['quality'],
            'salsa_error_score': sent['subscores']['error'],
        }

        if sent['simpeval_scores'] is not None:
            entry.update({
                'simpeval_score_1': sent['simpeval_scores'][0],
                'simpeval_score_2': sent['simpeval_scores'][1],
                'simpeval_score_3': sent['simpeval_scores'][2],
            })

        # Generate word-level QE
        for family_constraint in [None] + list(Family):
            for sentence_type in ['original', 'simplified']:
                tags = get_annotations_per_token([sent], sentence_type, remove_none=False, tagging=True)
                tags_by_type = get_tag_values(tags, family_constraint)

                for tag_type, tag_value in tags_by_type.items():
                    tag_value = write_tagged_sentence(sent[sentence_type], tags, tag_value)
                    
                    tag_value = tag_value.replace('<ok>', '')\
                        .replace('</ok>', '').replace('<noedit>', '')\
                        .replace('</noedit>', '')
                
                    fam_name = f'_{Family.CONTENT.value.lower()}' if family_constraint is not None else ''
                    entry.update({
                        f'{tag_type}_{sentence_type}{fam_name}': tag_value
                    })
        
        # Generate edit type tagging
        for sentence_type in ['original', 'simplified']:
            tags = get_annotations_per_token([sent], sentence_type, collapse_composite=True, remove_reorder=True, remove_none=False)
            tags_by_type = get_edit_values(tags)

            for tag_type, tag_value in tags_by_type.items():
                tag_value = write_tagged_sentence(sent[sentence_type], tags, tag_value)

                tag_value = tag_value.replace('<ok>', '').replace('</ok>', '')
            
                entry.update({
                    f'{tag_type}_{sentence_type}': tag_value
                })

        # Generate traditional alignment setup
        entry.update({
            'alignment': get_word_alignment_string(sent),
            'alignment-no-phrases': get_word_alignment_string(sent, collapse_phrase_alignment=True),
            'alignment-error-labels-input': ' '.join(get_tag_values(get_annotations_per_token([sent], 'original', \
                remove_none=False, tagging=True))['word_qe_error_types']),
            'alignment-error-labels-output': ' '.join(get_tag_values(get_annotations_per_token([sent], 'simplified', \
                remove_none=False, tagging=True))['word_qe_error_types']),
        })

        out_file += [entry]
    return out_file

In [5]:
# Get number of edit types
edits = [i for j in [s['processed_annotations'] for s in data] for i in j]
num_error = sum([e['error_type'] is not None for e in edits])
num_complex = sum([e['error_type'] == Error.COMPLEX_WORDING for e in edits])
num_bad_del = sum([e['error_type'] == Error.BAD_DELETION for e in edits])
num_quality = len(edits) - num_error
print("\\textit{" + str(num_quality) + "} & \\textit{" + str(num_bad_del) + "} & \\textit{" + str(num_complex) + "} & \\textit{" + str(num_error) + "}")

\textit{13057} & \textit{628} & \textit{449} & \textit{1901}


In [6]:
# Get number of tokens
edits = [i for j in [s['processed_annotations'] for s in data] for i in j]
num_error = [e['token_size'] for e in edits if e['error_type'] is not None]
num_complex = [e['token_size'] for e in edits if e['error_type'] == Error.COMPLEX_WORDING]
num_bad_del = [e['token_size'] for e in edits if e['error_type'] == Error.BAD_DELETION]
num_quality = [e['token_size'] for e in edits if e['error_type'] is None] # e['type'] == Quality.QUALITY
num_same_tok = sum([len(s['original'].split(' ') + s['simplified'].split(' ')) for s in data]) - sum([e['token_size'] for e in edits])

print("\\textit{" + str(len(num_quality)) + "} & \\textit{" + str(len(num_bad_del)) + "} & \\textit{" + str(len(num_complex)) + "} & \\textit{" + str(len(num_error)) + "} & -- & \\\\")
print("\\textit{" + str(int(sum(num_quality)/1000)) + "K} & \\textit{" + str(round(sum(num_bad_del)/1000, 1)) + "K} & \\textit{" + str(round(sum(num_complex)/1000, 1)) + "K} & \\textit{" + str(round(sum(num_error)/1000)) + "K} & \\textit{" + str(round(num_same_tok/1000)) + "K} & \\\\")

\textit{13057} & \textit{628} & \textit{449} & \textit{1901} & -- & \\
\textit{55K} & \textit{4.3K} & \textit{1.8K} & \textit{12K} & \textit{82K} & \\


In [7]:
train_data = package_data(data)

In [8]:
test_data = load_data('../data/inspection_rating_annotated', preprocess=True, adjudicated=True)
# TODO: Add test data automatic metrics

In [11]:
test_data = package_data(test_data)

In [None]:
# Create a split on the main data. We don't need to do this as we collected a separate test set
# src_sents = list(set(s['src'] for s in out_file))
# split = int(len(src_sents) * 0.7)
# src_sents_train, src_sents_val = src_sents[:split], src_sents[split:]

# write_csv('salsa/lens-salsa-training/train.csv', [s for s in out_file if s['src'] in src_sents_train])
# write_csv('salsa/lens-salsa-training/valid.csv', [s for s in out_file if s['src'] in src_sents_val])

# # Write with & without collapsed alignment
# write_tsv_align('salsa/lens-salsa-training/train-align.tsv', [s for s in out_file if s['src'] in src_sents_train])
# write_tsv_align('salsa/lens-salsa-training/valid-align.tsv', [s for s in out_file if s['src'] in src_sents_val])

# write_tsv_align('salsa/lens-salsa-training/train-align-collapsed.tsv', [s for s in out_file if s['src'] in src_sents_train], collapse_phrase_alignment=True)
# write_tsv_align('salsa/lens-salsa-training/valid-align-collapsed.tsv', [s for s in out_file if s['src'] in src_sents_val], collapse_phrase_alignment=True)

In [12]:
write_csv('salsa/lens-salsa-training/train.csv', train_data)
write_csv('salsa/lens-salsa-training/valid.csv', test_data)

In [13]:
# Write with & without collapsed alignment
write_tsv_align('salsa/lens-salsa-training/train-align.tsv', train_data)
write_tsv_align('salsa/lens-salsa-training/valid-align.tsv', test_data)

write_tsv_align('salsa/lens-salsa-training/train-align-collapsed.tsv', train_data, collapse_phrase_alignment=True)
write_tsv_align('salsa/lens-salsa-training/valid-align-collapsed.tsv', test_data, collapse_phrase_alignment=True)

In [None]:
# For testing
collapse_phrase_alignment = False

orig_tags = get_annotations_per_token([sent], 'original', collapse_composite=True, remove_reorder=True, \
    remove_none=False, get_alignment=True)
simp_tags = get_annotations_per_token([sent], 'simplified', collapse_composite=True, remove_reorder=True, \
    remove_none=False, get_alignment=True)

word_alignment = get_word_alignment(orig_tags, simp_tags, sent)
alignment = align_edits(word_alignment, orig_tags, simp_tags, sent, collapse_phrase_alignment)

orig_tags = get_annotations_per_token([sent], 'original', collapse_composite=True, remove_reorder=True, \
    remove_none=False)
simp_tags = get_annotations_per_token([sent], 'simplified', collapse_composite=True, remove_reorder=True, \
    remove_none=False)

orig_ids = {k: i for i, k in enumerate(orig_tags.keys()) if orig_tags[k] is not set()}
simp_ids = {k: i for i, k in enumerate(simp_tags.keys()) if simp_tags[k] is not set()}

' '.join([f"{orig_ids[x[0]]}-{simp_ids[x[1]]}" for x in alignment])

'0-0 0-1 0-2 1-0 1-1 1-2 2-0 2-1 2-2 3-0 3-1 3-2 4-0 4-1 4-2 5-3 6-4 7-5 8-5 9-6 10-7 11-8 12-11 14-12 15-13 16-14 17-15 18-16 19-17 19-18 19-19 19-20 19-21 20-17 20-18 20-19 20-20 20-21 21-17 21-18 21-19 21-20 21-21 22-22 23-23 24-24 24-25 25-33 26-34 26-35 27-36 28-37 29-38 30-39 31-40 32-41 33-42 34-43 35-44 36-45 37-46 38-47 39-48 40-49 41-50 42-51 45-26 45-27 45-28 45-29 46-26 46-27 46-28 46-29 47-26 47-27 47-28 47-29 48-26 48-27 48-28 48-29 49-26 49-27 49-28 49-29 50-26 50-27 50-28 50-29 51-26 51-27 51-28 51-29 52-26 52-27 52-28 52-29 53-26 53-27 53-28 53-29'