### Prepare data for Roberta and T5

It is in the following format:

*Input*: [CLS] original sentence [SEP] simplified <special> ... </special> sentence [SEP]

*Label*: quality, trivial, error

Only need sentence1, sentence2 and label columns, hugginface will automatically add [CLS] and [SEP]

In [1]:
import json

split_name = "test"

with open(f"../data/edit_classification/inspection_data/{split_name}_1234.json", 'r') as f:
    data = json.load(f)

keys = list(data.keys())
# keys is a string which delimiter is %%, so split into a tuple, and replace keys
keys = [tuple(key.split('%%')) for key in keys]
data = dict(zip(keys, data.values()))

In [2]:
edit_2_special_token_start = {"deletion": "<DEL>", "insertion": "<INS>", "substitution": "<SUB>", "reorder": "<REO>", 
                                "structure": "<STR>", "split": "<SPLIT>"}
edit_2_special_token_end = {"deletion": "</DEL>", "insertion": "</INS>", "substitution": "</SUB>", "reorder": "</REO>",
                                "structure": "</STR>", "split": "</SPLIT>"}

original_list = []                       
sentence1_list = []
sentence2_list = []
label_list = []

for key, annotation in data.items():
    system, original = key
    simplified = annotation['simplified']
    for i, edit in enumerate(annotation["edits"]):
        edit_type = edit["type"]
        original_spans = edit["original_span"]
        simplified_spans = edit["simplified_span"]
        label = edit["label"]
        # each span is a tuple (start, end), sort by start, then end
        if original_spans is None:
            new_original = original
        else:
            original_spans.sort(key=lambda x: (x[0], x[1]))
            new_original = original[:original_spans[0][0]]
            for j, span in enumerate(original_spans):
                new_original += edit_2_special_token_start[edit_type]
                new_original += original[span[0]:span[1]]
                new_original += edit_2_special_token_end[edit_type]
                if j < len(original_spans) - 1:
                    new_original += original[original_spans[j][1]:original_spans[j+1][0]]
            new_original += original[original_spans[-1][1]:]
        
        if simplified_spans is None:
            new_simplified = simplified
        else:
            simplified_spans.sort(key=lambda x: (x[0], x[1]))
            new_simplified = simplified[:simplified_spans[0][0]]
            for j, span in enumerate(simplified_spans):
                if simplified[span[0]:span[1]] == "||":
                    new_simplified += "<SPLIT_SIGN>"
                else:
                    new_simplified += edit_2_special_token_start[edit_type]
                    new_simplified += simplified[span[0]:span[1]]
                    new_simplified += edit_2_special_token_end[edit_type]
                if j < len(simplified_spans) - 1:
                    new_simplified += simplified[simplified_spans[j][1]:simplified_spans[j+1][0]]
            new_simplified += simplified[simplified_spans[-1][1]:]
        
        # replace "|| " in simplified with ""
        new_simplified = new_simplified.replace("|| ", "")
        new_simplified = new_simplified.replace("<SPLIT_SIGN> ", "<SPLIT_SIGN>")
        
        original_list.append(original)
        sentence1_list.append(new_original)
        sentence2_list.append(new_simplified)
        label_list.append(label)

In [3]:
# count number of each label in label_list
from collections import Counter
Counter(label_list)

Counter({'error': 66, 'quality': 394, 'trivial': 73})

In [4]:
import pandas as pd

# create dataframe
df = pd.DataFrame({"original": original_list, "sentence1": sentence1_list, "sentence2": sentence2_list, "label": label_list})

# save to csv
df.to_csv(f"../data/edit_classification/inspection_data/classification_data/{split_name}_1234.csv", index=False)