In [1]:
from sys import path
from os.path import dirname as dir

path.append("/Users/yaod_1/Desktop/webpage/text-simplification/ts-annoation-tool/data/analysis")

from utils.all import *
import numpy as np
import pandas as pd

### 1. Load data

In [2]:
# batch 5 and 6 are new-wiki-1, with gpt-3 davinci-2, we will remove gpt-3 zeroshot and fewshot.
# Notice one sentence need to remove all, which has original sentence:
# `In a difficult situation, it encouraged him to study Graphic Design in 2007, and 
#   he's been since as a Cinematographer in the film industry, also having a lot of experience 
#   with photography and experience in the field of graphic design.`

data = []
batch_num = np.arange(5, 12).tolist()
for batch_id in batch_num:
    batch_data = load_data('../annotated', batch_num=[batch_id], preprocess=True)
    if batch_id in [5,6]:
        # remove system = "new-wiki-1/GPT-3-few-shot" or "new-wiki-1/GPT-3-zero-shot"
        new_batch_data = []
        for simplification in batch_data:
            if simplification["original"] == "In a difficult situation, it encouraged him to study Graphic Design in 2007, and he's been since as a Cinematographer in the film industry, also his have a lot of experience with photography and experience in the field of graphic design.":
                # print("found")
                continue
            if simplification['system'] != "new-wiki-1/GPT-3-few-shot" and simplification['system'] != "new-wiki-1/GPT-3-zero-shot":
                new_batch_data.append(simplification)
        batch_data = new_batch_data
    data.extend(batch_data)
        

Loading files: ['../annotated/batch_5_anton.json', '../annotated/batch_5_ayush.json', '../annotated/batch_5_kelly.json', '../annotated/batch_5_rachel.json', '../annotated/batch_5_vinayak.json', '../annotated/batch_5_vishnesh.json']

Found users: {'vishnesh', 'vinayak', 'anton', 'rachel', 'ayush', 'kelly'}

anton - Batch 5, HIT 38 (ID 23) has 2 deletion edits but 1 annotations. Likely a missing annotation. Skipping edit type...
rachel - Batch 5, HIT 21 (ID 39) has 4 deletion edits but 3 annotations. Likely a missing annotation. Skipping edit type...
rachel - Batch 5, HIT 21 (ID 39) has 4 insertion edits but 3 annotations. Likely a missing annotation. Skipping edit type...
rachel - Batch 5, HIT 23 (ID 41) has 1 reorder edits but -1 annotations. Likely a missing annotation. Skipping edit type...
vinayak - Batch 5, HIT 25 (ID 56) has 5 deletion edits but 3 annotations. Likely a missing annotation. Skipping edit type...
vinayak - Batch 5, HIT 25 (ID 56) has 2 insertion edits but 1 annotatio

In [3]:
data[0]["edits"][8]

{'type': 'split',
 'id': 0,
 'original_span': [(81, 91)],
 'simplified_span': [(71, 73), (74, 78), (79, 86)],
 'annotation': ['positive', 'a lot', '', 'no'],
 'composite_edits': [{'type': 'substitution',
   'id': 0,
   'original_span': [(81, 91)],
   'simplified_span': [(79, 86)]},
  {'type': 'insertion',
   'id': 0,
   'original_span': None,
   'simplified_span': [(74, 78)]}],
 'token_length': 4}

### 2. Prepare data for Roberta and T5

It is in the following format:

*Input*: [CLS] original sentence [SEP] simplified <special> ... </special> sentence [SEP]

*Label*: quality, trivial, error

Only need sentence1, sentence2 and label columns, hugginface will automatically add [CLS] and [SEP]

In [4]:
edit_2_special_token_start = {"deletion": "<DEL>", "insertion": "<INS>", "substitution": "<SUB>", "reorder": "<REO>", 
                                "structure": "<STR>", "split": "<SPLIT>"}
edit_2_special_token_end = {"deletion": "</DEL>", "insertion": "</INS>", "substitution": "</SUB>", "reorder": "</REO>",
                                "structure": "</STR>", "split": "</SPLIT>"}

original_list = []                       
sentence1_list = []
sentence2_list = []
label_list = []

for annotation in data:
    original = annotation['original']
    simplified = annotation['simplified']
    for i, edit in enumerate(annotation["processed_annotations"]):
        edit_type = edit["edit_type"]
        original_spans = edit["original_span"]
        simplified_spans = edit["simplified_span"]
        # label = annotation["processed_annotations"][i]["type"].name.lower()
        label = edit["type"].name.lower()
        # each span is a tuple (start, end), sort by start, then end
        if original_spans is None:
            new_original = original
        else:
            original_spans.sort(key=lambda x: (x[0], x[1]))
            new_original = original[:original_spans[0][0]]
            for j, span in enumerate(original_spans):
                new_original += edit_2_special_token_start[edit_type]
                new_original += original[span[0]:span[1]]
                new_original += edit_2_special_token_end[edit_type]
                if j < len(original_spans) - 1:
                    new_original += original[original_spans[j][1]:original_spans[j+1][0]]
            new_original += original[original_spans[-1][1]:]
        
        if simplified_spans is None:
            new_simplified = simplified
        else:
            simplified_spans.sort(key=lambda x: (x[0], x[1]))
            new_simplified = simplified[:simplified_spans[0][0]]
            for j, span in enumerate(simplified_spans):
                if simplified[span[0]:span[1]] == "||":
                    new_simplified += "<SPLIT_SIGN>"
                else:
                    new_simplified += edit_2_special_token_start[edit_type]
                    new_simplified += simplified[span[0]:span[1]]
                    new_simplified += edit_2_special_token_end[edit_type]
                if j < len(simplified_spans) - 1:
                    new_simplified += simplified[simplified_spans[j][1]:simplified_spans[j+1][0]]
            new_simplified += simplified[simplified_spans[-1][1]:]
        
        # replace "|| " in simplified with ""
        new_simplified = new_simplified.replace("|| ", "")
        new_simplified = new_simplified.replace("<SPLIT_SIGN> ", "<SPLIT_SIGN>")
        
        original_list.append(original)
        sentence1_list.append(new_original)
        sentence2_list.append(new_simplified)
        label_list.append(label)

In [5]:
# create dataframe
df = pd.DataFrame({"original": original_list, "sentence1": sentence1_list, "sentence2": sentence2_list, "label": label_list})

unique_sentence1 = df["original"].unique().tolist()
# train, val, test split, 80%, 10%, 10%
train, val, test = np.split(unique_sentence1, [int(.8*len(unique_sentence1)), int(.9*len(unique_sentence1))])

train_df = df[df["original"].isin(train)]
val_df = df[df["original"].isin(val)]
test_df = df[df["original"].isin(test)]
# save to csv
train_df.to_csv("roberta_data/train.csv", index=False)
val_df.to_csv("roberta_data/val.csv", index=False)
test_df.to_csv("roberta_data/test.csv", index=False)

In [6]:
len(train_df), len(val_df), len(test_df)

(10125, 1407, 1579)

In [7]:
# get label distribution in train_df
train_df["label"].value_counts()

quality    7828
error      1384
trivial     913
Name: label, dtype: int64

In [8]:
# count number of each label in label_list
from collections import Counter
Counter(label_list)

Counter({'quality': 10250, 'trivial': 1131, 'error': 1730})

### 3. Prepare GPT-3 data

The difference is to use understandablt marker, like "<deletion>", "<insertion>", "<substitution>" ...

In [9]:
edit_2_special_token_start = {"deletion": "<deletion>", "insertion": "<insertion>", "substitution": "<substitution>", "reorder": "<reorder>", 
                                "structure": "<structure>", "split": "<split>"}
edit_2_special_token_end = {"deletion": "</deletion>", "insertion": "</insertion>", "substitution": "</substitution>", "reorder": "</reorder>",
                                "structure": "</structure>", "split": "</split>"}

original_list = []                       
sentence1_list = []
sentence2_list = []
label_list = []

for annotation in data:
    original = annotation['original']
    simplified = annotation['simplified']
    for i, edit in enumerate(annotation["edits"]):
        edit_type = edit["type"]
        original_spans = edit["original_span"]
        simplified_spans = edit["simplified_span"]
        label = annotation["processed_annotations"][i]["type"].name.lower()
        # each span is a tuple (start, end), sort by start, then end
        if original_spans is None:
            new_original = original
        else:
            original_spans.sort(key=lambda x: (x[0], x[1]))
            new_original = original[:original_spans[0][0]]
            for j, span in enumerate(original_spans):
                new_original += edit_2_special_token_start[edit_type]
                new_original += original[span[0]:span[1]]
                new_original += edit_2_special_token_end[edit_type]
                if j < len(original_spans) - 1:
                    new_original += original[original_spans[j][1]:original_spans[j+1][0]]
            new_original += original[original_spans[-1][1]:]
        
        if simplified_spans is None:
            new_simplified = simplified
        else:
            simplified_spans.sort(key=lambda x: (x[0], x[1]))
            new_simplified = simplified[:simplified_spans[0][0]]
            for j, span in enumerate(simplified_spans):
                if simplified[span[0]:span[1]] == "||":
                    new_simplified += "<split_sign>"
                else:
                    new_simplified += edit_2_special_token_start[edit_type]
                    new_simplified += simplified[span[0]:span[1]]
                    new_simplified += edit_2_special_token_end[edit_type]
                if j < len(simplified_spans) - 1:
                    new_simplified += simplified[simplified_spans[j][1]:simplified_spans[j+1][0]]
            new_simplified += simplified[simplified_spans[-1][1]:]
        
        # replace "|| " in simplified with ""
        new_simplified = new_simplified.replace("|| ", "")
        new_simplified = new_simplified.replace("<split_sign> ", "<split_sign>")
        
        original_list.append(original)
        sentence1_list.append(new_original)
        sentence2_list.append(new_simplified)
        label_list.append(label)

In [10]:
# create dataframe
df = pd.DataFrame({"original": original_list, "sentence1": sentence1_list, "sentence2": sentence2_list, "label": label_list})

unique_sentence1 = df["original"].unique().tolist()
# train, val, test split, 80%, 10%, 10%
train, val, test = np.split(unique_sentence1, [int(.8*len(unique_sentence1)), int(.9*len(unique_sentence1))])

train_df = df[df["original"].isin(train)]
val_df = df[df["original"].isin(val)]
test_df = df[df["original"].isin(test)]
# save to csv
train_df.to_csv("gpt3_data/train.csv", index=False)
val_df.to_csv("gpt3_data/val.csv", index=False)
test_df.to_csv("gpt3_data/test.csv", index=False)