In [1]:
import sys
sys.path.append('../analysis/')

import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from utils.all_yao import *
import numpy as np
import pandas as pd

In [2]:
data = load_data('../data/inspection_rating_annotated', preprocess=True)

INFO:Loading files: ['../data/inspection_rating_annotated/batch_1_ayush.json', '../data/inspection_rating_annotated/batch_1_rachel.json', '../data/inspection_rating_annotated/batch_1_vinayak.json', '../data/inspection_rating_annotated/batch_1_vishnesh.json', '../data/inspection_rating_annotated/batch_2_ayush.json', '../data/inspection_rating_annotated/batch_2_rachel.json', '../data/inspection_rating_annotated/batch_2_vinayak.json', '../data/inspection_rating_annotated/batch_2_vishnesh.json', '../data/inspection_rating_annotated/batch_3_ayush.json', '../data/inspection_rating_annotated/batch_3_rachel.json', '../data/inspection_rating_annotated/batch_3_vinayak.json', '../data/inspection_rating_annotated/batch_3_vishnesh.json', '../data/inspection_rating_annotated/batch_4_ayush.json', '../data/inspection_rating_annotated/batch_4_rachel.json', '../data/inspection_rating_annotated/batch_4_vinayak.json', '../data/inspection_rating_annotated/batch_4_vishnesh.json']

INFO:Found users: {'ayush'

In [3]:
edits_types = ["deletion", "insertion", "substitution", "split", "reorder", "structure"]
value_dict = {
    "minor": 1,
    "somewhat": 2,
    "a lot": 3,
    "very": 3,
}


In [4]:
# system_orig_tuples = []
# annotation_words = []
# for point in data:
#     system_orig_tuples.append((point['system'], point['original']))
#     for edit in point['edits']:
#         annotation_words += edit['annotation']

In [5]:
# convert all edit annotation (a list of words) to a format of [error/quality/trivial, severity or efficacy, grammar error or not]
# Update April 18, add error category to the 4th element of the annotation
for point in data:
    for edit in point['edits']:
        annotation = edit['annotation']
        if len(annotation) == 0:
            continue
        edit_type = edit['type']
        try:
            if edit_type == "deletion":
                if annotation[0] == "good":
                    edit['annotation'] = ["quality", value_dict[annotation[1]], annotation[2], "good deletion"]
                elif annotation[0] == "bad":
                    edit['annotation'] = ["error", -1 * value_dict[annotation[1]], annotation[2], "bad deletion"]
                else:
                    edit['annotation'] = ["trivial", 0, annotation[1], "unnecessary deletion"]
            elif edit_type == "insertion":
                if annotation[0] == "elaboration":
                    edit['annotation'] = ["quality", value_dict[annotation[1]], annotation[2], annotation[0]]
                elif annotation[0] == "trivial":
                    if annotation[1] == "yes":
                        edit['annotation'] = ["quality", value_dict[annotation[2]], annotation[3], annotation[0]]
                    else:
                        edit['annotation'] = ["trivial", 0, annotation[3], "unnecessary insertion"]
                else:
                    edit['annotation'] = ["error", -1 * value_dict[annotation[1]], annotation[2], annotation[0]]
            elif edit_type == "substitution":
                if annotation[0] == "same":
                    if annotation[1] == "negative":
                        edit['annotation'] = ["error", -1 * value_dict[annotation[3]], annotation[4], "bad paraphrase"]
                    elif annotation[1] == "positive":
                        edit['annotation'] = ["quality", value_dict[annotation[2]], annotation[4], "good paraphrase"]
                    else:
                        edit['annotation'] = ["trivial", 0, annotation[4], "unnecessary paraphrase"]
                elif annotation[0] == "different":
                    edit['annotation'] = ["error", -1 * value_dict[annotation[1]], annotation[2], "different information"]
                elif annotation[0] == "less":
                    if annotation[1] == "bad":
                        edit['annotation'] = ["error", -1 * value_dict[annotation[2]], annotation[3], "remove important information"]
                    else:
                        edit['annotation'] = ["quality", value_dict[annotation[2]], annotation[3], "remove unimportant information"]
                elif annotation[0] == "more":
                    if annotation[1] == "elaboration":
                        edit['annotation'] = ["quality", value_dict[annotation[2]], annotation[3], annotation[1]]
                    elif annotation[1] == "trivial":
                        if annotation[2] == "yes":
                            edit['annotation'] = ["quality", value_dict[annotation[3]], annotation[4], annotation[1]]
                        else:
                            edit['annotation'] = ["trivial", 0, annotation[4], "unnecessary add information"]
                    else:
                        edit['annotation'] = ["error", -1 * value_dict[annotation[2]], annotation[3], annotation[1]]
            elif edit_type == "split" or edit_type == "reorder":
                if annotation[0] == "negative":
                    edit['annotation'] = ["error", -1 * value_dict[annotation[1]], annotation[3], f"bad {edit_type}"]
                elif annotation[0] == "positive":
                    edit['annotation'] = ["quality", value_dict[annotation[2]], annotation[3], f"good {edit_type}"]
                else:
                    edit['annotation'] = ["trivial", 0, annotation[3], f"unnecessary {edit_type}"]
            elif edit_type == "structure":
                if annotation[1] == "negative":
                    edit['annotation'] = ["error", -1 * value_dict[annotation[2]], annotation[4], annotation[0]]
                elif annotation[1] == "positive":
                    edit['annotation'] = ["quality", value_dict[annotation[3]], annotation[4], annotation[0]]
                else:
                    edit['annotation'] = ["trivial", 0, annotation[4], "unnecessary " + annotation[0]]
        except:
            print(edit_type, annotation)
            if edit_type == "substitution" and annotation[0] == "different" and annotation[-1] == "":
                edit["annotation"] = ["error", -1 * value_dict[annotation[1]], "no", "different information"]
            else:
                edit['annotation'] = []

deletion ['bad', '', 'no', '']
structure ['changes tense', 'positive', '', '', 'no']


In [6]:
# resturcture edits annotations to a dictionary
for point in data:
    for edit in point["edits"]:
        edit["annotation"] = {point["user"]: edit["annotation"]}

In [7]:
# restructure the data into a dict, the key is (system, original) tuple, the value is a dictionary with keys being simplified and edits
data_dict = {}
for point in data:
    system = point['system']
    original = point['original']
    user = point['user']
    edits = point['edits']
    key = (system, original)
    if key not in data_dict:
        data_dict[key] = {
            "simplified": point['simplified'],
            "edits": edits
        }
    else:
        for exist_edit in data_dict[key]['edits']:
            edit_type = exist_edit['type']
            edit_id = exist_edit['id']
            for edit in edits:
                if edit['type'] == edit_type and edit['id'] == edit_id:
                    exist_edit['annotation'][user] = edit['annotation'][user]
                    break


In [8]:
for key_tuple, value in data_dict.items():
    for edit in value['edits']:
        if len(edit['annotation']) == 0:
            edit['score'] = 0
            edit['round_score'] = 0
            edit['label'] = "trivial"
            edit["edit_category_list"] = []
            edit["edit_category"] = ""
        else:
            score = 0
            edit_category_list = []
            num_annot = len(edit['annotation'])
            for user, annotation in edit['annotation'].items():
                if annotation == []:
                    num_annot -= 1
                    continue
                score += annotation[1]
                edit_category_list.append(annotation[3])
            edit['score'] = score / num_annot
            edit['round_score'] = round(edit['score'])
            if edit['score'] > 0:
                edit['label'] = "quality"
            elif edit['score'] < 0:
                edit['label'] = "error"
            else:
                edit['label'] = "trivial"
            edit["edit_category_list"] = edit_category_list
            # edit category is the most frequent edit category
            edit["edit_category"] = max(set(edit_category_list), key=edit_category_list.count)

In [9]:
# Convert tuple keys to strings using '%%' as a delimiter
converted_data_dict = {"%%".join(map(str, key)): value for key, value in data_dict.items()}

# Save the data into a json file
with open("../data/edit_classification/inspection_data/batch_1234.json", "w") as f:
    json.dump(converted_data_dict, f, indent=4)