In [26]:
"""
In this script, we want to convert annotations to conll style. There are several steps:
1.clustering input data
2.Turn cluster data into conll format
3.Try to run the preprocess script from HOI code base
"""

'\nIn this script, we want to convert annotations to conll style. There are several steps:\n1.clustering input data\n2.Turn cluster data into conll format\n3.Try to run the preprocess script from HOI code base\n'

In [2]:
import pickle as pkl
from copy import deepcopy
import jsonlines
from utils.my_util import cluster_mentions, remove_speaker_prefix
import json

## Prepare Dialogue Data

In [3]:
speaker_dict = {}
with open('data/raw_source/dialogue_en/all_coref_data_en_finalized.json', 'r') as f:
    temp = json.load(f)
    for line in temp:
        scene_id = line['scene_id']
        speakers = []
        for sent in line['sentences']:
            speakers.append(" ".join(sent[:sent.index(":")]))
        speaker_dict[scene_id] = speakers

split_dict = {"train":[], "dev":[], "test":[]}
with open('data/raw_source/dialogue_zh/dev_temp.pkl', 'rb') as f:
    temp = pkl.load(f)
    for line in temp:
        split_dict['dev'].append(line['scene_id'])
with open('data/raw_source/dialogue_zh/test_temp.pkl', 'rb') as f:
    temp = pkl.load(f)
    for line in temp:
        split_dict['test'].append(line['scene_id'])
with open('data/raw_source/dialogue_zh/train_temp.pkl', 'rb') as f:
    temp = pkl.load(f)
    for line in temp:
        split_dict['train'].append(line['scene_id'][:-1])

In [4]:
print(split_dict)

{'train': ['s01e01c00t', 's01e01c01t', 's01e01c03t', 's01e01c04t', 's01e01c05t', 's01e01c06t', 's01e01c07t', 's01e01c08t', 's01e01c09t', 's01e01c10t', 's01e03c00t', 's01e10c04t', 's01e03c01t', 's01e03c02t', 's01e03c03t', 's01e03c04t', 's01e03c05t', 's01e03c06t', 's01e03c07t', 's01e03c08t', 's01e03c09t', 's01e04c00t', 's01e04c01t', 's01e04c02t', 's01e04c03t', 's01e04c04t', 's01e04c05t', 's01e04c06t', 's01e04c07t', 's01e04c08t', 's01e04c09t', 's01e04c10t', 's01e04c11t', 's01e04c12t', 's01e05c00t', 's01e05c01t', 's01e05c02t', 's01e05c03t', 's01e05c04t', 's01e05c05t', 's01e05c06t', 's01e05c07t', 's01e07c01t', 's01e07c02t', 's01e07c03t', 's01e07c04t', 's01e07c05t', 's01e09c00t', 's01e09c01t', 's01e09c02t', 's01e09c03t', 's01e09c04t', 's01e09c05t', 's01e09c06t', 's01e07c00t', 's01e09c07t', 's01e10c00t', 's01e10c01t', 's01e10c02t', 's01e10c03t', 's01e10c05t', 's01e10c07t', 's01e11c00t', 's01e11c01t', 's01e11c02t', 's01e11c03t', 's01e11c04t', 's01e11c05t', 's01e11c06t', 's01e11c07t', 's01e11c0

In [37]:
data = []
all_ids = []
with open('data/raw_source/dialogue_en/all_coref_data_en_finalized.json', 'r') as f:
# with open('data/raw_source/dialogue_zh/dev-test-batch1_zh.json', 'r') as f:
    reader = jsonlines.Reader(f)
    for bulk in reader:
        for idx, instance in enumerate(bulk):
            if idx>=5:
                break

            scene_id = instance['scene_id']
            if scene_id == "":
                continue
            sentences = instance['sentences']

            for sent in sentences:
                print(sent.index(":"), sent)
            # print(sentences)
            # sentences = [[token for token in "".join(sent)] for sent in instance['sentences']]
            annotations = instance['annotations']
            all_ids.append(scene_id)
            speakers = speaker_dict[scene_id]
            answers = []
            for item in annotations:
                query = (item['query']['sentenceIndex'], item['query']['startToken']-1-sentences[item['query']['sentenceIndex']].index(":"), item['query']['endToken']-1-sentences[item['query']['sentenceIndex']].index(":"))
                antecedents = item['antecedents']
                if antecedents in [['n', 'o', 't', 'P', 'r', 'e', 's', 'e', 'n', 't'], ['null_projection'], ['empty_subtitle']]:
                    answers.append([query, "notPresent"])
                else:
                    temp_answer = []
                    for antecedent in antecedents:
                        if isinstance(antecedent, dict):
                            temp_answer.append((antecedent['sentenceIndex'], antecedent['startToken']-1-sentences[antecedent['sentenceIndex']].index(":"), antecedent['endToken']-1-sentences[antecedent['sentenceIndex']].index(":")))
                        else:
                            temp_answer = " ".join(antecedents)
                    answers.append([query, temp_answer])
            new_sentences = []
            for sent in sentences:
                new_sentences.append(sent[sent.index(":")+1:])

            data.append({
                "sentences": new_sentences,
                "answers": answers,
                "speakers": speakers,
                "scene_id": scene_id
            })

1 ['Sheldon', ':', 'If', 'a', 'photon', 'is', 'directed', 'through', 'a', 'plane', 'with', 'two', 'slits', 'in', 'it', 'and', 'either', 'is', 'observed', 'it', 'will', 'not', 'go', 'through', 'both', '.', 'If', 'unobserved', ',', 'it', 'will', '.', 'If', 'it', "'s", 'observed', 'after', 'it', 'left', 'the', 'plane', ',', 'before', 'it', 'hits', 'its', 'target', '...', 'it', 'will', 'not', 'have', 'gone', 'through', 'both', 'slits', '.']
1 ['Leonard', ':', 'Agreed', '.', 'What', "'s", 'your', 'point', '?']
1 ['Sheldon', ':', 'There', "'s", 'no', 'point', ',', 'I', 'just', 'think', 'it', "'s", 'a', 'good', 'idea', 'for', 'a', 'T', '-', 'shirt', '.']
1 ['Leonard', ':', 'Excuse', 'me', '.']
1 ['Receptionist', ':', 'Hang', 'on', '.']
1 ['Leonard', ':', 'One', 'across', 'is', 'Aegean', ',', 'eight', 'down', 'is', 'Nabokov', '.', 'Twenty', '-', 'six', 'across', 'is', 'MCM', '.', 'Fourteen', 'down', 'is', '...', 'Move', 'your', 'finger', '.', 'phylum', ',', 'which', 'makes', '14', 'across', 'P

In [36]:
split = "train"

document = []
for i in range(len(data)):
    # if i >= 2:
    #     continue
    sample = data[i]
    if sample['scene_id'] not in split_dict[split]:
        continue
    print(scene_id)
    # print(sample)
    # print()
    # original_sentences = sample['sentences']
    # original_clusters = cluster_mentions(sample['answers'], original_sentences)
    # sentences, clusters, speakers = remove_speaker_prefix(original_sentences, original_clusters)
    sentences = sample['sentences']
    clusters = cluster_mentions(sample['answers'], sentences)
    speakers = sample['speakers']
    scene_id = sample['scene_id']
    part = int(scene_id[7:9])
    begin_line = "#begin document " + "(" + scene_id + "); part " + "%03d" % part
    end_line = "#end document"

    # Prepare for clustering
    cluster_field = []
    for sent in sentences:
        cluster_field.append([""]*len(sent))
    # Add start
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                # print(cluster_field[sent_id])
                # print(sent_id, start, end, len(sentences[sent_id]))
                # print(sentences[sent_id])
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx)
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx)
    # Add start==end
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start == end:
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx) + ")"
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx) + ")"
    # Add End
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                try:
                    if cluster_field[sent_id][end] == "":
                        cluster_field[sent_id][end] += str(idx) + ")"
                    else:
                        cluster_field[sent_id][end] += "|" + str(idx) + ")"
                except:
                    pass
                # if cluster_field[sent_id][end] == "":
                #     cluster_field[sent_id][end] += str(idx) + ")"
                # else:
                #     cluster_field[sent_id][end] += "|" + str(idx) + ")"

    # Build document
    document.append(begin_line + "\n")
    for sent, speaker, cluster_value in zip(sentences, speakers, cluster_field):
        for j, word in enumerate(sent):
            cluster_id = cluster_value[j]
            if cluster_id == "":
                cluster_id = "-"
            temp = [scene_id, str(part), str(j), word, "na", "na", "na", "na", "na", speaker, "na", "na", "na", cluster_id]
            document.append(" ".join(temp)+ "\n")
        document.append("" + "\n")
    document.append(end_line + "\n")

with open("data/conll_style/dialogue_finalized_english/"+ split+'.english.v4_gold_conll', 'w') as f:
    f.writelines(document)

print(len(document))

s07e01c05t
s01e01c00t
s01e01c01t
s01e01c03t
s01e01c04t
s01e01c05t
s01e01c06t
s01e01c07t
s01e01c08t
s01e01c09t
s01e01c10t
s01e03c00t
s01e10c04t
s01e03c01t
s01e03c02t
s01e03c03t
s01e03c04t
s01e03c05t
s01e03c06t
s01e03c07t
s01e03c08t
s01e03c09t
s01e04c00t
s01e04c01t
s01e04c02t
s01e04c03t
s01e04c04t
s01e04c05t
s01e04c06t
s01e04c07t
s01e04c08t
s01e04c09t
s01e04c10t
s01e04c11t
s01e04c12t
s01e05c00t
s01e05c01t
s01e05c02t
s01e05c03t
s01e05c04t
s01e05c05t
s01e05c06t
s01e05c07t
s01e07c01t
s01e07c02t
s01e07c03t
s01e07c04t
s01e07c05t
s01e09c00t
s01e09c01t
s01e09c02t
s01e09c03t
s01e09c04t
s01e09c05t
s01e09c06t
s01e07c00t
s01e09c07t
s01e10c00t
s01e10c01t
s01e10c02t
s01e10c03t
s01e10c05t
s01e10c07t
s01e11c00t
s01e11c01t
s01e11c02t
s01e11c03t
s01e11c04t
s01e11c05t
s01e11c06t
s01e11c07t
s01e11c08t
s04e19c03t
s04e19c04t
s01e13c00t
s01e13c01t
s01e13c02t
s01e13c03t
s01e13c05t
s01e14c00t
s01e14c01t
s01e14c02t
s01e14c03t
s01e14c04t
s01e14c07t
s01e15c00t
s01e15c01t
s01e15c02t
s01e15c03t
s01e15c04t
s01e15c05t

In [6]:
file_name = "train"
data = []
with open('data/raw_source/dialogue/'+ file_name+'_temp.pkl', 'rb') as f:
    data.extend(pkl.load(f))

document = []
for i in range(len(data)):
    if file_name=="train" and i==38:
        continue
    if file_name=="test" and i==28:
        continue

    if i>=1:
        continue
    sample = data[i]
    original_sentences = sample['sentences']
    original_clusters = cluster_mentions(sample['answers'], original_sentences)

    # Get Data ready for conversion
    sentences, clusters, speakers = remove_speaker_prefix(original_sentences, original_clusters)
    scene_id = sample['scene_id']
    part = int(scene_id[7:9])
    begin_line = "#begin document " + "(" + scene_id + "); part " + "%03d" % part
    end_line = "#end document"

    # Prepare for clustering
    cluster_field = []
    for sent in sentences:
        cluster_field.append([""]*len(sent))
    # Add start
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx)
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx)
    # Add start==end
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start == end:
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx) + ")"
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx) + ")"
    # Add End
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                try:
                    if cluster_field[sent_id][end] == "":
                        cluster_field[sent_id][end] += str(idx) + ")"
                    else:
                        cluster_field[sent_id][end] += "|" + str(idx) + ")"
                except:
                    pass
                # if cluster_field[sent_id][end] == "":
                #     cluster_field[sent_id][end] += str(idx) + ")"
                # else:
                #     cluster_field[sent_id][end] += "|" + str(idx) + ")"

    # Build document
    document.append(begin_line + "\n")
    for sent, speaker, cluster_value in zip(sentences, speakers, cluster_field):
        for j, word in enumerate(sent):
            cluster_id = cluster_value[j]
            if cluster_id == "":
                cluster_id = "-"
            temp = [scene_id, str(part), str(j), word, "na", "na", "na", "na", "na", speaker, "na", "na", "na", cluster_id]
            document.append(" ".join(temp)+ "\n")
        document.append("" + "\n")
    document.append(end_line + "\n")

with open("data/conll_style/overfit_english/"+ "train"+'.english.v4_gold_conll', 'w') as f:
    f.writelines(document)
with open("data/conll_style/overfit_english/"+ "dev"+'.english.v4_gold_conll', 'w') as f:
    f.writelines(document)
with open("data/conll_style/overfit_english/"+ "test"+'.english.v4_gold_conll', 'w') as f:
    f.writelines(document)