In [1]:
"""
In this script, we want to convert annotations to conll style. There are several steps:
1.clustering input data
2.Turn cluster data into conll format
3.Try to run the preprocess script from HOI code base
"""

'\nIn this script, we want to convert annotations to conll style. There are several steps:\n1.clustering input data\n2.Turn cluster data into conll format\n3.Try to run the preprocess script from HOI code base\n'

In [2]:
import pickle as pkl
from copy import deepcopy
import json
import jsonlines
import os
import csv
from utils.my_util import cluster_mentions, remove_speaker_prefix

## Load Source Friends Dataset

In [3]:
all_data = []

root_path = 'data/raw_source/friends/'
for i, file_name in enumerate(os.listdir(root_path)):
    if file_name[-4:]!="json":
        continue
    # if i!=0:
    #     continue
    with open(root_path+file_name, 'r') as f:
        data = json.load(f)
        season_id = data['season_id']
        episodes = data['episodes']
        for episode in episodes:
            episode_id = episode['episode_id']
            scenes = episode['scenes']
            for scene in scenes:
                temp_sentences = []
                temp_answers = []
                temp_speakers = []

                scene_id = scene['scene_id']
                utterances = scene['utterances']
                # if scene_id != "s01_e24_c01":
                #     continue
                for utterance in utterances:
                    utt_speaker = " ".join(utterance['speakers'])
                    tokens = utterance['tokens']
                    if 'character_entities' not in utterance:
                        continue
                    character_entities = utterance['character_entities']

                    # Collect mentions in multiple utterance
                    utt_sentences = []
                    utt_answers = []
                    mention_dict = {}
                    # print(utt_speaker)
                    for sent, mentions in zip(tokens, character_entities):
                        for mention in mentions:
                            start = mention[0]+len(utt_sentences)
                            end = mention[1]+len(utt_sentences)
                            mention_name = " ".join(mention[2:])
                            utt_answers.append([len(temp_sentences), start, end, mention_name])
                        utt_sentences.extend(sent)

                    if not utt_sentences:
                        continue

                    # Collect utterances, speakers, mentions
                    temp_sentences.append(utt_sentences)
                    temp_answers.extend(utt_answers)
                    temp_speakers.append(utt_speaker)

                # Build Cluster
                cluster_dict = {}
                for sent_id, start, end, name in temp_answers:
                    if name not in cluster_dict:
                        cluster_dict[name] = [(sent_id, start, end)]
                    else:
                        cluster_dict[name].append((sent_id, start, end))

                clusters = []
                for item in cluster_dict:
                    clusters.append(cluster_dict[item])
                # print(scene_id)
                # print("".join(scene_id.strip().split("_")))
                all_data.append({
                    "sentences": temp_sentences,
                    "speakers": temp_speakers,
                    "scene_id": "".join(scene_id.strip().split("_")),
                    "clusters": clusters
                })

In [4]:
cleaned_data = []
for item in all_data:
    if item['speakers']==[]:
        continue
    cleaned_data.append(item)
print(len(all_data), len(cleaned_data))

3107 1301


In [5]:
sample = cleaned_data[0]
for item in sample:
    print(item)
    print(sample[item])
    print()

sentences
[['There', "'s", 'nothing', 'to', 'tell', '!', 'He', "'s", 'just', 'some', 'guy', 'I', 'work', 'with', '!'], ["C'mon", ',', 'you', "'re", 'going', 'out', 'with', 'the', 'guy', '!', 'There', "'s", 'got', 'ta', 'be', 'something', 'wrong', 'with', 'him', '!'], ['All', 'right', 'Joey', ',', 'be', 'nice', '.', 'So', 'does', 'he', 'have', 'a', 'hump', '?', 'A', 'hump', 'and', 'a', 'hairpiece', '?'], ['Wait', ',', 'does', 'he', 'eat', 'chalk', '?'], ['Just', ',', "'", 'cause', ',', 'I', 'do', "n't", 'want', 'her', 'to', 'go', 'through', 'what', 'I', 'went', 'through', 'with', 'Carl', '-', 'oh', '!'], ['Okay', ',', 'everybody', 'relax', '.', 'This', 'is', 'not', 'even', 'a', 'date', '.', 'It', "'s", 'just', 'two', 'people', 'going', 'out', 'to', 'dinner', 'and', '-', 'not', 'having', 'sex', '.'], ['Sounds', 'like', 'a', 'date', 'to', 'me', '.'], ['Alright', ',', 'so', 'I', "'m", 'back', 'in', 'high', 'school', ',', 'I', "'m", 'standing', 'in', 'the', 'middle', 'of', 'the', 'cafeteria

In [6]:
def flatten_sentences(sentences):
    flatten = []
    for sent in sentences:
        flatten.extend(sent)
    return flatten

In [7]:
# Check Docuemtn Length
for item in data:
    print(len(flatten_sentences(item['sentences'])))

TypeError: string indices must be integers

In [8]:
print(data[0])

KeyError: 0

In [22]:
# Save Data
train = cleaned_data[:-260]
dev = cleaned_data[-130:]
test = cleaned_data[-260:-130]
print(len(train), len(dev), len(test))

with open('data/raw_source/friends/friends_train.pkl', 'wb') as f:
    pkl.dump(train, f)
with open('data/raw_source/friends/friends_dev.pkl', 'wb') as f:
    pkl.dump(dev, f)
with open('data/raw_source/friends/friends_test.pkl', 'wb') as f:
    pkl.dump(test, f)

1041 130 130


## Prepare Friends Data

In [11]:
file_name = "test"

with open('data/raw_source/friends/friends_'+file_name+".pkl", 'rb') as f:
    data = pkl.load(f)

document = []
for i in range(len(data)):
    # if i>=10:
    #     continue
    sample = data[i]
    # Get Data ready for conversion
    sentences = sample['sentences']
    clusters = sample['clusters']
    speakers = sample['speakers']

    scene_id = sample['scene_id']
    part = int(scene_id[7:9])
    begin_line = "#begin document " + "(" + scene_id + "); part " + "%03d" % part
    end_line = "#end document"

    # Prepare for clustering
    cluster_field = []
    for sent in sentences:
        cluster_field.append([""]*len(sent))
    # Add start
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx)
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx)
    # Add start==end
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start == end:
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx) + ")"
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx) + ")"
    # Add End
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                try:
                    if cluster_field[sent_id][end] == "":
                        cluster_field[sent_id][end] += str(idx) + ")"
                    else:
                        cluster_field[sent_id][end] += "|" + str(idx) + ")"
                except:
                    pass
                # if cluster_field[sent_id][end] == "":
                #     cluster_field[sent_id][end] += str(idx) + ")"
                # else:
                #     cluster_field[sent_id][end] += "|" + str(idx) + ")"

    # Build document
    document.append(begin_line + "\n")
    for sent, speaker, cluster_value in zip(sentences, speakers, cluster_field):
        for j, word in enumerate(sent):
            cluster_id = cluster_value[j]
            if cluster_id == "":
                cluster_id = "-"
            temp = [scene_id, str(part), str(j), word, "na", "na", "na", "na", "na", speaker, "na", "na", "na", cluster_id]
            document.append(" ".join(temp)+ "\n")
        document.append("" + "\n")
    document.append(end_line + "\n")

with open("data/conll_style/ci_english/"+ file_name+'.english.v4_gold_conll', 'w') as f:
    f.writelines(document)

print(len(document))

35082


In [145]:
print(len(document))

191431


In [136]:
for line in document[:200]:
    print(line)

#begin document (s01e01c01); part 001

s01e01c01 1 0 There na na na na na Monica Geller na na na -

s01e01c01 1 1 's na na na na na Monica Geller na na na -

s01e01c01 1 2 nothing na na na na na Monica Geller na na na -

s01e01c01 1 3 to na na na na na Monica Geller na na na -

s01e01c01 1 4 tell na na na na na Monica Geller na na na -

s01e01c01 1 5 ! na na na na na Monica Geller na na na -

s01e01c01 1 6 He na na na na na Monica Geller na na na (0)

s01e01c01 1 7 's na na na na na Monica Geller na na na -

s01e01c01 1 8 just na na na na na Monica Geller na na na -

s01e01c01 1 9 some na na na na na Monica Geller na na na -

s01e01c01 1 10 guy na na na na na Monica Geller na na na (0)

s01e01c01 1 11 I na na na na na Monica Geller na na na (1)

s01e01c01 1 12 work na na na na na Monica Geller na na na -

s01e01c01 1 13 with na na na na na Monica Geller na na na -

s01e01c01 1 14 ! na na na na na Monica Geller na na na -



s01e01c01 1 0 C'mon na na na na na Joey Tribbiani na na na -



## Prepare Friends Dataset

In [None]:
file_name = "test"
data = []
with open('data/'+ file_name+'_temp.pkl', 'rb') as f:
    data.extend(pkl.load(f))

document = []
for i in range(len(data)):
    if file_name=="train" and i==38:
        continue
    if file_name=="test" and i==28:
        continue

    # if i>=100:
    #     continue
    sample = data[i]
    original_sentences = sample['sentences']
    original_clusters = cluster_mentions(sample['answers'], original_sentences)

    # Get Data ready for conversion
    sentences, clusters, speakers = remove_speaker_prefix(original_sentences, original_clusters)
    scene_id = sample['scene_id']
    part = int(scene_id[7:9])
    begin_line = "#begin document " + "(" + scene_id + "); part " + "%03d" % part
    end_line = "#end document"

    # Prepare for clustering
    cluster_field = []
    for sent in sentences:
        cluster_field.append([""]*len(sent))
    # Add start
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx)
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx)
    # Add start==end
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start == end:
                if cluster_field[sent_id][start] == "":
                    cluster_field[sent_id][start] += "(" + str(idx) + ")"
                else:
                    cluster_field[sent_id][start] += "|" + "(" + str(idx) + ")"
    # Add End
    for idx, cluster in enumerate(clusters):
        for sent_id, start, end in cluster:
            end = end - 1
            if start != end:
                try:
                    if cluster_field[sent_id][end] == "":
                        cluster_field[sent_id][end] += str(idx) + ")"
                    else:
                        cluster_field[sent_id][end] += "|" + str(idx) + ")"
                except:
                    pass
                # if cluster_field[sent_id][end] == "":
                #     cluster_field[sent_id][end] += str(idx) + ")"
                # else:
                #     cluster_field[sent_id][end] += "|" + str(idx) + ")"

    # Build document
    document.append(begin_line + "\n")
    for sent, speaker, cluster_value in zip(sentences, speakers, cluster_field):
        for j, word in enumerate(sent):
            cluster_id = cluster_value[j]
            if cluster_id == "":
                cluster_id = "-"
            temp = [scene_id, str(part), str(j), word, "na", "na", "na", "na", "na", speaker, "na", "na", "na", cluster_id]
            document.append(" ".join(temp)+ "\n")
        document.append("" + "\n")
    document.append(end_line + "\n")

with open("data/input/"+ file_name+'.english.v4_gold_conll', 'w') as f:
    f.writelines(document)