In [8]:
import csv
import json
from copy import deepcopy
import pickle as pkl

In [9]:
from utils.data_util import read_mturk_annotation_multiple_scene
from utils.data_util import read_turkle_annotation_multiple_scene
from utils.data_util import read_annotation
from utils.data_util import gather_by_scene

In [10]:
def collect_query_for_projection(answer_spans):
    # Collect all annotations
    annotations = []
    for x in answer_spans:
        query = x['querySpan']
        if x['notPresent']:
            antecedents = "notPresent"
        elif x['notMention']:
            antecedents = "notMention"
        else:
            antecedents = []
            for token in x['span_list']:
                if token['sentenceIndex']!=-1 and token['startToken']!=-1 and token['endToken']!=-1:
                    antecedents.append(token)
        annotations.append({
            "query": query,
            "antecedents": antecedents
        })
    return annotations

In [9]:
with open('data/all_scenes.pkl', 'rb') as f:
    all_scenes = pkl.load(f)

data = read_mturk_annotation_multiple_scene('data/bulk_0/results.csv')
results = []
results.extend(data)

output = []
for item in results:
    temp = {
        "sentences": item['sentences'],
    }
    annotations = collect_query_for_projection(item['answer_spans'])
    temp["annotations"] = annotations
    temp['zh_subtitles'] = all_scenes[item['scene_id']]['zh_subtitles']
    temp['fa_subtitles'] = all_scenes[item['scene_id']]['fa_subtitles']
    temp["scene_id"] = item['scene_id']
    output.append(temp)

with open('annotation_results_bulk_0.json', 'w') as f:
    json.dump(output, f)

In [7]:
mturk_2 = read_mturk_annotation_multiple_scene('data/pilot_2/golden.csv')
# mturk_3 = read_mturk_annotation_multiple_scene('data/pilot_1/pilot_1.csv')
# mturk_4 = read_mturk_annotation_multiple_scene('data/pilot_1/pilot_2.csv')

results = []
results.extend(mturk_2)
# results.extend(mturk_3)
# results.extend(mturk_4)

scenes = gather_by_scene(results)

KeyError: 'scene_ids'

In [6]:
output = {}
for i, scene_id in enumerate(scenes):
    scene = scenes[scene_id]
    temp = {"sentences": scene[0]['sentences']}
    for j, item in enumerate(scene):
        annotations = collect_query_for_projection(item['answer_spans'])
        temp["annotations_" + str(j)] = annotations
    output["scene_"+str(i)] = temp

with open('annotation_results.json', 'w') as f:
    json.dump(output, f)

TypeError: list indices must be integers or slices, not str

In [6]:
print(output)

{'scene_0': {'sentences': [['Penny', ':', 'Where', "'d", 'you', 'go', '?', 'I', 'ca', "n't", 'tell', 'if', 'the', 'turkey', "'s", 'done', '!'], ['Leonard', ':', 'Be', 'right', 'there', '!', 'Hi', ',', 'lover', '.'], ['Penny', ':', 'What', 'are', 'you', 'doing', '?'], ['Leonard', ':', 'I', "'", 'm', 'sorry', 'about', 'the', 'journal', '.', 'I', 'want', 'to', 'make', 'it', 'up', 'to', 'you', '.', 'So', 'I', "'", 'm', 'gon', 'na', 'let', 'you', 'post', 'a', 'shame', 'photo', 'of', 'me', 'on', 'Facebook', '.'], ['Penny', ':', 'I', 'am', 'not', 'putting', 'that', 'on', 'the', 'Internet', '!', 'I', 'do', "n't", 'want', 'people', 'to', 'see', 'this', '.', 'I', 'do', "n't", 'want', 'to', 'see', 'it', '!'], ['Leonard', ':', 'Do', "n't", 'want', 'people', 'to', 'see', 'what', ',', 'huh', '?', 'A', 'little', 'bit', 'of', 'this', '?'], ['Penny', ':', 'Oh', '.'], ['Leonard', ':', 'Some', 'of', 'this', '?', 'And', ',', 'since', 'it', "'s", 'Thanksgiving', ',', 'an', 'extra', 'helping', 'of', 'this',

In [11]:
golden = read_turkle_annotation_multiple_scene('data/pilot_2/golden.csv')
for item in golden:
    item['WorkerId'] = "golden"

results = []
results.extend(golden)

scenes = gather_by_scene(results)

In [56]:
output = {}
for i, item in enumerate(golden):
    temp = {"sentences": item['sentences']}

    # Collect all annotations
    annotations = []
    answer_spans = item['answer_spans']
    for x in answer_spans:
        query = x['querySpan']
        antecedents = []
        for token in x['span_list']:
            if token['sentenceIndex']!=-1 and token['startToken']!=-1 and token['endToken']!=-1:
                antecedents.append(token)
        annotations.append({
            "query": query,
            "antecedents": antecedents
        })
    temp['annotations'] = annotations

    # Collect speaker queries and non-speaker queries
    all_speakers = set([sent[0] for sent in item['sentences']])
    all_speaker_mentions = []
    for cluster in item['clusters_all']:
        for speaker in all_speakers:
            if tuple([speaker]) in cluster:
                all_speaker_mentions.extend(cluster)
    # print(all_speaker_mentions)
    speaker_annotations = []
    non_speaker_annotations = []
    for x in answer_spans:
        query = x['querySpan']
        antecedents = []
        for token in x['span_list']:
            if token['sentenceIndex']!=-1 and token['startToken']!=-1 and token['endToken']!=-1:
                antecedents.append(token)
        query_key = tuple(["_".join([str(query['sentenceIndex']), str(query['startToken']), str(query['endToken'])])])
        if query_key in all_speaker_mentions:
            speaker_annotations.append({
            "query": query,
            "antecedents": antecedents
            })
        else:
            non_speaker_annotations.append({
            "query": query,
            "antecedents": antecedents
            })
    temp['speaker_annotations'] = speaker_annotations
    temp['non_speaker_annotations'] = non_speaker_annotations

    output["scene_"+str(i)] = temp

with open('annotation_results.json', 'w') as f:
    json.dump(output, f)

In [5]:
print(scenes.keys())

dict_keys([":|Where|'d|you|go|?|I|ca|n't", ':|Oh|,|yeah|,|no|,|this|thing', ':|We|just|wanted|to|see|how|your|class'])


In [4]:
print(scenes)

{":|Where|'d|you|go|?|I|ca|n't": [{'sentences': [['Penny', ':', 'Where', "'d", 'you', 'go', '?', 'I', 'ca', "n't", 'tell', 'if', 'the', 'turkey', "'s", 'done', '!'], ['Leonard', ':', 'Be', 'right', 'there', '!', 'Hi', ',', 'lover', '.'], ['Penny', ':', 'What', 'are', 'you', 'doing', '?'], ['Leonard', ':', 'I', "'", 'm', 'sorry', 'about', 'the', 'journal', '.', 'I', 'want', 'to', 'make', 'it', 'up', 'to', 'you', '.', 'So', 'I', "'", 'm', 'gon', 'na', 'let', 'you', 'post', 'a', 'shame', 'photo', 'of', 'me', 'on', 'Facebook', '.'], ['Penny', ':', 'I', 'am', 'not', 'putting', 'that', 'on', 'the', 'Internet', '!', 'I', 'do', "n't", 'want', 'people', 'to', 'see', 'this', '.', 'I', 'do', "n't", 'want', 'to', 'see', 'it', '!'], ['Leonard', ':', 'Do', "n't", 'want', 'people', 'to', 'see', 'what', ',', 'huh', '?', 'A', 'little', 'bit', 'of', 'this', '?'], ['Penny', ':', 'Oh', '.'], ['Leonard', ':', 'Some', 'of', 'this', '?', 'And', ',', 'since', 'it', "'s", 'Thanksgiving', ',', 'an', 'extra', 'h

In [10]:
output = {}
for i, scene_id in enumerate(scenes):
    print(scenes[scene_id])

    print("scene_" + str(i), scene_id)
    output["scene_" + str(i)] = []

[{'sentences': [['Penny', ':', 'Where', "'d", 'you', 'go', '?', 'I', 'ca', "n't", 'tell', 'if', 'the', 'turkey', "'s", 'done', '!'], ['Leonard', ':', 'Be', 'right', 'there', '!', 'Hi', ',', 'lover', '.'], ['Penny', ':', 'What', 'are', 'you', 'doing', '?'], ['Leonard', ':', 'I', "'", 'm', 'sorry', 'about', 'the', 'journal', '.', 'I', 'want', 'to', 'make', 'it', 'up', 'to', 'you', '.', 'So', 'I', "'", 'm', 'gon', 'na', 'let', 'you', 'post', 'a', 'shame', 'photo', 'of', 'me', 'on', 'Facebook', '.'], ['Penny', ':', 'I', 'am', 'not', 'putting', 'that', 'on', 'the', 'Internet', '!', 'I', 'do', "n't", 'want', 'people', 'to', 'see', 'this', '.', 'I', 'do', "n't", 'want', 'to', 'see', 'it', '!'], ['Leonard', ':', 'Do', "n't", 'want', 'people', 'to', 'see', 'what', ',', 'huh', '?', 'A', 'little', 'bit', 'of', 'this', '?'], ['Penny', ':', 'Oh', '.'], ['Leonard', ':', 'Some', 'of', 'this', '?', 'And', ',', 'since', 'it', "'s", 'Thanksgiving', ',', 'an', 'extra', 'helping', 'of', 'this', '?'], ['Be

In [9]:
print(output)

{'scene_0': [], 'scene_1': [], 'scene_2': []}
