In [1]:
import os
from os import path
import json
import sys
from collections import defaultdict
import random
import csv

In [2]:
from utils.data_util import read_mturk_annotation_multiple_scene
from utils.data_util import read_turkle_annotation_multiple_scene
from utils.data_util import read_annotation
from utils.data_util import gather_by_scene

In [3]:
def process_clusters(clusters, sentences):
    speaker_dict = {}
    for i, sent in enumerate(sentences):
        speaker_dict[" ".join(sent[: sent.index(":")])] = tuple([i, 0, sent.index(":")])

    sent_offsets = [0]
    for i, sent in enumerate(sentences):
        sent_offset = sent_offsets[i] + len(sent)
        sent_offsets.append(sent_offset)

    processed_cluster = []
    for cluster in clusters:
        temp = []
        for mention in cluster:
            if mention[0] in speaker_dict:
                mention = speaker_dict[mention[0]]
            else:
                mention = tuple([int(item) for item in mention[0].strip().split("_")])
            temp.append([mention[1]+sent_offsets[mention[0]], mention[2]+sent_offsets[mention[0]]-1])
        processed_cluster.append(temp)

    return processed_cluster


In [4]:
mturk_1 = read_mturk_annotation_multiple_scene('data/bulk_0/results.csv')
results = []
results.extend(mturk_1)

for item in results:
    item['clusters'] = process_clusters(item['clusters_no_plural'], item['sentences'])

In [4]:
mturk_1 = read_mturk_annotation_multiple_scene('data/pilot_2/data.csv')

# mturk_2 = read_mturk_annotation_multiple_scene('data/pilot_1/pilot_2.csv')

golden = read_turkle_annotation_multiple_scene('data/pilot_1/golden.csv')
for item in golden:
    item['WorkerId'] = "golden"

results = []
results.extend(mturk_1)
# results.extend(mturk_2)
results.extend(golden)

for item in results:
    item['clusters'] = process_clusters(item['clusters_no_plural'], item['sentences'])

scenes = gather_by_scene(results)

In [18]:
# Load Data
golden = read_annotation('data/golden.csv')
for item in golden:
    item['Turkle.Username'] = "golden"
    item['clusters'] = process_clusters(item['clusters_no_plural'], item['sentences'])
sample = golden[0]

In [17]:
HTML_START = '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body>'


start_tag_template = '<div style="border:2px; display:inline; border-style: {}; border-color: {}; padding: {}px; padding-right: 3px; padding-left: 3px">'
end_tag = '</div>'

largest_padding = 13
padding_reduction = 3


def get_tag_options(cluster):
    border = 'solid'
    if len(cluster) == 1:
        border = 'dotted'

    color = '#0066CC'
    if cluster[0][-1] == 1:
        color = 'violet'

    return border, color


In [14]:
def generate_cluster_html(instance):
    bert_seg_idx = []
    doc_list = []
    for sentence in instance["sentences"]:
        doc_list.extend(sentence)
        bert_seg_idx.append(len(sentence) + (bert_seg_idx[-1] if len(bert_seg_idx) else 0))

    bert_seg_idx = set(bert_seg_idx)
    html_tag_list = {}

    # Get all the entity info
    clusters = sorted(instance["clusters"], key=lambda cluster: min([elem[0] for elem in cluster]))
    for cluster_idx, cluster in enumerate(clusters):
        for mention in cluster:
            span_start, span_end = mention
            span_end = span_end + 1  ## Now span_end is not part of the span

            if span_start not in html_tag_list:
                html_tag_list[span_start] = defaultdict(list)
            if span_end not in html_tag_list:
                html_tag_list[span_end] = defaultdict(list)

            subscript = str(cluster_idx)

            tag_options = get_tag_options(cluster)
            start_tag = start_tag_template.format(
                *tag_options,
                largest_padding - padding_reduction * len(html_tag_list[span_start]['start']))


            html_tag_list[span_start]['start'].append((start_tag))
            # Subscript used in end
            html_tag_list[span_end]['end'].append((span_start, cluster_idx, end_tag, subscript))


    html_string = HTML_START + '<div style="line-height: 3">'
    for token_idx, token in enumerate(doc_list):
        if token_idx in bert_seg_idx:
            html_string += "\n<br/>"

        if token_idx in html_tag_list:
            for tag_type in ['end', 'start']:
                if tag_type == 'end' and (tag_type in html_tag_list[token_idx]):
                    tags = html_tag_list[token_idx]['end']

                    # Sort the tags so as to mimic the stack behavior
                    tags = sorted(tags, key=lambda x: x[0] - x[1] * 1e-5)  # Highest mentions first
                    for _, _, html_tag, subscript in tags:
                        html_string += "<sub>" + subscript + "</sub>"
                        html_string += html_tag
                        # Since we are deleting the highest indices first, the lower indices are unaffected

                if tag_type == 'start' and (tag_type in html_tag_list[token_idx]):
                    for html_tag in html_tag_list[token_idx]['start']:
                        html_string += html_tag

        html_string += " " + token
    html_string += "</div></body></html>"
    html_string = html_string.replace("\n", "\n<br/>")
    html_string = html_string.replace("~", "&lt;")
    html_string = html_string.replace("^", "&gt;")
    return html_string

In [15]:
for i in range(len(golden)):
    sample = golden[i]
    html_string = generate_cluster_html(sample)
    with open('cluster_htmls/' + sample['WorkerId'] + "||" + str(i) + ".html", 'w') as f:
        f.write(html_string)

In [10]:
html_string = generate_cluster_html(sample)
with open('cluster_htmls/' + sample['WorkerId'] + "||" + ".html", 'w') as f:
        f.write(html_string)
print(sample.keys(), sample['WorkerId'])

dict_keys(['HITId', 'HITTypeId', 'Title', 'CreationTime', 'MaxAssignments', 'AssignmentDurationInSeconds', 'AssignmentId', 'WorkerId', 'AcceptTime', 'SubmitTime', 'WorkTimeInSeconds', 'sentences', 'answer_spans', 'Turkle.Username', 'clusters_all', 'clusters_no_plural', 'answers', 'clusters']) 170


In [9]:
for scene in results:
    html_string = generate_cluster_html(scene)
    with open('cluster_htmls/' + scene['WorkerId'] + "||" + scene['scene_id'] + ".html", 'w') as f:
            f.write(html_string)
    print(scene.keys(), scene['WorkerId'], scene['scene_id'])

dict_keys(['sentences', 'query_spans', 'answer_spans', 'WorkerId', 'scene_id', 'clusters_all', 'clusters_no_plural', 'answers', 'clusters']) AKQAI78JTXXC9 s01e01c00t
dict_keys(['sentences', 'query_spans', 'answer_spans', 'WorkerId', 'scene_id', 'clusters_all', 'clusters_no_plural', 'answers', 'clusters']) AKQAI78JTXXC9 s01e01c01t
dict_keys(['sentences', 'query_spans', 'answer_spans', 'WorkerId', 'scene_id', 'clusters_all', 'clusters_no_plural', 'answers', 'clusters']) A3L2XKXABNO0N5 s01e01c03t
dict_keys(['sentences', 'query_spans', 'answer_spans', 'WorkerId', 'scene_id', 'clusters_all', 'clusters_no_plural', 'answers', 'clusters']) A3L2XKXABNO0N5 s01e01c04t
dict_keys(['sentences', 'query_spans', 'answer_spans', 'WorkerId', 'scene_id', 'clusters_all', 'clusters_no_plural', 'answers', 'clusters']) A3L2XKXABNO0N5 s01e01c05t
dict_keys(['sentences', 'query_spans', 'answer_spans', 'WorkerId', 'scene_id', 'clusters_all', 'clusters_no_plural', 'answers', 'clusters']) A3L2XKXABNO0N5 s01e01c06t


In [11]:
for scene in results:
    html_string = generate_cluster_html(scene)
    if scene['scene_id']=="s01e10c04t":
        print(scene['clusters'])
    # print(scene.keys(), scene['WorkerId'], scene['scene_id'])

[[[10, 10], [2, 2]], [[494, 494], [414, 414], [263, 263], [199, 199], [475, 475], [256, 256], [4, 4], [399, 399], [405, 405], [512, 512], [485, 485], [37, 37], [89, 89], [12, 12], [50, 50]], [[113, 113], [473, 473], [411, 411], [57, 57], [74, 74], [165, 165], [180, 180], [544, 544], [234, 234], [439, 439], [48, 48], [78, 78], [426, 426], [175, 175], [5, 5], [305, 305], [292, 292], [104, 104]], [[467, 467], [24, 24], [451, 451], [43, 43], [477, 477], [28, 28], [196, 196], [415, 415], [230, 230], [217, 217], [515, 515], [52, 52], [190, 190], [13, 13], [149, 149], [61, 61], [482, 482], [285, 285], [182, 182], [20, 20], [489, 489], [509, 509]], [[18, 20], [32, 33]], [[43, 44], [14, 14]], [[64, 66], [72, 72], [70, 70]], [[85, 85]], [[92, 92], [95, 96]], [[98, 98]], [[104, 105]], [[106, 109], [102, 102]], [[113, 114], [104, 106]], [[177, 177], [123, 123], [383, 383], [395, 395], [106, 106], [374, 374], [169, 169], [185, 185], [186, 186], [130, 130], [115, 115], [292, 293], [329, 329], [274, 

In [8]:
for scene_key in scenes:
    print(scene_key, len(scenes[scene_key][0]['sentences']))
    for item in scenes[scene_key]:
        html_string = generate_cluster_html(item)
        print(item['WorkerId'])
        with open('cluster_htmls/' + str(len(scenes[scene_key][0]['sentences'])) + "/" + item['WorkerId'] + ".html", 'w') as f:
            f.write(html_string)
        # print(html_string)



:|Where|'d|you|go|?|I|ca|n't 9
A2VIKCIM9TZL22
golden
:|Oh|,|yeah|,|no|,|this|thing 11
A2VIKCIM9TZL22
golden
:|We|just|wanted|to|see|how|your|class 31
A2VIKCIM9TZL22
golden


In [70]:
temp = generate_cluster_html(golden[0])

In [71]:
print(temp)

<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"></head><body><div style="line-height: 3"> Penny : Where 'd<div style="border:2px; display:inline; border-style: solid; border-color: #0066CC; padding: 13px; padding-right: 3px; padding-left: 3px"> you<sub>0</sub></div> go ?<div style="border:2px; display:inline; border-style: solid; border-color: #0066CC; padding: 13px; padding-right: 3px; padding-left: 3px"> I<sub>1</sub></div> ca n't tell if<div style="border:2px; display:inline; border-style: dotted; border-color: #0066CC; padding: 13px; padding-right: 3px; padding-left: 3px"> the turkey<sub>2</sub></div> 's done !
<br/><br/> Leonard : Be right there ! Hi , lover .
<br/><br/> Penny :   What are you doing ?
<br/><br/> Leonard :<div style="border:2px; display:inline; border-style: solid; border-color: #0066CC; padding: 13px; padding-right: 3px; padding-left: 3px"> I<sub>0</sub></div> 'm sorry about<div style="border:2px; display:inline; border-style: dotted; border-color: #006