In [2]:
import csv
import json
from copy import deepcopy
import pickle as pkl
csv.field_size_limit(1131072)

131072

In [3]:
from annotation.analysis.utils.data_util import generate_all_clusters_combine_speakers
from annotation.analysis.utils.data_util import generate_clusters_no_plural_combine_speakers
from annotation.analysis.utils.data_util import collect_mentions


def read_turkle_annotation_multiple_scene(path):
    """
    Load the annotation_result along with the document
    Output: sentence along with all annotations

    In this version, the input contains annotations from multiple scenes
    Different scenes could be split with sentence_offsets and query_offsets
    """
    output = []
    with open(path, 'r') as f:
        annotation_reader = csv.DictReader(f)
        for instance in annotation_reader:
            inputs = json.loads(instance["Input.json_data"])
            all_sentences = inputs['sentences']
            all_query_spans = inputs['querySpans']
            sentence_offsets = inputs['sentence_offsets']
            query_spans_offsets = inputs['querySpans_offsets']
            all_scene_ids = inputs['scene_ids']
            all_answer_spans = json.loads(instance["Answer.answer_spans"])

            for i in range(len(sentence_offsets) - 1):
                sent_start, sent_end = sentence_offsets[i], sentence_offsets[i + 1]
                query_start, query_end = query_spans_offsets[i], query_spans_offsets[i + 1]
                original_sentences = all_sentences[sent_start: sent_end]
                sentences = []
                for item in original_sentences:
                    temp_sent = []
                    if isinstance(item[0], list):
                        temp_sent.extend(item[0])
                        temp_sent.extend(item[1:])
                        sentences.append(temp_sent)
                    else:
                        sentences.append(item)
                query_spans = all_query_spans[query_start: query_end]
                for item in query_spans:
                    item['sentenceIndex'] -= sentence_offsets[i]

                answer_spans = all_answer_spans[query_start: query_end]
                for item in answer_spans:
                    item['querySpan']['sentenceIndex'] -= sentence_offsets[i]
                    for x in item['span_list']:
                        if x['startToken'] == -1 and x['endToken'] == -1:
                            continue
                        x['sentenceIndex'] -= sentence_offsets[i]

                temp = {
                    "sentences": sentences,
                    "query_spans": query_spans,
                    "answer_spans": answer_spans,
                    "WorkerId": instance['WorkerId'],
                    "scene_id": all_scene_ids[i],
                }
                # Collect mentions into clusters
                temp['clusters_all'] = generate_all_clusters_combine_speakers(temp)
                temp['clusters_no_plural'] = generate_clusters_no_plural_combine_speakers(temp)

                # Collect mentions (For Kappa Cohen)
                answers = collect_mentions(temp)
                temp["answers"] = answers

                # Add to output
                output.append(temp)

    return output

def gather_by_scene(annotations):
    """
    annotations: [sentence along with all annotation_result]
    return {scene_key: [annotations]}
    """
    output = {}
    for instance in annotations:
        key_id = instance['scene_id']
        if key_id not in output:
            output[key_id] = [instance]
        else:
            output[key_id].append(instance)
    return output

In [61]:
# Load Data and relabel the speaker token
data = read_turkle_annotation_multiple_scene('data/coe_bulk.csv')

# Transform speaker token to string
for item in data:
    temp = []
    for i, (query, answers) in enumerate(item['answers']):
        if isinstance(answers, list):
            for j, answer in enumerate(answers):
                if answer[1]==0 and answer[2]==1 and answer[0]<=(len(item['sentences'])-1):
                    item['answers'][i][1][j] = item['sentences'][answer[0]][0]
                else:
                    item['answers'][i][1][j] = "_".join([str(token) for token in answer])
        else:
            item['answers'][i][1] = [item['answers'][i][1]]
        item['answers'][i][1] = sorted(item['answers'][i][1])
        # print(item['answers'][i])

results = []
results.extend(data)
scenes = gather_by_scene(results)

In [62]:
one_way = {}
two_way = {}
for item in scenes:
    if len(scenes[item])==1:
        one_way[item] = scenes[item]
    else:
        two_way[item] = scenes[item]

In [63]:
print(len(two_way))
print(len(one_way))

23
107


In [66]:
disagreed_num = 0
all_num = 0
union_num = 0
for item in two_way:
    print(item)
    temp = two_way[item]
    for a, b in zip(temp[0]['answers'], temp[1]['answers']):
        all_num += 1
        if a!=b:
            disagreed_num += 1
            print(a)
            print(b)
            print()
            temp = deepcopy(a[1])
            temp.extend(b[1])
            union_answers = [token for token in temp if token!="notMention"]
            if len(union_answers)!=0:
                union_num += 1
            else:
                print(a)
                print(b)
                print(union_answers)
                print()
    print("=="*50)

s07e17c11t
[(0, 14, 15), ['notPresent']]
[(0, 14, 15), ['notMention']]

[(1, 20, 21), ['notPresent']]
[(1, 20, 21), ['notMention']]

[(1, 23, 24), ['1_20_21']]
[(1, 23, 24), ['notMention']]

[(1, 40, 42), ['notPresent']]
[(1, 40, 42), ['1_37_38']]

[(4, 2, 3), ['notPresent']]
[(4, 2, 3), ['1_40_42']]

[(4, 4, 5), ['notPresent']]
[(4, 4, 5), ['notMention']]

[(4, 4, 6), ['4_2_3']]
[(4, 4, 6), ['1_40_42']]

[(4, 8, 9), ['4_2_3']]
[(4, 8, 9), ['1_40_42']]

[(4, 29, 30), ['notPresent']]
[(4, 29, 30), ['notMention']]

[(5, 6, 7), ['notPresent']]
[(5, 6, 7), ['notMention']]

[(6, 5, 6), ['notPresent']]
[(6, 5, 6), ['notMention']]

[(8, 11, 12), ['notPresent']]
[(8, 11, 12), ['Leonard']]

[(8, 19, 21), ['notPresent']]
[(8, 19, 21), ['4_2_3']]

s07e17c12t
[(0, 16, 17), ['Emily']]
[(0, 16, 17), ['notPresent']]

[(0, 17, 18), ['0_11_12']]
[(0, 17, 18), ['Raj']]

[(0, 21, 22), ['Emily']]
[(0, 21, 22), ['notPresent']]

[(0, 25, 26), ['0_11_12']]
[(0, 25, 26), ['Raj']]

[(0, 27, 28), ['0_21_22']]
[

In [59]:
print(disagreed_num)
print((all_num-disagreed_num)/all_num, (all_num-disagreed_num))
print(all_num)
print(union_num/all_num)

460
0.7377423033067275 1294
1754
0.9908779931584949
