In [119]:
import csv
import json
from copy import deepcopy
import pickle as pkl
csv.field_size_limit(1131072)

1131072

In [120]:
from annotation.analysis.utils.data_util import read_turkle_annotation_multiple_scene
from annotation.analysis.utils.data_util import read_annotation
from annotation.analysis.utils.data_util import gather_by_scene

In [121]:
from annotation.analysis.utils.data_util import generate_all_clusters_combine_speakers
from annotation.analysis.utils.data_util import generate_clusters_no_plural_combine_speakers
from annotation.analysis.utils.data_util import collect_mentions


def read_turkle_annotation_multiple_scene(path):
    """
    Load the annotation_result along with the document
    Output: sentence along with all annotations

    In this version, the input contains annotations from multiple scenes
    Different scenes could be split with sentence_offsets and query_offsets
    """
    output = []
    with open(path, 'r') as f:
        annotation_reader = csv.DictReader(f)
        for instance in annotation_reader:
            inputs = json.loads(instance["Input.json_data"])
            all_sentences = inputs['sentences']
            all_query_spans = inputs['querySpans']
            sentence_offsets = inputs['sentence_offsets']
            query_spans_offsets = inputs['querySpans_offsets']
            all_scene_ids = inputs['scene_ids']
            all_answer_spans = json.loads(instance["Answer.answer_spans"])

            for i in range(len(sentence_offsets) - 1):
                sent_start, sent_end = sentence_offsets[i], sentence_offsets[i + 1]
                query_start, query_end = query_spans_offsets[i], query_spans_offsets[i + 1]
                original_sentences = all_sentences[sent_start: sent_end]
                sentences = []
                for item in original_sentences:
                    temp_sent = []
                    if isinstance(item[0], list):
                        temp_sent.extend(item[0])
                        temp_sent.extend(item[1:])
                        sentences.append(temp_sent)
                    else:
                        sentences.append(item)
                query_spans = all_query_spans[query_start: query_end]
                for item in query_spans:
                    item['sentenceIndex'] -= sentence_offsets[i]

                answer_spans = all_answer_spans[query_start: query_end]
                for item in answer_spans:
                    item['querySpan']['sentenceIndex'] -= sentence_offsets[i]
                    for x in item['span_list']:
                        if x['startToken'] == -1 and x['endToken'] == -1:
                            continue
                        x['sentenceIndex'] -= sentence_offsets[i]

                temp = {
                    "sentences": sentences,
                    "query_spans": query_spans,
                    "answer_spans": answer_spans,
                    "WorkerId": instance['WorkerId'],
                    "scene_id": all_scene_ids[i],
                }
                # Collect mentions into clusters
                temp['clusters_all'] = generate_all_clusters_combine_speakers(temp)
                temp['clusters_no_plural'] = generate_clusters_no_plural_combine_speakers(temp)

                # Collect mentions (For Kappa Cohen)
                answers = collect_mentions(temp)
                temp["answers"] = answers

                # Add to output
                output.append(temp)

    return output

def gather_by_scene(annotations):
    """
    annotations: [sentence along with all annotation_result]
    return {scene_key: [annotations]}
    """
    output = {}
    for instance in annotations:
        key_id = instance['scene_id']
        if key_id not in output:
            output[key_id] = [instance]
        else:
            output[key_id].append(instance)
    return output

In [67]:
data = read_turkle_annotation_multiple_scene('data/coe_bulk.csv')

In [96]:
sample = data[0]
for item in sample:
    print(item)
    print(sample[item])
    print()

sentences
[['Raj', ':', 'Hey', ',', 'you', 'busy', '?'], ['Amy', ':', 'No', '.', 'What', "'s", 'up', '?'], ['Raj', ':', 'Have', 'you', 'heard', 'back', 'from', 'Emily', '?'], ['Amy', ':', 'I', 'have', '.'], ['Raj', ':', 'Great', '!', 'And', '?'], ['Amy', ':', 'And', 'I', "'m", 'afraid', 'she', 'does', "n't", 'think', 'you', "'re", 'right', 'for', 'her', '.'], ['Raj', ':', 'I', 'give', 'you', 'one', 'simple', 'thing', 'to', 'do--', 'contact', 'a', 'complete', 'stranger', 'and', 'make', 'her', 'fall', 'in', 'love', 'with', 'me--', 'and', 'you', 'blow', 'it', '!'], ['Amy', ':', 'I', 'told', 'her', 'what', 'a', 'good', 'guy', 'you', 'are', ',', 'but', 'she', 'thought', 'it', 'was', 'a', 'bad', 'sign', 'that', 'you', 'did', "n't", 'write', 'to', 'her', 'yourself', '.', 'She', 'thought', 'it', 'made', 'you', 'seem', 'too', 'shy', 'and', 'passive', '.'], ['Raj', ':', 'I', "'m", 'not', 'too', 'shy', 'and', 'passive', '.', 'You', 'write', 'her', 'back', 'and', 'tell', 'her', 'I', 'said', 'that'

In [118]:
data = read_turkle_annotation_multiple_scene('data/coe_bulk.csv')

for item in data:
    # print(item.keys())
    temp = []
    for i, (query, answers) in enumerate(item['answers']):
        if isinstance(answers, list):
            for j, answer in enumerate(answers):
                if answer[1]==0 and answer[2]==1 and answer[0]<=(len(item['sentences'])-1):
                    print(item['answers'][i][1][j], item['answers'][i])
                    item['answers'][i][1][j] = item['sentences'][answer[0]][0]
                    print(item['answers'][i][1][j])
                    print()
                    # print(item['answers'][i][1][1])
                    # print(answer, answer[1], answer[2], item['sentences'][answer[0]][0])
            item['answers'].sort()
        print()
        # if isinstance(token[1], list):
        #     sent_id, start, end = token[1][0]
        #     if start==0 and end==1:
        #         pass
        #         # print(token, isinstance(token[1], list), token[1][0], token[1][0][1], token[1][0][2])
        #     # if token[1][0]==0 and
        # else:
        #     temp.append(token)




(3, 0, 1) [(2, 3, 4), [(3, 0, 1)]]
Amy



(3, 0, 1) [(3, 2, 3), [(3, 0, 1)]]
Amy


(5, 0, 1) [(5, 3, 4), [(5, 0, 1)]]
Amy



(6, 0, 1) [(5, 10, 11), [(6, 0, 1)]]
Raj



(6, 0, 1) [(6, 2, 3), [(6, 0, 1)]]
Raj


(7, 0, 1) [(6, 4, 5), [(7, 0, 1)]]
Amy






(6, 0, 1) [(6, 21, 22), [(6, 0, 1)]]
Raj


(7, 0, 1) [(6, 23, 24), [(7, 0, 1)]]
Amy



(7, 0, 1) [(7, 2, 3), [(7, 0, 1)]]
Amy




(8, 0, 1) [(7, 9, 10), [(8, 0, 1)]]
Raj





(8, 0, 1) [(7, 21, 22), [(8, 0, 1)]]
Raj



(8, 0, 1) [(7, 27, 28), [(8, 0, 1)]]
Raj




(8, 0, 1) [(7, 33, 34), [(8, 0, 1)]]
Raj


(8, 0, 1) [(8, 2, 3), [(8, 0, 1)]]
Raj


(9, 0, 1) [(8, 10, 11), [(9, 0, 1)]]
Amy




(8, 0, 1) [(8, 17, 18), [(8, 0, 1)]]
Raj



(9, 0, 1) [(8, 21, 22), [(9, 0, 1)]]
Amy


(9, 0, 1) [(8, 25, 26), [(9, 0, 1)]]
Amy



(9, 0, 1) [(9, 4, 5), [(9, 0, 1)]]
Amy


(9, 0, 1) [(9, 8, 9), [(9, 0, 1)]]
Amy







(11, 0, 1) [(11, 2, 3), [(11, 0, 1)]]
Amy





(13, 0, 1) [(13, 5, 6), [(13, 0, 1), (2, 7, 8)]]
Amy


(13, 0, 1) [(13, 8, 9), [(13,

In [117]:
sample = data[0]
for item in sample:
    print(item)
    print(sample[item])
    print()

sentences
[['Raj', ':', 'Hey', ',', 'you', 'busy', '?'], ['Amy', ':', 'No', '.', 'What', "'s", 'up', '?'], ['Raj', ':', 'Have', 'you', 'heard', 'back', 'from', 'Emily', '?'], ['Amy', ':', 'I', 'have', '.'], ['Raj', ':', 'Great', '!', 'And', '?'], ['Amy', ':', 'And', 'I', "'m", 'afraid', 'she', 'does', "n't", 'think', 'you', "'re", 'right', 'for', 'her', '.'], ['Raj', ':', 'I', 'give', 'you', 'one', 'simple', 'thing', 'to', 'do--', 'contact', 'a', 'complete', 'stranger', 'and', 'make', 'her', 'fall', 'in', 'love', 'with', 'me--', 'and', 'you', 'blow', 'it', '!'], ['Amy', ':', 'I', 'told', 'her', 'what', 'a', 'good', 'guy', 'you', 'are', ',', 'but', 'she', 'thought', 'it', 'was', 'a', 'bad', 'sign', 'that', 'you', 'did', "n't", 'write', 'to', 'her', 'yourself', '.', 'She', 'thought', 'it', 'made', 'you', 'seem', 'too', 'shy', 'and', 'passive', '.'], ['Raj', ':', 'I', "'m", 'not', 'too', 'shy', 'and', 'passive', '.', 'You', 'write', 'her', 'back', 'and', 'tell', 'her', 'I', 'said', 'that'

In [46]:
results = []
results.extend(data)
scenes = gather_by_scene(results)

## Perform Data Merging

In [47]:
one_way = {}
two_way = {}
for item in scenes:
    if len(scenes[item])==1:
        one_way[item] = scenes[item]
    else:
        two_way[item] = scenes[item]

In [49]:
print(two_way.keys(), len(two_way.keys()))

dict_keys(['s07e17c11t', 's07e17c12t', 's07e18c00t', 's07e18c01t', 's07e18c08t', 's07e18c09t', 's07e18c10t', 's01e01c00f', 's01e01c01f', 's01e01c02f', 's01e01c03f', 's01e01c04f', 's01e01c05f', 's01e01c06f', 's01e01c07f', 's01e01c08f', 's01e01c09f', 's01e01c10f', 's01e01c11f', 's07e18c11t', 's01e01c12f', 's01e01c13f', 's01e02c00f']) 23


In [64]:
for item in two_way:
    print(item)
    temp = two_way[item]
    sentences = temp[0]['sentences']
    for a, b in zip(temp[0]['answers'], temp[1]['answers']):
        print(a)
        print(b)
        print()

# count = 0
# for item in two_way:
#     print(item)
#     temp = two_way[item]
#     for a, b in zip(temp[0]['answers'], temp[1]['answers']):
#         if a!=b:
#             count += 1
#             print(a)
#             print(b)
#             print()
#     print("=="*50)

s07e17c11t
[(0, 14, 15), 'notPresent']
[(0, 14, 15), 'notMention']

[(1, 2, 3), [(0, 0, 1)]]
[(1, 2, 3), [(2, 0, 1)]]

[(1, 6, 7), [(0, 0, 1)]]
[(1, 6, 7), [(2, 0, 1)]]

[(1, 18, 19), [(0, 0, 1)]]
[(1, 18, 19), [(2, 0, 1)]]

[(1, 20, 21), 'notPresent']
[(1, 20, 21), 'notMention']

[(1, 23, 24), [(1, 20, 21)]]
[(1, 23, 24), 'notMention']

[(1, 35, 36), [(0, 0, 1)]]
[(1, 35, 36), [(2, 0, 1)]]

[(1, 40, 42), 'notPresent']
[(1, 40, 42), [(1, 37, 38)]]

[(1, 40, 41), [(0, 0, 1)]]
[(1, 40, 41), [(2, 0, 1)]]

[(2, 7, 8), [(2, 0, 1), (1, 0, 1)]]
[(2, 7, 8), [(2, 0, 1), (3, 0, 1)]]

[(2, 10, 11), [(1, 0, 1)]]
[(2, 10, 11), [(3, 0, 1)]]

[(4, 2, 3), 'notPresent']
[(4, 2, 3), [(1, 40, 42)]]

[(4, 4, 5), 'notPresent']
[(4, 4, 5), 'notMention']

[(4, 4, 6), [(4, 2, 3)]]
[(4, 4, 6), [(1, 40, 42)]]

[(4, 8, 9), [(4, 2, 3)]]
[(4, 8, 9), [(1, 40, 42)]]

[(4, 11, 12), [(3, 0, 1)]]
[(4, 11, 12), [(5, 0, 1)]]

[(4, 21, 22), [(3, 0, 1)]]
[(4, 21, 22), [(5, 0, 1)]]

[(4, 29, 30), 'notPresent']
[(4, 29, 30),

In [63]:
1249/1754

0.7120866590649944

In [None]:
scenes = gather_by_scene(results)