In [2]:
import os
import pickle as pkl
import spacy
import csv
import json
from copy import deepcopy
from tqdm import tqdm
# import benepar
# import re
from string import punctuation

from utils import merge_maximum_span
from utils import clean_sentence_brackets
from utils import process_nps_punctuation
from utils import extract_scenes

In [3]:
print(os.listdir('batched_data/'))

['test_all_large.pkl', 'train_extension_large.pkl', 'test_all_medium.pkl', 'train_all_no_extend_medium.pkl', 'dev_all_medium.pkl', 'train_all_no_extend_large.pkl', 'train_extension_medium.pkl', 'dev_all_large.pkl']


In [30]:
def combine_multiple_scenes(scenes):
    # Store Data
    sentences = []
    querySpans = []
    candidateSpans = []
    clickSpans = []
    sentence_offsets = [0]
    querySpans_offsets = [0]
    scene_ids = []

    for instance in scenes:
        offset = len(sentences)
        sentences.extend(instance['sentences'])
        for item in instance['querySpans']:
            token = deepcopy(item)
            token['sentenceIndex'] = item['sentenceIndex'] + offset
            querySpans.append(token)
        for item in instance['candidateSpans']:
            token = deepcopy(item)
            token['sentenceIndex'] = item['sentenceIndex'] + offset
            candidateSpans.append(token)
        for item in instance['clickSpans']:
            token = deepcopy(item)
            token['sentenceIndex'] = item['sentenceIndex'] + offset
            clickSpans.append(token)
        sentence_offsets.append(len(sentences))
        querySpans_offsets.append(len(querySpans))
        scene_ids.extend(instance['scene_ids'])
    return {
    "sentences": sentences,
    "querySpans": querySpans,
    "candidateSpans": candidateSpans,
    "clickSpans": clickSpans,
    "sentence_offsets": sentence_offsets,
    "querySpans_offsets": querySpans_offsets,
    "scene_ids": scene_ids,
    }

In [36]:
def extract_annotation_fields(scene):
    temp = {}
    temp['sentences'] = scene['sentences']
    temp['querySpans'] = scene['querySpans']
    temp['candidateSpans'] = scene['candidateSpans']
    temp['clickSpans'] = scene['clickSpans']
    temp['scene_ids'] = [scene['scene_id']]
    return temp

In [41]:
# Generate Single Scene
input_root = "batched_data/"
output_root = "annotation_input/single_scene/"
for file_name in os.listdir(input_root):
    field = file_name.strip().split(".")[0]
    with open(input_root+field+".pkl", 'rb') as f:
        data = pkl.load(f)
        output = []
        for batch in data:
            for scene in batch:
                output.append(extract_annotation_fields(scene))
        with open(output_root+field+".csv", "w", encoding="utf-8") as csv_fh:
            fieldnames = ['json_data']
            writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
            writer.writeheader()
            for line in output:
                writer.writerow({'json_data': json.dumps(line)})

In [42]:
# Generate Multiple Scene
input_root = "batched_data/"
output_root = "annotation_input/multiple_scene/"
for file_name in os.listdir(input_root):
    field = file_name.strip().split(".")[0]
    with open(input_root+field+".pkl", 'rb') as f:
        data = pkl.load(f)
        output = []
        for batch in data:
            temp_batch = []
            for scene in batch:
                temp_batch.append(extract_annotation_fields(scene))
            output.append(combine_multiple_scenes(temp_batch))
        with open(output_root+field+".csv", "w", encoding="utf-8") as csv_fh:
            fieldnames = ['json_data']
            writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
            writer.writeheader()
            for line in output:
                writer.writerow({'json_data': json.dumps(line)})

In [49]:
# Generate Single Scene Demo
input_root = "batched_data/"
output_root = "annotation_input/single_scene/"
for file_name in os.listdir(input_root):
    if file_name!="dev_all_medium.pkl":
        continue
    field = file_name.strip().split(".")[0]
    with open(input_root+field+".pkl", 'rb') as f:
        data = [pkl.load(f)[0][:3]]
        print(len(data), len(data[0]))
        output = []
        for batch in data:
            for scene in batch:
                output.append(extract_annotation_fields(scene))
        with open(output_root+"demo"+".csv", "w", encoding="utf-8") as csv_fh:
            fieldnames = ['json_data']
            writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
            writer.writeheader()
            for line in output:
                writer.writerow({'json_data': json.dumps(line)})

# Generate Multiple Scene
input_root = "batched_data/"
output_root = "annotation_input/multiple_scene/"
for file_name in os.listdir(input_root):
    if file_name!="dev_all_medium.pkl":
        continue
    field = file_name.strip().split(".")[0]
    with open(input_root+field+".pkl", 'rb') as f:
        data = [pkl.load(f)[0][:3]]
        output = []
        for batch in data:
            temp_batch = []
            for scene in batch:
                temp_batch.append(extract_annotation_fields(scene))
            output.append(combine_multiple_scenes(temp_batch))
        with open(output_root+"demo"+".csv", "w", encoding="utf-8") as csv_fh:
            fieldnames = ['json_data']
            writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
            writer.writeheader()
            for line in output:
                writer.writerow({'json_data': json.dumps(line)})

1 3
