In [67]:
import pickle as pkl
import spacy
import csv
import json
from copy import deepcopy
from tqdm import tqdm
import benepar

In [68]:
sm_parser = spacy.load('en_core_web_sm')
berkeley_parser = spacy.load('en_core_web_md')
berkeley_parser.add_pipe("benepar", config={"model": "benepar_en3"})
# stanza_parser = spacy_stanza.load_pipeline('en')
trf_parser = spacy.load("en_core_web_trf")

In [24]:
# Load Parsed Corpus
sm_parser = spacy.load('en_core_web_sm')

with open('tbbt_en_zh.pkl', 'rb') as f_zh:
    with open('tbbt_en_fa.pkl', 'rb') as f_fa:
        zh = pkl.load(f_zh)
        fa = pkl.load(f_fa)
        inter_keys = set(zh.keys()) & set(fa.keys())

with open('parsed_corpus.pkl', 'rb') as f:
    nps = pkl.load(f)
with open('parsed_corpus_all.pkl', 'rb') as f:
    tags = pkl.load(f)

In [65]:
print(nps[(1,1)][0][0])

{'utterance': ' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'speaker': 'Sheldon', 'en_subtitles': ['If a photon is directed through a plane with two slits in it and either is observed it will not go through both.', 'If unobserved, it will.', "If it's observed after it left the plane, before it hits its target...", 'it will not have gone through both slits.'], 'zh_subtitles': ['将光子正对平面上的双缝 观察任意一个隙缝 它不会穿过那两个隙缝', '如果没被观察 那就会', '总之 如果观察它在离开平面到击中目标之前', '它就不会穿过那两个隙缝'], 'sm_noun_chunk': [('a photon', 1, 3), ('a plane', 6, 8), ('two slits', 9, 11), ('it', 12, 13), ('it', 17, 18), ('both', 22, 23), ('it', 27, 28), ('it', 31, 32), ('it', 35, 36), ('the plane', 37, 39), ('it', 41, 42), ('its target', 43, 45), ('it', 46, 47), ('both slits', 52, 54)], 'sm_pron': [(

In [54]:
# Merge
def merge_maximum_span(spans):
    spans.sort(key=lambda x: x[1])
    to_pop = []
    for j, (word_0, start_idx_0, end_idx_0) in enumerate(spans):
        for k, (word_1, start_idx_1, end_idx_1) in enumerate(spans):
            if k==j:
                continue
            if (start_idx_1 >= start_idx_0) and (end_idx_1 <= end_idx_0):
                to_pop.append(spans[k])
    for item in to_pop:
        if item in spans:
            spans.remove(item)

    spans.sort(key=lambda x: x[1])

    return spans

In [77]:
# Collect Data
output = []
for epi_key in inter_keys:
    if epi_key != (1,8):
        continue
    for i in range(len(nps[epi_key])):
        all_sentences = []
        all_query_spans = []

        scene = nps[epi_key][i]
        scene_tag = tags[epi_key][i]
        j = 0
        for utt, tag in zip(scene, scene_tag):
            speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
            speaker_tokens = [item.text for item in sm_parser(speaker)]
            if "en_subtitles" not in utt:
                sentence = utt['utterance']
                utt['sm_noun_chunk'] = [(item.text, item.start, item.end) for item in sm_parser(sentence).noun_chunks]
                utt['sm_pron'] = [(item.text,i, i+1) for i, item in enumerate(sm_parser(sentence)) if item.pos_=="PRON"]
                utt['berkeley_noun_chunk'] = [(item.text, item.start, item.end) for item in berkeley_parser(sentence).noun_chunks]
                utt['berkeley_pron'] = [(item.text,i, i+1) for i, item in enumerate(berkeley_parser(sentence)) if item.pos_=="PRON"]
                utt['trf_noun_chunk'] = [(item.text, item.start, item.end) for item in trf_parser(sentence).noun_chunks]
                utt['trf_pron'] = [(item.text,i, i+1) for i, item in enumerate(trf_parser(sentence)) if item.pos_=="PRON"]
            else:
                sentence = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
            sentence_token = [item.text for item in sm_parser(sentence)]
            noun_phrase = merge_maximum_span(list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk'])))
            pron = merge_maximum_span(list(set(utt['sm_pron']) | set(utt['berkeley_pron']) | set(utt['trf_pron'])))
            mention = list(set(noun_phrase)|set(pron))
            mention.sort(key=lambda x: x[1])

            all_sentences.append(speaker_tokens + [":"] + sentence_token)
            for span in mention:
                all_query_spans.append({
                        "sentenceIndex": j,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
            j+=1
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_query_spans,
            "clickSpans": all_query_spans,
            })

with open('sample_annotate_epi_1_8_new.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

In [25]:
for utt in tags[(1,1)][0]:
    for item in utt:
        print(item, ":", utt[item], len(utt[item]))
    print("=="*50)

utterance :  So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits. 280
speaker : Sheldon 7
en_subtitles : ['If a photon is directed through a plane with two slits in it and either is observed it will not go through both.', 'If unobserved, it will.', "If it's observed after it left the plane, before it hits its target...", 'it will not have gone through both slits.'] 4
zh_subtitles : ['将光子正对平面上的双缝 观察任意一个隙缝 它不会穿过那两个隙缝', '如果没被观察 那就会', '总之 如果观察它在离开平面到击中目标之前', '它就不会穿过那两个隙缝'] 4
sm_pron : [('If', 0, 1, 'SCONJ', 'IN'), ('a', 1, 2, 'DET', 'DT'), ('photon', 2, 3, 'NOUN', 'NN'), ('is', 3, 4, 'AUX', 'VBZ'), ('directed', 4, 5, 'VERB', 'VBN'), ('through', 5, 6, 'ADP', 'IN'), ('a', 6, 7, 'DET', 'DT'), ('plane', 7, 8, 'NOUN', 'NN'), ('with', 8, 9, 'ADP', 'IN'), ('two', 9, 10, 'NU

In [None]:


data = {}
with open('parsed_corpus.pkl', 'rb') as f:
    parsed = pkl.load(f)
    for item in inter_keys:
        data[item] = parsed[item]

In [6]:
temp = data[(1,1)][0]

In [11]:
for utt in temp:
    for item in utt:
        print(item, ":", utt[item], len(utt[item]))
    print("=="*50)

utterance :  So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits. 280
speaker : Sheldon 7
en_subtitles : ['If a photon is directed through a plane with two slits in it and either is observed it will not go through both.', 'If unobserved, it will.', "If it's observed after it left the plane, before it hits its target...", 'it will not have gone through both slits.'] 4
zh_subtitles : ['将光子正对平面上的双缝 观察任意一个隙缝 它不会穿过那两个隙缝', '如果没被观察 那就会', '总之 如果观察它在离开平面到击中目标之前', '它就不会穿过那两个隙缝'] 4
sm_noun_chunk : [('a photon', 1, 3), ('a plane', 6, 8), ('two slits', 9, 11), ('it', 12, 13), ('it', 17, 18), ('both', 22, 23), ('it', 27, 28), ('it', 31, 32), ('it', 35, 36), ('the plane', 37, 39), ('it', 41, 42), ('its target', 43, 45), ('it', 46, 47), ('both slits', 52, 54)] 14
sm_pron : [('it

In [20]:
with open('parsed_corpus_small.pkl', 'rb') as f:
    parsed = pkl.load(f)

In [21]:
temp = parsed[(1,1)][0]
for utt in temp:
    for item in utt:
        print(item, ":", utt[item])
    print('=='*50)

utterance :  So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.
speaker : Sheldon
en_subtitles : ['If a photon is directed through a plane with two slits in it and either is observed it will not go through both.', 'If unobserved, it will.', "If it's observed after it left the plane, before it hits its target...", 'it will not have gone through both slits.']
zh_subtitles : ['将光子正对平面上的双缝 观察任意一个隙缝 它不会穿过那两个隙缝', '如果没被观察 那就会', '总之 如果观察它在离开平面到击中目标之前', '它就不会穿过那两个隙缝']
sm_pron : [('If', 0, 1, 'SCONJ', 'IN'), ('a', 1, 2, 'DET', 'DT'), ('photon', 2, 3, 'NOUN', 'NN'), ('is', 3, 4, 'AUX', 'VBZ'), ('directed', 4, 5, 'VERB', 'VBN'), ('through', 5, 6, 'ADP', 'IN'), ('a', 6, 7, 'DET', 'DT'), ('plane', 7, 8, 'NOUN', 'NN'), ('with', 8, 9, 'ADP', 'IN'), ('two', 9, 10, 'NUM', 'CD'),