In [107]:
import pickle as pkl
import spacy
import stanza
import csv
import json
from copy import deepcopy

In [2]:
with open('tbbt_en_zh.pkl', 'rb') as f:
    data = pkl.load(f)

In [3]:
scenes = data[(1,1)]
for scene in scenes:
    print(scene)

[{'utterance': ' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'speaker': 'Sheldon', 'en_subtitles': ['If a photon is directed through a plane with two slits in it and either is observed it will not go through both.', 'If unobserved, it will.', "If it's observed after it left the plane, before it hits its target...", '- ...it will not have gone through both slits.'], 'zh_subtitles': ['将光子正对平面上的双缝 观察任意一个隙缝 它不会穿过那两个隙缝', '如果没被观察 那就会', '总之 如果观察它在离开平面到击中目标之前', '它就不会穿过那两个隙缝']}, {'utterance': ' Agreed, what’s your point?', 'speaker': 'Leonard', 'en_subtitles': ["- Agreed. What's your point?"], 'zh_subtitles': ['没错 但你为什么要说这个?']}, {'utterance': ' There’s no point, I just think it’s a good idea for a tee-shirt. ', 'speaker': 'Sheldon', 'en_subtitles': ["There's no

## Perform Parsing

In [4]:
# Load Data
scene = data[(1,1)][0]

In [5]:
for x in scene:
    print(x)

{'utterance': ' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'speaker': 'Sheldon', 'en_subtitles': ['If a photon is directed through a plane with two slits in it and either is observed it will not go through both.', 'If unobserved, it will.', "If it's observed after it left the plane, before it hits its target...", '- ...it will not have gone through both slits.'], 'zh_subtitles': ['将光子正对平面上的双缝 观察任意一个隙缝 它不会穿过那两个隙缝', '如果没被观察 那就会', '总之 如果观察它在离开平面到击中目标之前', '它就不会穿过那两个隙缝']}
{'utterance': ' Agreed, what’s your point?', 'speaker': 'Leonard', 'en_subtitles': ["- Agreed. What's your point?"], 'zh_subtitles': ['没错 但你为什么要说这个?']}
{'utterance': ' There’s no point, I just think it’s a good idea for a tee-shirt. ', 'speaker': 'Sheldon', 'en_subtitles': ["There's no po

In [105]:
parser = spacy.load("en_core_web_trf")

In [115]:
parser = spacy.load('en_core_web_sm')

In [116]:
all_sentences = []
all_speakers = []
all_query_spans = []
all_candidate_spans = []

for i, utt in enumerate(scene):
    speaker = utt['speaker']
    transcript = speaker + " : " + utt['utterance']
    doc = parser(transcript)

    # Load tokens and build instance
    sent_tokens = []
    for j, item in enumerate(doc):
        sent_tokens.append(str(item))
    all_sentences.append(sent_tokens)
    all_speakers.append(speaker)

    # Fetch Noun
    noun = set()
    for j, item in enumerate(doc):
        pos = item.pos_
        if pos=="NOUN":
            noun.add((i, j, j+1))
            # print(j, str(item), pos, sent_tokens[j: j+1], noun_pron)
    noun.add((j, 0, 1))

    # Fetch Prons
    pron = set()
    for j, item in enumerate(doc):
        pos = item.pos_
        if pos=="PRON":
            pron.add((i, j, j+1))
            # print(j, str(item), pos, sent_tokens[j: j+1], noun_pron)

    # Check Noun Phrase
    noun_phrases = set()
    for item in doc.noun_chunks:
        noun_phrases.add((i, item.start, item.end))
        # print(item, item.start, item.end, sent_tokens[item.start: item.end], noun_phrases)

    # Add into Query Spans (Noun Phrases + Prons)
    query_source = noun_phrases | pron
    for (sentenceIndex, startToken, endToken) in query_source:
        span = {
            "sentenceIndex": sentenceIndex,
            "startToken": startToken,
            "endToken": endToken
        }
        all_query_spans.append(span)

    # Add into Candidate Spans (Noun Phrases + Prons + Nouns)
    candidate_source = query_source | noun
    for (sentenceIndex, startToken, endToken) in query_source:
        span = {
            "sentenceIndex": sentenceIndex,
            "startToken": startToken,
            "endToken": endToken
        }
        all_candidate_spans.append(span)

output = []
for sent in all_sentences:
    sent.append("\n")
temp = {
    "sentences": all_sentences,
    "querySpans": all_query_spans,
    "candidateSpans": all_candidate_spans
}
output.append(temp)

In [108]:
trf = deepcopy(all_candidate_spans)

In [111]:
lg = deepcopy(all_candidate_spans)

In [114]:
md = deepcopy(all_candidate_spans)

In [117]:
sm = deepcopy(all_candidate_spans)

In [129]:
spans = [trf, lg, md, sm]
for x in spans:
    print(len(x))

120
129
131
134


In [128]:
for x in spans:
    temp = {}
    for item in x:
        sentenceIndex = item['sentenceIndex']
        startToken = item['startToken']
        endToken = item['endToken']
        if sentenceIndex not in temp:
            temp[sentenceIndex] = [" ".join(all_sentences[sentenceIndex][startToken: endToken])]
        else:
            temp[sentenceIndex].append(" ".join(all_sentences[sentenceIndex][startToken: endToken]))
    print(temp[0])
print(" ".join(all_sentences[0]))

['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'Sheldon', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'Sheldon', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
Sheldon :   So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits . If it ’s unobserved it will , however , if it ’s observed after it ’s left the plane but before it hits its target , it will not have gone through both slits . 



In [127]:
for x in spans:
    temp = {}
    for item in x:
        sentenceIndex = item['sentenceIndex']
        startToken = item['startToken']
        endToken = item['endToken']
        if sentenceIndex not in temp:
            temp[sentenceIndex] = [" ".join(all_sentences[sentenceIndex][startToken: endToken])]
        else:
            temp[sentenceIndex].append(" ".join(all_sentences[sentenceIndex][startToken: endToken]))
    print(temp[1])
print(" ".join(all_sentences[1]))

['what', 'your point', 'your']
['what', 'your point', 'your']
['what', 'your point', 'your']
['what', 'your', 'Leonard', 'your point', 'Agreed']
Leonard :   Agreed , what ’s your point ? 



In [104]:
with open('test_lg.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

In [22]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, constituency')

2022-02-28 10:59:02 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2022-02-28 10:59:02 INFO: Use device: cpu
2022-02-28 10:59:02 INFO: Loading: tokenize
2022-02-28 10:59:02 INFO: Loading: pos
2022-02-28 10:59:02 INFO: Loading: constituency
2022-02-28 10:59:02 INFO: Done loading processors!
