In [1]:
import pickle as pkl
import spacy
import stanza
import csv
import json
from copy import deepcopy

In [2]:
with open('tbbt_en_zh.pkl', 'rb') as f:
    data = pkl.load(f)

In [9]:
scenes = data[(1,1)]
for i, scene in enumerate(scenes):
    print(i)
    for item in scene:
        print(item['utterance'])
    print("=="*50)

0
 So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.
 Agreed, what’s your point?
 There’s no point, I just think it’s a good idea for a tee-shirt. 
 Excuse me?
 Hang on. 
 One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti. 
 Can I help you?
 Yes. Um, is this the High IQ sperm bank?
 If you have to ask, maybe you shouldn’t be here.
 I think this is the place.
 Fill these out.
 Thank-you. We’ll be right back.
 Oh, take your time. I’ll just finish my crossword puzzle. Oh wait.
 Leonard, I don’t think I can do this.
 What, are you kidding? You’re a semi-pro. 
 No. We are committing genetic fraud. Th

## Perform Parsing

In [15]:
# Load Data
scene = data[(1,1)][6]

In [16]:
for x in scene:
    print(x)

{'utterance': ' I’ll do the talking.', 'speaker': 'buzzer)', 'en_subtitles': ["- I'll do the talking."], 'zh_subtitles': ['我来跟他理论']}
{'utterance': ' Yeah.', 'speaker': 'buzzer'}
{'utterance': ' Hi, I’m Leonard, this is Sheldon.', 'speaker': 'Leonard', 'en_subtitles': ["- Hi. I'm Leonard, this is Sheldon."], 'zh_subtitles': ['嗨 我是Leonard 他是Sheldon']}
{'utterance': ' Hello.', 'speaker': 'Sheldon'}
{'utterance': ' What did I just…. Uh, we’re here to pick up Penny’s TV.', 'speaker': 'Leonard', 'en_subtitles': ['What did I just...?', "- We're here to pick up Penny's TV."], 'zh_subtitles': ['- 我...', '我们来拿Penny的电视']}
{'utterance': ' Get lost.', 'speaker': 'Voice', 'en_subtitles': ['- Get lost.'], 'zh_subtitles': ['滚']}
{'utterance': ' Okay, thanks for your time.', 'speaker': 'Sheldon', 'en_subtitles': ['Okay, thanks for your time.'], 'zh_subtitles': ['好的 打搅你了']}
{'utterance': ' We’re not going to give up just like that.', 'speaker': 'Leonard', 'en_subtitles': ["- We're not gonna give up just

In [17]:
parser = spacy.load("en_core_web_trf")

In [12]:
parser = spacy.load('en_core_web_sm')

In [18]:
all_sentences = []
all_speakers = []
all_query_spans = []
all_candidate_spans = []

for i, utt in enumerate(scene):
    speaker = utt['speaker']
    transcript = speaker + " : " + utt['utterance']
    doc = parser(transcript)

    # Load tokens and build instance
    sent_tokens = []
    for j, item in enumerate(doc):
        sent_tokens.append(str(item))
    all_sentences.append(sent_tokens)
    all_speakers.append(speaker)

    # Fetch Noun
    noun = set()
    for j, item in enumerate(doc):
        pos = item.pos_
        if pos=="NOUN":
            noun.add((i, j, j+1))
            # print(j, str(item), pos, sent_tokens[j: j+1], noun_pron)
    noun.add((j, 0, 1))

    # Fetch Prons
    pron = set()
    for j, item in enumerate(doc):
        pos = item.pos_
        if pos=="PRON":
            pron.add((i, j, j+1))
            # print(j, str(item), pos, sent_tokens[j: j+1], noun_pron)

    # Check Noun Phrase
    noun_phrases = set()
    for item in doc.noun_chunks:
        noun_phrases.add((i, item.start, item.end))
        # print(item, item.start, item.end, sent_tokens[item.start: item.end], noun_phrases)

    # Add into Query Spans (Noun Phrases + Prons)
    query_source = noun_phrases | pron
    for (sentenceIndex, startToken, endToken) in query_source:
        span = {
            "sentenceIndex": sentenceIndex,
            "startToken": startToken,
            "endToken": endToken
        }
        all_query_spans.append(span)

    # Add into Candidate Spans (Noun Phrases + Prons + Nouns)
    candidate_source = query_source | noun
    for (sentenceIndex, startToken, endToken) in query_source:
        span = {
            "sentenceIndex": sentenceIndex,
            "startToken": startToken,
            "endToken": endToken
        }
        all_candidate_spans.append(span)

output = []
for sent in all_sentences:
    sent.append("\n")
temp = {
    "sentences": all_sentences,
    "querySpans": all_query_spans,
    "candidateSpans": all_candidate_spans
}
output.append(temp)



In [108]:
trf = deepcopy(all_candidate_spans)

In [111]:
lg = deepcopy(all_candidate_spans)

In [114]:
md = deepcopy(all_candidate_spans)

In [117]:
sm = deepcopy(all_candidate_spans)

In [129]:
spans = [trf, lg, md, sm]
for x in spans:
    print(len(x))

120
129
131
134


In [128]:
for x in spans:
    temp = {}
    for item in x:
        sentenceIndex = item['sentenceIndex']
        startToken = item['startToken']
        endToken = item['endToken']
        if sentenceIndex not in temp:
            temp[sentenceIndex] = [" ".join(all_sentences[sentenceIndex][startToken: endToken])]
        else:
            temp[sentenceIndex].append(" ".join(all_sentences[sentenceIndex][startToken: endToken]))
    print(temp[0])
print(" ".join(all_sentences[0]))

['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'Sheldon', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
['a photon', 'it', 'it', 'both slits', 'it', 'its', 'two slits', 'it', 'the plane', 'Sheldon', 'it', 'it', 'either slit', 'its target', 'both slits', 'it', 'it', 'a plane']
Sheldon :   So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits . If it ’s unobserved it will , however , if it ’s observed after it ’s left the plane but before it hits its target , it will not have gone through both slits . 



In [127]:
for x in spans:
    temp = {}
    for item in x:
        sentenceIndex = item['sentenceIndex']
        startToken = item['startToken']
        endToken = item['endToken']
        if sentenceIndex not in temp:
            temp[sentenceIndex] = [" ".join(all_sentences[sentenceIndex][startToken: endToken])]
        else:
            temp[sentenceIndex].append(" ".join(all_sentences[sentenceIndex][startToken: endToken]))
    print(temp[1])
print(" ".join(all_sentences[1]))

['what', 'your point', 'your']
['what', 'your point', 'your']
['what', 'your point', 'your']
['what', 'your', 'Leonard', 'your point', 'Agreed']
Leonard :   Agreed , what ’s your point ? 



In [19]:
with open('test_short.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

In [22]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, constituency')

2022-02-28 10:59:02 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2022-02-28 10:59:02 INFO: Use device: cpu
2022-02-28 10:59:02 INFO: Loading: tokenize
2022-02-28 10:59:02 INFO: Loading: pos
2022-02-28 10:59:02 INFO: Loading: constituency
2022-02-28 10:59:02 INFO: Done loading processors!
