In [1]:
import pickle as pkl
import spacy
import csv
import json
from copy import deepcopy
import spacy_stanza
import benepar
from tqdm import tqdm

In [None]:
def old_get_spans_multi_parsers(scene, parsers):
    all_sentences = []
    all_speakers = []
    all_query_spans = []
    all_candidate_spans = []

    for i, utt in enumerate(scene):
        speaker = utt['speaker']
        transcript = speaker + " : " + utt['utterance'].replace("-", " ")


        # Load doc sentences and speakers
        doc = parsers[0](transcript)
        sent_tokens = []
        for j, item in enumerate(doc):
            sent_tokens.append(str(item))
        all_sentences.append(sent_tokens)
        all_speakers.append(speaker)

        # Parse spans
        noun = set()
        pron = set()
        noun_phrases = set()

        for parser in parsers:
            doc = parser(transcript)
            # Fetch Noun
            for j, item in enumerate(doc):
                pos = item.pos_
                if pos=="NOUN":
                    noun.add((i, j, j+1))
            noun.add((i, 0, 1))

            # Fetch Prons
            for j, item in enumerate(doc):
                pos = item.pos_
                if pos=="PRON":
                    pron.add((i, j, j+1))

            # Check Noun Phrase
            for item in doc.noun_chunks:
                noun_phrases.add((i, item.start, item.end))

        # Organize query source
        query_source = list(noun_phrases | noun | pron)
        query_source.sort(key=lambda x:x[1])


        # Add into Candidate Spans (Noun Phrases + Prons + Nouns)
        for (sentenceIndex, startToken, endToken) in query_source:
            span = {
                "sentenceIndex": sentenceIndex,
                "startToken": startToken,
                "endToken": endToken
            }
            all_candidate_spans.append(span)

        # Add into Candidate Spans (Noun Phrases + Prons + Nouns)
        for (sentenceIndex, startToken, endToken) in query_source:
            if startToken==0 and endToken==1:
                continue
            span = {
                "sentenceIndex": sentenceIndex,
                "startToken": startToken,
                "endToken": endToken
            }
            all_query_spans.append(span)


    output = []
    for sent in all_sentences:
        sent.append("\n")
    temp = {
        "sentences": all_sentences,
        "querySpans": all_query_spans,
        "candidateSpans": all_candidate_spans
    }
    output.append(temp)
    return output

In [2]:
def get_spans_multi_parsers(scene, parsers):
    all_sentences = []
    all_speakers = []
    all_query_spans = []
    all_candidate_spans = []

    for i, utt in enumerate(scene):
        speaker = utt['speaker']
        transcript = speaker + " : " + utt['utterance'].replace("-", " ")


        # Load doc sentences and speakers
        doc = parsers[0](transcript)
        sent_tokens = []
        for j, item in enumerate(doc):
            sent_tokens.append(str(item))
        all_sentences.append(sent_tokens)
        all_speakers.append(speaker)

        # Parse spans
        noun = set()
        pron = set()
        noun_phrases = set()

        for parser in parsers:
            doc = parser(transcript)

            # Fetch Prons
            for j, item in enumerate(doc):
                pos = item.pos_
                if pos=="PRON":
                    pron.add((i, j, j+1))

            # Check Noun Phrase
            for item in doc.noun_chunks:
                noun_phrases.add((i, item.start, item.end))

        # Organize query source
        query_source = list(noun_phrases | noun | pron)
        query_source.sort(key=lambda x:x[1])


        # Add into Candidate Spans (Noun Phrases + Prons + Nouns)
        for (sentenceIndex, startToken, endToken) in query_source:
            span = {
                "sentenceIndex": sentenceIndex,
                "startToken": startToken,
                "endToken": endToken
            }
            all_candidate_spans.append(span)

        # Add into Candidate Spans (Noun Phrases + Prons + Nouns)
        for (sentenceIndex, startToken, endToken) in query_source:
            if startToken==0 and endToken==1:
                continue
            span = {
                "sentenceIndex": sentenceIndex,
                "startToken": startToken,
                "endToken": endToken
            }
            all_query_spans.append(span)


    output = []
    for sent in all_sentences:
        sent.append("\n")
    temp = {
        "sentences": all_sentences,
        "querySpans": all_query_spans,
        "candidateSpans": all_candidate_spans
    }
    output.append(temp)
    return output

In [3]:
def write_annotation_file(output, file_path):
    with open(file_path, "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

# Load Parsers

In [5]:
trf_parser = spacy.load("en_core_web_trf")

sm_parser = spacy.load('en_core_web_sm')

md_parser = spacy.load('en_core_web_md')

lg_parser = spacy.load('en_core_web_lg')

berkeley_parser = spacy.load('en_core_web_md')
berkeley_parser.add_pipe("benepar", config={"model": "benepar_en3"})

stanza_parser = spacy_stanza.load_pipeline('en')

In [4]:
# parsers = [stanza_parser, berkeley_parser, trf_parser, sm_parser, md_parser, lg_parser]
parsers = [stanza_parser]


2022-03-09 14:30:58 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-03-09 14:30:58 INFO: Use device: cpu
2022-03-09 14:30:58 INFO: Loading: tokenize
2022-03-09 14:30:58 INFO: Loading: pos
2022-03-09 14:30:58 INFO: Loading: lemma
2022-03-09 14:30:58 INFO: Loading: depparse
2022-03-09 14:30:58 INFO: Loading: sentiment
2022-03-09 14:30:59 INFO: Loading: constituency
2022-03-09 14:30:59 INFO: Loading: ner
2022-03-09 14:31:00 INFO: Done loading processors!


In [6]:
with open('tbbt_en_zh.pkl', 'rb') as f:
    data = pkl.load(f)

scenes = data[(3,1)]

In [7]:
print(data.keys())

dict_keys([(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (2, 15), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (2, 21), (2, 22), (2, 23), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (3, 15), (3, 16), (3, 17), (3, 18), (3, 19), (3, 20), (3, 21), (3, 22), (3, 23), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (4, 11), (4, 12), (4, 13), (4, 14), (4, 15), (4, 17), (4, 18), (4, 19), (4, 20), (4, 21), (4, 22), (4, 23), (4, 24), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (5, 15), (5, 16), (5, 17), (5, 18), (5, 19), (5, 20), (5, 21), (5, 22), (5, 23), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8)

In [8]:
all_output = []
for scene in tqdm(scenes):
    output = get_spans_multi_parsers(scene, parsers)[0]
    all_output.append(output)
write_annotation_file(all_output, "trial_3_1.csv")

  0%|          | 0/11 [00:00<?, ?it/s][W NNPACK.cpp:79] Could not initialize NNPACK! Reason: Unsupported hardware.
100%|██████████| 11/11 [01:56<00:00, 10.60s/it]


In [11]:
for instance in all_output:
    sentences = instance['sentences']
    querySpans = instance['querySpans']
    candidateSpans = instance['candidateSpans']
    temp = []
    for item in candidateSpans:
        temp.append(sentences['sentenceIndex'][item['startToken']: item['endToken']])
    print()
    print(temp)


sentences
[['Leonard', ':', ' ', 'Oh', ',', 'thank', 'God', 'we', '’re', 'home', '.', '\n'], ['Howard', ':', ' ', 'I', 'ca', 'n’t', 'believe', 'we', 'spent', 'three', 'months', 'in', 'that', 'frozen', 'hell', '.', '\n'], ['Raj', ':', ' ', 'It', 'was', 'like', 'a', 'snowy', 'nightmare', 'from', 'which', 'there', 'was', 'no', 'awakening', '.', '\n'], ['Sheldon', ':', ' ', 'I', 'do', 'n’t', 'know', 'what', 'Arctic', 'expedition', 'you', 'guys', 'were', 'on', ',', 'but', 'I', 'thought', 'it', 'was', 'a', 'hoot', 'and', 'a', 'half', '.', '\n']]

querySpans
[{'sentenceIndex': 0, 'startToken': 7, 'endToken': 8}, {'sentenceIndex': 1, 'startToken': 3, 'endToken': 4}, {'sentenceIndex': 1, 'startToken': 7, 'endToken': 8}, {'sentenceIndex': 2, 'startToken': 3, 'endToken': 4}, {'sentenceIndex': 2, 'startToken': 10, 'endToken': 11}, {'sentenceIndex': 2, 'startToken': 11, 'endToken': 12}, {'sentenceIndex': 2, 'startToken': 13, 'endToken': 15}, {'sentenceIndex': 3, 'startToken': 3, 'endToken': 4}, {'s

In [45]:
test_scenes = []
for key in [(3, 2), (3, 3)]:
    scenes = data[key]
    all_output = []
    for scene in tqdm(scenes):
        output = get_spans_multi_parsers(scene, parsers)[0]
        all_output.append(output)
    test_scenes.extend(all_output)

100%|██████████| 11/11 [02:56<00:00, 16.05s/it]
100%|██████████| 9/9 [02:38<00:00, 17.56s/it]


In [46]:
print(test_scenes)

[{'sentences': [['Howard', ':', ' ', 'Sheldon', ',', 'you', '’re', 'wrong', '.', 'Wolverine', 'was', 'not', 'born', 'with', 'bone', 'claws', '.', '\n'], ['Sheldon', ':', ' ', 'Howard', ',', 'you', 'know', 'me', 'to', 'be', 'a', 'very', 'smart', 'man', '.', 'Do', 'n’t', 'you', 'think', 'if', 'I', 'were', 'wrong', ',', 'I', '’d', 'know', 'it', '?', '\n'], ['Howard', ':', ' ', 'Okay', ',', 'first', 'of', 'all', '…', '\n'], ['Raj', ':', ' ', 'Give', 'it', 'up', ',', 'dude', ',', 'you', '’re', 'arguing', 'with', 'a', 'crazy', 'person', '.', '\n'], ['Sheldon', ':', ' ', 'I', '’m', 'not', 'crazy', '.', 'My', 'mother', 'had', 'me', 'tested', '.', '\n'], ['Leonard', ':', ' ', 'Hey', ',', 'guys', '.', '\n'], ['Howard', ':', ' ', 'What', 'are', 'you', 'doing', 'here', '?', '\n'], ['Leonard', ':', ' ', 'What', 'do', 'you', 'mean', '?', 'It', '’s', 'new', 'comic', 'book', 'night', '.', '\n'], ['Raj', ':', ' ', 'Yeah', ',', 'but', 'since', 'you', 'and', 'Penny', 'finally', 'hooked', 'up', ',', 'we',

In [48]:
print(all_output)

[{'sentences': [['Penny', ':', ' ', 'Morning', ',', 'Sheldon', '.', 'Come', 'dance', 'with', 'me', '.', '\n'], ['Sheldon', ':', ' ', 'No', '.', '\n'], ['Penny', ':', ' ', 'Why', 'not', '?', '\n'], ['Sheldon', ':', ' ', 'Penny', ',', 'while', 'I', 'subscribe', 'to', 'the', 'many', 'worlds', 'theory', 'which', 'posits', 'the', 'existence', 'of', 'an', 'infinite', 'number', 'of', 'Sheldons', 'in', 'an', 'infinite', 'number', 'of', 'universes', ',', 'I', 'assure', 'you', 'that', 'in', 'none', 'of', 'them', 'am', 'I', 'dancing', '.', '\n'], ['Penny', ':', ' ', 'Are', 'you', 'fun', 'in', 'any', 'of', 'them', '?', '\n'], ['Sheldon', ':', ' ', 'The', 'math', 'would', 'suggest', 'that', 'in', 'a', 'few', 'I', '’m', 'a', 'clown', 'made', 'of', 'candy', '.', 'But', 'I', 'do', 'n’t', 'dance', '.', '\n'], ['Penny', ':', ' ', 'All', 'right', ',', 'want', 'some', 'French', 'toast', '?', '\n'], ['Sheldon', ':', ' ', 'It', '’s', 'Oatmeal', 'Day', '.', '\n'], ['Penny', ':', ' ', 'Tell', 'you', 'what', '

In [51]:
final_output = deepcopy(all_output)
final_output.extend(test_scenes)

In [52]:
print(len(final_output))

29


In [53]:
print(len(test_scenes))

20


In [56]:
for scene in final_output:
    print(len(scene['sentences']), len(scene['querySpans']), len(scene['querySpans'])/len(scene['sentences']))

34 190 5.588235294117647
37 227 6.135135135135135
24 121 5.041666666666667
22 157 7.136363636363637
27 121 4.481481481481482
2 1 0.5
21 93 4.428571428571429
33 149 4.515151515151516
12 53 4.416666666666667
32 189 5.90625
65 456 7.015384615384615
2 5 2.5
14 91 6.5
7 22 3.142857142857143
4 14 3.5
2 24 12.0
18 112 6.222222222222222
29 233 8.03448275862069
18 111 6.166666666666667
13 69 5.3076923076923075
34 190 5.588235294117647
37 227 6.135135135135135
24 121 5.041666666666667
22 157 7.136363636363637
27 121 4.481481481481482
2 1 0.5
21 93 4.428571428571429
33 149 4.515151515151516
12 53 4.416666666666667


In [57]:
num_sentences = 0
num_spans = 0
for scene in final_output:
    num_spans += len(scene['querySpans'])
    num_sentences += len(scene['sentences'])

In [58]:
print(num_spans)
print(num_sentences)

3550
628


In [59]:
3550 / 628

5.6528662420382165

# Analyze Results

In [39]:
for scene in all_output:
    print(scene)
    print("=="*50)

{'sentences': [['Leonard', ':', ' ', 'Oh', ',', 'thank', 'God', 'we', '’re', 'home', '.', '\n'], ['Howard', ':', ' ', 'I', 'ca', 'n’t', 'believe', 'we', 'spent', 'three', 'months', 'in', 'that', 'frozen', 'hell', '.', '\n'], ['Raj', ':', ' ', 'It', 'was', 'like', 'a', 'snowy', 'nightmare', 'from', 'which', 'there', 'was', 'no', 'awakening', '.', '\n'], ['Sheldon', ':', ' ', 'I', 'do', 'n’t', 'know', 'what', 'Arctic', 'expedition', 'you', 'guys', 'were', 'on', ',', 'but', 'I', 'thought', 'it', 'was', 'a', 'hoot', 'and', 'a', 'half', '.', '\n']], 'querySpans': [{'sentenceIndex': 0, 'startToken': 6, 'endToken': 7}, {'sentenceIndex': 0, 'startToken': 7, 'endToken': 8}, {'sentenceIndex': 0, 'startToken': 9, 'endToken': 10}, {'sentenceIndex': 1, 'startToken': 3, 'endToken': 4}, {'sentenceIndex': 1, 'startToken': 7, 'endToken': 8}, {'sentenceIndex': 1, 'startToken': 9, 'endToken': 11}, {'sentenceIndex': 1, 'startToken': 10, 'endToken': 11}, {'sentenceIndex': 1, 'startToken': 12, 'endToken': 1

In [41]:
for scene in all_output:
    print(len(scene['querySpans']), len(scene['sentences']), len(scene['querySpans'])/len(scene['sentences']))

30 4
69 3
20 5
179 29
255 31
130 14
208 28
207 27
108 21
54 8
158 29


In [29]:
multi_output = get_spans_multi_parsers(scene, parsers)
write_annotation_file(multi_output, "trial_path.csv")

In [27]:
for item in scene:
    if 'en_subtitles' in item:
        print(item['utterance'])
        print()
        print(" ".join(item['en_subtitles']).strip().replace("-", " "))
        print("=="*50)

 So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.

If a photon is directed through a plane with two slits in it and either is observed it will not go through both. If unobserved, it will. If it's observed after it left the plane, before it hits its target...   ...it will not have gone through both slits.
 Agreed, what’s your point?

  Agreed. What's your point?
 There’s no point, I just think it’s a good idea for a tee-shirt. 

There's no point, I just think it's a good idea for a T shirt.
 One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti. 

One across is Aegean, eight down is Nabokov. Twenty 