In [61]:
import pickle as pkl
import spacy
import csv
import json
from copy import deepcopy
from tqdm import tqdm

In [17]:
# Load Parsed Corpus
sm_parser = spacy.load('en_core_web_sm')

with open('tbbt_en_zh.pkl', 'rb') as f_zh:
    with open('tbbt_en_fa.pkl', 'rb') as f_fa:
        zh = pkl.load(f_zh)
        fa = pkl.load(f_fa)
        inter_keys = set(zh.keys()) & set(fa.keys())

data = {}
with open('parsed_corpus.pkl', 'rb') as f:
    parsed = pkl.load(f)
    for item in inter_keys:
        data[item] = parsed[item]

In [64]:
# Regular Candidate Spans

output = []
for epi_key in data:
    if epi_key != (1,1):
        continue
    episode = data[epi_key]
    # Each scene contain on episode
    for scene in tqdm(episode):
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]
                speaker = utt['speaker']
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)

                spans = list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk']))
                spans.sort(key=lambda x: x[1])

                for span in spans:
                    all_candidate_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
                    all_query_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
            else:
                utterance = utt['utterance']
                utterance_tokens = [item.text for item in sm_parser(utterance)]
                speaker = utt['speaker']
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)
                all_candidate_spans.append({
                        "sentenceIndex": i,
                        "startToken": 0,
                        "endToken": len(speaker) + 1
                })
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_candidate_spans
        })

100%|██████████| 11/11 [00:01<00:00,  5.82it/s]


In [96]:
def get_all_possible_spans(sentIdx, sentLen, window_size):
    all_possible_spans = []
    for i in range(sentLen-window_size):
        all_possible_spans.append({
            "sentenceIndex": sentIdx,
            "startToken": i,
            "endToken": i+window_size
        })
    return all_possible_spans

In [75]:
for window_size in range(6):
    print(window_size+1)

1
2
3
4
5
6


In [101]:
# All Spans
# Use Sliding Window to gather all potential spans

output = []
for epi_key in data:
    if epi_key != (1,1):
        continue
    episode = data[epi_key]
    # Each scene contain on episode
    for scene in episode:
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]
                speaker = utt['speaker']
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)

                spans = list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk']))
                spans.sort(key=lambda x: x[1])

                for span in spans:
                    all_query_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
                # Gather all possible candidate spans
                temp = []
                for window_size in range(6):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)
                # print(len(temp), len(sentence_tokens))
            else:
                utterance = utt['utterance']
                utterance_tokens = [item.text for item in sm_parser(utterance)]
                speaker = utt['speaker']
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)
                # Gather all possible candidate spans
                temp = []
                for window_size in range(6):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)
                # print(len(temp), len(sentence_tokens))
        print(len(all_candidate_spans))
        print(len(all_query_spans))
        print("=="*50)
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_candidate_spans
        })

2400
102
3294
130
8908
370
615
23
1695
80
291
15
987
45
336
16
147
5
858
35
1357
55


In [102]:
with open('test_all_spans.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

In [87]:
for item in output[0]:
    print(item)
    print(output[0][item])
    print('=='*50)

sentences
[['Sheldon', ':', 'If', 'a', 'photon', 'is', 'directed', 'through', 'a', 'plane', 'with', 'two', 'slits', 'in', 'it', 'and', 'either', 'is', 'observed', 'it', 'will', 'not', 'go', 'through', 'both', '.', 'If', 'unobserved', ',', 'it', 'will', '.', 'If', 'it', "'s", 'observed', 'after', 'it', 'left', 'the', 'plane', ',', 'before', 'it', 'hits', 'its', 'target', '...', 'it', 'will', 'not', 'have', 'gone', 'through', 'both', 'slits', '.'], ['Leonard', ':', 'Agreed', '.', 'What', "'s", 'your', 'point', '?'], ['Sheldon', ':', 'There', "'s", 'no', 'point', ',', 'I', 'just', 'think', 'it', "'s", 'a', 'good', 'idea', 'for', 'a', 'T', '-', 'shirt', '.'], ['Leonard', ':', ' ', 'Excuse', 'me', '?'], ['Receptionist', ':', ' ', 'Hang', 'on', '.'], ['Leonard', ':', 'One', 'across', 'is', 'Aegean', ',', 'eight', 'down', 'is', 'Nabokov', '.', 'Twenty', '-', 'six', 'across', 'is', 'MCM', '.', 'Fourteen', 'down', 'is', '...', 'Move', 'your', 'finger', '.', 'phylum', ',', 'which', 'makes', '14'

In [84]:
print(output[0]['candidateSpans'])

[{'sentenceIndex': 0, 'startToken': 0, 'endToken': 0}, {'sentenceIndex': 0, 'startToken': 1, 'endToken': 1}, {'sentenceIndex': 0, 'startToken': 2, 'endToken': 2}, {'sentenceIndex': 0, 'startToken': 3, 'endToken': 3}, {'sentenceIndex': 0, 'startToken': 4, 'endToken': 4}, {'sentenceIndex': 0, 'startToken': 5, 'endToken': 5}, {'sentenceIndex': 0, 'startToken': 6, 'endToken': 6}, {'sentenceIndex': 0, 'startToken': 7, 'endToken': 7}, {'sentenceIndex': 0, 'startToken': 8, 'endToken': 8}, {'sentenceIndex': 0, 'startToken': 9, 'endToken': 9}, {'sentenceIndex': 0, 'startToken': 10, 'endToken': 10}, {'sentenceIndex': 0, 'startToken': 11, 'endToken': 11}, {'sentenceIndex': 0, 'startToken': 12, 'endToken': 12}, {'sentenceIndex': 0, 'startToken': 13, 'endToken': 13}, {'sentenceIndex': 0, 'startToken': 14, 'endToken': 14}, {'sentenceIndex': 0, 'startToken': 15, 'endToken': 15}, {'sentenceIndex': 0, 'startToken': 16, 'endToken': 16}, {'sentenceIndex': 0, 'startToken': 17, 'endToken': 17}, {'sentenceI

In [88]:
len(output[0]['candidateSpans'])

13934

In [89]:
print(output[0]['sentences'])

[['Sheldon', ':', 'If', 'a', 'photon', 'is', 'directed', 'through', 'a', 'plane', 'with', 'two', 'slits', 'in', 'it', 'and', 'either', 'is', 'observed', 'it', 'will', 'not', 'go', 'through', 'both', '.', 'If', 'unobserved', ',', 'it', 'will', '.', 'If', 'it', "'s", 'observed', 'after', 'it', 'left', 'the', 'plane', ',', 'before', 'it', 'hits', 'its', 'target', '...', 'it', 'will', 'not', 'have', 'gone', 'through', 'both', 'slits', '.'], ['Leonard', ':', 'Agreed', '.', 'What', "'s", 'your', 'point', '?'], ['Sheldon', ':', 'There', "'s", 'no', 'point', ',', 'I', 'just', 'think', 'it', "'s", 'a', 'good', 'idea', 'for', 'a', 'T', '-', 'shirt', '.'], ['Leonard', ':', ' ', 'Excuse', 'me', '?'], ['Receptionist', ':', ' ', 'Hang', 'on', '.'], ['Leonard', ':', 'One', 'across', 'is', 'Aegean', ',', 'eight', 'down', 'is', 'Nabokov', '.', 'Twenty', '-', 'six', 'across', 'is', 'MCM', '.', 'Fourteen', 'down', 'is', '...', 'Move', 'your', 'finger', '.', 'phylum', ',', 'which', 'makes', '14', 'across'

In [91]:
count = 0
for item in output[0]['sentences']:
    print(item)
    count += len(item)

['Sheldon', ':', 'If', 'a', 'photon', 'is', 'directed', 'through', 'a', 'plane', 'with', 'two', 'slits', 'in', 'it', 'and', 'either', 'is', 'observed', 'it', 'will', 'not', 'go', 'through', 'both', '.', 'If', 'unobserved', ',', 'it', 'will', '.', 'If', 'it', "'s", 'observed', 'after', 'it', 'left', 'the', 'plane', ',', 'before', 'it', 'hits', 'its', 'target', '...', 'it', 'will', 'not', 'have', 'gone', 'through', 'both', 'slits', '.']
['Leonard', ':', 'Agreed', '.', 'What', "'s", 'your', 'point', '?']
['Sheldon', ':', 'There', "'s", 'no', 'point', ',', 'I', 'just', 'think', 'it', "'s", 'a', 'good', 'idea', 'for', 'a', 'T', '-', 'shirt', '.']
['Leonard', ':', ' ', 'Excuse', 'me', '?']
['Receptionist', ':', ' ', 'Hang', 'on', '.']
['Leonard', ':', 'One', 'across', 'is', 'Aegean', ',', 'eight', 'down', 'is', 'Nabokov', '.', 'Twenty', '-', 'six', 'across', 'is', 'MCM', '.', 'Fourteen', 'down', 'is', '...', 'Move', 'your', 'finger', '.', 'phylum', ',', 'which', 'makes', '14', 'across', 'Por

In [92]:
print(count)

470
