In [2]:
import pickle as pkl
import spacy
import csv
import json
from copy import deepcopy
from tqdm import tqdm

In [3]:
# Load Parsed Corpus
sm_parser = spacy.load('en_core_web_sm')

with open('tbbt_en_zh.pkl', 'rb') as f_zh:
    with open('tbbt_en_fa.pkl', 'rb') as f_fa:
        zh = pkl.load(f_zh)
        fa = pkl.load(f_fa)
        inter_keys = set(zh.keys()) & set(fa.keys())

data = {}
with open('parsed_corpus.pkl', 'rb') as f:
    parsed = pkl.load(f)
    for item in inter_keys:
        data[item] = parsed[item]

In [4]:
for scene in zh[(1,2)]:
    for utt in scene:
        print(utt['speaker'], utt['utterance'])
        if "en_subtitles" in utt:
            print(utt['en_subtitles'])
        print()

Leonard  There you go, Pad Thai, no peanuts.

Howard  But does it have peanut oil?

Leonard  Uh, I’m not sure, everyone keep an eye on Howard in case he starts to swell up.
['Everyone keep an eye on Howard in case he starts to swell up.']

Sheldon  Since it’s not bee season, you can have my epinephrine.
["Since it's not bee season, you can have my epinephrine."]

Raj  Are there any chopsticks?
['- Any chopsticks?']

Sheldon  You don’t need chopsticks, this is Thai food.
["- Don't need chopsticks, this is Thai food."]

Leonard  Here we go.
['Here we go.']

Sheldon  Thailand has had the fork since the latter half of the nineteenth century. Interestingly they don’t actually put the fork in their mouth, they use it to put the food on a spoon which then goes into their mouth.
['Thailand has had the fork since the latter half of the 19th century.', "They don't put the fork in their mouth, they use it to put the food on a spoon which then goes into their mouth."]

Leonard  Ask him for a napki

In [6]:
print(len(zh[(1,2)]))

10


In [4]:
# Regular Candidate Spans

output = []
for epi_key in data:
    if epi_key != (1,1):
        continue
    episode = data[epi_key]
    # Each scene contain on episode
    for scene in tqdm(episode):
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]
                speaker = utt['speaker']
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)

                spans = list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk']))
                spans.sort(key=lambda x: x[1])

                for span in spans:
                    all_candidate_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
                    all_query_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
            else:
                utterance = utt['utterance']
                utterance_tokens = [item.text for item in sm_parser(utterance)]
                speaker = utt['speaker']
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)
                all_candidate_spans.append({
                        "sentenceIndex": i,
                        "startToken": 0,
                        "endToken": len(speaker) + 1
                })
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_candidate_spans,
            "clickSpans": all_query_spans,
        })

100%|██████████| 11/11 [00:01<00:00,  5.88it/s]


In [5]:
def get_all_possible_spans(sentIdx, sentLen, window_size):
    all_possible_spans = []
    for i in range(sentLen-window_size):
        all_possible_spans.append({
            "sentenceIndex": sentIdx,
            "startToken": i,
            "endToken": i+window_size
        })
    return all_possible_spans

In [47]:
for epi_key in data:
    episode = data[epi_key]
    # Each scene contain on episode
    # print(epi_key)
    for scene in episode:
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                # Fetch parse Noun Phrases from former parsing result
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if (utterance_tokens[0].isupper() and len(utterance_tokens[0])!=1) or (utterance_tokens[1]==":"):
                    print(utterance)
                    print()

                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]

FYI, secret-keeping. Hand-holding, not a fan. Hammerhead shark, I love that thing. Yeah, it's another fish with a tool on its head.

PS4 or Xbox One?

PENNY: Come on, Sheldon. There are plenty of smart people who don't have mental problems.

PENNY: I don't know. Maybe it's the local cuisine. Okay, well, it's nice to meet you. Just gonna set you on down over here. And FYI, you'd be lucky to have me as a daughter-in-law.

NASA called. The telescope mount I installed on the space station got damaged, and they want me to go back up and fix it.

KEVIN: Oh, I'm hanging up now.

PG. Some scenes may be too intense for younger viewers.

MRS. WOLOWITZ: Howard, I found my girdle! It was in the dryer!

FYI-- his toothbrush is the red one in the Plexiglas case under the UV light.

FYI, I had a doughnut for breakfast, you jerk.

PENNY: You know, they have DVDs over there.

LEONARD: Anyway we can e-mail and I think the phone connections are good.

DR. KOOTHRAPPALI: Who's that? Oh, my God! There's som

In [48]:
for epi_key in data:
    episode = data[epi_key]
    # Each scene contain on episode
    # print(epi_key)
    for scene in episode:
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                # Fetch parse Noun Phrases from former parsing result
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if (utterance_tokens[1]==":"):
                    print(utterance)
                    print()

                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]
    # print('=='*50)

PENNY: Come on, Sheldon. There are plenty of smart people who don't have mental problems.

PENNY: I don't know. Maybe it's the local cuisine. Okay, well, it's nice to meet you. Just gonna set you on down over here. And FYI, you'd be lucky to have me as a daughter-in-law.

KEVIN: Oh, I'm hanging up now.

A: surprised you know that. B: I wanted to look like a sexy graduate for you.

PENNY: You know, they have DVDs over there.

LEONARD: Anyway we can e-mail and I think the phone connections are good.

WOLOWITZ: I'm not a baby! I'm a grown man, and I made the bed. Now where's my star?

KOOTHRAPPALI: They're in my shirt!

AMY: Wow, the store looks great.

LEONARD: Right there, right there,

LEONARD: I know, I'm familiar with you.

SHELDON: This is great. I'm in the real world of ordinary people just living their ordinary, colorless, work-a-day lives.

LEONARD: Oh, come on. Some battles you win, some battles you lose.

HOWARD: Yes, but you don't have to lose to Kyle Bernstein's bar mitzvah p

In [58]:
for epi_key in data:
    episode = data[epi_key]
    # Each scene contain on episode
    # print(epi_key)
    for scene in episode:
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                # Fetch parse Noun Phrases from former parsing result
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if ":" in utterance_tokens:
                    colon_idx = utterance_tokens.index(":")
                    prefix = utterance_tokens[: colon_idx]
                    if " ".join(prefix).isupper():
                        print(utterance)
                        print(prefix, len(prefix))
                        print(" ".join(utterance_tokens[len(prefix)+1:]))
                        print()

                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]

PENNY: Come on, Sheldon. There are plenty of smart people who don't have mental problems.
['PENNY'] 1
Come on , Sheldon . There are plenty of smart people who do n't have mental problems .

PENNY: I don't know. Maybe it's the local cuisine. Okay, well, it's nice to meet you. Just gonna set you on down over here. And FYI, you'd be lucky to have me as a daughter-in-law.
['PENNY'] 1
I do n't know . Maybe it 's the local cuisine . Okay , well , it 's nice to meet you . Just gon na set you on down over here . And FYI , you 'd be lucky to have me as a daughter - in - law .

KEVIN: Oh, I'm hanging up now.
['KEVIN'] 1
Oh , I 'm hanging up now .

MRS. WOLOWITZ: Howard, I found my girdle! It was in the dryer!
['MRS', '.', 'WOLOWITZ'] 3
Howard , I found my girdle ! It was in the dryer !

A: surprised you know that. B: I wanted to look like a sexy graduate for you.
['A'] 1
surprised you know that . B : I wanted to look like a sexy graduate for you .

PENNY: You know, they have DVDs over there.
['P

In [38]:
utterance = "LEONARD: This is it. I'll do the talking."
utterance_tokens = [item.text for item in sm_parser(utterance)]

print(utterance)
print(utterance_tokens)
print(utterance_tokens[0].isupper())
print(utterance_tokens[1], utterance_tokens[1]==":")

print(utterance_tokens[0].isupper() and utterance_tokens[1]==":")

LEONARD: This is it. I'll do the talking.
['LEONARD', ':', 'This', 'is', 'it', '.', 'I', "'ll", 'do', 'the', 'talking', '.']
True
: True
True


In [72]:
# All Spans
# Use Sliding Window to gather all potential spans

output = []
for epi_key in data:
    if epi_key != (1,1):
        continue
    episode = data[epi_key]
    # Each scene contain on episode
    for scene in episode:
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                prefix_length = None
                # Fetch parse Noun Phrases from former parsing result
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if ":" in utterance_tokens:
                    colon_idx = utterance_tokens.index(":")
                    prefix = utterance_tokens[: colon_idx]
                    if " ".join(prefix).isupper():
                        prefix_length = len(prefix)+1
                        utterance_tokens = utterance_tokens[prefix_length:]
                        utterance = " ".join(utterance_tokens)


                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens
                all_sentences.append(sentence_tokens)
                spans = list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk']))

                spans.sort(key=lambda x: x[1])
                # Merge overlapping spans into one maximum logical span
                to_pop = []
                for j, (word_0, start_idx_0, end_idx_0) in enumerate(spans):
                    for k, (word_1, start_idx_1, end_idx_1) in enumerate(spans):
                        if k==j:
                            continue
                        if (start_idx_1 >= start_idx_0) and (end_idx_1 <= end_idx_0):
                            to_pop.append(spans[k])
                for item in to_pop:
                    spans.remove(item)

                # Split NPs with Poesstive Pronoun into two parts
                poessives = []
                for j, token in enumerate(sm_parser(utterance)):
                    if token.tag_=="PRP$":
                        for k, (word, start_idx, end_idx) in enumerate(spans):
                            if start_idx <= j < end_idx:
                                new_span_1 = (token.text, j, j+1)
                                poessives.append(new_span_1)
                    if token.tag_=="NNPS":
                        pass

                for item in poessives:
                    spans.append(item)
                spans.sort(key=lambda x: x[1])

                for span in spans:
                    all_query_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })
                    # if prefix_length:
                    #     all_query_spans.append({
                    #     "sentenceIndex": i,
                    #     "startToken": span[1] + len(speaker_tokens) + 1 - prefix_length,
                    #     "endToken": span[2] + len(speaker_tokens) + 1 - prefix_length
                    # })
                    # else:
                    #     all_query_spans.append({
                    #     "sentenceIndex": i,
                    #     "startToken": span[1] + len(speaker_tokens) + 1,
                    #     "endToken": span[2] + len(speaker_tokens) + 1
                    # })

                # Gather all possible candidate spans
                temp = []
                for window_size in range(10):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)
            else:
                prefix_length = None
                utterance = utt['utterance']
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if ":" in utterance_tokens:
                    colon_idx = utterance_tokens.index(":")
                    prefix = utterance_tokens[: colon_idx]
                    if " ".join(prefix).isupper():
                        prefix_length = len(prefix)+1

                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)

                # Gather all possible candidate spans
                temp = []
                for window_size in range(10):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)


        print(len(all_candidate_spans))
        print(len(all_query_spans))
        # print(all_candidate_spans)
        print("=="*50)
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_candidate_spans,
            "clickSpans": all_query_spans,
        })

3485
106
4484
123
12481
378
821
23
2506
86
378
14
1369
47
397
17
168
6
1250
40
1934
59


In [8]:
output = []
for epi_key in tqdm(data):
    if epi_key != (1,8):
        continue
    episode = data[epi_key]
    # Each scene contain on episode
    for scene in episode:
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                prefix_length = None
                # Fetch parse Noun Phrases from former parsing result
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if ":" in utterance_tokens:
                    colon_idx = utterance_tokens.index(":")
                    prefix = utterance_tokens[: colon_idx]
                    if " ".join(prefix).isupper():
                        prefix_length = len(prefix)+1
                        utterance_tokens = utterance_tokens[prefix_length:]
                        utterance = " ".join(utterance_tokens)


                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens
                all_sentences.append(sentence_tokens)
                spans = list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk']))

                spans.sort(key=lambda x: x[1])
                # Merge overlapping spans into one maximum logical span
                to_pop = []
                for j, (word_0, start_idx_0, end_idx_0) in enumerate(spans):
                    for k, (word_1, start_idx_1, end_idx_1) in enumerate(spans):
                        if k==j:
                            continue
                        if (start_idx_1 >= start_idx_0) and (end_idx_1 <= end_idx_0):
                            to_pop.append(spans[k])
                for item in to_pop:
                    if item in spans:
                        spans.remove(item)

                # Split NPs with Poesstive Pronoun into two parts
                poessives = []
                for j, token in enumerate(sm_parser(utterance)):
                    if token.tag_=="PRP$":
                        for k, (word, start_idx, end_idx) in enumerate(spans):
                            if start_idx <= j < end_idx:
                                new_span_1 = (token.text, j, j+1)
                                poessives.append(new_span_1)
                    if token.tag_=="NNPS":
                        pass

                for item in poessives:
                    spans.append(item)
                spans.sort(key=lambda x: x[1])

                for span in spans:
                    all_query_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })

                # Gather all possible candidate spans
                temp = []
                for window_size in range(1):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)
            else:
                prefix_length = None
                utterance = utt['utterance']
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if ":" in utterance_tokens:
                    colon_idx = utterance_tokens.index(":")
                    prefix = utterance_tokens[: colon_idx]
                    if " ".join(prefix).isupper():
                        prefix_length = len(prefix)+1

                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)

                # Gather all possible candidate spans
                temp = []
                for window_size in range(1):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_candidate_spans,
            "clickSpans": all_query_spans,
        })


with open('sample_annotate_epi_1_8.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

100%|██████████| 96/96 [00:02<00:00, 36.68it/s]

5





In [11]:
temp = output[0]
print(len(temp['clickSpans']))

187


In [7]:
for item in output:
    print(len(item['sentences']), len(item['querySpans']))

74 187
33 159
41 149
68 233
44 138


In [80]:
with open('all_to_annotate.pkl', 'wb') as f:
    pkl.dump(output, f)

In [7]:
with open('all_to_annotate.pkl', 'rb') as f:
    all_output = pkl.load(f)
    output = all_output[11:21]

with open('sample_annotate_epi_1_2.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

## Generate samples to annotate by utterance number

In [24]:
import random

In [28]:
with open('all_to_annotate.pkl', 'rb') as f:
    all_output = pkl.load(f)
    random.shuffle(all_output)

In [32]:
output = []
random.shuffle(all_output)
for scene in all_output:
    sent_num = len(scene['sentences'])
    if 5 < sent_num < 10:
        output.append(scene)
        break

for scene in all_output:
    sent_num = len(scene['sentences'])
    if 10 < sent_num < 15:
        output.append(scene)
        break

for scene in all_output:
    sent_num = len(scene['sentences'])
    if 30 < sent_num < 35:
        output.append(scene)
        break

In [35]:
with open('sample_pilot_study.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})

In [33]:
for scene in output:
    for sent in scene['sentences']:
        print(sent)
    print('=='*50)

['Penny', ':', 'Where', "'d", 'you', 'go', '?', 'I', 'ca', "n't", 'tell', 'if', 'the', 'turkey', "'s", 'done', '!']
['Leonard', ':', 'Be', 'right', 'there', '!', 'Hi', ',', 'lover', '.']
['Penny', ':', ' ', 'What', 'are', 'you', 'doing', '?']
['Leonard', ':', 'I', "'m", 'sorry', 'about', 'the', 'journal', '.', 'I', 'want', 'to', 'make', 'it', 'up', 'to', 'you', '.', 'So', 'I', "'m", 'gon', 'na', 'let', 'you', 'post', 'a', 'shame', 'photo', 'of', 'me', 'on', 'Facebook', '.']
['Penny', ':', 'I', 'am', 'not', 'putting', 'that', 'on', 'the', 'Internet', '!', 'I', 'do', "n't", 'want', 'people', 'to', 'see', 'this', '.', 'I', 'do', "n't", 'want', 'to', 'see', 'it', '!']
['Leonard', ':', 'Do', "n't", 'want', 'people', 'to', 'see', 'what', ',', 'huh', '?', 'A', 'little', 'bit', 'of', 'this', '?']
['Penny', ':', ' ', 'Oh', '.']
['Leonard', ':', 'Some', 'of', 'this', '?', 'And', ',', 'since', 'it', "'s", 'Thanksgiving', ',', 'an', 'extra', 'helping', 'of', 'this', '?']
['Bernadette', ':', ' ', '

In [86]:
with open('all_to_annotate.pkl', 'rb') as f:
    all_output = pkl.load(f)
    output = all_output

In [91]:
print(len(output[0]['querySpans']))

125


In [96]:
query_nums = []
for item in output:
    query_nums.append(len(item['querySpans']))
print(len(query_nums))
print(sum(query_nums))
print(sum(query_nums)/len(query_nums))

1070
73411
68.60841121495326


In [None]:
output = []
for epi_key in tqdm(data):
    if epi_key != (1,8):
        continue
    episode = data[epi_key]
    # Each scene contain on episode
    for scene in episode:
        # Collect data to annotate
        all_sentences = []
        all_query_spans = []
        all_candidate_spans = []

        for i, utt in enumerate(scene):
            if "en_subtitles" in utt:
                prefix_length = None
                # Fetch parse Noun Phrases from former parsing result
                utterance = " ".join([x.strip().lstrip('-').lstrip().lstrip('.').lstrip() for x in utt['en_subtitles']])
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if ":" in utterance_tokens:
                    colon_idx = utterance_tokens.index(":")
                    prefix = utterance_tokens[: colon_idx]
                    if " ".join(prefix).isupper():
                        prefix_length = len(prefix)+1
                        utterance_tokens = utterance_tokens[prefix_length:]
                        utterance = " ".join(utterance_tokens)


                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens
                all_sentences.append(sentence_tokens)
                spans = list(set(utt['sm_noun_chunk']) | set(utt['berkeley_noun_chunk']) | set(utt['trf_noun_chunk']))

                spans.sort(key=lambda x: x[1])
                # Merge overlapping spans into one maximum logical span
                to_pop = []
                for j, (word_0, start_idx_0, end_idx_0) in enumerate(spans):
                    for k, (word_1, start_idx_1, end_idx_1) in enumerate(spans):
                        if k==j:
                            continue
                        if (start_idx_1 >= start_idx_0) and (end_idx_1 <= end_idx_0):
                            to_pop.append(spans[k])
                for item in to_pop:
                    if item in spans:
                        spans.remove(item)

                # Split NPs with Poesstive Pronoun into two parts
                poessives = []
                for j, token in enumerate(sm_parser(utterance)):
                    if token.tag_=="PRP$":
                        for k, (word, start_idx, end_idx) in enumerate(spans):
                            if start_idx <= j < end_idx:
                                new_span_1 = (token.text, j, j+1)
                                poessives.append(new_span_1)
                    if token.tag_=="NNPS":
                        pass

                for item in poessives:
                    spans.append(item)
                spans.sort(key=lambda x: x[1])

                for span in spans:
                    all_query_spans.append({
                        "sentenceIndex": i,
                        "startToken": span[1] + len(speaker_tokens) + 1,
                        "endToken": span[2] + len(speaker_tokens) + 1
                    })

                # Gather all possible candidate spans
                temp = []
                for window_size in range(1):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)
            else:
                prefix_length = None
                utterance = utt['utterance']
                utterance_tokens = [item.text for item in sm_parser(utterance)]

                if ":" in utterance_tokens:
                    colon_idx = utterance_tokens.index(":")
                    prefix = utterance_tokens[: colon_idx]
                    if " ".join(prefix).isupper():
                        prefix_length = len(prefix)+1

                speaker = utt['speaker'].strip().strip("(").strip(")").strip().strip(".").strip().strip(":")
                speaker_tokens = [item.text for item in sm_parser(speaker)]
                sentence_tokens = speaker_tokens + [":"] + utterance_tokens

                all_sentences.append(sentence_tokens)

                # Gather all possible candidate spans
                temp = []
                for window_size in range(1):
                    temp += get_all_possible_spans(i, len(sentence_tokens), window_size)
                all_candidate_spans.extend(temp)
        output.append({
            "sentences": all_sentences,
            "querySpans": all_query_spans,
            "candidateSpans": all_candidate_spans,
            "clickSpans": all_query_spans,
        })


with open('sample_annotate_epi_1_8.csv', "w", encoding="utf-8") as csv_fh:
        fieldnames = ['json_data']
        writer = csv.DictWriter(csv_fh, fieldnames, lineterminator='\n')
        writer.writeheader()
        for line in output:
            writer.writerow({'json_data': json.dumps(line)})