In [27]:
import pickle as pkl
import json

In [2]:
from data_construction.parallel_corpus.utils import merge_maximum_span
from data_construction.parallel_corpus.utils import clean_sentence_brackets
from data_construction.parallel_corpus.utils import process_nps_punctuation

In [3]:
def extract_three_way_subtitles(scenes):
    output = []
    for scene in scenes:
        for utt in scene:
            temp = {
                "fa_utterance": utt['fa_subtitles'],
                "zh_utterance": utt['zh_subtitles']
            }
            if utt['en_subtitles']!="":
                temp['en_utterance'] = utt['en_subtitles']
            else:
                temp['en_utterance'] = utt['utterance']
            output.append(temp)
    return output

In [19]:
def extract_mentions(scenes):
    output = []
    for scene in scenes:
        for utt in scene:
            sentence_token = [item[0] for item in utt['sm_pron']]
            sm_nps = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, utt['sm_noun_chunk']))
            berkeley_nps = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, utt['berkeley_noun_chunk']))
            trf_nps = process_nps_punctuation(sentence_token, process_nps_punctuation(sentence_token, utt['trf_noun_chunk']))
            noun_phrase = merge_maximum_span(list(set(sm_nps) | set(berkeley_nps) | set(trf_nps)))

            temp_pron = []
            temp_pron.extend([(item[0], item[1], item[2]) for item in utt['sm_pron'] if item[3]=='PRON'])
            temp_pron.extend([(item[0], item[1], item[2]) for item in utt['berkeley_pron'] if item[3]=='PRON'])
            temp_pron.extend([(item[0], item[1], item[2]) for item in utt['trf_pron'] if item[3]=='PRON'])
            pron = merge_maximum_span(list(set(temp_pron)))

            mention = list(set(noun_phrase)|set(pron))
            output.extend(mention)
    return output

In [29]:
with open('tbbt_three_way.pkl', 'rb') as f:
    corpus = pkl.load(f)
with open('../../parsing/parsed_corpus_tbbt.pkl', 'rb') as f:
    parsed_corpus = pkl.load(f)

print("Episode Number:", len(corpus))
count = 0
for epi_key in corpus:
    count += len(corpus[epi_key])
print("Scene Number:", count)

all_subtitles = []
all_mentions = []
for epi_key in corpus:
    all_subtitles.extend(extract_three_way_subtitles(corpus[epi_key]))
    all_mentions.extend(extract_mentions(parsed_corpus[epi_key]))
print("Subtitle Number:", len(all_subtitles))
print("Mention Number:", len(all_mentions))

with open('tbbt_3_way.json', 'w') as f:
    json.dump(all_subtitles, f)

Episode Number: 88
Scene Number: 998
Subtitle Number: 18746
Mention Number: 74272


In [28]:
with open('friends_three_way.pkl', 'rb') as f:
    corpus = pkl.load(f)
with open('../../parsing/parsed_corpus_friends.pkl', 'rb') as f:
    parsed_corpus = pkl.load(f)

print("Episode Number:", len(corpus))
count = 0
for epi_key in corpus:
    count += len(corpus[epi_key])
print("Scene Number:", count)

all_subtitles = []
all_mentions = []
for epi_key in corpus:
    all_subtitles.extend(extract_three_way_subtitles(corpus[epi_key]))
    all_mentions.extend(extract_mentions(parsed_corpus[epi_key]))
print("Subtitle Number:", len(all_subtitles))
print("Mention Number:", len(all_mentions))

with open('friends_3_way.json', 'w') as f:
    json.dump(all_subtitles, f)

Episode Number: 20
Scene Number: 242
Subtitle Number: 4756
Mention Number: 15099


In [46]:
print(4756+18746)

print(74272+15099)

23502
89371


In [30]:
with open('friends_three_way_new.pkl', 'rb') as f:
    corpus = pkl.load(f)
with open('../../parsing/parsed_corpus_friends_new.pkl', 'rb') as f:
    parsed_corpus = pkl.load(f)

print("Episode Number:", len(corpus))
count = 0
for epi_key in corpus:
    count += len(corpus[epi_key])
print("Scene Number:", count)

all_subtitles = []
all_mentions = []
for epi_key in corpus:
    all_subtitles.extend(extract_three_way_subtitles(corpus[epi_key]))
    all_mentions.extend(extract_mentions(parsed_corpus[epi_key]))
print("Subtitle Number:", len(all_subtitles))
print("Mention Number:", len(all_mentions))

with open('friends_3_way_new.json', 'w') as f:
    json.dump(all_subtitles, f)

Episode Number: 91
Scene Number: 1161
Subtitle Number: 22248
Mention Number: 76104


# Divide into Train/Dev/Test

In [33]:
with open('friends_three_way.pkl', 'rb') as f:
    friends_corpus = pkl.load(f)
with open('../../parsing/parsed_corpus_friends.pkl', 'rb') as f:
    friends_parsed_corpus = pkl.load(f)

with open('tbbt_three_way.pkl', 'rb') as f:
    tbbt_corpus = pkl.load(f)
with open('../../parsing/parsed_corpus_tbbt.pkl', 'rb') as f:
    tbbt_parsed_corpus = pkl.load(f)

In [35]:
# Dev
dev_friends = [(1,1)]
dev_tbbt = [(7, 1), (7, 2), (7, 6), (7, 10), (7, 11), (7, 15), (7, 16), (7, 17), (7, 18), (7, 19)]

count = 0
all_subtitles = []
all_mentions = []

for epi_key in dev_friends:
    count += len(friends_corpus[epi_key])
for epi_key in dev_tbbt:
    count += len(tbbt_corpus[epi_key])
print("Scene Number:", count)

for epi_key in dev_friends:
    all_subtitles.extend(extract_three_way_subtitles(friends_corpus[epi_key]))
    all_mentions.extend(extract_mentions(friends_parsed_corpus[epi_key]))
for epi_key in dev_tbbt:
    all_subtitles.extend(extract_three_way_subtitles(tbbt_corpus[epi_key]))
    all_mentions.extend(extract_mentions(tbbt_parsed_corpus[epi_key]))

print("Subtitle Number:", len(all_subtitles))
print("Mention Number:", len(all_mentions))

Scene Number: 126
Subtitle Number: 2406
Mention Number: 9598


In [36]:
# Test
test_friends = [(1,4)]
test_tbbt = [(9, 1), (9, 3), (9, 4), (9, 7), (9, 9), (9, 10), (9, 11), (9, 12), (9, 16)]

count = 0
all_subtitles = []
all_mentions = []

for epi_key in test_friends:
    count += len(friends_corpus[epi_key])
for epi_key in test_tbbt:
    count += len(tbbt_corpus[epi_key])
print("Scene Number:", count)

for epi_key in test_friends:
    all_subtitles.extend(extract_three_way_subtitles(friends_corpus[epi_key]))
    all_mentions.extend(extract_mentions(friends_parsed_corpus[epi_key]))
for epi_key in test_tbbt:
    all_subtitles.extend(extract_three_way_subtitles(tbbt_corpus[epi_key]))
    all_mentions.extend(extract_mentions(tbbt_parsed_corpus[epi_key]))

print("Subtitle Number:", len(all_subtitles))
print("Mention Number:", len(all_mentions))

Scene Number: 146
Subtitle Number: 2129
Mention Number: 8437


In [42]:
print(set(friends_corpus.keys()))
print(friends_corpus.keys()-set(test_friends)-set(dev_friends))

print()
print(set(tbbt_corpus.keys()))
print(tbbt_corpus.keys()-set(test_tbbt)-set(dev_tbbt))

{(1, 6), (1, 3), (1, 9), (1, 12), (1, 18), (1, 15), (1, 24), (1, 2), (1, 8), (1, 14), (1, 20), (1, 17), (1, 1), (1, 4), (1, 7), (1, 13), (1, 10), (1, 16), (1, 19), (1, 22)}
{(1, 2), (1, 8), (1, 14), (1, 20), (1, 17), (1, 6), (1, 3), (1, 9), (1, 12), (1, 18), (1, 15), (1, 24), (1, 7), (1, 13), (1, 10), (1, 16), (1, 19), (1, 22)}

{(7, 17), (8, 9), (8, 18), (6, 2), (7, 1), (1, 15), (7, 10), (6, 20), (7, 19), (8, 2), (9, 1), (5, 12), (8, 11), (9, 10), (2, 4), (4, 23), (6, 4), (9, 3), (5, 14), (9, 12), (5, 23), (1, 10), (6, 6), (7, 23), (1, 3), (7, 16), (9, 7), (3, 21), (9, 16), (1, 5), (8, 20), (1, 14), (5, 2), (4, 4), (9, 9), (3, 23), (4, 22), (1, 7), (2, 6), (8, 22), (7, 2), (1, 16), (7, 11), (6, 15), (4, 6), (4, 24), (8, 6), (8, 15), (1, 9), (8, 24), (6, 8), (6, 17), (5, 6), (4, 8), (8, 17), (1, 11), (6, 19), (4, 1), (7, 18), (4, 19), (8, 1), (1, 4), (5, 20), (8, 19), (1, 13), (6, 3), (6, 21), (4, 3), (7, 20), (4, 21), (8, 3), (8, 12), (9, 11), (8, 21), (6, 5), (6, 14), (4, 5), (7, 22)

In [43]:
# Train
test_friends = [(1,4)]
test_tbbt = [(9, 1), (9, 3), (9, 4), (9, 7), (9, 9), (9, 10), (9, 11), (9, 12), (9, 16)]

count = 0
all_subtitles = []
all_mentions = []

for epi_key in friends_corpus.keys()-set(test_friends)-set(dev_friends):
    count += len(friends_corpus[epi_key])
for epi_key in tbbt_corpus.keys()-set(test_tbbt)-set(dev_tbbt):
    count += len(tbbt_corpus[epi_key])
print("Scene Number:", count)

for epi_key in friends_corpus.keys()-set(test_friends)-set(dev_friends):
    all_subtitles.extend(extract_three_way_subtitles(friends_corpus[epi_key]))
    all_mentions.extend(extract_mentions(friends_parsed_corpus[epi_key]))
for epi_key in tbbt_corpus.keys()-set(test_tbbt)-set(dev_tbbt):
    all_subtitles.extend(extract_three_way_subtitles(tbbt_corpus[epi_key]))
    all_mentions.extend(extract_mentions(tbbt_parsed_corpus[epi_key]))

print("Subtitle Number:", len(all_subtitles))
print("Mention Number:", len(all_mentions))

Scene Number: 968
Subtitle Number: 18967
Mention Number: 71336


## Estimate Cost