In [1]:
import json
import pickle as pkl
import re
from nltk.translate import bleu_score
from tqdm import tqdm
import jiwer
from matplotlib import pyplot as plt

#### Load Source Data of both Episode and Subtitles

In [2]:
# Load Open Subtitle
with open('en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [3]:
# Load Memor Dataset
with open('memor/data.json') as f:
    data = json.load(f)

### Pre-Process Data for one episode

In [6]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [38]:
for item in data:
    if item.strip().split('_')[0]=='S01E01':
        sentences = data[item]['sentences']
        speakers = data[item]['speakers']
        start_times = data[item]['seg_start']
        end_times = data[item]['seg_end']
        for sentence, speaker, start, end in zip(sentences, speakers, start_times, end_times):
            # print(sentence, speaker)
            print(start, end)
            # print()

2 14.08
14.08 16.75
16.75 20
31 51
93 103.61
103.61 111
110 123.1
123.1 126.379999
126.379999 128
124 126.34
126.34 129.53
129.53 131.12
131.12 132.16
132.16 134.01
134.01 135.67
135.67 140
168 172.27
172.27 177.42
177.42 182
191 194.49
194.49 196.53
196.53 200.179999
200.179999 204
206 207.63
207.63 208.28
208.28 210.78
210.78 212.67
212.67 215.59
215.59 217
239 241.55
241.55 242.33
242.33 242.79
242.79 243.11
243.11 248.34
248.34 248.7
248.7 249.48
249.48 250
339 343.84
343.84 348.49
348.49 349.75
349.75 351.78
351.78 352.92
352.92 356
412 415.72
415.72 419.69
419.69 421.139999
421.139999 425
439 443.38
443.38 445.18
445.18 446.17
446.17 447.77
447.77 450.75
450.75 456
466 486.52
486.52 491.23
491.23 491.66
491.66 495.51
495.51 497
511 514.74
514.74 515.55
515.55 515.56
515.56 520
531 531.6
531.6 533.41
533.41 534.89
534.89 535.1700000000001
535.1700000000001 538.22
538.22 541
556 566.28
566.28 571.469999
571.469999 576
576 585.91
585.91 592
588 591.71
591.71 598.42
598.42 602
626 62

In [23]:
# Process episode
utterances = []
for item in data:
    if item.strip().split('_')[0]=='S01E01':
        sentences = data[item]['sentences']
        speakers = data[item]['speakers']
        for sentence, speaker in zip(sentences, speakers):
            utterances.append([sentence, speaker])

segments = []
pattern = r'\.|\,|\?|\!|\;'
for x in utterances:
    utt = x[0]
    temp = re.split(pattern, utt.strip(' ,.-!'))
    for item in temp:
        item = transformation(item)
        if item!="":
            segments.append([item, x[1]])

for x in segments:
    print(x[0], x[1])

if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits 1
if it is unobserved it will 1
however 1
if it is observed after it is left the plane but before it hits its target 1
it will not have gone through both slits 1
agreed 0
what is your point 0
there is no point 1
i just think it is a good idea for a tshirt 1
one across is aegean 0
eight down is nabakov 0
twentysix across is mcm 0
fourteen down is 0
move your finger 0
phylum 0
which makes fourteen across portauprince 0
see 0
papa doc is capital idea 0
that is portauprince 0
haiti 0
no 1
we are committing genetic fraud 1
there is no guarantee that our sperm is going to generate high iq offspring 1
think about that 1
i have a sister with the same basic dna mix who hostesses at fuddruckers 1
sheldon 0
this was your idea 0
a little extra money to get fractional t1 bandwidth in the apartment 0
i know 1
and i do yearn for faster downloads 1
but there is some poor woman is 

In [8]:
# Process Subtitle Subset
base = 4424331
bias = 400

subtitles = []
for item in en_subtitle[base-bias: base+bias]:
    item = transformation(item)
    if item!="":
        subtitles.append(item)

In [41]:
for x in subtitles:
    print(x)

i think that we can manage that
i am not going to take part in that nonsense
you do not live here iisakki it is not of your concern
quit your yapping
we will all take care of nikolas
beginning with the hannuses
the boy will move to the next family on christmas
is this agreed
yes
merry christmas sister aada
i could not forget my mother and my father  and my sister aada
i knew they would not be coming back  but i missed them dearly
but time heals the wounds  and by autumn the sadness began to fade away
my boat is broken fix it
ask your father
my father has gone fishing
ask your mother then
mother can not do it
give it to me
turn boat
it will not turn
why does not it come here turn right
there it goes
i did not know it back then  but eemeli would become a good friend
good throw it bounced off the water
i found a great stone
stretch your arm back and throw
you can do it one two three
let us get another one
then came the moment i had been dreading
i had to move to another family for the ver

### Perform Alignment with Word Error Rate (WER)

In [188]:
def get_optimal_wer_from_episode(ground_truth, hypothesis_pool):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.compute_measures(ground_truth, hypothesis)['wer'])
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, scores.index(min(scores))

In [189]:
# Align based on episode
valid_utterances = []

for x in tqdm(segments):
    utt = x[0]
    speaker = x[1]
    result = get_optimal_wer_from_episode(utt, subtitles)
    score = result[0]
    index = result[-1]
    subtitle = result[1]
    if 0 < score <0.2:
        if valid_utterances==[]:
            valid_utterances.append([
                utt, speaker, index
            ])
        else:
            last_index = valid_utterances[-1][-1]
            if last_index < index < last_index+40:
                valid_utterances.append([
                utt, speaker, index
            ])

100%|██████████| 325/325 [00:04<00:00, 68.55it/s]


In [190]:
for x in valid_utterances:
    print(x)

['if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits', 1, 397]
['if it is observed after it is left the plane but before it hits its target', 1, 399]
['what if she winds up with a toddler who does not know if he should use an integral or a differential to solve the area under a curve', 1, 426]
['i did a series of experiments when i was twelve', 1, 443]
['significant improvement over the old neighbour', 0, 449]
['you are inviting me over to eat', 4, 485]
['what do you guys do for fun around here', 4, 487]
['at least i did not have to invent twentysix dimensions just to make the math come out', 0, 501]
['it tells us that you participate in the mass cultural delusion that the sun is apparent position relative to arbitrarily defined constellations and the time of your birth somehow effects your personality', 1, 531]
['so do you have some sort of a job', 0, 539]
['you will only make it worse', 1, 563]
['but not of the s

In [187]:
def get_optimal_wer_from_subtitle(ground_truth, hypothesis_pool):
    scores = []
    for i, x in enumerate(hypothesis_pool):
        hypothesis = x[0]
        scores.append(jiwer.compute_measures(ground_truth, hypothesis)['wer'])
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, scores.index(min(scores))

In [184]:
# Alignment from Open Subtitle
valid_subtitles = []

for sub in subtitles:
    results = get_optimal_wer_from_subtitle(sub, utterances)
    if results[0] < 0.4:
        print("!!!!", results)
    else:
        print(results)

(0.8571428571428571, ["i wouldn't.", 1], 'i think that we can manage that', 8)
(0.8, ['i want to leave.', 1], 'i am not going to take part in that nonsense', 12)
(0.8333333333333334, ['so do you have some sort of a job?', 0], 'you do not live here iisakki it is not of your concern', 74)
(1.0, ["agreed, what's your point?", 0], 'quit your yapping', 1)
(1.0, ["agreed, what's your point?", 0], 'we will all take care of nikolas', 1)
(0.75, ['participate in the what?', 4], 'beginning with the hannuses', 69)
(0.9, ['i want to leave.', 1], 'the boy will move to the next family on christmas', 12)
(1.0, ["i wouldn't.", 1], 'is this agreed', 8)
(1.0, ['okay.', 0], 'yes', 13)
(1.0, ["agreed, what's your point?", 0], 'merry christmas sister aada', 1)
(0.8461538461538461, ['i have a board. if you like boards, this is my board.', 0], 'i could not forget my mother and my father  and my sister aada', 46)
(0.8461538461538461, ["i just, i can't believe i trusted him.", 4], 'i knew they would not be comi

### Pattern Matching From both Side

In [81]:
"""
utterances: original utt in episode side
segments: segmented in episode side
subtitles: original sentence in open subtitles
"""
subtitle_transformed = []
for x in subtitles:
    subtitle_transformed.append(transformation(x))

utterance_transformed = []
for x in utterances:
    utterance_transformed.append([transformation(x[0]), x[1]])

In [173]:
# String Matching from episode to subtitles
utt_segments = []
for x in utterance_transformed:
    utt = x[0]
    utt_tokens = utt.strip().split(' ')
    if len(utt_tokens)>=6:
        num_iter = len(utt_tokens) // 6
        for i in range(num_iter):
            utt_segments.append([" ".join(utt_tokens[i*6: i*6+6]), x[1]])

subtitle_indexs_dict_e2s = {}
count = 0
for i, x in enumerate(utt_segments):
    utt_seg = x[0]
    for j, sub in enumerate(subtitle_transformed):
        if utt_seg in sub:
            if j in subtitle_indexs_dict_e2s:
                subtitle_indexs_dict_e2s[j].append([utt_seg, i])
            else:
                subtitle_indexs_dict_e2s[j] = [[utt_seg, i]]
            count += 1
            print(utt_seg, i)
            print(sub, j)
            print("=="*50)
print(count)

if a photon is directed through 0
if a photon is directed through a plane with two slits in it and either is observed it will not go through both 397
a plane with two slits in 1
if a photon is directed through a plane with two slits in it and either is observed it will not go through both 397
it will not go through both 3
if a photon is directed through a plane with two slits in it and either is observed it will not go through both 397
it will not have gone through 8
it will not have gone through both slits 400
there is no point i just 9
there is no point i just think it is a good idea for a tshirt 402
think it is a good idea 10
there is no point i just think it is a good idea for a tshirt 402
think it is a good idea 10
i do not eat it i think it is a good idea 543
one across is aegean eight down 11
one across is aegean eight down is nabokov 403
fourteen down is move your finger 13
fourteen down is move your finger 405
see papa doc is capital idea 15
see papa doc is capital idea that i

In [174]:
for i, sub in enumerate(subtitles):
    # Show matching results
    if i in subtitle_indexs_dict_e2s:
        print(sub)
        print(subtitle_indexs_dict_e2s[i])
        print("=="*50)
    # Show the whole subtitle, matched subtitle will be highlighted
    if i in subtitle_indexs_dict_e2s:
        print("!!!!", sub)
    else:
        print(sub)

i think that we can manage that
i am not going to take part in that nonsense
you do not live here iisakki it is not of your concern
quit your yapping
we will all take care of nikolas
beginning with the hannuses
the boy will move to the next family on christmas
is this agreed
yes
merry christmas sister aada
i could not forget my mother and my father  and my sister aada
i knew they would not be coming back  but i missed them dearly
but time heals the wounds  and by autumn the sadness began to fade away
my boat is broken fix it
ask your father
my father has gone fishing
ask your mother then
mother can not do it
give it to me
turn boat
it will not turn
why does not it come here turn right
there it goes
i did not know it back then  but eemeli would become a good friend
good throw it bounced off the water
i found a great stone
stretch your arm back and throw
you can do it one two three
let us get another one
then came the moment i had been dreading
i had to move to another family for the ver

In [175]:
# String Matching from subtitles to utterance
sub_segments = []
for k, sub in enumerate(subtitle_transformed):
    sub_tokens = sub.strip().split(' ')
    if len(sub_tokens)>=6:
        num_iter = len(sub_tokens) // 6
        for i in range(num_iter):
            sub_segments.append([" ".join(sub_tokens[i*6: i*6+6]), k])

subtitle_indexs_dict_s2e = {}
for j, x in enumerate(sub_segments):
    sub_seg = x[0]
    for i, x in enumerate(utterance_transformed):
        utt = x[0]
        speaker = x[1]
        if sub_seg in utt:
            if j in subtitle_indexs_dict_s2e:
                subtitle_indexs_dict_s2e[j].append([utt, i])
            else:
                subtitle_indexs_dict_s2e[j] = [[utt, i]]

In [176]:
for x in subtitle_indexs_dict_s2e:
    print(x, "--", subtitle_indexs_dict_s2e[x][0][1])
    print(subtitle_indexs_dict_s2e[x][0][0])
    print(sub_segments[x])
    print()

294 -- 0
if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it is unobserved it will however if it is observed after it is left the plane but before it hits its target it will not have gone through both slits
['if a photon is directed through', 397]

295 -- 0
if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it is unobserved it will however if it is observed after it is left the plane but before it hits its target it will not have gone through both slits
['a plane with two slits in', 397]

297 -- 0
if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it is unobserved it will however if it is observed after it is left the plane but before it hits its target it will not have gone through both slits
['if it is observed after it', 399]

299 -- 0
if a photon is directed 

In [193]:
# Combine two set and show in the open subtitle
temp_set = set()
for x in subtitle_indexs_dict_s2e:
    temp_set.add(sub_segments[x][1])
print(len(temp_set))
for x in subtitle_indexs_dict_e2s:
    temp_set.add(x)
print(len(temp_set))
for x in valid_utterances:
    # print(x[-1])
    temp_set.add(x[-1])
print(len(temp_set))
# for i, sub in enumerate(subtitles):
#     # Show the whole subtitle, matched subtitle will be highlighted
#     if i in temp_set:
#         print("!!!!", sub)
#     else:
#         print(sub)

93
99
102
