In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_zh/zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [5]:
with open('original_transcript/tbbt_transcripts.pkl', 'rb') as f:
    tbbt_transcripts = pkl.load(f)

In [6]:
# Load alignment results after stage-2
with open('alignment_results/zh/indexs_tbbt_zh.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

## Perform fine-grained alignment

### Part 0: Load Data

In [47]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

#### Part 1: String Match with sliding window

In [7]:
# Part 1: String Match with sliding window
def temp_string_match_sliding_window_no_filter(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)
    return res

In [78]:
result_0_all = {}
for i in range(12):
    for j in range(30):
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=tbbt_transcripts,
                en_subtitle=en_subtitle,
                zh_subtitle=zh_subtitle,
                results=results,
                season_id=i+1,
                episode_id=j+1,
                bias=200
            )
            temp = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=9)
            result_0_all[(i+1,j+1)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

Season: 1 Episode: 1 Episode Number: 50 Subtitle Number: 45
Season: 1 Episode: 2 Episode Number: 57 Subtitle Number: 46
Season: 1 Episode: 3 Episode Number: 42 Subtitle Number: 40
Season: 1 Episode: 4 Episode Number: 51 Subtitle Number: 41
Season: 1 Episode: 5 Episode Number: 49 Subtitle Number: 41
Season: 1 Episode: 6 Episode Number: 49 Subtitle Number: 47
Season: 1 Episode: 7 Episode Number: 52 Subtitle Number: 49
Season: 1 Episode: 8 Episode Number: 58 Subtitle Number: 50
Season: 1 Episode: 9 Episode Number: 38 Subtitle Number: 33
Season: 1 Episode: 10 Episode Number: 60 Subtitle Number: 52
Season: 1 Episode: 11 Episode Number: 38 Subtitle Number: 36
Season: 1 Episode: 12 Episode Number: 53 Subtitle Number: 46
Season: 1 Episode: 13 Episode Number: 57 Subtitle Number: 54
Season: 1 Episode: 14 Episode Number: 51 Subtitle Number: 42
Season: 1 Episode: 15 Episode Number: 47 Subtitle Number: 43
Season: 1 Episode: 16 Episode Number: 47 Subtitle Number: 40
Season: 2 Episode: 1 Episode Numb

In [79]:
with open('alignment_part_1_string_match.pkl', 'wb') as f:
    pkl.dump(result_0_all, f)

In [11]:
with open('alignment_part_1_string_match.pkl', 'rb') as f:
    result_0_all = pkl.load(f)

### Filter the indexs obtained using sliding window string match

In [90]:
temp = result_0_all[(1,4)]
for x in temp:
    print(x, temp[x])

194 {2}
195 {2}
205 {10}
211 {16}
223 {25}
236 {35}
242 {42}
248 {46}
249 {46}
250 {46}
257 {52}
272 {64}
275 {67}
287 {77}
291 {79}
292 {79}
295 {80}
300 {84}
306 {92}
313 {96}
314 {97}
317 {99}
322 {103}
336 {111}
345 {119}
359 {129}
371 {137}
373 {139}
385 {151}
386 {152}
389 {153}
390 {153}
392 {153}
406 {159}
407 {159}
408 {159}
411 {161}
425 {167}
428 {168}
434 {173}
436 {174}
441 {179}
444 {182}
454 {190}
459 {193}
461 {196}
462 {196}
468 {203}
469 {203}
474 {206}
483 {215}


### Part 2: Strict Match

In [327]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

In [328]:
for x in tbbt_episode:
    print(x)

[' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'Sheldon', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't', 'e', 'd', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'a', ' ', 'p', 'l', 'a', 'n', 'e', ' ', 'w', 'i', 't', 'h', ' ', 't', 'w', 'o', ' ', 's', 'l', 'i', 't', 's', ' ', 'i', 'n', ' ', 'i', 't', ' ', 'a', 'n', 'd', ' ', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'i', 's', ' ', 'o', 'b', 's', 'e', 'r', 'v', 'e', 'd', ' ', 'i', 't', ' ', 'w', 'i', 'l', 'l', ' ', 'n', 'o', 't', ' ', 'g', 'o', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'b', 'o', 't', 'h', '.', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't', 'e', 'd', ' ', 't', 'h', 'r', 'o', 'u',

In [329]:
for x in tbbt_transcripts[(1,1)]:
    print(x)

[' A corridor at a sperm bank.', 'Scene']
[' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'Sheldon', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't', 'e', 'd', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'a', ' ', 'p', 'l', 'a', 'n', 'e', ' ', 'w', 'i', 't', 'h', ' ', 't', 'w', 'o', ' ', 's', 'l', 'i', 't', 's', ' ', 'i', 'n', ' ', 'i', 't', ' ', 'a', 'n', 'd', ' ', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'i', 's', ' ', 'o', 'b', 's', 'e', 'r', 'v', 'e', 'd', ' ', 'i', 't', ' ', 'w', 'i', 'l', 'l', ' ', 'n', 'o', 't', ' ', 'g', 'o', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'b', 'o', 't', 'h', '.', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't

In [8]:
def exact_match(en_subset, episode):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        if len(subtitle.strip().split(" ")) <=5:
            continue
        # Exact Match for short sentences
        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            if subtitle == utt:
                if i not in res:
                    res[i] = set()
                    res[i].add(j)
                else:
                    res[i].add(j)
    output = {}
    for x in res:
        output[x] = sorted(list(res[x]))

    return output

In [106]:
result_1_all = {}
for i in range(12):
    for j in range(30):
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=tbbt_transcripts,
                en_subtitle=en_subtitle,
                zh_subtitle=zh_subtitle,
                results=results,
                season_id=i+1,
                episode_id=j+1,
                bias=200
            )
            temp = exact_match(en_subset, tbbt_episode)
            result_1_all[(i+1,j+1)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(turn_sub2epi_into_epi2sub(temp)), "Subtitle Number:",len(temp))
        except:
            pass

Season: 1 Episode: 1 Episode Number: 31 Subtitle Number: 31
Season: 1 Episode: 2 Episode Number: 23 Subtitle Number: 23
Season: 1 Episode: 3 Episode Number: 17 Subtitle Number: 17
Season: 1 Episode: 4 Episode Number: 13 Subtitle Number: 13
Season: 1 Episode: 5 Episode Number: 12 Subtitle Number: 12
Season: 1 Episode: 6 Episode Number: 29 Subtitle Number: 29
Season: 1 Episode: 7 Episode Number: 22 Subtitle Number: 22
Season: 1 Episode: 8 Episode Number: 31 Subtitle Number: 31
Season: 1 Episode: 9 Episode Number: 21 Subtitle Number: 21
Season: 1 Episode: 10 Episode Number: 25 Subtitle Number: 25
Season: 1 Episode: 11 Episode Number: 25 Subtitle Number: 25
Season: 1 Episode: 12 Episode Number: 18 Subtitle Number: 18
Season: 1 Episode: 13 Episode Number: 15 Subtitle Number: 15
Season: 1 Episode: 14 Episode Number: 17 Subtitle Number: 17
Season: 1 Episode: 15 Episode Number: 24 Subtitle Number: 24
Season: 1 Episode: 16 Episode Number: 19 Subtitle Number: 19
Season: 2 Episode: 1 Episode Numb

In [107]:
with open('alignment_part_2_strict_match.pkl', 'wb') as f:
    pkl.dump(result_1_all, f)

In [13]:
with open('alignment_part_2_strict_match.pkl', 'rb') as f:
    result_1_all = pkl.load(f)

In [148]:
temp = result_0_all[(1,3)]
for x in temp:
    print(x, temp[x])
    print(en_subset[x])
    for item in temp[x]:
        print(tbbt_episode[item])
    print("=="*50)

192 {3}
This is what the last 97 hours have been about.
[' Don’t panic, this is what the last 97 hours have been about.', 'Leonard']
243 {42}
Civil servants have a documented propensity to, you know, snap, so...
[' Oh no, that’s probably not such a good idea. Civil servants have a documented propensity to, you know, snap. ', 'Leonard']
256 {51}
Don't tell me that your hopeless infatuation is devolving into pointless jealousy.
[' Please don’t tell me that your hopeless infatuation is devolving into pointless jealousy.', 'Sheldon']
261 {55}
At least now you can retrieve the black box from the smoldering wreckage that was once your fantasy of dating her and analyze the data so that you don't crash into geek mountain again.
[' Well, at least now you can retrieve the black box from the twisted smouldering wreckage that was once your fantasy of dating her, and analyse the data so that you don’t crash into geek mountain again.', 'Sheldon']
263 {56}
A relentless pursuit that only ends when she

In [147]:
temp = result_1_all[(1,3)]
for x in temp:
    print(x, temp[x])
    print(en_subset[x])
    for item in temp[x]:
        print(tbbt_episode[item])
    print("=="*50)

195 [5]
Warriors, unsheathe your weapons. Magic wielders, raise your wands.
[' Warriors, unsheathe your weapons, magic wielders raise your wands.', 'Leonard']
212 [18]
- Forget the sword, Sheldon, help Raj.
[' Forget the sword, Sheldon, help Raj.', 'Leonard']
231 [34]
Stealing snail mail, very old school. I like it.
[' Stealing snail mail, very old school, I like it. ', 'Howard']
288 [75]
Wait, are you asking me out?
[' Wait, are you asking me out?', 'Lesley']
291 [77]
What sort of experiment would you propose?
[' What sort of experiment would you propose?', 'Lesley']
302 [87]
- No, I think it needs to be spontaneous.
[' No, I think it needs to be spontaneous.', 'Lesley']
304 [89]
- you proposed the experiment. I think you should present your findings first.
[' You proposed the experiment, I think you should present your findings first.', 'Leonard']
316 [100]
You might be bound by them right now.
[' You might be bound by them right now.', 'Raj']
324 [108]
Also, Sheldon may be a robot.


### Converge part 1 and part 2 result as the seeds

In [9]:
def merge_two_dict(dict_1, dict_2):
    res = deepcopy(dict_1)
    for sea_epi in dict_2:
        if sea_epi not in res:
            res[sea_epi] = dict_2[sea_epi]
        else:
            temp = {}
            for x in res[sea_epi]:
                temp[x] = list(res[sea_epi][x])
            # add dict 2
            if sea_epi in dict_2:
                for x in dict_2[sea_epi]:
                    if x not in temp:
                        temp[x] = sorted(list(set(dict_2[sea_epi][x])))
                    else:
                        temp[x].extend(dict_2[sea_epi][x])
                    temp[x] = sorted(list(set(temp[x])))
            res[sea_epi] = temp

    output = {}
    for x in res:
        if res[x]!={}:
            output[x] = res[x]
    return output

In [10]:
# Perform index filtering on the alignment seeds
"""
Filter indexs based on the index before and after
"""
def filter_by_idx(sub2epi):
    paris = []
    for x in sorted(list(sub2epi.keys())):
        for y in sorted(sub2epi[x]):
            paris.append([x,y])

    res = [paris[0]]
    for i in range(1, len(paris)-1):
        former = res[-1]
        current = paris[i]
        after = paris[i+1]
        if former[0] <= current[0] <= after[0]:
            if former[1]<=current[1]<=after[1]:
                res.append(current)
    if paris[-1][0] >= res[-1][0]:
        if paris[-1][1] >= res[-1][1]:
            res.append(paris[-1])

    output = {}
    for x in res:
        sub = x[0]
        epi = x[1]
        if sub not in output:
            output[sub] = [epi]
        else:
            output[sub].append(epi)

    return output

In [16]:
alignment_seeds = {}
temp = merge_two_dict(result_0_all, result_1_all)
for x in temp:
    alignment_seeds[x] = filter_by_idx(temp[x])

In [240]:
with open('alignment_seeds.pkl', 'wb') as f:
    pkl.dump(alignment_seeds, f)

In [11]:
with open('alignment_seeds.pkl', 'rb') as f:
    alignment_seeds = pkl.load(f)

In [12]:
temp = []
for x in alignment_seeds:
    temp.append(x)

In [13]:
for x in temp:
    print(x, len(alignment_seeds[x]), len(turn_sub2epi_into_epi2sub(alignment_seeds[x])))
    # print(alignment_seeds[x])
    # print('=='*50)

(1, 1) 73 68
(1, 2) 77 66
(1, 3) 58 56
(1, 4) 64 54
(1, 5) 58 50
(1, 6) 69 67
(1, 7) 68 65
(1, 8) 77 69
(1, 9) 51 46
(1, 10) 80 72
(1, 11) 58 56
(1, 12) 41 36
(1, 13) 69 66
(1, 14) 63 54
(1, 15) 61 57
(1, 16) 63 56
(2, 1) 71 69
(2, 2) 75 74
(2, 3) 64 60
(2, 4) 67 62
(2, 5) 70 60
(2, 6) 69 65
(2, 7) 83 73
(2, 8) 57 49
(2, 9) 83 72
(2, 10) 62 52
(2, 11) 74 68
(2, 12) 69 59
(2, 13) 64 56
(2, 14) 82 80
(2, 15) 74 71
(2, 16) 73 66
(2, 17) 93 78
(2, 18) 50 49
(2, 19) 61 56
(2, 20) 69 67
(2, 21) 70 63
(2, 22) 70 62
(2, 23) 83 71
(3, 1) 72 65
(3, 2) 77 67
(3, 3) 60 54
(3, 4) 64 59
(3, 5) 78 75
(3, 6) 62 51
(3, 7) 67 63
(3, 8) 62 56
(3, 9) 81 68
(3, 10) 96 87
(3, 11) 88 76
(3, 12) 74 68
(3, 13) 60 50
(3, 14) 47 42
(3, 15) 82 68
(3, 16) 89 79
(3, 17) 65 59
(3, 18) 71 58
(3, 19) 71 67
(3, 20) 87 82
(3, 21) 68 60
(3, 22) 67 63
(3, 23) 66 60
(4, 1) 88 83
(4, 2) 71 67
(4, 3) 103 96
(4, 4) 83 78
(4, 5) 87 78
(4, 6) 90 83
(4, 7) 89 79
(4, 8) 69 63
(4, 9) 106 96
(4, 10) 88 76
(4, 11) 62 60
(4, 12) 75 6

## Part 2: Extend from Alignment Seeds

In [14]:
def extend_neighbors(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(epi2sub_alignment_2[epi_id])
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_latter = transformation(en_subset[sub_id_latter])
        epi = transformation(episode[epi_id][0])

        if sub_former in epi:
            sub_ids.append(sub_id_former)
            # print(epi_id, sub_id_former)
        if sub_latter in epi:
            sub_ids.append(sub_id_latter)
            # print(epi_id, sub_id_latter)
        # print(sorted(sub_ids))
        temp[epi_id] = sorted(sub_ids)
        # epi2sub_alignment_2[epi_id] = sorted(sub_ids)
        # print("=="*50)
    return temp

In [15]:
def add_strict_match_within_gaps(gaps, epi2sub):
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]

        for sub_id in sub_ids:
            sub = transformation(en_subset[sub_id].replace("’", "'"))
            for epi_id in epi_ids:
                epi = transformation(tbbt_episode[epi_id][0].replace("’", "'"))
                if len(epi.strip().split(" "))<=2:
                    continue
                if sub == epi:

                    if epi_id not in epi2sub:
                        epi2sub[epi_id] = [sub_id]
                    else:
                        epi2sub[epi_id].append(sub_id)

    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))

    return output

In [16]:
def get_optimal_wer_from_episode(ground_truth, hypothesis_pool):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.compute_measures(ground_truth, hypothesis)['wer'])
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, scores.index(min(scores))

In [17]:
def add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode):
    count = 0
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]

        for epi_id in epi_ids:
            best_score = 100
            best_pair = [None, None]
            epi = transformation(tbbt_episode[epi_id][0].replace("’", "'"))
            if len(epi.strip().split(" "))<=2:
                continue
            for sub_id in sub_ids:
                sub = transformation(en_subset[sub_id].replace("’", "'"))
                score = jiwer.compute_measures(epi, sub)['wer']
                if score < best_score:
                    best_score = score
                    best_pair = [epi_id, sub_id]
            if best_score < 0.15:
                count += 1
                if best_pair[0] not in epi2sub:
                    epi2sub[best_pair[0]] = [best_pair[1]]
                else:
                    epi2sub[best_pair[0]].append([best_pair[1]])
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))

    return output

In [18]:
def add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode):
    count = 0
    pattern = r'\.|\?|\!|\;|- '
    temp_sub2epi = {}
    for gap in gaps:
        # Build substrings
        sub_ids = gap[1]
        epi_ids = gap[0]
        sub_lists = []
        epi_lists = []

        for epi_id in epi_ids:
            epi = tbbt_episode[epi_id][0].replace("’", "'")
            epi_substring = re.split(pattern, epi)
            for item in epi_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                epi_lists.append([temp_item, epi_id])

        for sub_id in sub_ids:
            sub = en_subset[sub_id].replace("’", "'")
            sub_substring = re.split(pattern, sub)
            for item in sub_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                sub_lists.append([temp_item, sub_id])


        # Calculate WER Similarity
        for (sub, sub_id) in sub_lists:
            for (epi, epi_id) in epi_lists:
                cer = jiwer.cer(epi, sub)
                if cer <= 0.2:
                    count += 1
                    if sub_id not in temp_sub2epi:
                        temp_sub2epi[sub_id] = set()
                        temp_sub2epi[sub_id].add(epi_id)
                    else:
                        temp_sub2epi[sub_id].add(epi_id)

    for sub_id in temp_sub2epi:
        epi_ids = list(temp_sub2epi[sub_id])
        if len(epi_ids)!=1:
            continue
        epi_id = epi_ids[0]
        if epi_id not in epi2sub:
            epi2sub[epi_id] = [sub_id]
        else:
            epi2sub[epi_id].append(sub_id)

    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [19]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [20]:
def extend_neighbors_sliding(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(epi2sub_alignment_2[epi_id])
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

In [21]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_neighbors_subtitle_sliding(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [23]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

In [30]:
print(alignment_seeds.keys())

dict_keys([(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (2, 15), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (2, 21), (2, 22), (2, 23), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (3, 15), (3, 16), (3, 17), (3, 18), (3, 19), (3, 20), (3, 21), (3, 22), (3, 23), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (4, 11), (4, 12), (4, 13), (4, 14), (4, 15), (4, 17), (4, 18), (4, 19), (4, 20), (4, 21), (4, 22), (4, 23), (4, 24), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (5, 15), (5, 16), (5, 17), (5, 18), (5, 19), (5, 20), (5, 21), (5, 22), (5, 23), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8)

In [24]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, turn_sub2epi_into_epi2sub(alignment_seeds[(1,1)]), tbbt_episode)
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)

# Extend within gap using strict string match
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

# Extend within gap using wer
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

# Extend within gap using substring cer
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

# Add within the gap
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_one_size_gap(gaps, epi2sub)

In [29]:
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))
print(len(alignment_seeds[(1,1)]), len(turn_sub2epi_into_epi2sub(alignment_seeds[(1,1)])))

226 260
73 68


In [54]:
def get_alignment_extend_neighbors(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, turn_sub2epi_into_epi2sub(alignment_seeds[(season_id,episode_id)]), tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Add within the gap
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_one_size_gap(gaps, epi2sub)

    # # Further extend neighbors
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub

In [55]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment_extend_neighbors(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:",len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])), len(temp), "Subtitle Number:", len(alignment_seeds[(i,j)]),len(turn_sub2epi_into_epi2sub(temp)))

Season: 1 Episode: 1 Episode Number: 68 226 Subtitle Number: 73 260
Season: 1 Episode: 2 Episode Number: 66 161 Subtitle Number: 77 203
Season: 1 Episode: 3 Episode Number: 56 163 Subtitle Number: 58 191
Season: 1 Episode: 4 Episode Number: 54 143 Subtitle Number: 64 174
Season: 1 Episode: 5 Episode Number: 50 144 Subtitle Number: 58 178
Season: 1 Episode: 6 Episode Number: 67 165 Subtitle Number: 69 187
Season: 1 Episode: 7 Episode Number: 65 166 Subtitle Number: 68 186
Season: 1 Episode: 8 Episode Number: 69 195 Subtitle Number: 77 243
Season: 1 Episode: 9 Episode Number: 46 146 Subtitle Number: 51 177
Season: 1 Episode: 10 Episode Number: 72 144 Subtitle Number: 80 176
Season: 1 Episode: 11 Episode Number: 56 164 Subtitle Number: 58 187
Season: 1 Episode: 12 Episode Number: 36 95 Subtitle Number: 41 118
Season: 1 Episode: 13 Episode Number: 66 145 Subtitle Number: 69 175
Season: 1 Episode: 14 Episode Number: 54 151 Subtitle Number: 63 188
Season: 1 Episode: 15 Episode Number: 57 157

In [56]:
with open('alignment_results/zh/final_stage_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [284]:
with open('alignment_results/zh/final_stage_alignment.pkl', 'rb') as f:
    final_stage_alignment = pkl.load(f)

In [285]:
for x in final_stage_alignment:
    print(x)
    print(final_stage_alignment[x])
    print('=='*50)

(1, 1)
{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334], 13

In [319]:
alignment_seeds = final_stage_alignment

In [324]:
print(further_alignment[(1,8)])

{2: [158, 159, 160], 3: [163], 4: [164], 6: [165], 7: [167], 9: [169, 170], 11: [171, 172], 13: [174], 15: [176], 17: [178], 18: [179, 180], 22: [185], 24: [186], 25: [187], 26: [188], 27: [189], 30: [193, 194], 31: [195], 33: [197], 34: [198], 35: [199], 36: [200], 37: [201], 38: [202, 203], 39: [204], 40: [205], 41: [206, 207], 44: [210], 45: [212], 46: [213], 47: [214], 49: [218, 219], 52: [221], 53: [222], 54: [223], 56: [225], 57: [226], 58: [227, 228], 59: [229], 61: [230, 231], 62: [232], 63: [233], 64: [234], 65: [235], 66: [236], 70: [239], 71: [240], 72: [241], 73: [242], 74: [243, 244, 245, 246], 75: [249], 76: [250], 77: [251, 252], 78: [254], 79: [256], 80: [257], 82: [259], 84: [262], 85: [263], 87: [264], 89: [265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275], 90: [276], 91: [277], 92: [278], 93: [279], 94: [280], 95: [281], 99: [285, 286, 287], 100: [288], 101: [289], 102: [290], 103: [292], 104: [293, 294], 105: [295], 106: [296, 297, 298], 108: [302], 109: [303],

In [327]:
with open('alignment_results/zh/cleaned_final_stage_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [357]:
with open('alignment_results/zh/cleaned_final_stage_alignment.pkl', 'rb') as f:
    final_stage_alignment = pkl.load(f)

In [358]:
alignment_seeds = final_stage_alignment

In [341]:
for x in alignment_seeds:
    print(x)
    print(alignment_seeds[x])

(1, 1)
{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 315, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334

In [355]:
def get_alignment_extend_neighbors(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    epi2sub = alignment_seeds[(season_id,episode_id)]
    # Extend the neighbors
    while True:
        temp = extend_neighbors(en_subset, epi2sub, tbbt_episode)
        if len(temp)==len(epi2sub) and len(turn_sub2epi_into_epi2sub(temp))==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break
        else:
            epi2sub = temp

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # # Add within the gap
    # gaps, abandons = get_subset_in_gaps(epi2sub)
    # epi2sub = add_one_size_gap(gaps, epi2sub)

    # # Further extend neighbors
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub


In [356]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment_extend_neighbors(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:",len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])), len(temp), "Subtitle Number:", len(alignment_seeds[(i,j)]),len(turn_sub2epi_into_epi2sub(temp)))

Season: 1 Episode: 1 Episode Number: 262 227 Subtitle Number: 226 271
Season: 1 Episode: 2 Episode Number: 205 161 Subtitle Number: 161 217
Season: 1 Episode: 3 Episode Number: 190 163 Subtitle Number: 163 197
Season: 1 Episode: 4 Episode Number: 176 143 Subtitle Number: 143 185
Season: 1 Episode: 5 Episode Number: 181 144 Subtitle Number: 144 191
Season: 1 Episode: 6 Episode Number: 187 165 Subtitle Number: 165 191
Season: 1 Episode: 7 Episode Number: 184 166 Subtitle Number: 164 190
Season: 1 Episode: 8 Episode Number: 251 195 Subtitle Number: 195 259
Season: 1 Episode: 9 Episode Number: 178 146 Subtitle Number: 146 186
Season: 1 Episode: 10 Episode Number: 177 144 Subtitle Number: 144 184
Season: 1 Episode: 11 Episode Number: 193 165 Subtitle Number: 164 205
Season: 1 Episode: 12 Episode Number: 120 95 Subtitle Number: 95 124
Season: 1 Episode: 13 Episode Number: 181 145 Subtitle Number: 145 191
Season: 1 Episode: 14 Episode Number: 191 151 Subtitle Number: 151 196
Season: 1 Episode

## Part 3: Final Stage Alignment

In [359]:
def get_final_stage_gap_pairs(epi2sub):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]

    # Perform string match and CER Scoring
    return subtitle_gaps

In [360]:
"""
Explore the neighbor subtitles of a episode.

Given a episode utterance (epi_id), then we fetch the unaligned subtitle (sub_id)
[epi_id_0, epi_id_1, etc., epi_id_n] - [sub_id_0, sub_id_1, etc. sub_id_m]

Then, we search within the subset-pair

For each subtitle, we use sliding window to fetch a set of substrings in each episode utterance and calculate the CER
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    count = 0
    # Gather the gap of subtitle corresponding to episode utterance
    subtitle_gaps = get_final_stage_gap_pairs(epi2sub)

    # Iterate the whole subtitle gaps to perform substring match
    temp_epi2sub = deepcopy(epi2sub)
    for item in subtitle_gaps:
        epi_ids = [i for i in range(item[0], item[1]+1)]
        sub_ids = [i for i in subtitle_gaps[item]]

        # Fetch all episodes and subtitles
        epis = [transformation(tbbt_episode[i][0].replace("’", " ").replace('…', " ")) for i in epi_ids]
        subs = [transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids]

        # print(epi_ids)
        # print(epis)
        # print(sub_ids)
        # print(subs)
        for sub_id, sub in zip(sub_ids, subs):
            sub_len = len(sub.strip().split(" "))
            if sub_len <=3:
                continue
            min_score = float('inf')
            min_substring = ""
            source_episode = ""
            source_sub_id = float('inf')
            source_epi_id = float('inf')
            for epi, epi_id in zip(epis, epi_ids):
                # min_score = float('inf')
                # min_substring = ""
                epi_substrings = get_sliding_window_substrings(epi, window_size=sub_len)
                for substring in epi_substrings:
                    wer = jiwer.wer(sub, substring)
                    if wer <= min_score:
                        min_score = wer
                        min_substring = substring
                        source_episode = epi
                        source_epi_id = epi_id
            if min_score <= 0.5:
                if source_epi_id not in temp_epi2sub:
                    temp_epi2sub[source_epi_id] = [sub_id]
                else:
                    temp_epi2sub[source_epi_id].append(sub_id)
                # print(sub_id+2, source_epi_id+2)
                # print("Subtitle:", sub_id+2, sub, len(sub.strip().split(" ")))
                # print("All Episode to compare:")
                # print(min_score, '|', min_substring, '|', source_episode)
                # print('=='*50)
                # count += 1
    # print(temp_epi2sub)
    # print(len(temp_epi2sub), len(turn_sub2epi_into_epi2sub(temp_epi2sub)))
    # print(count)
    output = {}
    for epi_id in sorted(list(temp_epi2sub.keys())):
        output[epi_id] = sorted(list(set(temp_epi2sub[epi_id])))
    return output

# epi2sub_0 = epi2sub
# print(len(epi2sub_0), len(turn_sub2epi_into_epi2sub(epi2sub_0)))
# epi2sub_1 = extend_neighbors_episode_sliding(en_subset, epi2sub_0, tbbt_episode)
# print(len(epi2sub_1), len(turn_sub2epi_into_epi2sub(epi2sub_1)))
# epi2sub_2 = extend_neighbors_episode_sliding(en_subset, epi2sub_1, tbbt_episode)
# print(len(epi2sub_2), len(turn_sub2epi_into_epi2sub(epi2sub_2)))

In [361]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    epi2sub = alignment_seeds[(season_id,episode_id)]
    # Extend the neighbors
    while True:
        temp = extend_neighbors(en_subset, epi2sub, tbbt_episode)
        if len(temp)==len(epi2sub) and len(turn_sub2epi_into_epi2sub(temp))==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break
        else:
            epi2sub = temp

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend the neighbors
    epi2sub = extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub

In [362]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:", len(alignment_seeds[(i,j)]),len(temp), "|", "Subtitle Number:", len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])),len(turn_sub2epi_into_epi2sub(temp)))

Season: 1 Episode: 1 Episode Number: 226 235 | Subtitle Number: 262 317
Season: 1 Episode: 2 Episode Number: 161 177 | Subtitle Number: 205 264
Season: 1 Episode: 3 Episode Number: 163 188 | Subtitle Number: 190 268
Season: 1 Episode: 4 Episode Number: 143 156 | Subtitle Number: 176 240
Season: 1 Episode: 5 Episode Number: 144 160 | Subtitle Number: 181 236
Season: 1 Episode: 6 Episode Number: 165 178 | Subtitle Number: 187 238
Season: 1 Episode: 7 Episode Number: 164 187 | Subtitle Number: 184 253
Season: 1 Episode: 8 Episode Number: 195 211 | Subtitle Number: 251 310
Season: 1 Episode: 9 Episode Number: 146 177 | Subtitle Number: 178 251
Season: 1 Episode: 10 Episode Number: 144 171 | Subtitle Number: 177 253
Season: 1 Episode: 11 Episode Number: 164 182 | Subtitle Number: 193 243
Season: 1 Episode: 12 Episode Number: 95 107 | Subtitle Number: 120 165
Season: 1 Episode: 13 Episode Number: 145 160 | Subtitle Number: 181 230
Season: 1 Episode: 14 Episode Number: 151 170 | Subtitle Numb

In [363]:
with open('alignment_results/zh/final_stage_alignment_1.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [364]:
with open('alignment_results/zh/final_stage_alignment_1.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [None]:
# Check the result
def check_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )
    print(season_id, episode_id)
    alignment = alignment_seeds[(season_id,episode_id)]
    for epi_id in alignment:
        if len(alignment[epi_id])!=max(alignment[epi_id])-min(alignment[epi_id])+1:
        # if max(temp[epi_id])-min(temp[epi_id])+1-len(temp[epi_id])==2:
            epi = transformation(tbbt_episode[epi_id][0].replace("’", " ").replace('…', " "))
            sub_ids = [i for i in range(min(alignment[epi_id]), max(alignment[epi_id])+1)]
            sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids])
            print(epi)
            print(sub)
            print(epi_id, alignment[epi_id], max(alignment[epi_id])-min(alignment[epi_id])+1-len(alignment[epi_id]), len(epi.strip().split(" ")), len(sub.strip().split(" ")), jiwer.wer(sub, epi))
            print('--')


    print("=="*50)

    # print(season_id, episode_id)
    # alignment = alignment_seeds[(season_id,episode_id)]
    # for item in alignment:
    #     epi = transformation(tbbt_episode[item][0].replace("’", " ").replace('…', " "))
    #     sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in alignment[item]])
    #     print(item, alignment[item], len(epi.strip().split(" ")), len(sub.strip().split(" ")))
    # print("=="*50)

    return epi2sub


further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = check_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)

In [262]:
alignment_seeds = further_alignment

In [264]:
# Add gaps of one within
def complete_gaps(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Check the alignment
    alignment = alignment_seeds[(season_id,episode_id)]
    for item in alignment:
        epi = transformation(tbbt_episode[item][0].replace("’", " ").replace('…', " "))
        sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in alignment[item]])
        print(item, alignment[item], len(epi.strip().split(" ")), len(sub.strip().split(" ")))
    print("=="*50)



    # return epi2sub
for (i, j) in alignment_seeds.keys():
    temp = complete_gaps(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)


0 [200, 201, 202, 203] 57 49
1 [204] 5 5
2 [205] 15 15
5 [206, 207, 208, 209, 210] 34 33
6 [212] 4 6
7 [213] 9 7
8 [214] 11 11
9 [215] 6 6
10 [216] 3 6
11 [217] 6 5
12 [218] 13 11
13 [220] 9 9
14 [221] 8 4
15 [222, 223, 224] 37 34
16 [225, 226] 17 17
17 [227, 228, 229] 52 49
18 [230] 8 8
19 [231] 3 3
20 [232] 7 6
21 [233] 4 4
22 [234] 6 11
23 [235] 14 10
24 [236] 6 6
28 [239] 8 8
30 [240] 9 8
31 [241] 2 2
32 [242] 19 19
33 [243, 244] 11 11
34 [246, 247] 19 15
35 [248] 9 10
36 [249] 10 9
39 [252] 6 6
40 [253] 11 10
45 [256] 11 11
46 [257] 4 4
47 [258, 259] 17 16
54 [262] 8 6
63 [266] 7 8
64 [267] 11 9
65 [268] 7 7
66 [269] 3 3
67 [270] 14 14
68 [271] 5 5
69 [272, 273] 13 13
70 [274, 275] 12 12
71 [276] 9 9
72 [277] 6 6
73 [278, 279] 15 14
74 [280] 9 8
75 [281, 282] 20 20
76 [283] 3 3
82 [285, 286] 62 61
83 [287] 26 21
84 [288] 8 8
86 [289] 9 9
88 [290] 10 10
89 [291] 7 7
90 [292] 6 6
93 [293] 11 11
94 [294] 5 5
96 [295, 296] 33 25
97 [297] 11 11
98 [298] 1 1
99 [299] 4 4
100 [300] 12 12

In [279]:
for (i, j) in further_alignment:
    print(i, j)
    alignment = further_alignment[(i, j)]
    sub2epi = turn_sub2epi_into_epi2sub(alignment)
    for sub_id in sub2epi:
        if len(sub2epi[sub_id])!=1:
            print(sub_id, sub2epi[sub_id])
    print('=='*50)

1 1
1 2
1 3
504 [239, 240]
1 4
1 5
1 6
1 7
318 [119, 121]
319 [119, 121]
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
2 1
343 [118, 119]
2 2
296 [71, 74]
2 3
2 4
2 5
355 [118, 122]
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
358 [170, 171]
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
3 1
3 2
282 [74, 76]
3 3
3 4
3 5
3 6
335 [97, 99]
452 [182, 184]
3 7
3 8
445 [199, 200]
446 [201, 202]
447 [201, 202]
3 9
3 10
486 [212, 214]
3 11
3 12
230 [19, 21]
3 13
426 [156, 157]
3 14
3 15
382 [121, 122]
3 16
3 17
318 [85, 86]
3 18
3 19
3 20
307 [84, 85]
3 21
3 22
3 23
4 1
4 2
4 3
4 4
205 [2, 3]
4 5
4 6
4 7
4 8
221 [28, 29]
4 9
4 10
321 [104, 105]
4 11
4 12
4 13
236 [30, 31]
4 14
450 [189, 195]
4 15
4 17
4 18
4 19
4 20
4 21
4 22
4 23
4 24
5 1
263 [47, 48]
5 2
5 3
5 4
5 5
5 6
5 7
5 8
248 [31, 32]
280 [49, 50]
5 9
5 10
5 11
5 12
5 13
422 [194, 195]
5 14
301 [66, 67]
5 15
5 16
339 [93, 94]
5 17
344 [89, 91]
413 [131, 134]
5 18
5 19
314 [74, 75]
316 [75, 76]
466 [181, 182]
5 20
5 21
259 [38, 40]
5 22
5 23
6 1

In [218]:
alignment_seeds = further_alignment

In [273]:
# Check the result
def check_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )
    print(season_id, episode_id)
    alignment = alignment_seeds[(season_id,episode_id)]
    for epi_id in alignment:
        if len(alignment[epi_id])!=max(alignment[epi_id])-min(alignment[epi_id])+1:
        # if max(temp[epi_id])-min(temp[epi_id])+1-len(temp[epi_id])==2:
            epi = transformation(tbbt_episode[epi_id][0].replace("’", " ").replace('…', " "))
            sub_ids = [i for i in range(min(alignment[epi_id]), max(alignment[epi_id])+1)]
            sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids])
            print(epi)
            print(sub)
            print(epi_id, alignment[epi_id], max(alignment[epi_id])-min(alignment[epi_id])+1-len(alignment[epi_id]), len(epi.strip().split(" ")), len(sub.strip().split(" ")), jiwer.wer(sub, epi))
            print('--')


    print("=="*50)

    # print(season_id, episode_id)
    # alignment = alignment_seeds[(season_id,episode_id)]
    # for item in alignment:
    #     epi = transformation(tbbt_episode[item][0].replace("’", " ").replace('…', " "))
    #     sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in alignment[item]])
    #     print(item, alignment[item], len(epi.strip().split(" ")), len(sub.strip().split(" ")))
    # print("=="*50)

    return epi2sub


further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = check_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)

1 1
in the winter that seat is close enough to the radiator to remain warm and yet not so close as to cause perspiration in the summer it s directly in the path of a cross breeze created by open windows there and there it faces the television at an angle that is neither direct thus discouraging conversation nor so far wide to create a parallax distortion i could go on but i think i ve made my point
in winter that seat is close enough to the radiator to remain warm yet not so close as to cause perspiration in the summer it is in a cross breeze created by opening windows there and there it faces the television at an angle that is neither direct discouraging conversation nor so far wide as to create a parallax distortion i could go on but i think i have made my point
116 [314, 316, 317] 1 78 72 0.1527777777777778
--
i know right okay let s see what else um that s about it that s the story of penny
i know right okay let us see what else i guess that is about it that is the story of penny
1

In [261]:
for x in alignment_seeds:
    if x!=(1,1):
        continue
    print(x)
    alignment = alignment_seeds[x]

    # for epi_id in temp:
    #     if len(temp[epi_id])!=max(temp[epi_id])-min(temp[epi_id])+1:
    #         print(epi_id, temp[epi_id], max(temp[epi_id])-min(temp[epi_id])+1-len(temp[epi_id]))
    #         print([i for i in range(min(temp[epi_id]), max(temp[epi_id])+1)])

    for item in alignment:
        epi = transformation(tbbt_episode[item][0].replace("’", " ").replace('…', " "))
        sub_ids = [i for i in range(min(alignment[item]), max(alignment[item])+1)]
        sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids])
        print(item, len(epi.strip().split(" ")), len(sub.strip().split(" ")), abs(len(epi.strip().split(" "))- len(sub.strip().split(" "))))
        print()
        print(epi)
        print()
        print(sub)
        print('--')

    print('=='*50)

(1, 1)
0 57 49 8

so if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it s unobserved it will however if it s observed after it s left the plane but before it hits its target it will not have gone through both slits

if a photon is directed through a plane with two slits in it and either is observed it will not go through both if unobserved it will if it is observed after it left the plane before it hits its target it will not have gone through both slits
--
1 5 5 0

agreed what s your point

agreed what is your point
--
2 15 15 0

there s no point i just think it s a good idea for a teeshirt

there is no point i just think it is a good idea for a tshirt
--
5 34 33 1

one across is aegean eight down is nabakov twentysix across is mcm fourteen down is move your finger phylum which makes fourteen across portauprince see papa doc s capital idea that s portauprince haiti

one across is aegean eight down is nabokov

In [248]:
count = 0
for x in alignment_seeds:
    print(x)
    temp = alignment_seeds[x]
    for epi_id in temp:
        if len(temp[epi_id])!=max(temp[epi_id])-min(temp[epi_id])+1:
            count += len(temp[epi_id])
        # if max(temp[epi_id])-min(temp[epi_id])+1-len(temp[epi_id])==2:
            print(epi_id, temp[epi_id], max(temp[epi_id])-min(temp[epi_id])+1-len(temp[epi_id]))
    print('=='*50)

(1, 1)
116 [314, 316, 317] 1
149 [352, 353, 355] 1
161 [370, 372, 373, 374, 375] 1
(1, 2)
57 [238, 241] 2
81 [279, 280, 282, 283] 1
143 [342, 343, 345] 1
176 [388, 390, 391] 1
222 [453, 455] 1
(1, 3)
26 [222, 224] 1
169 [409, 410, 411, 412, 413, 414, 416, 418, 420] 3
(1, 4)
133 [364, 366, 367] 1
153 [387, 388, 389, 390, 392, 393] 1
(1, 5)
127 [350, 351, 352, 354, 355] 1
134 [365, 368] 2
167 [408, 409, 410, 412] 1
210 [458, 460] 1
(1, 6)
59 [262, 263, 265, 266, 267] 1
92 [314, 317] 2
223 [471, 472, 473, 474, 476] 1
(1, 7)
97 [297, 299, 300, 301] 1
121 [318, 319, 321] 1
132 [330, 335] 4
143 [345, 348] 2
195 [411, 418, 419] 6
224 [447, 452, 453] 4
252 [483, 485] 1
261 [493, 497] 3
(1, 8)
74 [243, 244, 246, 247, 248] 1
89 [265, 268, 269, 272, 275] 6
106 [296, 298] 1
111 [305, 306, 308, 311] 3
(1, 9)
13 [217, 220] 2
67 [295, 296, 297, 298, 300, 301] 1
99 [340, 342] 1
113 [360, 362] 1
187 [453, 455] 1
215 [492, 493, 501] 7
(1, 10)
76 [269, 270, 272] 1
182 [417, 419] 1
(1, 11)
117 [318, 327, 

In [249]:
print(count)

2713


In [224]:
for x in alignment_seeds:
    print(x)
    temp = alignment_seeds[x]
    for item in temp:
        print(item, temp[item])
    print("=="*50)

(1, 1)
0 [200, 201, 202, 203]
1 [204]
2 [205]
5 [206, 207, 208, 209, 210]
6 [212]
7 [213]
8 [214]
9 [215]
10 [216]
11 [217]
12 [218]
13 [220]
14 [221]
15 [222, 223, 224]
16 [225, 226]
17 [227, 228, 229]
18 [230]
19 [231]
20 [232]
21 [233]
22 [234]
23 [235]
24 [236]
28 [239]
30 [240]
31 [241]
32 [242]
33 [243, 244]
34 [246, 247]
35 [248]
36 [249]
39 [252]
40 [253]
45 [256]
46 [257]
47 [258, 259]
54 [262]
63 [266]
64 [267]
65 [268]
66 [269]
67 [270]
68 [271]
69 [272, 273]
70 [274, 275]
71 [276]
72 [277]
73 [278, 279]
74 [280]
75 [281, 282]
76 [283]
82 [285, 286]
83 [287]
84 [288]
86 [289]
88 [290]
89 [291]
90 [292]
93 [293]
94 [294]
96 [295, 296]
97 [297]
98 [298]
99 [299]
100 [300]
102 [301, 302]
104 [303]
105 [304]
106 [305]
107 [306]
108 [307]
109 [308]
111 [311]
112 [312]
115 [313]
116 [314, 316, 317]
117 [318]
119 [319]
123 [321, 322]
124 [323]
126 [325]
127 [326]
128 [327]
131 [329]
132 [330]
133 [331, 332]
134 [333]
135 [334]
136 [335]
137 [336]
138 [337, 338, 339, 340]
139 [341]


In [206]:
for x in alignment_seeds:
    print(x)
    print(get_final_stage_gap_pairs(alignment_seeds[x]))

(1, 1)
{(5, 6): [211], (12, 13): [219], (24, 28): [237, 238], (33, 34): [245], (36, 39): [250, 251], (40, 45): [254, 255], (47, 54): [260, 261], (54, 63): [263, 264, 265], (76, 82): [284], (109, 111): [309, 310], (119, 123): [320], (124, 126): [324], (128, 131): [328], (150, 151): [357], (151, 154): [359], (161, 162): [376], (164, 169): [381], (171, 173): [385], (193, 195): [410], (198, 200): [414, 415], (208, 209): [426], (211, 214): [432], (215, 219): [436, 437, 438], (219, 221): [440, 441], (221, 222): [443], (222, 223): [445], (249, 250): [480], (250, 252): [482], (272, 275): [505], (284, 286): [518], (286, 288): [521], (299, 301): [539, 540]}
(1, 2)
{(20, 22): [189], (22, 24): [191], (28, 30): [197, 198], (36, 39): [206, 207], (39, 40): [209], (54, 55): [235], (64, 65): [253], (65, 67): [256, 257], (74, 76): [269], (76, 77): [272], (77, 79): [275, 276], (100, 103): [301, 302], (103, 105): [304], (108, 109): [308, 309], (123, 128): [326], (128, 129): [328], (148, 151): [349, 350], 

In [None]:
for item in alignment_seeds:
    gap = get_final_stage_gap_pairs(alignment_seeds[x])
    print("Episodes:")
    for epi_min, epi_max in gap:
        for i in range(epi_min, epi_max):
            print()

In [82]:
# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )
epi2sub = alignment_seeds[(1,1)]

In [137]:
def get_sliding_window_substrings(input_string, window_size):
    input_tokens = input_string.strip().split(" ")
    substrings = []
    for i in range(len(input_tokens)-window_size):
        substrings.append(" ".join(input_tokens[i: i+window_size]))
    return substrings

In [73]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [74]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [75]:
def sliding_within_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        for epi_id in epi_ids:
            epi = tbbt_episode

        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]


    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [83]:
"""
Add neighbor subtitle from the utterance
"""
def extend_subtitles_to_episode_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

extend_subtitles_to_episode_sliding_window(en_subset, epi2sub, tbbt_transcripts)

KeyError: 0

In [77]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_episodes_to_subtitle_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [175]:
alignment_seeds = final_stage_alignment

In [176]:
# for x in alignment_seeds:
#     print(x)
#     print(alignment_seeds[x])
#     print("=="*50)

In [177]:
# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )
epi2sub = alignment_seeds[(1,1)]

In [103]:
"""
Extend the former subtitle and latter subtitle near the episode
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]
    print(subtitle_gaps)

    # Check whether the subtitle could be merged into utterances using sliding window
    for start_epi_id, end_epi_id in subtitle_gaps:
        start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
        end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
        # print(start_epi_id, start_epi)
        # print(end_epi_id, end_epi)
        # print('--')

        for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
            sub = transformation(en_subset[sub_id])
            sub_substrings = get_sliding_window_substrings(sub, 6)
            sub_substrings.append(sub)
            # print(sub_id, sub)
            # print(sub_substrings)
            temp_start = [substring in start_epi for substring in sub_substrings]
            signal_start = True in temp_start
            temp_end = [substring in end_epi for substring in sub_substrings]
            signal_end = True in temp_end
            # print(temp_start, signal_start)
            # print(temp_end, signal_end)
            if signal_start==True and signal_end==True:
                print(start_epi_id, start_epi)
                print(end_epi_id, end_epi)
                print(sub, "|", signal_start, "|", signal_end)
                print('--')

        # print('=='*50)

extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

{(0, 1): [201, 202, 203], (2, 5): [206, 207, 208, 209], (5, 6): [211], (12, 13): [219], (14, 15): [222, 223], (16, 17): [227, 228], (22, 24): [235], (24, 28): [237, 238], (33, 34): [245], (36, 39): [250, 251], (40, 45): [254, 255], (46, 63): [258, 259, 260, 261, 262, 263, 264, 265], (74, 75): [281], (76, 82): [284], (82, 84): [287], (96, 97): [296], (109, 111): [309, 310], (116, 117): [317], (119, 123): [320], (124, 126): [324], (128, 131): [328], (133, 134): [332], (138, 139): [338, 339, 340], (144, 145): [347], (150, 151): [357], (151, 154): [359], (154, 155): [361], (159, 160): [367], (160, 161): [370, 371, 372], (161, 162): [376, 377], (164, 169): [381], (171, 173): [385], (181, 183): [394, 395], (183, 185): [397, 398], (193, 195): [410], (198, 200): [414, 415], (208, 209): [426], (209, 210): [428, 429], (211, 214): [432], (215, 221): [436, 437, 438, 439, 440, 441], (221, 222): [443], (222, 224): [445, 446], (235, 236): [462], (241, 242): [471], (249, 250): [480], (250, 252): [482]

In [202]:
def get_final_stage_gap_pairs(epi2sub):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]

    # Perform string match and CER Scoring
    return subtitle_gaps

final_gap = get_final_stage_gap_pairs(epi2sub)

for item in final_gap:
    epi_ids = [i+2 for i in range(item[0], item[1]+1)]
    sub_ids = [x+2 for x in final_gap[item]]
    print(epi_ids)
    print(sub_ids)
    print('=='*50)

[2, 3]
[203, 204, 205]
[4, 5, 6, 7]
[208, 209, 210, 211]
[7, 8]
[213]
[14, 15]
[221]
[16, 17]
[224, 225]
[18, 19]
[229, 230]
[24, 25, 26]
[237]
[26, 27, 28, 29, 30]
[239, 240]
[35, 36]
[247]
[38, 39, 40, 41]
[252, 253]
[42, 43, 44, 45, 46, 47]
[256, 257]
[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]
[260, 261, 262, 263, 264, 265, 266, 267]
[76, 77]
[283]
[78, 79, 80, 81, 82, 83, 84]
[286]
[84, 85, 86]
[289]
[98, 99]
[298]
[111, 112, 113]
[311, 312]
[118, 119]
[319]
[121, 122, 123, 124, 125]
[322]
[126, 127, 128]
[326]
[130, 131, 132, 133]
[330]
[135, 136]
[334]
[140, 141]
[340, 341, 342]
[146, 147]
[349]
[152, 153]
[359]
[153, 154, 155, 156]
[361]
[156, 157]
[363]
[161, 162]
[369]
[162, 163]
[372, 373, 374]
[163, 164]
[378, 379]
[166, 167, 168, 169, 170, 171]
[383]
[173, 174, 175]
[387]
[183, 184, 185]
[396, 397]
[185, 186, 187]
[399, 400]
[195, 196, 197]
[412]
[200, 201, 202]
[416, 417]
[210, 211]
[428]
[211, 212]
[430, 431]
[213, 214, 215, 216]
[434]
[217, 

In [181]:
with open('alignment_results/zh/final_stage_alignment.pkl', 'rb') as f:
    final_stage_alignment = pkl.load(f)

# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

alignment_seeds = final_stage_alignment
epi2sub = alignment_seeds[(1,1)]

In [188]:
"""
Explore the neighbor subtitles of a episode.

Given a episode utterance (epi_id), then we fetch the unaligned subtitle (sub_id)
[epi_id_0, epi_id_1, etc., epi_id_n] - [sub_id_0, sub_id_1, etc. sub_id_m]

Then, we search within the subset-pair

For each subtitle, we use sliding window to fetch a set of substrings in each episode utterance and calculate the CER
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    count = 0
    # Gather the gap of subtitle corresponding to episode utterance
    subtitle_gaps = get_final_stage_gap_pairs(epi2sub)

    # Iterate the whole subtitle gaps to perform substring match
    temp_epi2sub = deepcopy(epi2sub)
    for item in subtitle_gaps:
        epi_ids = [i for i in range(item[0], item[1]+1)]
        sub_ids = [i for i in subtitle_gaps[item]]

        # Fetch all episodes and subtitles
        epis = [transformation(tbbt_episode[i][0].replace("’", " ").replace('…', " ")) for i in epi_ids]
        subs = [transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids]

        # print(epi_ids)
        # print(epis)
        # print(sub_ids)
        # print(subs)
        for sub_id, sub in zip(sub_ids, subs):
            sub_len = len(sub.strip().split(" "))
            if sub_len <=3:
                continue
            min_score = float('inf')
            min_substring = ""
            source_episode = ""
            source_sub_id = float('inf')
            source_epi_id = float('inf')
            for epi, epi_id in zip(epis, epi_ids):
                # min_score = float('inf')
                # min_substring = ""
                epi_substrings = get_sliding_window_substrings(epi, window_size=sub_len)
                for substring in epi_substrings:
                    wer = jiwer.wer(sub, substring)
                    if wer <= min_score:
                        min_score = wer
                        min_substring = substring
                        source_episode = epi
                        source_epi_id = epi_id
            if min_score <= 0.5:
                if source_epi_id not in temp_epi2sub:
                    temp_epi2sub[source_epi_id] = [sub_id]
                else:
                    temp_epi2sub[source_epi_id].append(sub_id)
                # print(sub_id+2, source_epi_id+2)
                # print("Subtitle:", sub_id+2, sub, len(sub.strip().split(" ")))
                # print("All Episode to compare:")
                # print(min_score, '|', min_substring, '|', source_episode)
                # print('=='*50)
                # count += 1
    # print(temp_epi2sub)
    # print(len(temp_epi2sub), len(turn_sub2epi_into_epi2sub(temp_epi2sub)))
    # print(count)
    output = {}
    for epi_id in sorted(list(temp_epi2sub.keys())):
        output[epi_id] = sorted(list(set(temp_epi2sub[epi_id])))
    return output

# epi2sub_0 = epi2sub
# print(len(epi2sub_0), len(turn_sub2epi_into_epi2sub(epi2sub_0)))
# epi2sub_1 = extend_neighbors_episode_sliding(en_subset, epi2sub_0, tbbt_episode)
# print(len(epi2sub_1), len(turn_sub2epi_into_epi2sub(epi2sub_1)))
# epi2sub_2 = extend_neighbors_episode_sliding(en_subset, epi2sub_1, tbbt_episode)
# print(len(epi2sub_2), len(turn_sub2epi_into_epi2sub(epi2sub_2)))

In [189]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors_episode_sliding(en_subset, alignment_seeds[(i,j)], tbbt_episode)

    return epi2sub

In [192]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:", len(alignment_seeds[(i,j)]),len(temp), "|", "Subtitle Number:", len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])),len(turn_sub2epi_into_epi2sub(temp)))

Season: 1 Episode: 1 Episode Number: 226 235 | Subtitle Number: 260 309
Season: 1 Episode: 2 Episode Number: 161 177 | Subtitle Number: 203 253
Season: 1 Episode: 3 Episode Number: 163 188 | Subtitle Number: 191 265
Season: 1 Episode: 4 Episode Number: 143 156 | Subtitle Number: 174 232
Season: 1 Episode: 5 Episode Number: 144 160 | Subtitle Number: 178 226
Season: 1 Episode: 6 Episode Number: 165 178 | Subtitle Number: 187 235
Season: 1 Episode: 7 Episode Number: 166 186 | Subtitle Number: 186 251
Season: 1 Episode: 8 Episode Number: 195 211 | Subtitle Number: 243 298
Season: 1 Episode: 9 Episode Number: 146 177 | Subtitle Number: 177 243
Season: 1 Episode: 10 Episode Number: 144 171 | Subtitle Number: 176 245
Season: 1 Episode: 11 Episode Number: 164 181 | Subtitle Number: 187 229
Season: 1 Episode: 12 Episode Number: 95 107 | Subtitle Number: 118 162
Season: 1 Episode: 13 Episode Number: 145 160 | Subtitle Number: 175 216
Season: 1 Episode: 14 Episode Number: 151 170 | Subtitle Numb

In [193]:
with open('alignment_results/zh/final_stage_alignment_0.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [194]:
with open('alignment_results/zh/final_stage_alignment_0.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [191]:
for x in further_alignment:
    aligned_epi_num = len(alignment_seeds[x])
    aligned_sub_num = len(turn_sub2epi_into_epi2sub(alignment_seeds[x]))
    all_epi_num = max(alignment_seeds[x])-min(alignment_seeds[x])
    all_sub_num = max(turn_sub2epi_into_epi2sub(alignment_seeds[x]))-min(turn_sub2epi_into_epi2sub(alignment_seeds[x]))
    print(x, aligned_epi_num, all_epi_num, int(100*(aligned_epi_num/all_epi_num)), "|", aligned_sub_num, all_sub_num, int(100*(aligned_sub_num/all_sub_num)))

(1, 1) 226 314 71 | 260 355 73
(1, 2) 161 230 70 | 203 299 67
(1, 3) 163 237 68 | 191 313 61
(1, 4) 143 214 66 | 174 291 59
(1, 5) 144 220 65 | 178 283 62
(1, 6) 165 236 69 | 187 292 64
(1, 7) 166 265 62 | 186 319 58
(1, 8) 195 261 74 | 243 356 68
(1, 9) 146 217 67 | 177 308 57
(1, 10) 144 223 64 | 176 289 60
(1, 11) 164 241 68 | 187 293 63
(1, 12) 95 126 75 | 118 481 24
(1, 13) 145 244 59 | 175 307 57
(1, 14) 151 228 66 | 188 311 60
(1, 15) 157 230 68 | 193 319 60
(1, 16) 144 221 65 | 164 285 57
(2, 1) 161 213 75 | 206 287 71
(2, 2) 166 240 69 | 191 310 61
(2, 3) 152 230 66 | 175 297 58
(2, 4) 162 250 64 | 193 309 62
(2, 5) 138 201 68 | 166 258 64
(2, 6) 171 228 75 | 200 304 65
(2, 7) 173 233 74 | 212 284 74
(2, 8) 135 224 60 | 161 274 58
(2, 9) 150 203 73 | 194 278 69
(2, 10) 144 207 69 | 183 276 66
(2, 11) 162 213 76 | 203 298 68
(2, 12) 149 200 74 | 192 299 64
(2, 13) 156 199 78 | 205 316 64
(2, 14) 190 240 79 | 234 302 77
(2, 15) 183 212 86 | 216 278 77
(2, 16) 183 237 77 | 233 28

In [99]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors_episode_sliding(en_subset, alignment_seeds[(i,j)], tbbt_episode)

    return epi2sub

In [100]:
for (i, j) in alignment_seeds.keys():
    temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    # print("Season:", i,"Episode:", j, "Episode Number:",len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])), len(temp), "Subtitle Number:", len(alignment_seeds[(i,j)]),len(turn_sub2epi_into_epi2sub(temp)))

241 to mend her broken heart this situation is much less complicated there s some kind of dispute between penny and her exboyfriend as to who gets custody of the tv she just wanted to avoid having a scene with him
242 so we get to have a scene with him
she just wanted to avoid a scene with him | True | True
--
297 no it was a valid hypothesis
298 that was a valid hypothesis what is happening to you
that was a valid hypo | True | True
--
299 really thank you so much for going and trying you re uh you re so terrific why don t you put some clothes on i ll get my purse and dinner is on me okay
301 thank you you re not done with her are you
thank you | True | True
--
51 oh dear god shouting leonard leonard i m sick
53 leonard leonard leonard leonard my comforter fell down and my sinuses hurt when i bend over leonard bends to get phone ow
leonard | True | True
--
69 can i get a hallelujah
71 hallelujah
hallelujah | True | True
--
78 do you like motorcycles ‘cos i ride a hog
79 a hog you have

In [516]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, turn_sub2epi_into_epi2sub(alignment_seeds[(season_id,episode_id)]), tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Add within the gap
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_one_size_gap(gaps, epi2sub)

    # Further extend neighbors
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub

In [527]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
        further_alignment[(i,j)] = temp
        print("Season:", i,"Episode:", j, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
    except:
        pass

Season: 1 Episode: 2 Episode Number: 158 Subtitle Number: 225
Season: 1 Episode: 4 Episode Number: 144 Subtitle Number: 212
Season: 1 Episode: 5 Episode Number: 150 Subtitle Number: 209
Season: 1 Episode: 9 Episode Number: 147 Subtitle Number: 197
Season: 1 Episode: 10 Episode Number: 142 Subtitle Number: 202
Season: 1 Episode: 12 Episode Number: 95 Subtitle Number: 133
Season: 1 Episode: 14 Episode Number: 151 Subtitle Number: 211
Season: 1 Episode: 15 Episode Number: 156 Subtitle Number: 213
Season: 1 Episode: 16 Episode Number: 144 Subtitle Number: 198
Season: 2 Episode: 1 Episode Number: 160 Subtitle Number: 229
Season: 2 Episode: 3 Episode Number: 154 Subtitle Number: 213
Season: 2 Episode: 5 Episode Number: 138 Subtitle Number: 184
Season: 2 Episode: 6 Episode Number: 171 Subtitle Number: 223
Season: 2 Episode: 8 Episode Number: 141 Subtitle Number: 193
Season: 2 Episode: 9 Episode Number: 151 Subtitle Number: 230
Season: 2 Episode: 10 Episode Number: 143 Subtitle Number: 202
Sea

In [517]:
further_alignment = {}
for i in tqdm(range(3)):
    for j in tqdm(range(4)):
        try:
            temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, alignment_seeds, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/3 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s][A

100%|██████████| 4/4 [00:00<?, ?it/s][A

100%|██████████| 4/4 [00:00<?, ?it/s][A
100%|██████████| 3/3 [00:00<00:00, 125.33it/s]


In [404]:
with open('final_stage_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [10]:
with open('final_stage_alignment.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [103]:
def get_sliding_window_substrings(input_string, window_size):
    input_tokens = input_string.strip().split(' ')
    substrings = []
    for i in range(len(input_tokens)-3):
        substrings.append(" ".join(input_tokens[i: i+4]))
    return substrings

In [None]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [23]:
def sliding_within_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        for epi_id in epi_ids:
            epi = tbbt_episode

        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]


    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [117]:
"""
Add neighbor subtitle from the utterance
"""
def extend_subtitles_to_episode_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

In [118]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_episodes_to_subtitle_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [81]:
# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )
epi2sub = alignment_seeds[(1,1)]
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334], 136: [335

In [135]:
"""
Extend the former subtitle and latter subtitle near the episode
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]

    # Check whether the subtitle could be merged into utterances using sliding window
    for start_epi_id, end_epi_id in subtitle_gaps:
        start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
        end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
        # print(start_epi_id, start_epi)
        # print(end_epi_id, end_epi)
        # print('--')

        for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
            sub = transformation(en_subset[sub_id])
            sub_substrings = get_sliding_window_substrings(sub, 6)
            sub_substrings.append(sub)
            # print(sub_id, sub)
            # print(sub_substrings)
            temp_start = [substring in start_epi for substring in sub_substrings]
            signal_start = True in temp_start
            temp_end = [substring in end_epi for substring in sub_substrings]
            signal_end = True in temp_end
            # print(temp_start, signal_start)
            # print(temp_end, signal_end)
            if signal_start==True and signal_end==True:
                print(start_epi_id, start_epi)
                print(end_epi_id, end_epi)
                print(sub, "|", signal_start, "|", signal_end)
            # print('--')

        print('=='*50)

extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

241 to mend her broken heart this situation is much less complicated there s some kind of dispute between penny and her exboyfriend as to who gets custody of the tv she just wanted to avoid having a scene with him
242 so we get to have a scene with him
she just wanted to avoid a scene with him | True | True
297 no it was a valid hypothesis
298 that was a valid hypothesis what is happening to you
that was a valid hypo | True | True
299 really thank you so much for going and trying you re uh you re so terrific why don t you put some clothes on i ll get my purse and dinner is on me okay
301 thank you you re not done with her are you
thank you | True | True


In [116]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

subtitle_gaps = {}
for i in range(len(epi_keys)-1):
    epi_start = epi_keys[i]
    epi_end = epi_keys[i+1]
    key = (epi_start, epi_end)
    if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
        subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]


# Check whether the subtitle could be merged into utterances using sliding window
for start_epi_id, end_epi_id in subtitle_gaps:
    start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
    end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
    print(start_epi_id, start_epi)
    print(end_epi_id, end_epi)
    print('--')

    for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
        sub = transformation(en_subset[sub_id])
        sub_substrings = get_sliding_window_substrings(sub, 4)
        sub_substrings.append(sub)
        print(sub_id, sub)
        print(sub_substrings)
        temp_start = [substring in start_epi for substring in sub_substrings]
        temp_end = [substring in end_epi for substring in sub_substrings]
        print(temp_start)
        print(temp_end)
        print('--')

    print('=='*50)

0 so if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it s unobserved it will however if it s observed after it s left the plane but before it hits its target it will not have gone through both slits
1 agreed what s your point
--
201 if unobserved it will
['if unobserved it will', 'if unobserved it will']
[False, False]
[False, False]
--
202 if it is observed after it left the plane before it hits its target
['if it is observed', 'it is observed after', 'is observed after it', 'observed after it left', 'after it left the', 'it left the plane', 'left the plane before', 'the plane before it', 'plane before it hits', 'before it hits its', 'it hits its target', 'if it is observed after it left the plane before it hits its target']
[False, False, False, False, False, False, False, False, False, True, True, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
--
203 it will not

In [104]:
input_string = transformation("One across is Aegean")
substrings = get_sliding_window_substrings(input_string, 7)
print(substrings)

['one across is aegean']


{(0, 1): [201, 202, 203], (2, 5): [206, 207, 208, 209], (5, 6): [211], (12, 13): [219], (14, 15): [222, 223], (16, 17): [227, 228], (22, 24): [235], (24, 28): [237, 238], (33, 34): [245], (36, 39): [250, 251], (40, 45): [254, 255], (46, 63): [258, 259, 260, 261, 262, 263, 264, 265], (74, 75): [281], (76, 82): [284], (82, 84): [287], (96, 97): [296], (109, 111): [309, 310], (116, 117): [317], (119, 123): [320], (124, 126): [324], (128, 131): [328], (133, 134): [332], (138, 139): [338, 339, 340], (144, 145): [347], (150, 151): [357], (151, 154): [359], (154, 155): [361], (159, 160): [367], (160, 161): [370, 371, 372], (161, 162): [376, 377], (164, 169): [381], (171, 173): [385], (181, 183): [394, 395], (183, 185): [397, 398], (193, 195): [410], (198, 200): [414, 415], (208, 209): [426], (209, 210): [428, 429], (211, 214): [432], (215, 221): [436, 437, 438, 439, 440, 441], (221, 222): [443], (222, 224): [445, 446], (235, 236): [462], (241, 242): [471], (249, 250): [480], (250, 252): [482]

In [79]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

subtitle_gaps = {}
for i in range(len(epi_keys)-1):
    epi_start = epi_keys[i]
    epi_end = epi_keys[i+1]
    print(epi_start, epi2sub[epi_start])
    print(epi_end, epi2sub[epi_end])
    key = (epi_start, epi_end)
    if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
        # value = (max(epi2sub[epi_start])+1, min(epi2sub[epi_end])-1)
        value = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]
        print(key, value)
    else:
        print("False")
    print()

0 [200]
1 [204]
(0, 1) [201, 202, 203]

1 [204]
2 [205]
False

2 [205]
5 [210]
(2, 5) [206, 207, 208, 209]

5 [210]
6 [212]
(5, 6) [211]

6 [212]
7 [213]
False

7 [213]
8 [214]
False

8 [214]
9 [215]
False

9 [215]
10 [216]
False

10 [216]
11 [217]
False

11 [217]
12 [218]
False

12 [218]
13 [220]
(12, 13) [219]

13 [220]
14 [221]
False

14 [221]
15 [224]
(14, 15) [222, 223]

15 [224]
16 [225, 226]
False

16 [225, 226]
17 [229]
(16, 17) [227, 228]

17 [229]
18 [230]
False

18 [230]
19 [231]
False

19 [231]
20 [232]
False

20 [232]
21 [233]
False

21 [233]
22 [234]
False

22 [234]
24 [236]
(22, 24) [235]

24 [236]
28 [239]
(24, 28) [237, 238]

28 [239]
30 [240]
False

30 [240]
31 [241]
False

31 [241]
32 [242]
False

32 [242]
33 [243, 244]
False

33 [243, 244]
34 [246, 247]
(33, 34) [245]

34 [246, 247]
35 [248]
False

35 [248]
36 [249]
False

36 [249]
39 [252]
(36, 39) [250, 251]

39 [252]
40 [253]
False

40 [253]
45 [256]
(40, 45) [254, 255]

45 [256]
46 [257]
False

46 [257]
63 [266]

In [65]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))
start = min(epi2sub[epi_keys[0]])
end = max(epi2sub[epi_keys[-1]])
for j in range(start, end+1):
    print(j)

200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449


In [61]:
print(start, end)
print(epi_keys)
print(sub_keys)

200 555
[0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 28, 30, 31, 32, 33, 34, 35, 36, 39, 40, 45, 46, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 82, 84, 86, 88, 89, 90, 93, 94, 96, 97, 98, 99, 100, 102, 104, 105, 106, 107, 108, 109, 111, 112, 115, 116, 117, 119, 123, 124, 126, 127, 128, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 154, 155, 157, 158, 159, 160, 161, 162, 163, 164, 169, 170, 171, 173, 174, 175, 176, 177, 179, 180, 181, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 195, 197, 198, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 214, 215, 221, 222, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 250, 252, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 268, 269, 270, 272, 275, 277, 278, 279, 280, 281, 282, 283, 284, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297,

In [None]:
# Extend the neighbor subtitles to episode utterance


In [25]:
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

NameError: name 'extend_neighbors_sliding' is not defined

In [540]:
# Add within the gap
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_one_size_gap(gaps, epi2sub)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

{2: [168], 3: [169], 5: [171], 6: [172], 7: [173, 174], 8: [175], 9: [176, 177], 11: [178], 12: [179], 14: [180], 15: [181], 16: [182], 17: [183, 184], 18: [185], 19: [186], 20: [187, 188], 22: [190, 191], 24: [192], 25: [193], 26: [194], 27: [195], 28: [196], 31: [200], 32: [201], 33: [202], 35: [204], 36: [205, 206], 39: [208], 40: [209, 210], 41: [211], 42: [212, 213, 214], 44: [215], 45: [216], 46: [217, 218, 219, 220], 47: [221], 48: [222, 223, 224, 225], 49: [226], 52: [231], 53: [232], 54: [233, 234], 55: [235, 236], 56: [237], 57: [240, 241], 58: [242], 60: [243], 61: [245, 246], 62: [250], 63: [251], 64: [252], 65: [253, 254, 255], 68: [260], 70: [263], 71: [264], 72: [265], 73: [266], 74: [267, 268], 77: [273, 274], 80: [278, 279], 81: [278, 279, 280, 282, 283], 82: [284, 285], 84: [286, 287], 85: [288], 86: [289], 90: [291], 91: [292], 92: [293], 93: [294, 295], 94: [296], 95: [297], 98: [300, 301], 103: [303], 105: [305, 306], 107: [308], 108: [309], 109: [310], 114: [314],

In [541]:
for x in gaps:
    print(x)

[[4], [170]]
[[21], [189]]
[[29, 30], [197, 198, 199]]
[[34], [203]]
[[37, 38], [207]]
[[50, 51], [227, 228, 229, 230]]
[[66, 67], [256, 257, 258, 259]]
[[69], [261, 262]]
[[75, 76], [269, 270, 271, 272]]
[[78, 79], [275, 276, 277]]
[[87, 88, 89], [290]]
[[96, 97], [298, 299]]
[[99, 100, 101, 102], [302]]
[[104], [304]]
[[106], [307]]
[[110, 111, 112, 113], [311, 312, 313]]
[[124, 125], [326]]
[[135, 136], [335]]
[[138], [337]]
[[149, 150], [349, 350]]
[[152], [352]]
[[160], [359, 360, 361, 362]]
[[170], [375]]
[[199, 200, 201], [425]]
[[212], [438, 439]]
[[216, 217], [443, 444]]
[[220], [449, 450, 451]]
[[224], [457]]
[[227, 228, 229, 230, 231], [461, 462, 463, 464, 465, 466]]


In [542]:
count = 0
for gap in gaps:
    sub_ids = gap[1]
    epi_ids = gap[0]
    if not(len(sub_ids)==1 and len(epi_ids)==1):
        continue
    print(sub_ids, epi_ids)
    print("Episode:")
    for epi_id in epi_ids:
        print(len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')), tbbt_episode[epi_id][0])

    print()
    print("Subtitle:")
    for sub_id in sub_ids:
        print(len(transformation(en_subset[sub_id]).strip().split(' ')), en_subset[sub_id])

    # if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
    #     count += 1
    count += 1
    print('=='*50)

[170] [4]
Episode:
4  Are there any chopsticks?

Subtitle:
2 - Any chopsticks?
[189] [21]
Episode:
1  Yes.

Subtitle:
3 - Yes. - Oh.
[203] [34]
Episode:
25  Yeah, yeah, if it gets here and I’m not here tomorrow could you just sign for it and have them put it in my apartment.

Subtitle:
18 If it gets here and I'm not here, could you sign for it and put it in?
[304] [104]
Episode:
15  Oh, great, thank you again (she throws her jacket over the back of the sofa).

Subtitle:
3 Thank you again.
[307] [106]
Episode:
4  What’s he talking about?

Subtitle:
5 I'm here for you.
[337] [138]
Episode:
16  You know what, you’ve convinced me, maybe tonight we should sneak in and shampoo her carpet.

Subtitle:
13 You've convinced me, maybe we should sneak in and shampoo her carpet.
[352] [152]
Episode:
11  You came into my apartment last night when I was sleeping?

Subtitle:
9 You came into my apartment while I was sleeping?
[375] [170]
Episode:
1  No.

Subtitle:
3 - No! - Whoo.
[457] [224]
Episode:
17

In [497]:
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

{2: [168], 3: [169], 4: [170], 5: [171], 6: [172], 7: [173, 174], 8: [175], 9: [176, 177], 11: [178], 12: [179], 14: [180], 15: [181], 16: [182], 17: [183, 184], 18: [185], 19: [186], 20: [187, 188], 22: [190, 191], 24: [192], 25: [193], 26: [194], 27: [195], 28: [196], 31: [200], 32: [201], 33: [202], 35: [204], 36: [205, 206], 39: [208], 40: [209, 210], 41: [211], 42: [212, 213, 214], 44: [215], 45: [216], 46: [217, 218, 219, 220], 47: [221], 48: [222, 223, 224, 225], 49: [226], 52: [231], 53: [232], 54: [233, 234], 55: [235, 236], 56: [237], 57: [240, 241], 58: [242], 60: [243], 61: [245, 246], 62: [250], 63: [251], 64: [252], 65: [254, 255], 68: [260], 70: [263], 71: [264], 72: [265], 73: [266], 74: [267, 268], 77: [273, 274], 80: [278, 279], 81: [278, 279, 280, 282, 283], 82: [284, 285], 84: [287, 288], 86: [289], 88: [290], 90: [291], 91: [292], 92: [293], 93: [294, 295], 94: [296], 97: [297, 298], 98: [298], 99: [299], 100: [300], 103: [303], 105: [305, 306], 108: [307], 109: [3

In [425]:
# Merge neighbors into the current sentence
# From Episode side
for epi_id in seeds_final:
    epi = transformation(tbbt_episode[epi_id][0])
    sub = transformation("".join([en_subset[item] for item in seeds_final[epi_id]]))
    print(epi_id, seeds_final[epi_id])
    print(epi)
    print(sub)
    print(len(epi.split(' ')), len(sub.split(' ')), abs(len(epi.split(' '))-len(sub.split(' '))))

    print('=='*50)

2 [168]
uh i’m not sure everyone keep an eye on howard in case he starts to swell up
everyone keep an eye on howard in case he starts to swell up
17 13 4
3 [169]
since it’s not bee season you can have my epinephrine
since it is not bee season you can have my epinephrine
10 11 1
4 [170]
are there any chopsticks
any chopsticks
4 2 2
5 [171]
you don’t need chopsticks this is thai food
do not need chopsticks this is thai food
8 8 0
6 [172]
here we go
here we go
3 3 0
7 [173, 174]
thailand has had the fork since the latter half of the nineteenth century interestingly they don’t actually put the fork in their mouth they use it to put the food on a spoon which then goes into their mouth
thailand has had the fork since the latter half of the 19th centurythey do not put the fork in their mouth they use it to put the food on a spoon which then goes into their mouth
39 37 2
8 [175]
ask him for a napkin i dare you there is a knock on the door i’ll get it
ask him for a napkin i dare you
18 8 10
9 [

In [428]:
for epi_id in seeds_final:
    epi = transformation(tbbt_episode[epi_id][0])
    sub = transformation("".join([en_subset[item] for item in seeds_final[epi_id]]))
    former_sub = transformation(en_subset[min(seeds_final[epi_id])-1])
    after_sub = transformation(en_subset[max(seeds_final[epi_id])+1])
    print(epi_id, seeds_final[epi_id])
    print(epi)
    print("Former:", former_sub)
    print("Current:", sub)
    print("After:", after_sub)
    print(len(epi.split(' ')), len(sub.split(' ')), abs(len(epi.split(' '))-len(sub.split(' '))))

    print('=='*50)

2 [168]
uh i’m not sure everyone keep an eye on howard in case he starts to swell up
Former: does it have peanut oil
Current: everyone keep an eye on howard in case he starts to swell up
After: since it is not bee season you can have my epinephrine
17 13 4
3 [169]
since it’s not bee season you can have my epinephrine
Former: everyone keep an eye on howard in case he starts to swell up
Current: since it is not bee season you can have my epinephrine
After: any chopsticks
10 11 1
4 [170]
are there any chopsticks
Former: since it is not bee season you can have my epinephrine
Current: any chopsticks
After: do not need chopsticks this is thai food
4 2 2
5 [171]
you don’t need chopsticks this is thai food
Former: any chopsticks
Current: do not need chopsticks this is thai food
After: here we go
8 8 0
6 [172]
here we go
Former: do not need chopsticks this is thai food
Current: here we go
After: thailand has had the fork since the latter half of the 19th century
3 3 0
7 [173, 174]
thailand has 

In [367]:
count = 0
for gap in gaps:
    sub_ids = gap[1]
    epi_ids = gap[0]
    if not(len(sub_ids)==1 and len(epi_ids)==1):
        continue
    print(sub_ids, epi_ids)
    print("Episode:")
    for epi_id in epi_ids:
        print(len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')), tbbt_episode[epi_id][0])

    print()
    print("Subtitle:")
    for sub_id in sub_ids:
        print(len(transformation(en_subset[sub_id]).strip().split(' ')), en_subset[sub_id])

    # if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
    #     count += 1
    count += 1
    print('=='*50)

[235] [23]
Episode:
12  I don’t know, I’ve never reneged on a proffer of sperm before.

Subtitle:
10 I've never reneged on a proffer of sperm before.
[241] [31]
Episode:
2  Not really.

Subtitle:
2 Not really.
[287] [83]
Episode:
25  Leonard, I’m not expert here but I believe in the context of a luncheon invitation, you might want to skip the reference to bowel movements.

Subtitle:
21 I'm no expert, but in the context of a lunch invitation you might wanna skip the reference to bowel movements.
[298] [98]
Episode:
1  Yeah. 

Subtitle:
1 Yeah.
[324] [125]
Episode:
4  Yes I now, but…

Subtitle:
3 Yes, I know.
[385] [172]
Episode:
2  How so?

Subtitle:
3 LEONARD: How so?
[407] [191]
Episode:
2  See what?

Subtitle:
2 See what?
[410] [194]
Episode:
8  It’s before he became a creepy computer voice:.

Subtitle:
12 [IMITATING COMPUTERIZED VOICE] It's before he became a creepy computer voice.
[418] [202]
Episode:
1  Uh-huh.

Subtitle:
1 - Uh-huh.
[451] [229]
Episode:
2  Must we?

Subtitle:
2 -

In [503]:
print(alignment)

{(1, 2): {168: [2], 169: [3], 170: [4], 171: [5], 172: [6], 173: [7], 174: [7], 175: [8], 176: [9], 177: [9], 178: [11], 179: [12], 180: [14], 181: [15], 182: [16], 183: [17], 184: [17], 185: [18], 186: [19], 187: [20], 188: [20], 190: [22], 191: [22], 192: [24], 193: [25], 194: [26], 195: [27], 196: [28], 200: [31], 201: [32], 202: [33], 204: [35], 205: [36], 206: [36], 208: [39], 209: [40], 210: [40], 211: [41], 212: [42], 213: [42], 214: [42], 215: [44], 216: [45], 217: [46], 218: [46], 219: [46], 220: [46], 221: [47], 222: [48], 223: [48], 224: [48], 225: [48], 226: [49], 231: [52], 232: [53], 233: [54], 234: [54], 235: [55], 236: [55], 237: [56], 240: [57], 241: [57], 242: [58], 243: [60], 245: [61], 246: [61], 250: [62], 251: [63], 252: [64], 253: [65], 254: [65], 255: [65], 260: [68], 263: [70], 264: [71], 265: [72], 266: [73], 267: [74], 268: [74], 273: [77], 274: [77], 278: [80, 81], 279: [80, 81], 280: [81], 282: [81], 283: [81], 284: [82], 285: [82], 287: [84], 288: [84], 28

In [85]:
alignment = {(1,1):turn_sub2epi_into_epi2sub(epi2sub)}
# alignment = further_alignment
# Write into xlsx file
for x in alignment:
    print(x)
    print(alignment[x])
    # Define season and episode
    season_id = x[0]
    episode_id = x[1]

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/test_episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/test_subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, str(speaker), " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

(1, 1)
{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334], 13

In [24]:
epi2sub = turn_sub2epi_into_epi2sub(alignment[(1,1)])
for x in epi2sub:
    utt = transformation(tbbt_episode[x][0])
    sub = transformation("".join([en_subset[item] for item in epi2sub[x]]))

    print("Utt:", len(utt.strip().split(" ")), utt)
    print("Sub:", len(sub.strip().split(" ")), sub)
    print("=="*50)
    # print(tbbt_episode[x])

Utt: 54 so if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it’s unobserved it will however if it’s observed after it’s left the plane but before it hits its target it will not have gone through both slits
Sub: 23 if a photon is directed through a plane with two slits in it and either is observed it will not go through both
Utt: 6 i think this is the place
Sub: 6 i think this is the place
Utt: 36 no we are committing genetic fraud there’s no guarantee that our sperm is going to generate high iq offspring think about that i have a sister with the same basic dna mix who hostesses at fuddruckers
Sub: 14 i have a sister with the same basic dna mix who hostesses at fuddruckers
Utt: 17 sheldon this was your idea a little extra money to get fractional t1 bandwidth in the apartment
Sub: 16 sheldon this was your ideaa little extra money to get fractional t1 bandwidth in the apartment
Utt: 50 i know and i do yearn for f

In [17]:
def temp_string_match_sliding_window(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)

    temp = filter_alignment_by_gap(res)

    final = {}
    for x in temp:
        if len(temp[x])==1:
            final[x] = temp[x]

    return final

In [24]:
def temp_string_match_sliding_window_no_filter(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)
    return res

In [15]:
# Check one episode and adapt to the new tbbt transcript corpus

(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=3,
        bias=200
    )

In [60]:
# Firstly perform exact match
sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=5)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=6)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=7)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

155 120
112 91
81 68


In [58]:
# Firstly perform exact match
sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=5)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=6)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=7)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

95 72
6 5
4 4


In [42]:
temp = sub2epi
sub2epi = {}
for x in sorted(list(temp.keys())):
    sub2epi[x+2] = [item+2 for item in temp[x]]
for x in sub2epi:
    print(x, sub2epi[x])

191 [2]
192 [2]
194 [5]
196 [6]
197 [7]
208 [15]
221 [26]
227 [30]
229 [31]
231 [34]
233 [36]
236 [38]
242 [42]
245 [44]
258 [53]
260 [54]
263 [57]
264 [58]
265 [58]
267 [59]
271 [62]
274 [64]
275 [65]
279 [68]
280 [68]
284 [74, 72]
286 [74, 72]
287 [75]
288 [76]
289 [76]
291 [78]
293 [79]
294 [80]
295 [80]
296 [80]
297 [81]
299 [83]
304 [89]
306 [91]
307 [92]
313 [98]
314 [99]
315 [100]
317 [101]
318 [102]
320 [103]
321 [105]
327 [111]
328 [112]
329 [112]
331 [113]
334 [115]
335 [115]
344 [120]
345 [120]
347 [121]
350 [123]
351 [124]
356 [127]
358 [128]
359 [128]
365 [131]
370 [134]
372 [136]
373 [136]
374 [136]
377 [138]
379 [141]
381 [143]
383 [144]
384 [144]
386 [144]
398 [158]
400 [160]
403 [163]
405 [165]
406 [166]
408 [168]
412 [171]
413 [171]
416 [171]
418 [171]
427 [175]
431 [180]
435 [182]
437 [183]
438 [183]
442 [189]
444 [190]
450 [196]
452 [197]
453 [198]
455 [201]
458 [205]
459 [205]
463 [208]
467 [211]
471 [214]
473 [215]
474 [215]
477 [217]
479 [218]
481 [219]
486 [222]

In [43]:
temp = epi2sub
epi2sub = {}
for x in sorted(list(temp.keys())):
    epi2sub[x+2] = [item+2 for item in temp[x]]

for x in epi2sub:
    print(x, epi2sub[x])

2 [191, 192]
5 [194]
6 [196]
7 [197]
15 [208]
26 [221]
30 [227]
31 [229]
34 [231]
36 [233]
38 [236]
42 [242]
44 [245]
53 [258]
54 [260]
57 [263]
58 [264, 265]
59 [267]
62 [271]
64 [274]
65 [275]
68 [279, 280]
72 [284, 286]
74 [284, 286]
75 [287]
76 [288, 289]
78 [291]
79 [293]
80 [294, 295, 296]
81 [297]
83 [299]
89 [304]
91 [306]
92 [307]
98 [313]
99 [314]
100 [315]
101 [317]
102 [318]
103 [320]
105 [321]
111 [327]
112 [328, 329]
113 [331]
115 [334, 335]
120 [344, 345]
121 [347]
123 [350]
124 [351]
127 [356]
128 [358, 359]
131 [365]
134 [370]
136 [372, 373, 374]
138 [377]
141 [379]
143 [381]
144 [383, 384, 386]
158 [398]
160 [400]
163 [403]
165 [405]
166 [406]
168 [408]
171 [412, 413, 416, 418]
175 [427]
180 [431]
182 [435]
183 [437, 438]
189 [442]
190 [444]
196 [450]
197 [452]
198 [453]
201 [455]
205 [458, 459]
208 [463]
211 [467]
214 [471]
215 [473, 474]
217 [477]
218 [479]
219 [481]
222 [486]
226 [490]
236 [499]
237 [500]
241 [505]
242 [507]
244 [509, 512]
247 [515]


In [44]:
print(len(sub2epi))
print(len(epi2sub))

112
91


In [8]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)

    return temp

In [9]:
further_alignment = {}
for i in tqdm(range(2)):
    for j in tqdm(range(4)):
        try:
            temp = look_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:29<01:27, 29.08s/it][A
 50%|█████     | 2/4 [00:43<00:41, 20.61s/it][A
 75%|███████▌  | 3/4 [00:58<00:17, 17.78s/it][A
100%|██████████| 4/4 [01:09<00:00, 17.45s/it][A
 50%|█████     | 1/2 [01:09<01:09, 69.81s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:13<00:39, 13.30s/it][A
 50%|█████     | 2/4 [00:30<00:31, 15.66s/it][A
 75%|███████▌  | 3/4 [00:43<00:14, 14.20s/it][A
100%|██████████| 4/4 [00:57<00:00, 14.41s/it][A
100%|██████████| 2/2 [02:07<00:00, 63.73s/it]


In [10]:
for x in further_alignment:
    print(x, len(further_alignment[x][0]), len(further_alignment[x][1]),len(further_alignment[x][2]),len(further_alignment[x][3]))

(0, 0) 133 133 171 171
(0, 1) 109 109 130 130
(0, 2) 72 72 90 90
(0, 3) 112 112 134 134
(1, 0) 1 1 1 1
(1, 1) 84 84 104 104
(1, 2) 113 113 135 135
(1, 3) 124 124 159 159


In [14]:
with open('further_alignment.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [11]:
alignment = further_alignment

In [12]:
for x in alignment:
    print(x, len(alignment[x][0]), len(alignment[x][1]),len(alignment[x][2]),len(alignment[x][3]))

(0, 0) 133 133 171 171
(0, 1) 109 109 130 130
(0, 2) 72 72 90 90
(0, 3) 112 112 134 134
(1, 0) 1 1 1 1
(1, 1) 84 84 104 104
(1, 2) 113 113 135 135
(1, 3) 124 124 159 159


In [47]:
print(epi2sub)

{2: [191, 192], 5: [194], 6: [196], 7: [197], 15: [208], 26: [221], 30: [227], 31: [229], 34: [231], 36: [233], 38: [236], 42: [242], 44: [245], 53: [258], 54: [260], 57: [263], 58: [264, 265], 59: [267], 62: [271], 64: [274], 65: [275], 68: [279, 280], 72: [284, 286], 74: [284, 286], 75: [287], 76: [288, 289], 78: [291], 79: [293], 80: [294, 295, 296], 81: [297], 83: [299], 89: [304], 91: [306], 92: [307], 98: [313], 99: [314], 100: [315], 101: [317], 102: [318], 103: [320], 105: [321], 111: [327], 112: [328, 329], 113: [331], 115: [334, 335], 120: [344, 345], 121: [347], 123: [350], 124: [351], 127: [356], 128: [358, 359], 131: [365], 134: [370], 136: [372, 373, 374], 138: [377], 141: [379], 143: [381], 144: [383, 384, 386], 158: [398], 160: [400], 163: [403], 165: [405], 166: [406], 168: [408], 171: [412, 413, 416, 418], 175: [427], 180: [431], 182: [435], 183: [437, 438], 189: [442], 190: [444], 196: [450], 197: [452], 198: [453], 201: [455], 205: [458, 459], 208: [463], 211: [467]

In [244]:
alignment = alignment_seeds

In [248]:
for x in alignment:
    print(x)

(1, 1)
(1, 2)
(1, 3)
(1, 4)
(1, 5)
(1, 6)
(1, 7)
(1, 8)
(1, 9)
(1, 10)
(1, 11)
(1, 12)
(1, 13)
(1, 14)
(1, 15)
(1, 16)
(2, 1)
(2, 2)
(2, 3)
(2, 4)
(2, 5)
(2, 6)
(2, 7)
(2, 8)
(2, 9)
(2, 10)
(2, 11)
(2, 12)
(2, 13)
(2, 14)
(2, 15)
(2, 16)
(2, 17)
(2, 18)
(2, 19)
(2, 20)
(2, 21)
(2, 22)
(2, 23)
(3, 1)
(3, 2)
(3, 3)
(3, 4)
(3, 5)
(3, 6)
(3, 7)
(3, 8)
(3, 9)
(3, 10)
(3, 11)
(3, 12)
(3, 13)
(3, 14)
(3, 15)
(3, 16)
(3, 17)
(3, 18)
(3, 19)
(3, 20)
(3, 21)
(3, 22)
(3, 23)
(4, 1)
(4, 2)
(4, 3)
(4, 4)
(4, 5)
(4, 6)
(4, 7)
(4, 8)
(4, 9)
(4, 10)
(4, 11)
(4, 12)
(4, 13)
(4, 14)
(4, 15)
(4, 17)
(4, 18)
(4, 19)
(4, 20)
(4, 21)
(4, 22)
(4, 23)
(4, 24)
(5, 1)
(5, 2)
(5, 3)
(5, 4)
(5, 5)
(5, 6)
(5, 7)
(5, 8)
(5, 9)
(5, 10)
(5, 11)
(5, 12)
(5, 13)
(5, 14)
(5, 15)
(5, 16)
(5, 17)
(5, 18)
(5, 19)
(5, 20)
(5, 21)
(5, 22)
(5, 23)
(6, 1)
(6, 2)
(6, 3)
(6, 4)
(6, 5)
(6, 6)
(6, 7)
(6, 8)
(6, 9)
(6, 10)
(6, 11)
(6, 12)
(6, 13)
(6, 14)
(6, 15)
(6, 16)
(6, 17)
(6, 18)
(6, 19)
(6, 20)
(6, 21)
(6, 22)
(6, 23)
(6, 24

In [290]:
alignment = {(1,1): turn_sub2epi_into_epi2sub(temp)}

TypeError: list indices must be integers or slices, not str

In [294]:
# Write into xlsx file
for x in alignment:
    print(alignment[x])
    # Define season and episode
    season_id = x[0]
    episode_id = x[1]

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/test_episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/test_subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    sub2epi = alignment[x]
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, str(speaker), " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

{0: [200], 9: [215], 15: [224], 16: [225, 226], 17: [229], 28: [239], 34: [246, 247], 35: [248], 65: [268], 75: [282], 82: [285, 286], 88: [290], 89: [291], 90: [292], 93: [293], 96: [295], 100: [300], 102: [301, 302], 108: [307], 116: [314, 316], 117: [318], 134: [333], 135: [334], 145: [348], 148: [351], 154: [360], 155: [362], 160: [368, 369], 161: [373, 374, 375], 162: [378], 171: [384], 173: [386], 174: [387], 175: [388], 176: [389], 177: [390], 183: [396], 186: [400, 401], 193: [409], 201: [417], 207: [423], 208: [424, 425], 210: [430], 215: [435], 226: [449], 228: [450], 230: [452, 453, 454, 455, 456], 232: [458], 237: [464], 240: [467], 241: [468, 469, 470], 242: [472], 245: [476], 258: [489, 490], 259: [491], 262: [494, 495], 263: [496], 270: [503], 275: [506], 280: [510, 511], 288: [523, 524], 295: [531], 296: [532], 297: [533], 302: [542], 309: [548], 311: [550, 551], 314: [555]}


In [None]:
# Check current alignments
for x in alignment:
    # Load Data
    epi2sub = alignment[x][-1]

    pass


In [36]:
print(further_alignment[(0,0)][-1])

{0: [200, 201, 202, 203], 2: [205], 5: [206, 207], 8: [214], 9: [215], 12: [218, 219], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 21: [233], 24: [235], 29: [239], 31: [240], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 40: [252], 41: [253], 42: [254], 48: [256], 50: [259], 52: [261], 58: [263], 59: [264], 66: [266], 67: [267], 68: [268], 69: [269], 70: [270], 72: [273], 73: [274, 275], 76: [279], 78: [282], 79: [283], 85: [285, 286], 86: [287], 91: [290], 92: [291], 93: [292], 96: [293], 99: [295], 100: [297], 102: [299], 103: [300], 104: [301], 105: [302], 108: [304], 110: [306], 111: [307], 112: [308], 114: [311], 115: [312], 118: [313], 119: [314, 315, 316, 317], 120: [318], 122: [319], 124: [320], 126: [321, 322], 129: [325], 131: [327], 135: [330], 136: [331], 137: [333], 138: [334], 139: [335], 140: [336], 141: [337, 338, 339, 340], 143: [342], 151: [351], 153: [356], 154: [357, 358], 157: [360, 361], 158: [362], 163: [368, 369], 165: [373

In [35]:
print(len(further_alignment[(0,0)]))

4


In [25]:
for x in further_alignment:
    print(x, len(further_alignment[x][-1]), len(further_alignment[x][-2]), len(further_alignment[x][-3]))
    # print(further_alignment[x])

(0, 0) 171 171 133
(0, 1) 130 130 109
(0, 2) 90 90 72
(0, 3) 134 134 112
(0, 4) 132 132 106
(0, 5) 24 24 16
(0, 6) 75 75 66
(0, 7) 7 7 7
(0, 8) 130 130 100
(0, 9) 57 57 50
(0, 10) 46 46 32
(0, 11) 93 93 75
(0, 12) 137 137 114
(0, 13) 10 10 9
(0, 14) 124 124 103
(0, 15) 130 130 102
(1, 0) 1 1 1
(1, 1) 104 104 84
(1, 2) 135 135 113
(1, 3) 159 159 124
(1, 4) 74 74 60
(1, 5) 60 60 43
(1, 6) 29 29 24
(1, 7) 110 110 98
(1, 8) 70 70 61
(1, 9) 80 80 68
(1, 11) 133 133 112
(1, 12) 136 136 108
(1, 13) 161 161 131
(1, 14) 31 31 26
(1, 15) 106 106 83
(1, 16) 140 140 119
(1, 17) 127 127 102
(1, 18) 134 134 111
(1, 19) 46 46 40
(1, 20) 95 95 74
(1, 21) 137 137 104
(1, 22) 56 56 47
(2, 0) 125 125 102
(2, 1) 134 134 115
(2, 2) 103 103 81
(2, 3) 105 105 84
(2, 4) 131 131 105
(2, 5) 17 17 13
(2, 6) 159 159 131
(2, 7) 109 109 93
(2, 8) 133 133 106
(2, 9) 162 162 125
(2, 10) 129 129 112
(2, 11) 125 125 105
(2, 12) 109 109 86
(2, 13) 106 106 83
(2, 14) 124 124 103
(2, 15) 142 142 119
(2, 16) 100 100 81
(2,

In [27]:
old_alignment = further_alignment

In [34]:
print(old_alignment[(1,1)][-1])

{2: [200, 201, 202, 203], 4: [206], 5: [208], 6: [209, 210], 7: [211], 10: [217, 218], 14: [220], 18: [223], 19: [224, 225], 20: [229], 21: [230], 22: [232], 23: [233, 234], 26: [238], 28: [240, 241], 30: [242], 31: [243], 32: [244], 33: [245], 34: [246], 36: [249, 250], 37: [251], 40: [253], 41: [254, 255], 44: [258], 45: [259, 260, 261], 46: [262, 263], 47: [264], 48: [266], 49: [267], 52: [269], 56: [274], 57: [276, 277], 58: [278], 59: [279], 60: [280, 281], 61: [282], 63: [283], 64: [284], 65: [285], 67: [286], 68: [287, 288, 289], 70: [291, 292], 73: [295], 74: [296], 75: [297, 298, 299], 76: [300], 77: [302], 79: [304], 80: [305], 81: [306], 82: [308], 83: [310], 84: [311], 85: [312, 313], 87: [315], 88: [317], 89: [319], 90: [320, 321], 91: [322], 92: [325, 326], 94: [328], 95: [329], 96: [330], 97: [331], 99: [334], 100: [335, 336], 101: [337], 102: [338], 104: [339], 107: [342, 343], 109: [344], 110: [345], 111: [346], 112: [347], 113: [348, 349, 350], 115: [351], 116: [352, 

In [33]:
print(len(old_alignment[(1,1)][-1]))
print(len(turn_sub2epi_into_epi2sub(old_alignment[(1,1)][-1])))

104
140


In [38]:
alignment = further_alignment
for x in alignment:
    # Define season and episode
    season_id = x[0]+1
    episode_id = x[1]+1

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x][-1]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    tbbt_episode = []
    for x in tbbt_transcripts[(season_id, episode_id)]:
        if x[1]!='Scene':
            tbbt_episode.append(x)

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, "", " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

In [22]:
print(further_alignment.keys())

dict_keys([(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)])


In [14]:
temp = []
# Perform string match with sliding window
sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 4)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

In [18]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

In [20]:
# Perform exact match and add it to the whole alignment
exact_match_result = exact_match(en_subset, tbbt_episode)
sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

{200: [0], 202: [0], 203: [0], 205: [2], 206: [5], 207: [5], 212: [6], 213: [7], 214: [8], 215: [9], 218: [12], 219: [12], 220: [13], 222: [15], 223: [15], 224: [15], 225: [16], 226: [16], 227: [17], 228: [17], 229: [17], 232: [20], 233: [21], 234: [23], 235: [24], 239: [29], 240: [31], 241: [32], 242: [33], 246: [35], 247: [35], 248: [36], 249: [37], 251: [39], 252: [40], 253: [41], 254: [42], 256: [48], 259: [50], 260: [51], 261: [52], 262: [57], 263: [58], 264: [59], 266: [66], 267: [67], 268: [68], 269: [69], 270: [70], 272: [72], 273: [72], 274: [73], 275: [73], 276: [74], 279: [76], 280: [77], 282: [78], 283: [79], 285: [85], 286: [85], 287: [86], 288: [87], 290: [91], 291: [92], 292: [93], 293: [96], 295: [99], 296: [99], 297: [100], 299: [102], 300: [103], 301: [104], 302: [105], 303: [107], 304: [108], 306: [110], 307: [111], 308: [112], 311: [114], 312: [115], 313: [118], 314: [119], 315: [119], 316: [119], 317: [119], 318: [120], 319: [122], 320: [124], 321: [126], 322: [126

In [24]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

{200: [0], 202: [0], 203: [0], 205: [2], 206: [5], 207: [5], 212: [6], 213: [7], 214: [8], 215: [9], 218: [12], 219: [12], 220: [13], 222: [15], 223: [15], 224: [15], 225: [16], 226: [16], 227: [17], 228: [17], 229: [17], 232: [20], 233: [21], 234: [23], 235: [24], 239: [29], 240: [31], 241: [32], 242: [33], 246: [35], 247: [35], 248: [36], 249: [37], 251: [39], 252: [40], 253: [41], 254: [42], 255: [42], 256: [48], 259: [50], 260: [51], 261: [52], 262: [57], 263: [58], 264: [59], 266: [66], 267: [67], 268: [68], 269: [69], 270: [70], 272: [72], 273: [72], 274: [73], 275: [73], 276: [74], 279: [76], 280: [77], 282: [78], 283: [79], 285: [85], 286: [85], 287: [86], 288: [87], 290: [91], 291: [92], 292: [93], 293: [96], 295: [99], 296: [99], 297: [100], 299: [102], 300: [103], 301: [104, 105], 302: [105], 303: [107], 304: [108], 306: [110], 307: [111], 308: [112], 311: [114], 312: [115], 313: [118], 314: [119], 315: [119], 316: [119], 317: [119], 318: [120], 319: [122], 320: [124], 321: 

In [22]:
# Show TBBT Episode
for i, (utt, speaker) in enumerate(tbbt_episode):
    if i in epi2sub:
        print(i, "||||",epi2sub[i], speaker, utt)
    else:
        print(i, speaker, utt)

0 |||| [200, 202, 203] Sheldon  So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.
1 Leonard  Agreed, what’s your point?
2 |||| [205] Sheldon  There’s no point, I just think it’s a good idea for a tee-shirt. 
3 Leonard  Excuse me?
4 Receptionist  Hang on. 
5 |||| [206, 207] Leonard  One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti. 
6 |||| [212] Receptionist  Can I help you?
7 |||| [213] Leonard  Yes. Um, is this the High IQ sperm bank?
8 |||| [214] Receptionist  If you have to ask, maybe you shouldn’t be here.
9 |||| [215] Sheldon  I think this is the place.
10 Receptionist  Fill these out.
11 

In [23]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle, zh_subset[i])
    else:
        print(i, subtitle, zh_subset[i])

0 Thank you very much. Good day to you. 多谢了 日安
1 Good day to you. 日安
2 Come and buy a dresser! 来买梳妆台了
3 The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
4 Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
5 The village had grown. 村子也大了 有很多新的小孩
6 There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
7 But we had a secret helper. 但是我们有个秘密
8 Nikolas. -尼古拉斯
9 -Eemeli. -艾美利
10 Long time no see. You should come more often. 很久没见了 你应该常来
11 I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
12 Do you have the list? 你有名单吗
13 Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
14 As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
15 Is that... -是吗
16 -A girl, three months. -一个女孩 三个月大
17 Let's add her to the list. 那我们加上她的名字吧
18 What is the name of this little princess? 这个小公主叫什么名字
19 Aada. 亚达
20 Aada? 亚达
21 Hello, Aada. 你好 亚达
22 Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
23 My sons. 我的儿子


In [90]:
print(temp)

[{0: [200, 203], 5: [206], 8: [214], 15: [224], 16: [226], 17: [227, 229], 24: [235], 29: [239], 33: [242], 35: [246], 36: [248], 37: [249], 41: [253], 48: [256], 50: [259]}, {0: [200, 203], 5: [206, 207], 8: [214], 15: [224], 16: [225, 226], 17: [227, 229], 24: [235], 29: [239], 33: [242], 35: [246, 247], 36: [248], 37: [249], 41: [253], 48: [256], 50: [259]}, {0: [200, 203], 5: [206, 207], 8: [214], 9: [215], 15: [224], 16: [225, 226], 17: [227, 229], 21: [233], 24: [235], 29: [239], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 41: [253], 42: [254], 48: [256], 50: [259]}]


In [91]:
for x in temp:
    print(x)

3


In [None]:
    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

In [56]:
def extend_with_wer(en_subset, epi2sub_alignment_2, episode):
    temp = {}

    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_latter = transformation(en_subset[sub_id_latter])
        # sub = transformation(en_subset[sub_id])
        epi = transformation(episode[epi_id][0])

        # Fetch all relevant sentences
        epi_sentences = [episode[idx][0] for idx in [epi_id_former, epi_id, epi_id_latter] if idx>=0]
        print(epi_id_former, epi_id, epi_id_latter)
        print(epi_sentences)

        sub_sentences = [en_subset[sub_id_former, sub_id_latter] for idx in [sub_former, sub_id_latter] if idx>=0]
        print(sub_id_former, sub_id_latter)
        print(sub_sentences)

        print("=="*50)


        if sub_former in epi:
            sub_ids.append(sub_id_former)
        if sub_latter in epi:
            sub_ids.append(sub_id_latter)
        # print(sorted(sub_ids))
        temp[epi_id] = sorted(sub_ids)
        # epi2sub_alignment_2[epi_id] = sorted(sub_ids)
        # print("=="*50)
    return temp

In [26]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    tbbt_episode = []
    for x in tbbt_transcripts[(1,1)]:
        if x[1]!='Scene':
            tbbt_episode.append(x)

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

    # # Extend with WER
    # # for x in sub2epi:
    # #     print(x, sub2epi[x])
    # extend_with_wer(en_subset, epi2sub, tbbt_episode)




    return temp

In [None]:
further_alignment = {}
for i in tqdm(range(12)):
    for j in tqdm(range(30)):
        try:
            temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

In [28]:
temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, 1, 1, 200)

In [35]:
print(temp[-1])

{0: [200, 201, 202, 203], 2: [205], 5: [206, 207], 8: [214], 9: [215], 12: [218, 219], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 21: [233], 24: [235], 29: [239], 31: [240], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 40: [252], 41: [253], 42: [254], 48: [256], 50: [259], 52: [261], 58: [263], 59: [264], 66: [266], 67: [267], 68: [268], 69: [269], 70: [270], 72: [273], 73: [274, 275], 76: [279], 78: [282], 79: [283], 85: [285, 286], 86: [287], 91: [290], 92: [291], 93: [292], 96: [293], 99: [295], 100: [297], 102: [299], 103: [300], 104: [301], 105: [302], 108: [304], 110: [306], 111: [307], 112: [308], 114: [311], 115: [312], 118: [313], 119: [314, 315, 316, 317], 120: [318], 122: [319], 124: [320], 126: [321, 322], 129: [325], 131: [327], 135: [330], 136: [331], 137: [333], 138: [334], 139: [335], 140: [336], 141: [337, 338, 339, 340], 143: [342], 151: [351], 153: [356], 154: [357, 358], 157: [360, 361], 158: [362], 163: [368, 369], 165: [373

In [46]:
for x in further_alignment:
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=x[0]+1,
        episode_id=x[1]+1,
        bias=200
    )
    total = len(tbbt_episode)
    print(x, total, len(further_alignment[x][0]), len(further_alignment[x][1]), len(further_alignment[x][2]), len(further_alignment[x][3]))
    # print(further_alignment[x])
    # print("=="*50)

(0, 0) 133 80 80 100 100


In [13]:
further_alignment[(i,j)] = temp

In [14]:
print(further_alignment)

{(1, 2): [{2: [200], 3: [201], 4: [204], 8: [209, 210, 211], 9: [212], 11: [215, 217, 218], 15: [225], 16: [226], 18: [228], 19: [229], 22: [248], 23: [264, 265], 25: [268], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [441], 57: [443], 61: [462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 15: [225], 16: [226], 18: [227, 228], 19: [229], 22: [248], 23: [264, 265, 266], 25: [268, 269], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [340, 341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [440, 441], 57: [443], 61: [461, 462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 14: [221], 15: [225]

In [13]:
with open('further_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

## Check the further alignment

In [114]:
with open('further_alignment.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [17]:
alignment = further_alignment

In [18]:
for x in alignment:
    # Define season and episode
    season_id = x[0]+1
    episode_id = x[1]+1

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x][2]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, "", " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

In [None]:
"""
Check alignment based on Season-Episode
Write into xlsx file
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [109]:
"""
Check alignment based on Season-Episode
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [110]:
# Show TBBT Episode
for i, (utt, speaker) in enumerate(tbbt_episode):
    if i in epi2sub:
        print(i, "||||",epi2sub[i], speaker, utt)
    else:
        print(i, speaker, utt)

0 4 no, i haven't.
1 2 get used to it.
2 |||| [196] 4 yeah, i probably won't, but... hey sheldon.
3 |||| [197] 1 hi.
4 |||| [198, 199] 4 hey raj!  still not talking to me, huh?
5 1 don't take it personally, it's his pathology, he can't talk to women.
6 |||| [201] 2 he can't talk to attractive women, or in your case a cheesecake-scented goddess!
7 |||| [202] 0 so, there's gonna be some furniture delivered?
8 |||| [217, 218, 219, 220] 1 oh no, let's assume that they can. lois lane is falling, accelerating at an initial rate of 32 feet per second per second. superman swoops down to save her by reaching out two arms of steel. miss lane, who is now travelling at approximately 120 miles per hour, hits them, and is immediately sliced into three equal pieces.
9 |||| [226, 228, 229] 1 are you listening to yourself, it is well established that superman's flight is a feat of strength, it is an extension of his ability to leap tall buildings, an ability he derives from earth's yellow sun.
10 |||| 

In [108]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle, zh_subset[i])
    else:
        print(i, subtitle, zh_subset[i])

0 Thank you very much. Good day to you. 多谢了 日安
1 Good day to you. 日安
2 Come and buy a dresser! 来买梳妆台了
3 The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
4 Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
5 The village had grown. 村子也大了 有很多新的小孩
6 There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
7 But we had a secret helper. 但是我们有个秘密
8 Nikolas. -尼古拉斯
9 -Eemeli. -艾美利
10 Long time no see. You should come more often. 很久没见了 你应该常来
11 I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
12 Do you have the list? 你有名单吗
13 Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
14 As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
15 Is that... -是吗
16 -A girl, three months. -一个女孩 三个月大
17 Let's add her to the list. 那我们加上她的名字吧
18 What is the name of this little princess? 这个小公主叫什么名字
19 Aada. 亚达
20 Aada? 亚达
21 Hello, Aada. 你好 亚达
22 Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
23 My sons. 我的儿子
