In [3]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [4]:
from utils.preprocessing import organize_coarse_alignment_by_seasons
from utils.preprocessing import fetch_subsets

In [5]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Load Source file

In [6]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/en_zh/en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/en_zh/zh_subtitles.pkl"
transcript_path = "../../source_data/transcripts/tbbt/tbbt_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/tbbt_en_zh.pkl"
root_path = "results/tbbt_en_zh/"


# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

# Section 1: Coarse-Grain Alignment
Use string match to fetch relevant index of open subtitle for each episode
The code for coarse alignment locates at "../coarse_alignment/align_transcript_subtitle.py"
The alignment results locates at "../coarse_alignment/results/"

In [7]:
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

# Section 2: Fine-Grained Alignment

In this section, we fetch alignment within one episode using the course alignment indexs fetch in the last section.

This include the following parts:

2.1: Fetch transcript episode and open subtitle subset
2.2: Exact String Match (Use as the seed, since it is more accurate)
2.3: Substring Exact Match (Sliding Window Algorithm)
2.4: Merge the alignment result of 2.2, 2.3 and filter by the index

## Part 2.1: Fetch Subsets

In [12]:
# An example of fetch_subset function
(en_subtitle, other_subtitle, tbbt_episode) = fetch_subsets(
        episode=all_transcripts,
        en_subtitle=all_en_subtitles,
        zh_subtitle=all_other_subtitles,
        results=coarse_alignments,
        season_id=1,
        episode_id=1,
        bias=200,
        zh_split=False
    )

for a, b in zip(en_subtitle, other_subtitle):
    print(a, b)

Thank you very much. Good day to you. 多谢了 日安
Good day to you. 日安
Come and buy a dresser! 来买梳妆台了
The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
The village had grown. 村子也大了 有很多新的小孩
There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
But we had a secret helper. 但是我们有个秘密
Nikolas. 尼古拉斯
Eemeli. 艾美利
Long time no see. You should come more often. 很久没见了 你应该常来
I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
Do you have the list? 你有名单吗
Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
Is that... 是吗
A girl, three months. 一个女孩 三个月大
Let's add her to the list. 那我们加上她的名字吧
What is the name of this little princess? 这个小公主叫什么名字
Aada. 亚达
Aada? 亚达
Hello, Aada. 你好 亚达
Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
My sons. 我的儿子
I sought them out and asked them here. 我在外面找到他们带到这里来
We were wrong w

## Part2: Generate Alignment Seeds

In [8]:
from utils.alignment_seeds import *

In [34]:
# results = {}
# for i in range(12):
#     for j in tqdm(range(30)):
#         try:
#             (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
#                 episode=all_transcripts,
#                 en_subtitle=all_en_subtitles,
#                 zh_subtitle=all_other_subtitles,
#                 results=coarse_alignments,
#                 season_id=i+1,
#                 episode_id=j+1,
#                 bias=200
#             )
#             temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=6)
#             results[(i+1, j+1)] = temp
#         except:
#             pass

100%|██████████| 30/30 [00:38<00:00,  1.28s/it]
100%|██████████| 30/30 [00:06<00:00,  4.80it/s]
100%|██████████| 30/30 [00:05<00:00,  5.48it/s]
100%|██████████| 30/30 [00:29<00:00,  1.01it/s]
100%|██████████| 30/30 [00:15<00:00,  1.93it/s]
100%|██████████| 30/30 [00:35<00:00,  1.18s/it]
100%|██████████| 30/30 [00:37<00:00,  1.23s/it]
100%|██████████| 30/30 [00:50<00:00,  1.67s/it]
100%|██████████| 30/30 [00:30<00:00,  1.03s/it]
100%|██████████| 30/30 [00:00<00:00, 566797.84it/s]
100%|██████████| 30/30 [00:00<00:00, 197224.33it/s]
100%|██████████| 30/30 [00:00<00:00, 587986.54it/s]


In [15]:
# Check the source alignment
results = {}
for i in sorted(list(coarse_alignments.keys())):
    for j in sorted(list(coarse_alignments[i].keys())):
        if (i, j) not in all_transcripts:
            continue
        print("Season:", i, "  Episode:", j)
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
            temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=12)
            if temp != {}:
                results[(i, j)] = temp
        except:
            pass

Season: 1   Episode: 1
Season: 1   Episode: 2
Season: 1   Episode: 3
Season: 1   Episode: 4
Season: 1   Episode: 5
Season: 1   Episode: 6
Season: 1   Episode: 7
Season: 1   Episode: 8
Season: 1   Episode: 9
Season: 1   Episode: 10
Season: 1   Episode: 11
Season: 1   Episode: 12
Season: 1   Episode: 13
Season: 1   Episode: 14
Season: 1   Episode: 15
Season: 1   Episode: 16
Season: 2   Episode: 1
Season: 2   Episode: 2
Season: 2   Episode: 3
Season: 2   Episode: 4
Season: 2   Episode: 5
Season: 2   Episode: 6
Season: 2   Episode: 7
Season: 2   Episode: 9
Season: 2   Episode: 11
Season: 2   Episode: 12
Season: 2   Episode: 13
Season: 2   Episode: 14
Season: 2   Episode: 15
Season: 2   Episode: 16
Season: 2   Episode: 17
Season: 2   Episode: 18
Season: 2   Episode: 19
Season: 2   Episode: 20
Season: 2   Episode: 21
Season: 2   Episode: 22
Season: 2   Episode: 23
Season: 3   Episode: 1
Season: 3   Episode: 2
Season: 3   Episode: 3
Season: 3   Episode: 4
Season: 3   Episode: 5
Season: 3   Ep

In [16]:
with open(root_path + "0_alignment_seeds_12.pkl", "wb") as f:
    pkl.dump(results, f)

In [13]:
with open(root_path + "0_alignment_seeds_temp.pkl", "wb") as f:
    pkl.dump(results, f)

## Part 3: Incrementally Extend from Alignment Seeds

In [17]:
with open(root_path + "0_alignment_seeds_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

In [None]:
# The input to each extension function should be epi2sub

In [9]:
from utils.alignment_extension import *
from utils.helper_functions import *
from utils.ultimate_alignment import *

In [19]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
            )
    epi2sub = filter_by_idx(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)]))
    # Extend the neighbor
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform a set of extension
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform alignment within gaps
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Extend Subtitle ids with its min and max index
    for x in epi2sub:
        epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

    # Perform ultimate alignment
    gaps = get_final_stage_gap_pairs(epi2sub)
    epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

    further_alignment[(i,j)] = epi2sub

In [15]:
root_path = "results/tbbt_en_zh/"
with open(root_path + "1_alignment_extension_12.pkl", "rb") as f:
    further_alignment = pkl.load(f)

In [16]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 350 0.9859154929577465 || Episode: 258 77.24550898203593 0
(1, 3) Subtitle: 304 0.987012987012987 || Episode: 205 79.15057915057915 5
(1, 4) Subtitle: 284 0.9759450171821306 || Episode: 181 76.05042016806722 1
(1, 5) Subtitle: 278 0.9823321554770318 || Episode: 186 80.17241379310344 0
(1, 6) Subtitle: 283 0.9895104895104895 || Episode: 197 76.35658914728683 2
(1, 7) Subtitle: 318 0.9968652037617555 || Episode: 230 79.03780068728523 10
(1, 9) Subtitle: 302 0.9805194805194806 || Episode: 200 85.47008547008546 1
(1, 10) Subtitle: 288 0.9965397923875432 || Episode: 187 79.57446808510639 0
(1, 11) Subtitle: 287 0.9795221843003413 || Episode: 208 80.93385214007782 4
(1, 12) Subtitle: 252 0.5239085239085239 || Episode: 113 46.50205761316872 102
(1, 13) Subtitle: 279 0.9489795918367347 || Episode: 192 72.45283018867924 5
(1, 14) Subtitle: 296 0.961038961038961 || Episode: 191 78.60082304526749 2
(1, 15) Subtitle: 308 0.9871794871794872 || Episode: 202 81.45161290322581 0
(1, 1

In [14]:
print(further_alignment[(1,12)])
print("=="*50)
print(turn_sub2epi_into_epi2sub(alignment_seeds[(1,12)]))

{102: [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 148, 239, 242, 243, 264], 103: [128, 162, 336], 104: [337], 105: [338, 339], 106: [340, 341], 107: [342], 108: [343], 109: [344], 110: [345, 346, 347], 111: [348, 349, 350], 112: [351], 113: [352, 353], 114: [354], 117: [355], 118: [356], 119: [357], 121: [358], 123: [359], 124: [360, 361], 125: [362], 126: [363], 127: [364], 128: [365, 366], 129: [367], 130: [368], 131: [369, 370], 132: [371, 372], 133: [373, 374], 134: [375], 135: [376, 377], 136: [378], 138: [379, 380], 139: [381], 140: [382, 383], 141: [384], 142: [385], 143: [386, 387, 388, 389], 144: [390], 145: [391], 147: [392, 393], 148: [394], 149: [395], 150: [396, 397], 151: [398], 152: [399, 400, 401, 402], 153: [403], 154: [404, 405], 1

In [20]:
with open(root_path+'1_alignment_extension_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

# Part 4: Add Head Tail

In [10]:
from utils.head_tail_alignment import *

root_path = "results/tbbt_en_zh/"
with open(root_path + "1_alignment_extension_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        print(i, j)
        (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])

        head_tail_sub2epi = {}

        gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
        for item in gap_pairs:
            temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
            for x in temp:
                head_tail_sub2epi[x] = temp[x]

        temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
        # print(temp)
        if temp!={}:
            if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
                temp = {}
        further_alignment[(i,j)] = filter_by_idx(temp)

        # print(temp)
        # print(filter_by_idx(epi2sub))
        # print("=="*50)
    except:
        print("Pass i j")

with open(root_path+'1_alignment_head_tail_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

ultimate_data = {}
for item in alignment_seeds:
    temp = alignment_seeds[item]
    if item in further_alignment:
        temp = merge_episode_alignment(temp, further_alignment[item])
    ultimate_data[item] = temp

with open(root_path+'1_alignment_ultimate_12.pkl', 'wb') as f:
    pkl.dump(ultimate_data, f)

1 1
1 3
1 4
1 5
1 6
1 7
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 9
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
3 10
3 11
3 12
3 13
3 14
3 15
3 16
3 17
3 18
3 19
3 20
3 21
3 22
3 23
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
4 10
4 11
4 12
4 13
4 14
4 15
4 17
4 18
4 19
4 20
4 21
4 22
4 23
4 24
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
5 10
5 11
5 12
5 13
5 14
5 15
5 16
5 17
5 18
5 19
5 20
5 21
5 23
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 10
6 11
6 12
6 13
6 14
6 15
6 16
6 17
6 18
6 19
6 20
6 21
6 22
7 1
7 2
7 3
7 4
7 6
7 7
7 8
7 9
7 10
7 11
7 13
7 14
7 15
7 16
7 17
7 18
7 19
7 20
7 21
7 22
7 23
7 24
8 1
8 2
8 3
8 4
8 6
8 8
8 9
8 10
8 11
8 12
8 13
8 15
8 16
8 17
8 18
8 19
8 20
8 21
8 22
8 23
8 24
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 9
9 10
9 11
9 12
9 13
9 14
9 15
9 16
9 24


In [11]:
further_alignment = deepcopy(ultimate_data)
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]), len(all_transcripts[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 350 0.9859154929577465 || Episode: 258 334 77.24550898203593 0
(1, 3) Subtitle: 315 0.9752321981424149 || Episode: 213 259 82.23938223938224 0
(1, 4) Subtitle: 286 0.9727891156462585 || Episode: 182 238 76.47058823529412 1
(1, 5) Subtitle: 281 0.9825174825174825 || Episode: 188 232 81.03448275862068 0
(1, 6) Subtitle: 301 0.9709677419354839 || Episode: 210 258 81.3953488372093 0
(1, 7) Subtitle: 330 0.9649122807017544 || Episode: 239 291 82.13058419243985 0
(1, 9) Subtitle: 302 0.9805194805194806 || Episode: 200 234 85.47008547008546 1
(1, 10) Subtitle: 291 0.9965753424657534 || Episode: 188 235 80.0 0
(1, 11) Subtitle: 289 0.9763513513513513 || Episode: 209 257 81.32295719844358 2
(1, 12) Subtitle: 252 0.5239085239085239 || Episode: 113 243 46.50205761316872 102
(1, 13) Subtitle: 294 0.9245283018867925 || Episode: 203 265 76.60377358490567 0
(1, 14) Subtitle: 303 0.9558359621451105 || Episode: 195 243 80.24691358024691 0
(1, 15) Subtitle: 317 0.9664634146341463 || Epi