In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

from utils.preprocessing import organize_coarse_alignment_by_seasons
from utils.preprocessing import fetch_subsets
from utils.alignment_seeds import *
from utils.preprocessing import *

# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [2]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/en_zh/en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/en_zh/zh_subtitles.pkl"
transcript_path = "../../source_data/transcripts/tbbt/tbbt_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/tbbt_en_zh.pkl"
root_path = "results/tbbt_en_zh/"


# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

In [3]:
# Perform index filtering on the alignment seeds
def filter_by_idx(sub2epi):
    """
    Filter index based on the index before and after
    """
    paris = []
    for x in sorted(list(sub2epi.keys())):
        for y in sorted(sub2epi[x]):
            paris.append([x, y])

    res = [paris[0]]
    for i in range(1, len(paris) - 1):
        former = res[-1]
        current = paris[i]
        after = paris[i + 1]
        if former[0] <= current[0] <= after[0]:
            if former[1] <= current[1] <= after[1]:
                res.append(current)
    if paris[-1][0] >= res[-1][0]:
        if paris[-1][1] >= res[-1][1]:
            res.append(paris[-1])

    output = {}
    for x in res:
        sub = x[0]
        epi = x[1]
        if sub not in output:
            output[sub] = [epi]
        else:
            output[sub].append(epi)

    return output

def generate_alignment_seeds(en_subset, tbbt_episode, window_size):
    """
    Generate {subtitle_id: episode_id} dictionary as the seeds for the next step alignments
    This includes:
        1.Exact Match + Filtering by Index to make sure the accuracy
        2.Substring Match + Filtering
    """
    temp_0 = exact_match(en_subset, tbbt_episode)
    print(temp_0)
    temp_0 = filter_by_idx(temp_0)
    temp_1 = string_match_sliding_window(en_subset, tbbt_episode, window_size)
    print(temp_1)
    temp_1 = filter_by_idx(temp_1)
    merged_temp = merge_episode_alignment(temp_0, temp_1)
    alignment_seeds = filter_by_idx(merged_temp)
    return alignment_seeds

In [31]:
def fetch_subsets(episode, en_subtitle, zh_subtitle, results, season_id, episode_id, bias, zh_split=False):
    """
    episode: Whole transcript
    en_subtitle: all en subtitle
    zh_subtitle: all subtitle in other language
    results: the coarse alignment result
    season_id: id of the season to align
    episode_id: id of episode to align
    bias: [first_index-bias, last_index+bias] is the list of subtitles to align
        a subset of en_subtitle, a subset of subtitles in another language and the corresponding utterance

    Special Notice: Since in Open Subtitle some subtitles contains two utterance split with "-", we split them
    """
    idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
    subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)[-1]
    # Calculate gaps within the subset
    gaps_subsets = calculate_gaps(subsets)

    # Prepare Subtitle Subset
    start = subsets[0] - bias
    end = subsets[-1] + bias
    old_en_subset = [clean_sentence_brackets(item) for item in en_subtitle[start: end]]
    old_zh_subset = [clean_sentence_brackets(item) for item in zh_subtitle[start: end]]
    en_subset = []
    zh_subset = []

    # Split mutiple utterance in subtitle
    for i in range(len(old_en_subset)):
        utt_en = clean_sentence_brackets(old_en_subset[i]).replace("... - ...", "...")
        utt_en_token = utt_en.strip().split()
        utt_zh = clean_sentence_brackets(old_zh_subset[i])

        if zh_split:
            # Split concatenated subtitle with "-"
            if "-" in utt_en_token:
                idx = [k for k, item in enumerate(utt_en_token) if item == "-"][-1]
                if utt_en_token[idx - 1][-1] in string.punctuation:
                    if "－" in utt_zh or "-" in utt_zh:
                        en_subs = [" ".join(utt_en_token[:idx]), " ".join(utt_en_token[idx + 1:])]
                        if "－" in utt_zh:
                            zh_subs = utt_zh.strip().lstrip("－").split("－")
                        else:
                            zh_subs = utt_zh.strip().lstrip("-").split("-")

                        for j in range(len(en_subs)):
                            en_subset.append(en_subs[j])
                            zh_subset.append(zh_subs[j])
                    else:
                        former = " ".join(utt_en_token[:idx])
                        latter = " ".join(utt_en_token[idx + 1:])
                        old_en_subset[i] = former
                        old_en_subset[i + 1] = latter + " " + old_en_subset[i + 1]
                        en_subset.append(old_en_subset[i])
                        zh_subset.append(old_zh_subset[i])
            else:
                en_subset.append(old_en_subset[i])
                zh_subset.append(old_zh_subset[i])
        else:
            en_subset.append(old_en_subset[i])
            zh_subset.append(old_zh_subset[i])

    # Remove Characters
    for i in range(len(en_subset)):
        en_tokens = en_subset[i].strip().split()
        for j, item in enumerate(en_tokens):
            if item.isupper() and len(item) >= 3 and item[-1] == ":":
                en_tokens.pop(j)
        en_subset[i] = " ".join(en_tokens)

    tbbt_episode = []
    for x in episode[(season_id, episode_id)]:
        if x[1] != 'Scene':
            tbbt_episode.append(x)

    # Clean the episode bck
    # 1. Remove empty string
    # 2. Remove duplicate stings
    temp_tbbt_episode = []
    abandon_idx = set()
    for i, x in enumerate(tbbt_episode):
        if transformation(x[0]) in [" ", ""]:
            abandon_idx.add(i)
    for length in range(6):
        length += 1
        for i in range(len(tbbt_episode) - length):
            if tbbt_episode[i][0] == tbbt_episode[i + length][0] and tbbt_episode[i][1] == tbbt_episode[i + length][1]:
                abandon_idx.add(i)

    for i, item in enumerate(tbbt_episode):
        if i not in abandon_idx:
            temp_tbbt_episode.append(item)

    return en_subset, zh_subset, temp_tbbt_episode


In [32]:
# Check the source alignment
results = {}
for i in sorted(list(coarse_alignments.keys())):
    for j in sorted(list(coarse_alignments[i].keys())):
        if (i, j) not in all_transcripts:
            continue
        print("Season:", i, "  Episode:", j)
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
            # temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=6)
            # if temp != {}:
            #     results[(i, j)] = temp
        except:
            pass

Season: 1   Episode: 1
Season: 1   Episode: 2
Season: 1   Episode: 3
Season: 1   Episode: 4
Season: 1   Episode: 5
Season: 1   Episode: 6
Season: 1   Episode: 7
Season: 1   Episode: 8
Season: 1   Episode: 9
Season: 1   Episode: 10
Season: 1   Episode: 11
Season: 1   Episode: 12
Season: 1   Episode: 13
Season: 1   Episode: 14
Season: 1   Episode: 15
Season: 1   Episode: 16
Season: 2   Episode: 1
Season: 2   Episode: 2
Season: 2   Episode: 3
Season: 2   Episode: 4
Season: 2   Episode: 5
Season: 2   Episode: 6
Season: 2   Episode: 7
Season: 2   Episode: 9
Season: 2   Episode: 11
Season: 2   Episode: 12
Season: 2   Episode: 13
Season: 2   Episode: 14
Season: 2   Episode: 15
Season: 2   Episode: 16
Season: 2   Episode: 17
Season: 2   Episode: 18
Season: 2   Episode: 19
Season: 2   Episode: 20
Season: 2   Episode: 21
Season: 2   Episode: 22
Season: 2   Episode: 23
Season: 3   Episode: 1
Season: 3   Episode: 2
Season: 3   Episode: 3
Season: 3   Episode: 4
Season: 3   Episode: 5
Season: 3   Ep

In [11]:
# An example of fetch_subset function
(en_subtitle, other_subtitle, tbbt_episode) = fetch_subsets(
        episode=all_transcripts,
        en_subtitle=all_en_subtitles,
        zh_subtitle=all_other_subtitles,
        results=coarse_alignments,
        season_id=1,
        episode_id=2,
        bias=200,
        zh_split=True,
    )

# for a, b in zip(en_subtitle, other_subtitle):
#     print(a, b)

Oh, yes! - To attract money, eh?
是的 －来聚敛钱财
Oh, yes! 是的 
To attract money, eh? 来聚敛钱财
Jaffad didn't tell me anything like that. - But maybe there was some message?
没有，贾法特什么也没跟我说 －也许，他给了你什么？
Jaffad didn't tell me anything like that. 没有，贾法特什么也没跟我说 
But maybe there was some message? 也许，他给了你什么？
He said: "Remember this day". - What day?
他说让我记住那一天 －哪一天？
He said: "Remember this day". 他说让我记住那一天 
What day? 哪一天？
On the roof! Stay here. - I'm coming with you.
留在这里 －不，我跟你一起
On the roof! Stay here. 留在这里 
I'm coming with you. 不，我跟你一起
Yes. Also... We'll handle this. - Keep in touch with the Ministry of Defense.
和美国人联系 －随时和国防部保持联系
Yes. Also... We'll handle this. 和美国人联系 
Keep in touch with the Ministry of Defense. 随时和国防部保持联系


In [27]:
print(other_subtitle)

['贾法特怎么会相信这样的女人？', '法国银行家怎么会有一个 做恐怖分子的儿子？', '这个... ...', '我有一个光彩的家谱， 战争的牺牲者，孤儿...', '我是个非常有吸引力的孩子 我妈妈爱我爱的不行', '因为她不知道你会变成这样', '当然，我长大了并且成了一个定时炸弹', '当时机来临， 我变得孤伶伶的，能想起我的', '有我的阿拉伯朋友， 他们解释很多说他们需要这样的人 有钱，强壮，坚守信仰', '知道吗？ 像我一样的人才能制造革命', '通过我们的卫星正在进行密集的排查 在所有的近东和东南亚地区', '如果谁试图激活密码， 我们当然会标出信号 但接下来的问题是... ...', '不知道核弹的位置， 不知道发送机启动的时间 不知道信号会传送到哪里', '别给自己设定使命', '对于你和贾法特这样的败类来说， 你们的想法 就是生意！', '怎么， 想成为第一号恐怖分子吗？', '－是的 －来聚敛钱财', '钱...', '', '想法，有什么区别？', '为什么这些炸弹爆炸， 对你来说有关系吗？', '四个大城市，将下地狱', '这是最重要的', '这种级别的核弹激活 通常需要11位的数字 前七个让他们从"苏醒"过来， 随后大概20秒钟进行自我检测 和发回信号准备实施爆炸', '但是，防止爆炸只能在炸弹所在地 停止输入最后一组数字', '－这一切她都知道 －是的，她知道', '好了，我们回到事情上来', '为什么贾法特下令 杀死萨西克. 吉拉弥？', '在近东吉拉弥拍下了贾法特 实施谋杀和进行攻击的一些阴谋', '这里，在马来西亚，他需要钱 于是他用这些录像带来威胁贾法特', '你知道，谁取代了萨西克. 吉拉弥吗？', '贾法特让谁保存这最后一部分密码？', '－贾法特来过这里？', '－是 －哦', '我和贾法特4月14号在这里见过', '－他一个人住这里？', '－是', '意大利人是他忠实的律师 挪威人，朋友，对他进行治疗', '萨西克. 吉拉弥曾是 贾法特最亲近的帮手', '他会选择谁来代替吉拉弥那？', '他不会随便相信谁', '你们来这里多久了？', '我们那天早上来的', '－他没给任何人电话？', '－跟我在一起的时候没有', '没和任何人见面？', '和他的兄弟见过', '萨伊特在这里？', '－我们要和你谈谈，这非常重要',

In [28]:
print(en_subtitle)

['How could ever Jaffad trust this kind of woman?', "How could a French banker adopt a terrorist's son?", 'I was provided with a clean legend.', 'An orphan, a warvictim...', 'Also, I was such a darling child, my new mom adored me.', "Because she didn't know what you will become.", 'Oh yes. I was growing, and I was like a time bomb.', 'Then I got orphaned once more, and my Arab friends found me.', 'I was just the person they needed - someone rich and strong, and dedicated to the idea.', 'People like me work revolutions!', "We're scanning the Middle East and South-Eastern Asia trough our satellites.", 'Should someone activate the code, we can intercept the signal, but will ten face a problem.', "We don't know the whereabouts of the bombs, we don't know the trigger period, we can't trace the source of the signal.", "Don't play a Messiah.", "For morons like you and Jaffad, it's all not about idea, but about business.", 'You want to become a terrorist number one?', 'Oh, yes! - To attract mo