In [21]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [22]:
from utils.preprocessing import organize_coarse_alignment_by_seasons
from utils.preprocessing import fetch_subsets

In [23]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Load Source file

In [24]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/en_fa/en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/en_fa/fa_subtitles.pkl"
transcript_path = "../../source_data/transcripts/tbbt/tbbt_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/tbbt_en_fa.pkl"
root_path = "results/tbbt_en_fa/"

# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

# Section 1: Coarse-Grain Alignment
Use string match to fetch relevant index of open subtitle for each episode
The code for coarse alignment locates at "../coarse_alignment/align_transcript_subtitle.py"
The alignment results locates at "../coarse_alignment/results/"

In [31]:
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

# Section 2: Fine-Grained Alignment

In this section, we fetch alignment within one episode using the course alignment indexs fetch in the last section.

This include the following parts:

2.1: Fetch transcript episode and open subtitle subset
2.2: Exact String Match (Use as the seed, since it is more accurate)
2.3: Substring Exact Match (Sliding Window Algorithm)
2.4: Merge the alignment result of 2.2, 2.3 and filter by the index

## Part 2.1: Fetch Subsets

In [32]:
# An example of fetch_subset function
(en_subtitle, other_subtitle, tbbt_episode) = fetch_subsets(
        episode=all_transcripts,
        en_subtitle=all_en_subtitles,
        zh_subtitle=all_other_subtitles,
        results=coarse_alignments,
        season_id=1,
        episode_id=3,
        bias=200
    )

for a, b in zip(en_subtitle, other_subtitle):
    print(a, b)

Sheldon, this is not your home. اينجا خونه تو نيست
This is not anyone's home. This is a swirling vortex of entropy. اين خونه هيچ کسي نيست اينجا گردابي قلتان از بي نظميه
When the transvestite lived here, you didn't care how he kept the place. ببينم همسايه قبلي اينجا زندگي ميکرد وضعيت اينجا برات مهم نبود
Because it was immaculate. به خاطر اين که ايشون خيلي معصوم بودند
I mean, you open that man's closet it was left to right, evening gowns, cocktail dresses and his police uniforms. يعني در کمد لباسشو که باز ميکردي از چپ به راست لباس راحتي عصرونه لباس مهموني و بعد هم لباس يونيفورم پليسي شو گذاشته بود
What were you doing in his closet? تو با کمد لباس اون چي کار داشتي؟
I helped him run some cable for a webcam. داشتم کمکش ميکردم که براي وب کم سيم بکشيم
This just arrived. Brought this up. Just now. سلام پني ، همين الان آورديمش بالا همين حالا
Was it hard getting up the stairs? بالا آوردنش از پله ها خيلي سخت بود
We'll get out of your hair. خب ما زحمتو کم ميکنيم
Okay, great. Thank you again. باشه 

## Part2: Generate Alignment Seeds

In [33]:
from utils.alignment_seeds import *

In [34]:
results = {}
for i in range(12):
    for j in tqdm(range(30)):
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i+1,
                episode_id=j+1,
                bias=200
            )
            temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=6)
            results[(i+1, j+1)] = temp
        except:
            pass

100%|██████████| 30/30 [00:38<00:00,  1.28s/it]
100%|██████████| 30/30 [00:06<00:00,  4.80it/s]
100%|██████████| 30/30 [00:05<00:00,  5.48it/s]
100%|██████████| 30/30 [00:29<00:00,  1.01it/s]
100%|██████████| 30/30 [00:15<00:00,  1.93it/s]
100%|██████████| 30/30 [00:35<00:00,  1.18s/it]
100%|██████████| 30/30 [00:37<00:00,  1.23s/it]
100%|██████████| 30/30 [00:50<00:00,  1.67s/it]
100%|██████████| 30/30 [00:30<00:00,  1.03s/it]
100%|██████████| 30/30 [00:00<00:00, 566797.84it/s]
100%|██████████| 30/30 [00:00<00:00, 197224.33it/s]
100%|██████████| 30/30 [00:00<00:00, 587986.54it/s]


In [36]:
with open(root_path + "0_alignment_seeds.pkl", "wb") as f:
    pkl.dump(results, f)

## Part 3: Incrementally Extend from Alignment Seeds

In [39]:
with open(root_path + "0_alignment_seeds.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

In [None]:
# The input to each extension function should be epi2sub

In [40]:
from utils.alignment_extension import *
from utils.helper_functions import *
from utils.ultimate_alignment import *

In [41]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200
            )
    epi2sub = filter_by_idx(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)]))
    # Extend the neighbor
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform a set of extension
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform alignment within gaps
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Extend Subtitle ids with its min and max index
    for x in epi2sub:
        epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

    # Perform ultimate alignment
    gaps = get_final_stage_gap_pairs(epi2sub)
    epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

    further_alignment[(i,j)] = epi2sub

In [43]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 355 0.9833795013850416 || Episode: 255 76.34730538922156 0
(1, 2) Subtitle: 299 0.9933554817275747 || Episode: 197 79.43548387096774 0
(1, 3) Subtitle: 301 0.9709677419354839 || Episode: 201 77.60617760617761 3
(1, 4) Subtitle: 288 0.993103448275862 || Episode: 180 75.63025210084034 1
(1, 5) Subtitle: 277 0.9892857142857143 || Episode: 191 82.32758620689656 0
(1, 7) Subtitle: 306 0.9745222929936306 || Episode: 232 79.72508591065292 8
(1, 9) Subtitle: 280 0.99644128113879 || Episode: 188 80.34188034188034 1
(1, 10) Subtitle: 273 0.9927272727272727 || Episode: 180 76.59574468085107 0
(1, 11) Subtitle: 291 0.9931740614334471 || Episode: 209 81.32295719844358 4
(1, 12) Subtitle: 465 1.0021551724137931 || Episode: 113 46.50205761316872 102
(1, 13) Subtitle: 310 0.9810126582278481 || Episode: 202 76.22641509433963 0
(1, 14) Subtitle: 306 0.9652996845425867 || Episode: 196 80.65843621399176 0
(1, 15) Subtitle: 303 0.9837662337662337 || Episode: 200 80.64516129032258 0
(1, 16)

In [44]:
with open(root_path+'1_alignment_extension.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

# Part 4: Add Head Tail

In [13]:
root_path = "results/tbbt_en_zh/"
with open(root_path + "1_alignment_extension.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

In [14]:
def get_optimal_cer_from_episode(ground_truth, hypothesis_pool, utt_ids):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.cer(ground_truth, hypothesis))
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, utt_ids[scores.index(min(scores))]

In [15]:
def fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub):
    gap_pairs = []
    # Fetch the Episode-Subtitle Pair before the episode start
    epi_ids = list(range(0, min(epi2sub.keys())))
    num_epi_token = 0
    for i in epi_ids:
        num_epi_token += len(tbbt_episode[i][0].strip().split(" "))

    sub_ids = [min(epi2sub[min(epi2sub.keys())])]
    num_sub_token = len(en_subset[sub_ids[-1]].strip().split(" "))
    while num_sub_token <= num_epi_token*3:
        sub_ids.append(sub_ids[-1]-1)
        num_sub_token += len(en_subset[sub_ids[-1]].strip().split(" "))
    gap_pairs.append([epi_ids, sorted(sub_ids)])

    # Fetch the Episode-Subtitle Pair after the episode end
    epi_ids = list(range(max(epi2sub.keys())+1, len(tbbt_episode)))
    num_epi_token = 0
    for i in epi_ids:
        num_epi_token += len(tbbt_episode[i][0].strip().split(" "))

    sub_ids = [max(epi2sub[max(epi2sub.keys())])+1]
    num_sub_token = len(en_subset[sub_ids[-1]].strip().split(" "))
    while num_sub_token <= num_epi_token*3:
        sub_ids.append(sub_ids[-1]+1)
        num_sub_token += len(en_subset[sub_ids[-1]].strip().split(" "))
    gap_pairs.append([epi_ids, sub_ids])

    return gap_pairs

In [16]:
def before_after_wer_match(en_subset, episode, epi_ids, sub_ids):
    # Load Sub2Epi
    temp_sub2epi = {}
    temp_epi2sub = {}
    for sub_id in sub_ids:
        subtitle = transformation(en_subset[sub_id].replace("’", " ").replace('…', " "))
        subtitle_tokens = subtitle.strip().split(" ")
        sub_len = len(subtitle_tokens)
        if sub_len <= 4:
            continue

        utt_segments = []
        utt_ids = []
        for epi_id in epi_ids:
            utt = transformation(episode[epi_id][0].replace("’", " ").replace('…', " "))
            utt_tokens = utt.strip().split(" ")
            for j in range(len(utt_tokens) - sub_len+1):
                utt_segments.append(" ".join(utt_tokens[j: j + sub_len]))
                utt_ids.append(epi_id)
        if utt_segments != [] and subtitle not in [" ", ""]:
            score, hypo, truth, index = get_optimal_cer_from_episode(subtitle, utt_segments, utt_ids)
            if score >= 0.5:
                continue
            # print("**********")
            # print("Score:", score)
            # print("Subtitle:", truth)
            # print("Episode:", hypo)
            # print("Epi ID:", index, "Sub ID:", sub_id)
            temp_sub2epi[sub_id] = [index]
            if index not in temp_epi2sub:
                temp_epi2sub[index] = [sub_id]
            else:
                temp_epi2sub[index].append(sub_id)
    # print("Sub2Epi:", temp_sub2epi)
    # print("Epi2Sub:", temp_epi2sub)
    return temp_sub2epi

In [20]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    print(i, j)
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200
            )
    epi2sub = filter_by_idx(alignment_seeds[(i,j)])

    head_tail_sub2epi = {}

    gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
    for item in gap_pairs:
        temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
        for x in temp:
            head_tail_sub2epi[x] = temp[x]
    print(head_tail_sub2epi)
    print("=="*50)

    further_alignment[(i,j)] = head_tail_sub2epi

1 1
{}
1 3
{}
1 4
{}
1 5
{}
1 6
{}
1 7
{180: [0], 182: [1], 183: [3], 520: [280], 521: [280], 522: [281]}
1 9
{}
1 10
{}
1 11
{190: [2], 191: [2]}
1 12
{-417: [24], -414: [25], -413: [26], -412: [27], -410: [29], -409: [30], -408: [31], -405: [33], -404: [34], -402: [36], -401: [37], -400: [37], -399: [37], -398: [38], -396: [40], -393: [44], -392: [44], -391: [45], -389: [47], -388: [48], -387: [48], -384: [51], -383: [52], -382: [53], -381: [28], -380: [56], -379: [57], -378: [58], -377: [58], -376: [58], -375: [59], -374: [59], -372: [62], -371: [62], -369: [63], -368: [63], -367: [63], -365: [65], -364: [66], -363: [67], -361: [68], -360: [69], -359: [70], -358: [70], -357: [71], -356: [71], -355: [72], -354: [72], -353: [73], -352: [73], -351: [74], -349: [76], -348: [77], -347: [77], -345: [78], -344: [81], -343: [81], -341: [82], -340: [83], -339: [85], -338: [85], -337: [86], -335: [87], -333: [89], -330: [93], -329: [93], -328: [93], -327: [94], -326: [94], -325: [94], -324: [

IndexError: list index out of range