In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from utils.preprocessing import organize_coarse_alignment_by_seasons
from utils.preprocessing import fetch_subsets

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Load Source file

In [4]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/en_fa/en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/en_fa/fa_subtitles.pkl"
transcript_path = "../../source_data/transcripts/tbbt/tbbt_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/tbbt_en_fa.pkl"
root_path = "results/tbbt_en_fa/"


# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

# Section 1: Coarse-Grain Alignment
Use string match to fetch relevant index of open subtitle for each episode
The code for coarse alignment locates at "../coarse_alignment/align_transcript_subtitle.py"
The alignment results locates at "../coarse_alignment/results/"

In [5]:
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

# Section 2: Fine-Grained Alignment

In this section, we fetch alignment within one episode using the course alignment indexs fetch in the last section.

This include the following parts:

2.1: Fetch transcript episode and open subtitle subset
2.2: Exact String Match (Use as the seed, since it is more accurate)
2.3: Substring Exact Match (Sliding Window Algorithm)
2.4: Merge the alignment result of 2.2, 2.3 and filter by the index

## Part 2.1: Fetch Subsets

In [14]:
# An example of fetch_subset function
(en_subtitle, other_subtitle, tbbt_episode) = fetch_subsets(
        episode=all_transcripts,
        en_subtitle=all_en_subtitles,
        zh_subtitle=all_other_subtitles,
        results=coarse_alignments,
        season_id=1,
        episode_id=3,
        bias=200
    )

for a, b in zip(en_subtitle, other_subtitle):
    print(a, b)

Sheldon, this is not your home. اينجا خونه تو نيست
This is not anyone's home. This is a swirling vortex of entropy. اين خونه هيچ کسي نيست اينجا گردابي قلتان از بي نظميه
When the transvestite lived here, you didn't care how he kept the place. ببينم همسايه قبلي اينجا زندگي ميکرد وضعيت اينجا برات مهم نبود
Because it was immaculate. به خاطر اين که ايشون خيلي معصوم بودند
I mean, you open that man's closet it was left to right, evening gowns, cocktail dresses and his police uniforms. يعني در کمد لباسشو که باز ميکردي از چپ به راست لباس راحتي عصرونه لباس مهموني و بعد هم لباس يونيفورم پليسي شو گذاشته بود
What were you doing in his closet? تو با کمد لباس اون چي کار داشتي؟
I helped him run some cable for a webcam. داشتم کمکش ميکردم که براي وب کم سيم بکشيم
This just arrived. Brought this up. Just now. سلام پني ، همين الان آورديمش بالا همين حالا
Was it hard getting up the stairs? بالا آوردنش از پله ها خيلي سخت بود
We'll get out of your hair. - Okay, great. خب ما زحمتو کم ميکنيم
Thank you again. باش

## Part2: Generate Alignment Seeds

In [6]:
from utils.alignment_seeds import *

In [13]:
# Check the source alignment
results = {}
for i in sorted(list(coarse_alignments.keys())):
    for j in sorted(list(coarse_alignments[i].keys())):
        if (i, j) not in all_transcripts:
            continue
        print("Season:", i, "  Episode:", j)
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
            temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=12)
            if temp != {}:
                results[(i, j)] = temp
        except:
            pass

Season: 1   Episode: 1
Season: 1   Episode: 2
Season: 1   Episode: 3
Season: 1   Episode: 4
Season: 1   Episode: 5
Season: 1   Episode: 6
Season: 1   Episode: 7
Season: 1   Episode: 8
Season: 1   Episode: 9
Season: 1   Episode: 10
Season: 1   Episode: 11
Season: 1   Episode: 12
Season: 1   Episode: 13
Season: 1   Episode: 14
Season: 1   Episode: 15
Season: 1   Episode: 16
Season: 2   Episode: 1
Season: 2   Episode: 2
Season: 2   Episode: 3
Season: 2   Episode: 4
Season: 2   Episode: 5
Season: 2   Episode: 6
Season: 2   Episode: 7
Season: 2   Episode: 9
Season: 2   Episode: 11
Season: 2   Episode: 12
Season: 2   Episode: 13
Season: 2   Episode: 14
Season: 2   Episode: 15
Season: 2   Episode: 16
Season: 2   Episode: 17
Season: 2   Episode: 18
Season: 2   Episode: 19
Season: 2   Episode: 20
Season: 2   Episode: 21
Season: 2   Episode: 22
Season: 2   Episode: 23
Season: 3   Episode: 1
Season: 3   Episode: 2
Season: 3   Episode: 3
Season: 3   Episode: 4
Season: 3   Episode: 5
Season: 3   Ep

In [16]:
# results = {}
# for i in range(12):
#     for j in tqdm(range(30)):
#         try:
#             (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
#                 episode=all_transcripts,
#                 en_subtitle=all_en_subtitles,
#                 zh_subtitle=all_other_subtitles,
#                 results=coarse_alignments,
#                 season_id=i+1,
#                 episode_id=j+1,
#                 bias=200,
#                 zh_split=False
#             )
#             temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=6)
#             results[(i+1, j+1)] = temp
#         except:
#             pass

100%|██████████| 30/30 [00:44<00:00,  1.49s/it]
100%|██████████| 30/30 [00:08<00:00,  3.48it/s]
100%|██████████| 30/30 [00:05<00:00,  5.28it/s]
100%|██████████| 30/30 [00:31<00:00,  1.04s/it]
100%|██████████| 30/30 [00:16<00:00,  1.84it/s]
100%|██████████| 30/30 [00:35<00:00,  1.20s/it]
100%|██████████| 30/30 [00:36<00:00,  1.21s/it]
100%|██████████| 30/30 [00:50<00:00,  1.70s/it]
100%|██████████| 30/30 [00:31<00:00,  1.05s/it]
100%|██████████| 30/30 [00:00<00:00, 613800.59it/s]
100%|██████████| 30/30 [00:00<00:00, 519955.04it/s]
100%|██████████| 30/30 [00:00<00:00, 622916.44it/s]


In [14]:
with open(root_path + "0_alignment_seeds_12.pkl", "wb") as f:
    pkl.dump(results, f)

In [None]:
with open(root_path + "0_alignment_seeds.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

## Part 3: Incrementally Extend from Alignment Seeds

In [15]:
with open(root_path + "0_alignment_seeds_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

In [None]:
# The input to each extension function should be epi2sub

In [7]:
from utils.alignment_extension import *
from utils.helper_functions import *
from utils.ultimate_alignment import *

In [17]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
            )
    epi2sub = filter_by_idx(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)]))
    # Extend the neighbor
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform a set of extension
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform alignment within gaps
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Extend Subtitle ids with its min and max index
    for x in epi2sub:
        epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

    # Perform ultimate alignment
    gaps = get_final_stage_gap_pairs(epi2sub)
    epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

    further_alignment[(i,j)] = epi2sub

In [18]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 353 0.9778393351800554 || Episode: 255 76.34730538922156 0
(1, 2) Subtitle: 297 0.9933110367892977 || Episode: 195 78.62903225806451 2
(1, 3) Subtitle: 286 0.9533333333333334 || Episode: 189 72.97297297297297 4
(1, 4) Subtitle: 273 0.9855595667870036 || Episode: 171 71.84873949579831 7
(1, 5) Subtitle: 271 0.9818840579710145 || Episode: 187 80.60344827586206 0
(1, 7) Subtitle: 295 0.9735973597359736 || Episode: 225 77.31958762886599 10
(1, 9) Subtitle: 277 0.9857651245551602 || Episode: 185 79.05982905982906 1
(1, 10) Subtitle: 269 0.9853479853479854 || Episode: 181 77.02127659574468 0
(1, 11) Subtitle: 282 0.9757785467128027 || Episode: 205 79.76653696498055 4
(1, 12) Subtitle: 464 1.0 || Episode: 112 46.09053497942387 102
(1, 13) Subtitle: 295 0.9609120521172638 || Episode: 195 73.58490566037736 4
(1, 14) Subtitle: 299 0.9614147909967846 || Episode: 193 79.42386831275721 0
(1, 15) Subtitle: 296 0.9704918032786886 || Episode: 195 78.62903225806451 0
(1, 16) Subtitle: 

In [19]:
with open(root_path+'1_alignment_extension_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

# Part 4: Add Head Tail

In [8]:
from utils.head_tail_alignment import *

root_path = "results/tbbt_en_fa/"
with open(root_path + "1_alignment_extension_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        print(i, j)
        (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])

        head_tail_sub2epi = {}

        gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
        for item in gap_pairs:
            temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
            for x in temp:
                head_tail_sub2epi[x] = temp[x]

        temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
        # print(temp)
        if temp!={}:
            if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
                temp = {}
        further_alignment[(i,j)] = filter_by_idx(temp)

        # print(temp)
        # print(filter_by_idx(epi2sub))
        # print("=="*50)
    except:
        print("Pass i j")

with open(root_path+'1_alignment_head_tail_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

ultimate_data = {}
for item in alignment_seeds:
    temp = alignment_seeds[item]
    if item in further_alignment:
        temp = merge_episode_alignment(temp, further_alignment[item])
    ultimate_data[item] = temp

with open(root_path+'1_alignment_ultimate_12.pkl', 'wb') as f:
    pkl.dump(ultimate_data, f)

1 1
1 2
1 3
1 4
1 5
1 7
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
2 4
2 6
3 21
3 23
4 1
4 3
4 4
4 5
4 6
4 8
4 16
4 19
4 21
4 22
4 23
4 24
5 2
5 6
5 12
5 14
5 15
5 20
5 23
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 14
6 15
6 16
6 17
6 19
6 20
6 21
7 1
7 2
7 6
7 10
7 11
7 15
7 16
7 17
7 18
7 19
7 20
7 22
7 23
8 1
8 2
8 3
8 6
8 7
8 9
8 11
8 12
8 14
8 15
8 16
8 17
8 18
8 19
8 20
8 21
8 22
8 23
8 24
9 1
9 3
9 4
9 7
9 8
9 9
9 10
9 11
9 12
9 16
9 17
9 18


In [9]:
further_alignment = deepcopy(ultimate_data)
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]), len(all_transcripts[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 353 0.9778393351800554 || Episode: 255 334 76.34730538922156 0
(1, 2) Subtitle: 299 0.9933554817275747 || Episode: 197 248 79.43548387096774 0
(1, 3) Subtitle: 293 0.9361022364217252 || Episode: 193 259 74.5173745173745 0
(1, 4) Subtitle: 285 0.976027397260274 || Episode: 178 238 74.78991596638656 1
(1, 5) Subtitle: 273 0.978494623655914 || Episode: 189 232 81.46551724137932 0
(1, 7) Subtitle: 307 0.9359756097560976 || Episode: 234 291 80.41237113402062 0
(1, 9) Subtitle: 277 0.9857651245551602 || Episode: 185 234 79.05982905982906 1
(1, 10) Subtitle: 271 0.9854545454545455 || Episode: 182 235 77.4468085106383 0
(1, 11) Subtitle: 287 0.9663299663299664 || Episode: 209 257 81.32295719844358 2
(1, 12) Subtitle: 464 1.0 || Episode: 112 243 46.09053497942387 102
(1, 13) Subtitle: 310 0.950920245398773 || Episode: 204 265 76.9811320754717 0
(1, 14) Subtitle: 303 0.9588607594936709 || Episode: 196 243 80.65843621399176 0
(1, 15) Subtitle: 301 0.9555555555555556 || Episode: 2