In [10]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [11]:
from utils.preprocessing import organize_coarse_alignment_by_seasons
from utils.preprocessing import fetch_subsets

In [12]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Load Source file

In [13]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/en_fa/en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/en_fa/fa_subtitles.pkl"
transcript_path = "../../source_data/transcripts/friends/friends_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/friends_en_fa.pkl"
root_path = "results/friends_en_fa/"


# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

# Section 1: Coarse-Grain Alignment
Use string match to fetch relevant index of open subtitle for each episode
The code for coarse alignment locates at "../coarse_alignment/align_transcript_subtitle.py"
The alignment results locates at "../coarse_alignment/results/"

In [14]:
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

# Section 2: Fine-Grained Alignment

In this section, we fetch alignment within one episode using the course alignment indexs fetch in the last section.

This include the following parts:

2.1: Fetch transcript episode and open subtitle subset
2.2: Exact String Match (Use as the seed, since it is more accurate)
2.3: Substring Exact Match (Sliding Window Algorithm)
2.4: Merge the alignment result of 2.2, 2.3 and filter by the index

## Part 2.1: Fetch Subsets

In [6]:
# An example of fetch_subset function
(en_subtitle, other_subtitle, tbbt_episode) = fetch_subsets(
        episode=all_transcripts,
        en_subtitle=all_en_subtitles,
        zh_subtitle=all_other_subtitles,
        results=coarse_alignments,
        season_id=1,
        episode_id=1,
        bias=200,
        zh_split=False
    )

for a, b in zip(en_subtitle, other_subtitle):
    print(a, b)

Shoot! تير بزن
My condolences to the widow. تسليتم رو بابت بيوه شدنت اعلام ميكنم
We may need the hostage. ممكنه به گروگان احتياج پيدا كنيم
Okay, let's go, Suzy Homemaker. باشه.. بزن بريم خانوم كدبانو
Just get her in the car. Let's get going. سوار ماشينش كن بزنيد بريم
I thought this looked like yourwork. فكر ميكنم شبيه كاراي تو باشه حالا بريم.
Let's go. 
I'll briefyou in the air. اون بالا برات توضيح ميدم
You're welcome. Let's go! قابلتو نداره بريم ...
Let's get out ofhere! از اينجا بريم بيرون
Let's go! بريم ..
Well, you tell that son of a bitch this is a Bright Boy Alert. خب . به اون حروم زاده بگو كه اين هشدار روشن بچه ست
I repeat: A Bright Boy Alert. تكرار ميكنم هشدار روشن بچه
This is not a drill. Do you understand that? اين يك تمرين نيست متوجه شدي؟
Hold on. خيلي خب ..
Put me through to the White House. يك لحظه من رو به كاخ سفيد وصل كن
The minimum safe distance is 1 2 miles. Get the marine patrol in right away. حداقل فاصله ي امن 12 مايله همين الان گشت دريايي رو بگير
I want the Coast Gu

## Part2: Generate Alignment Seeds

In [15]:
from utils.alignment_seeds import *

In [8]:
# results = {}
# for i in range(12):
#     for j in tqdm(range(30)):
#         try:
#             (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
#                 episode=all_transcripts,
#                 en_subtitle=all_en_subtitles,
#                 zh_subtitle=all_other_subtitles,
#                 results=coarse_alignments,
#                 season_id=i+1,
#                 episode_id=j+1,
#                 bias=200
#             )
#             temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=6)
#             results[(i+1, j+1)] = temp
#         except:
#             pass

In [9]:
# Check the source alignment
results = {}
for i in sorted(list(coarse_alignments.keys())):
    for j in sorted(list(coarse_alignments[i].keys())):
        if (i, j) not in all_transcripts:
            continue
        try:
            print("Season:", i, "  Episode:", j)
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
            temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=12)
            if temp != {}:
                results[(i, j)] = temp
        except:
            print("Pass Season", i, "Episode:", j)

Season: 1   Episode: 1
Season: 1   Episode: 2
Season: 1   Episode: 3
Season: 1   Episode: 4
Season: 1   Episode: 5
Pass Season 1 Episode: 5
Season: 1   Episode: 6
Season: 1   Episode: 7
Season: 1   Episode: 8
Season: 1   Episode: 9
Season: 1   Episode: 10
Season: 1   Episode: 11
Season: 1   Episode: 12
Season: 1   Episode: 13
Season: 1   Episode: 14
Season: 1   Episode: 15
Season: 1   Episode: 16
Season: 1   Episode: 17
Season: 1   Episode: 18
Season: 1   Episode: 19
Season: 1   Episode: 20
Season: 1   Episode: 21
Season: 1   Episode: 22
Season: 1   Episode: 23
Season: 1   Episode: 24
Season: 2   Episode: 1
Pass Season 2 Episode: 1
Season: 2   Episode: 2
Pass Season 2 Episode: 2
Season: 2   Episode: 3
Pass Season 2 Episode: 3
Season: 2   Episode: 4
Pass Season 2 Episode: 4
Season: 2   Episode: 5
Season: 2   Episode: 6
Season: 2   Episode: 7
Season: 2   Episode: 8
Season: 2   Episode: 9
Season: 2   Episode: 10
Season: 2   Episode: 11
Season: 2   Episode: 12
Season: 2   Episode: 13
Seaso

In [19]:
with open(root_path + "0_alignment_seeds_12.pkl", "wb") as f:
    pkl.dump(results, f)

In [13]:
with open(root_path + "0_alignment_seeds_temp.pkl", "wb") as f:
    pkl.dump(results, f)

## Part 3: Incrementally Extend from Alignment Seeds

In [11]:
with open(root_path + "0_alignment_seeds_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

In [11]:
# The input to each extension function should be epi2sub

In [16]:
from utils.alignment_extension import *
from utils.helper_functions import *
from utils.ultimate_alignment import *

In [13]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
            )
    epi2sub = filter_by_idx(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)]))
    # Extend the neighbor
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform a set of extension
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform alignment within gaps
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Extend Subtitle ids with its min and max index
    for x in epi2sub:
        epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

    # Perform ultimate alignment
    gaps = get_final_stage_gap_pairs(epi2sub)
    epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

    further_alignment[(i,j)] = epi2sub

In [16]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    try:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    except:
        print("Wrong")
    # print(epi2sub)

(1, 1) Subtitle: 403 0.9757869249394673 || Episode: 235 74.60317460317461 5
(1, 2) Subtitle: 279 0.9653979238754326 || Episode: 190 74.80314960629921 7
(1, 3) Subtitle: 253 0.9730769230769231 || Episode: 168 61.76470588235294 38
(1, 4) Subtitle: 300 0.9523809523809523 || Episode: 176 65.67164179104478 3
(1, 6) Subtitle: 300 0.9523809523809523 || Episode: 159 69.43231441048034 19
(1, 7) Subtitle: 247 0.9686274509803922 || Episode: 158 62.698412698412696 12
(1, 8) Subtitle: 251 0.9365671641791045 || Episode: 175 71.13821138211382 12
(1, 9) Subtitle: 281 0.972318339100346 || Episode: 186 76.85950413223141 4
(1, 10) Subtitle: 309 0.9507692307692308 || Episode: 179 74.27385892116183 6
(1, 11) Subtitle: 8 1.1428571428571428 || Episode: 1 0.33670033670033667 13
(1, 12) Subtitle: 227 0.9227642276422764 || Episode: 144 53.333333333333336 5
(1, 13) Subtitle: 288 0.9230769230769231 || Episode: 176 68.75 5
(1, 14) Subtitle: 298 0.9834983498349835 || Episode: 163 79.12621359223301 3
(1, 15) Subtitl

In [20]:
with open(root_path+'1_alignment_extension_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

# Part 4: Add Head Tail

In [17]:
from utils.head_tail_alignment import *

root_path = "results/friends_en_fa/"
with open(root_path + "1_alignment_extension_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        print(i, j)
        (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])

        head_tail_sub2epi = {}

        gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
        for item in gap_pairs:
            temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
            for x in temp:
                head_tail_sub2epi[x] = temp[x]

        temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
        # print(temp)
        if temp!={}:
            if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
                temp = {}
        further_alignment[(i,j)] = filter_by_idx(temp)

        # print(temp)
        # print(filter_by_idx(epi2sub))
        # print("=="*50)
    except:
        print("Pass i j")

with open(root_path+'1_alignment_head_tail_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

ultimate_data = {}
for item in alignment_seeds:
    temp = alignment_seeds[item]
    if item in further_alignment:
        temp = merge_episode_alignment(temp, further_alignment[item])
    ultimate_data[item] = temp

with open(root_path+'1_alignment_ultimate_12.pkl', 'wb') as f:
    pkl.dump(ultimate_data, f)

1 1
1 2
1 3
1 4
1 6
1 7
1 8
1 9
1 10
1 11
Pass i j
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
1 22
1 24
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 15
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
2 24
3 1
3 2
3 3
3 5
3 6
3 7
3 8
3 9
3 11
3 12
3 13
3 14
3 16
3 17
3 18
3 19
3 20
3 21
3 23
3 24
3 25
4 1
4 2
4 3
4 4
4 6
4 7
4 9
4 10
4 11
4 12
4 13
Pass i j
4 14
4 16
4 17
Pass i j
4 18
4 19
4 20
4 22
5 2
5 5
5 7
5 8
5 9
5 10
5 11
5 12
5 13
5 15
5 16
5 17
5 18
5 19
5 20
5 21
5 23
5 24
6 1
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 11
6 12
6 14
6 15
6 16
6 17
6 19
6 20
6 21
6 22
6 23
6 24
6 25
7 4
7 6
7 7
7 9
7 11
7 12
7 13
7 14
7 15
7 16
7 17
7 18
7 19
7 21
Pass i j
7 23
8 3
8 4
8 12
8 13
8 15
8 17
8 19
8 21
8 22
8 23
8 24
Pass i j
9 2
9 3
9 4
9 5
9 7
9 8
9 10
Pass i j
9 11
9 13
9 14
9 16
9 17
9 18
9 21
10 2
10 5
10 7
10 9
10 10
Pass i j
10 11
10 12
10 15
10 16
10 17
10 18


In [18]:
further_alignment = deepcopy(ultimate_data)
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]), len(all_transcripts[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 425 0.8432539682539683 || Episode: 251 315 79.68253968253968 0
(1, 2) Subtitle: 285 0.9595959595959596 || Episode: 194 254 76.37795275590551 0
(1, 3) Subtitle: 276 0.8817891373801917 || Episode: 183 272 67.27941176470588 3
(1, 4) Subtitle: 318 0.8712328767123287 || Episode: 191 268 71.26865671641791 1
(1, 6) Subtitle: 317 0.9161849710982659 || Episode: 170 229 74.235807860262 0
(1, 7) Subtitle: 267 0.9501779359430605 || Episode: 169 252 67.06349206349206 0
(1, 8) Subtitle: 260 0.896551724137931 || Episode: 181 246 73.57723577235772 0
(1, 9) Subtitle: 288 0.9320388349514563 || Episode: 191 242 78.92561983471074 0
(1, 10) Subtitle: 311 0.9452887537993921 || Episode: 181 241 75.10373443983403 3
(1, 11) Subtitle: 8 1.1428571428571428 || Episode: 1 297 0.33670033670033667 13
(1, 12) Subtitle: 253 0.8971631205673759 || Episode: 161 270 59.62962962962963 1
(1, 13) Subtitle: 290 0.9206349206349206 || Episode: 178 256 69.53125 0
(1, 14) Subtitle: 301 0.9585987261146497 || Episo

In [19]:
with open(root_path + "1_alignment_ultimate_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
            )
    epi2sub = filter_by_idx(alignment_seeds[(i,j)])
    # Extend the neighbor
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform a set of extension
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform alignment within gaps
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Extend Subtitle ids with its min and max index
    for x in epi2sub:
        epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

    # Perform ultimate alignment
    gaps = get_final_stage_gap_pairs(epi2sub)
    epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

    further_alignment[(i,j)] = epi2sub

with open(root_path+'1_alignment_extension_12_again.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [20]:
further_alignment = deepcopy(ultimate_data)
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]), len(all_transcripts[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 425 0.8432539682539683 || Episode: 251 315 79.68253968253968 0
(1, 2) Subtitle: 285 0.9595959595959596 || Episode: 194 254 76.37795275590551 0
(1, 3) Subtitle: 276 0.8817891373801917 || Episode: 183 272 67.27941176470588 3
(1, 4) Subtitle: 318 0.8712328767123287 || Episode: 191 268 71.26865671641791 1
(1, 6) Subtitle: 317 0.9161849710982659 || Episode: 170 229 74.235807860262 0
(1, 7) Subtitle: 267 0.9501779359430605 || Episode: 169 252 67.06349206349206 0
(1, 8) Subtitle: 260 0.896551724137931 || Episode: 181 246 73.57723577235772 0
(1, 9) Subtitle: 288 0.9320388349514563 || Episode: 191 242 78.92561983471074 0
(1, 10) Subtitle: 311 0.9452887537993921 || Episode: 181 241 75.10373443983403 3
(1, 11) Subtitle: 8 1.1428571428571428 || Episode: 1 297 0.33670033670033667 13
(1, 12) Subtitle: 253 0.8971631205673759 || Episode: 161 270 59.62962962962963 1
(1, 13) Subtitle: 290 0.9206349206349206 || Episode: 178 256 69.53125 0
(1, 14) Subtitle: 301 0.9585987261146497 || Episo