In [36]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [37]:
from utils.preprocessing import organize_coarse_alignment_by_seasons
from utils.preprocessing import fetch_subsets
from utils.preprocessing import fetch_subsets_a4k

In [38]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Load Source file

In [39]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/friends/friends_en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/friends/friends_zh_subtitles.pkl"
transcript_path = "../../source_data/transcripts/friends/friends_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/friends_en_zh.pkl"
root_path = "results/friends_en_zh_a4k/"


# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

# Section 1: Coarse-Grain Alignment
Use string match to fetch relevant index of open subtitle for each episode
The code for coarse alignment locates at "../coarse_alignment/align_transcript_subtitle.py"
The alignment results locates at "../coarse_alignment/results/"

In [40]:
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

# Section 2: Fine-Grained Alignment

In this section, we fetch alignment within one episode using the course alignment indexs fetch in the last section.

This include the following parts:

2.1: Fetch transcript episode and open subtitle subset
2.2: Exact String Match (Use as the seed, since it is more accurate)
2.3: Substring Exact Match (Sliding Window Algorithm)
2.4: Merge the alignment result of 2.2, 2.3 and filter by the index

## Part 2.1: Fetch Subsets

## Part2: Generate Alignment Seeds

In [41]:
print(all_en_subtitles[(1, 1)])

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Friends S01E01: The Pilot', '没有人告诉你活着有多累', '上班受罪  口袋空空  爱情变累赘', '寂寞又潦倒  受气又受累', '日复一日  年复一年', '时运不济  活着受罪', '你我永不离弃', '纵有狂风暴雨', '你我永不离弃', ' 支持一如往昔', '你我永不离弃', '你我相偎相依', "There's nothing to tell. It's just some guy I work with.", "Come on. You're going out with a guy.", "There's gotta be something wrong with him.", 'So does he have a hump and a hair piece?', 'Wait. Does he eat chalk?', "I don't want her to go through what I went through with Carl. Oh.", 'Okay, everybody relax. This is not even a date.', "It's just two people going out to dinner and not having sex.", 'Sounds like a date to me.', "I'm back in high school, in the middle of the cafeteria...", "...and I realize I'm totally naked.", "- Oh, yeah. - I've had that dream.", 'Then I look down and I realize there is a phone...', '...there.', "- Instead of... - That's right!", "MONICA: That one, I've never had. P

In [42]:
from utils.alignment_seeds import *

In [43]:
# Check the source alignment
results = {}
for i, j in sorted(all_other_subtitles.keys()):
    print("Season:", i, "  Episode:", j)
    # en_subset = all_en_subtitles[(i, j)]
    # zh_subset = all_other_subtitles[(i, j)]
    # tbbt_episode = all_transcripts[(i, j)]
    en_subset, zh_subset, tbbt_episode = fetch_subsets_a4k(all_transcripts, all_en_subtitles, all_other_subtitles, i, j, zh_split=True)

    print(tbbt_episode[1])
    print(en_subset[30])
    print(en_subset[31])
    print()
    temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=12)
    if temp != {}:
        results[(i, j)] = temp

Season: 1   Episode: 1
["C'mon, you're going out with the guy! There's gotta be something wrong with him!", 'Joey Tribbiani']
Friends The Pilot
没有人告诉你活着有多累

Season: 1   Episode: 2
["Yeah, right!.......Y'serious?", 'Joey Tribbiani']
What you guys don't understand is...
for us, kissing is as important as any part of it.

Season: 1   Episode: 3
['Hey, Pheebs! Hi!', '#ALL#']
Hi, guys. Hey, Phoebe.
Hey, oh, how did it go?

Season: 1   Episode: 4
['Okay, okay. If I were omnipotent for a day, I would want, um, world peace, no more hunger, good things for the rain-forest...And bigger boobs!', 'Phoebe Buffay']
Oh, that's good.
Okay. All right, Phoebe?

Season: 1   Episode: 5
["Not that big a deal? It's amazing. Ok, you just reach in there, there's one little maneuver, and bam, a bra right out the sleeve. All right, as far as I'm concerned, there is nothing a guy can do that even comes close. Am I right?", 'Ross Geller']
Would you let it go? It's not a big deal.
Not a big deal? It's amazing.

Se

In [44]:
print(results.keys())

dict_keys([(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (1, 17), (1, 18), (1, 19), (1, 20), (1, 21), (1, 22), (1, 23), (1, 24), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (2, 15), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (2, 21), (2, 22), (2, 23), (2, 24), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (3, 15), (3, 16), (3, 17), (3, 18), (3, 19), (3, 20), (3, 21), (3, 22), (3, 23), (3, 24), (3, 25), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (5, 15), (5, 16), (5, 17), (5, 18), (5, 19), (5, 20), (5, 21), (5, 22), (5, 23), (5, 24), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (6, 10), (6, 11), (6, 12), (6, 13), (6, 14), (6, 15), (6, 16), (6, 17), (6, 18),

In [45]:
print(results[(1,1)])

{47: [4], 49: [5], 50: [6], 64: [18], 69: [22], 71: [24], 80: [32], 93: [41], 98: [44], 107: [48], 115: [49], 118: [49], 120: [49], 122: [50], 131: [58], 141: [61], 159: [71], 160: [72], 161: [73], 170: [81], 176: [87], 184: [95], 194: [103], 199: [107], 236: [142], 243: [143], 249: [148], 275: [161], 276: [165], 283: [171], 292: [178], 300: [186], 301: [187], 305: [190], 309: [195], 314: [199], 318: [202], 319: [203], 325: [209], 328: [212], 334: [217], 337: [219], 346: [227], 355: [233], 358: [235], 363: [241], 364: [242], 372: [244], 382: [258], 383: [259], 401: [271], 421: [282, 283], 425: [287], 428: [293]}


In [46]:
with open(root_path + "0_alignment_seeds_12.pkl", "wb") as f:
    pkl.dump(results, f)

## Part 3: Incrementally Extend from Alignment Seeds

In [47]:
with open(root_path + "0_alignment_seeds_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

In [48]:
from utils.alignment_extension import *
from utils.helper_functions import *
from utils.ultimate_alignment import *

In [75]:
# further_alignment = {}
# for (i, j) in alignment_seeds.keys():
#     print(i, j)
#     if (i, j)==(1,2):
#         continue
#     en_subset = all_en_subtitles[(i, j)]
#     zh_subset = all_other_subtitles[(i, j)]
#     tbbt_episode = all_transcripts[(i, j)]
#     epi2sub = filter_by_idx(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)]))
#     # Extend the neighbor
#     while True:
#         temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
#         epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
#         if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
#             break
#
#     # Perform a set of extension
#     while True:
#         temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
#
#         gaps, _ = get_subset_in_gaps(epi2sub)
#         epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))
#
#         gaps, _ = get_subset_in_gaps(epi2sub)
#         epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))
#
#         gaps, _ = get_subset_in_gaps(epi2sub)
#         epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))
#
#         epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
#
#         if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
#             break
#
#     # Perform alignment within gaps
#     while True:
#         temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
#
#         gaps, _ = get_subset_in_gaps(epi2sub)
#         epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))
#
#         gaps, _ = get_subset_in_gaps(epi2sub)
#         epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))
#
#         gaps, _ = get_subset_in_gaps(epi2sub)
#         epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))
#
#         epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))
#
#         if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
#             break
#
#     # Extend Subtitle ids with its min and max index
#     for x in epi2sub:
#         epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]
#
#     # Perform ultimate alignment
#     gaps = get_final_stage_gap_pairs(epi2sub)
#     epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)
#
#     further_alignment[(i,j)] = epi2sub

1 1
1 2
1 3
1 4
1 5


In [78]:
# for item in further_alignment:
#     print(item)
#     print(further_alignment[item])
#     print()

(1, 1)
{5: [47], 6: [48, 49], 7: [50], 8: [51, 52], 10: [54, 55], 15: [58], 17: [59, 60], 19: [61, 62], 20: [63], 21: [64, 65], 23: [66], 25: [68], 28: [70, 71], 30: [72], 31: [73], 32: [74], 33: [75], 34: [76, 77, 78], 35: [79, 80], 39: [81, 82, 83], 40: [84, 85], 41: [86, 87], 42: [88], 43: [89], 44: [90, 91, 92], 45: [93], 46: [94, 95, 96], 49: [99], 50: [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113], 51: [114], 52: [115], 54: [116, 117], 57: [118, 119, 120, 121], 59: [122], 60: [123], 61: [124], 62: [125, 126, 127, 128, 129, 130, 131, 132], 63: [133], 64: [134, 135], 65: [136], 66: [137, 138, 139], 67: [140, 141], 68: [142, 143, 144], 69: [145], 70: [146], 71: [147, 148], 72: [149, 150], 73: [151], 74: [152], 75: [153], 77: [154], 79: [155], 80: [156, 157], 81: [158], 83: [159], 84: [160], 85: [161], 86: [162], 87: [163], 89: [164], 91: [165, 166], 92: [167], 93: [168], 97: [170], 102: [171], 103: [172, 173], 104: [174, 175, 176, 177, 178, 179], 105: [180], 

In [49]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    # print(i, j)
    # if (i, j)==(1,2):
    #     continue
    try:
        en_subset, zh_subset, tbbt_episode = fetch_subsets_a4k(all_transcripts, all_en_subtitles, all_other_subtitles, i, j, zh_split=True)
        epi2sub = filter_by_idx(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)]))
        # Extend the neighbor
        while True:
            temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
            epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
            if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
                break

        # Perform a set of extension
        while True:
            temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

            if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
                break

        # Perform alignment within gaps
        while True:
            temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

            if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
                break

        # Extend Subtitle ids with its min and max index
        for x in epi2sub:
            epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

        # Perform ultimate alignment
        gaps = get_final_stage_gap_pairs(epi2sub)
        epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

        further_alignment[(i,j)] = epi2sub
        print(i, j)
    except:
        pass

1 1
1 3
1 4
1 5
1 7
1 9
1 11
1 12
1 13
1 14
1 15
1 17
1 18
1 19
1 21
1 22
2 1
2 2
2 3
2 4
2 6
2 7
2 8
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
2 24
3 1
3 3
3 4
3 5
3 6
3 7
3 8
3 9
3 10
3 11
3 12
3 13
3 15
3 18
3 19
3 20
3 21
3 22
3 23
3 24
3 25
5 1
5 2
5 3
5 4
5 5
5 6
5 8
5 9
5 10
5 11
5 12
5 13
5 14
5 15
5 16
5 17
5 19
5 21
5 22
5 23
5 24
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 12
6 13
6 14
6 15
6 16
6 17
6 18
6 19
6 20
6 21
6 22
6 23
6 24
6 25
7 1
7 3
7 4
7 5
7 6
7 8
7 9
7 10
7 11
7 12
7 13
7 14
7 15
7 16
7 17
7 18
7 19
7 20
7 21
7 22
7 23
7 24
8 24


In [50]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    try:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    except:
        print("Wrong")
    # print(epi2sub)

(1, 1) Subtitle: 361 0.94750656167979 || Episode: 214 67.93650793650794 4
(1, 3) Subtitle: 350 0.9259259259259259 || Episode: 220 80.88235294117648 3
(1, 4) Subtitle: 343 0.9002624671916011 || Episode: 204 76.11940298507463 1
(1, 5) Subtitle: 383 0.979539641943734 || Episode: 220 86.61417322834646 2
(1, 7) Subtitle: 299 0.9256965944272446 || Episode: 170 67.46031746031747 5
(1, 9) Subtitle: 331 0.9043715846994536 || Episode: 187 77.27272727272727 2
(1, 11) Subtitle: 415 0.9496567505720824 || Episode: 228 76.76767676767676 0
(1, 12) Subtitle: 362 0.9402597402597402 || Episode: 214 79.25925925925927 2
(1, 13) Subtitle: 379 0.9451371571072319 || Episode: 197 76.953125 2
(1, 14) Subtitle: 335 0.9710144927536232 || Episode: 173 83.98058252427184 1
(1, 15) Subtitle: 364 0.9430051813471503 || Episode: 195 75.5813953488372 4
(1, 17) Subtitle: 359 0.9497354497354498 || Episode: 239 80.74324324324324 1
(1, 18) Subtitle: 286 0.9196141479099679 || Episode: 191 69.45454545454545 34
(1, 19) Subtitle

In [53]:
with open(root_path+'1_alignment_extension_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

# Part 4: Add Head Tail

In [90]:
def fetch_before_after_a4k(en_subset, tbbt_episode, epi2sub):
    gap_pairs = []
    gap_pairs.append([
        [i for i in range(0, min(epi2sub.keys()))],
        [i for i in range(0, min(epi2sub[min(epi2sub.keys())]))]
    ])

    gap_pairs.append([
        [i for i in range(max(epi2sub.keys()), len(tbbt_episode))],
        [i for i in range(max(epi2sub[max(epi2sub.keys())]), len(en_subset))]
    ])
    return gap_pairs

In [97]:
from utils.head_tail_alignment import *

with open(root_path + "1_alignment_extension_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

print(alignment_seeds.keys())

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    print(i, j)
    # if (i, j)!=(1,5):
    #     continue
    en_subset, zh_subset, tbbt_episode = fetch_subsets_a4k(all_transcripts, all_en_subtitles, all_other_subtitles, i, j, zh_split=True)
    epi2sub = filter_by_idx(alignment_seeds[(i,j)])
    # print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))
    # print(epi2sub)
    # print()
    head_tail_sub2epi = {}
    gap_pairs = fetch_before_after_a4k(en_subset, tbbt_episode, epi2sub)
    for item in gap_pairs:
        temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
        for x in temp:
            head_tail_sub2epi[x] = temp[x]
    temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
    print(temp)
    if temp!={}:
        if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
            temp = {}
    further_alignment[(i,j)] = filter_by_idx(temp)

    print("=="*50)

dict_keys([(1, 1), (1, 3), (1, 4), (1, 5), (1, 7), (1, 9), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 17), (1, 18), (1, 19), (1, 21), (1, 22), (2, 1), (2, 2), (2, 3), (2, 4), (2, 6), (2, 7), (2, 8), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (2, 15), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (2, 21), (2, 22), (2, 23), (2, 24), (3, 1), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 15), (3, 18), (3, 19), (3, 20), (3, 21), (3, 22), (3, 23), (3, 24), (3, 25), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (5, 15), (5, 16), (5, 17), (5, 19), (5, 21), (5, 22), (5, 23), (5, 24), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (6, 12), (6, 13), (6, 14), (6, 15), (6, 16), (6, 17), (6, 18), (6, 19), (6, 20), (6, 21), (6, 22), (6, 23), (6, 24), (6, 25), (7, 1), (7, 3), (7, 4), (7, 5), (7, 6), (7, 8), (7, 9), (7, 10), (7, 11), (7, 12), (7, 13), (7, 14),

In [103]:
from utils.head_tail_alignment import *

with open(root_path + "1_alignment_extension_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        print(i, j)
        # if (i, j)!=(1,1):
        #     continue
        en_subset, zh_subset, tbbt_episode = fetch_subsets_a4k(all_transcripts, all_en_subtitles, all_other_subtitles, i, j, zh_split=True)
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])


        head_tail_sub2epi = {}
        gap_pairs = fetch_before_after_a4k(en_subset, tbbt_episode, epi2sub)
        for item in gap_pairs:
            temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
            for x in temp:
                head_tail_sub2epi[x] = temp[x]
        temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
        # print(temp)
        if temp!={}:
            if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
                temp = {}
        further_alignment[(i,j)] = filter_by_idx(temp)
        # print("=="*50)
    except:
        print("Pass i j")

with open(root_path+'1_alignment_head_tail_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

ultimate_data = {}
for item in alignment_seeds:
    temp = alignment_seeds[item]
    if item in further_alignment:
        temp = merge_episode_alignment(temp, further_alignment[item])
    ultimate_data[item] = temp

with open(root_path+'1_alignment_ultimate_12.pkl', 'wb') as f:
    pkl.dump(ultimate_data, f)

1 1
1 3
1 4
1 5
1 7
1 9
1 11
1 12
1 13
1 14
1 15
1 17
1 18
1 19
1 21
1 22
2 1
2 2
2 3
2 4
2 6
2 7
2 8
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
2 24
3 1
3 3
3 4
3 5
3 6
3 7
3 8
3 9
3 10
3 11
3 12
3 13
3 15
3 18
3 19
3 20
3 21
3 22
3 23
3 24
3 25
5 1
5 2
5 3
5 4
5 5
5 6
5 8
5 9
5 10
5 11
5 12
5 13
5 14
5 15
5 16
5 17
5 19
5 21
5 22
5 23
5 24
6 1
6 2
6 3
6 4
6 5
6 6
6 7
6 8
6 9
6 12
6 13
6 14
6 15
6 16
6 17
6 18
6 19
6 20
6 21
6 22
6 23
6 24
6 25
7 1
7 3
7 4
7 5
7 6
7 8
7 9
7 10
7 11
7 12
7 13
7 14
7 15
7 16
7 17
7 18
7 19
7 20
7 21
7 22
7 23
7 24
8 24


In [104]:
for item in further_alignment:
    print(further_alignment[item])

{0: [42], 1: [43, 44], 2: [45], 3: [46], 293: [428], 299: [431]}
{2: [31], 255: [410], 256: [411]}
{231: [413], 232: [414], 233: [415, 417], 237: [423], 241: [428], 243: [430], 247: [437, 438], 250: [439, 444]}
{0: [30], 1: [31, 32, 33, 34, 35], 235: [427]}
{0: [30, 31], 1: [33, 34, 36], 2: [38], 3: [40, 41], 4: [42], 206: [367], 208: [368], 209: [369], 213: [374], 218: [379], 220: [381, 382], 222: [385, 386], 223: [388], 224: [390, 391]}
{0: [30, 31, 32], 220: [400], 221: [401]}
{257: [407], 258: [408, 409], 260: [411, 413, 414], 270: [425], 271: [427], 274: [432, 433], 275: [434, 435], 279: [440, 441], 280: [442]}
{1: [33, 34, 35], 247: [422], 249: [424], 250: [425], 252: [427, 428, 430]}
{0: [32], 239: [434]}
{0: [30], 181: [376], 186: [385], 188: [387]}
{232: [419], 233: [420], 234: [421, 422, 423, 424], 239: [428]}
{0: [52, 53], 269: [433], 270: [434]}
{2: [45, 46], 4: [48, 49], 5: [50], 10: [56], 12: [58, 59], 14: [61, 62], 15: [63, 64, 65], 18: [69], 21: [73], 26: [79], 28: [81]

In [105]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    try:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    except:
        print("Wrong")
    # print(epi2sub)

(1, 1) Subtitle: 7 0.017994858611825194 || Episode: 6 1.9047619047619049 0
(1, 3) Subtitle: 3 0.007894736842105263 || Episode: 3 1.1029411764705883 2
(1, 4) Subtitle: 11 0.3548387096774194 || Episode: 8 2.9850746268656714 231
(1, 5) Subtitle: 7 0.017632241813602016 || Episode: 3 1.1811023622047243 0
(1, 7) Subtitle: 21 0.05817174515235457 || Episode: 14 5.555555555555555 0
(1, 9) Subtitle: 5 0.013477088948787063 || Episode: 3 1.2396694214876034 0
(1, 11) Subtitle: 15 0.42857142857142855 || Episode: 9 3.0303030303030303 257
(1, 12) Subtitle: 9 0.022670025188916875 || Episode: 5 1.8518518518518516 1
(1, 13) Subtitle: 2 0.004975124378109453 || Episode: 2 0.78125 0
(1, 14) Subtitle: 4 0.011204481792717087 || Episode: 4 1.9417475728155338 0
(1, 15) Subtitle: 7 0.7777777777777778 || Episode: 4 1.550387596899225 232
(1, 17) Subtitle: 4 0.010471204188481676 || Episode: 3 1.0135135135135136 0
(1, 18) Subtitle: 24 0.06575342465753424 || Episode: 17 6.181818181818182 2
(1, 19) Subtitle: 6 0.01449

In [106]:
for item in ultimate_data:
    epi2sub = ultimate_data[item]
    sub2epi = turn_sub2epi_into_epi2sub(ultimate_data[item])
    try:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(ultimate_data[item]),  len(ultimate_data[item])/len(all_transcripts[item])*100, min(ultimate_data[item].keys()))
    except:
        print("Wrong")
    # print(epi2sub)

(1, 1) Subtitle: 367 0.9434447300771208 || Episode: 219 69.52380952380952 0
(1, 3) Subtitle: 352 0.9263157894736842 || Episode: 222 81.61764705882352 2
(1, 4) Subtitle: 353 0.8567961165048543 || Episode: 211 78.73134328358209 1
(1, 5) Subtitle: 389 0.9798488664987406 || Episode: 222 87.4015748031496 0
(1, 7) Subtitle: 319 0.8836565096952909 || Episode: 183 72.61904761904762 0
(1, 9) Subtitle: 335 0.9029649595687331 || Episode: 189 78.099173553719 0
(1, 11) Subtitle: 429 0.9088983050847458 || Episode: 236 79.46127946127946 0
(1, 12) Subtitle: 370 0.9319899244332494 || Episode: 218 80.74074074074075 1
(1, 13) Subtitle: 380 0.945273631840796 || Episode: 198 77.34375 0
(1, 14) Subtitle: 338 0.9467787114845938 || Episode: 176 85.43689320388349 0
(1, 15) Subtitle: 370 0.9367088607594937 || Episode: 198 76.74418604651163 4
(1, 17) Subtitle: 362 0.9476439790575916 || Episode: 241 81.41891891891892 0
(1, 18) Subtitle: 309 0.8465753424657534 || Episode: 207 75.27272727272727 2
(1, 19) Subtitle: 

In [102]:
for item in further_alignment:
    print(item)
    print(further_alignment[item])
    print()
    print(ultimate_data[item])
    print("=="*50)

(1, 3)
{3: [31], 270: [374]}

{3: [31], 4: [32, 33], 6: [34, 35], 7: [36, 37], 8: [38], 9: [39, 40], 10: [41, 42, 43], 11: [44, 45], 12: [46], 13: [47], 14: [48, 49], 15: [50, 51], 16: [52], 17: [53, 54, 55, 56, 57], 19: [59], 20: [60, 61], 21: [62], 22: [63], 24: [65], 25: [66], 26: [68], 28: [69], 29: [71], 31: [72], 33: [74], 36: [76], 37: [77, 78, 79, 80], 39: [81, 82, 83], 41: [84, 85, 86], 42: [87], 43: [88], 44: [89, 90, 91, 92, 93, 94], 46: [95], 48: [97], 49: [98], 50: [99, 100], 51: [101], 52: [102], 53: [103], 54: [104], 55: [105], 56: [106], 57: [107], 58: [108, 109, 110, 111, 112, 113], 59: [114], 60: [115], 61: [116, 117], 63: [118], 64: [119], 66: [120], 67: [121], 68: [122], 69: [123], 70: [124], 72: [125, 126], 74: [128], 75: [129], 76: [130], 77: [131], 78: [132], 79: [133], 80: [134, 135], 81: [136], 82: [137], 85: [140, 141, 142], 86: [143, 144, 145], 87: [146], 88: [147, 148], 90: [149], 91: [150], 92: [151], 93: [152], 94: [153], 95: [154], 96: [155], 97: [156, 15

In [113]:
with open(root_path + "1_alignment_ultimate_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)
print(len(alignment_seeds))

126


In [110]:
with open(root_path + "1_alignment_ultimate_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    # print(i, j)
    # if (i, j)==(1,2):
    #     continue
    try:
        en_subset, zh_subset, tbbt_episode = fetch_subsets_a4k(all_transcripts, all_en_subtitles, all_other_subtitles, i, j, zh_split=True)
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])
        # Extend the neighbor
        while True:
            temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
            epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
            if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
                break

        # Perform a set of extension
        while True:
            temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

            if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
                break

        # Perform alignment within gaps
        while True:
            temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            gaps, _ = get_subset_in_gaps(epi2sub)
            epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

            epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

            if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
                break

        # Extend Subtitle ids with its min and max index
        for x in epi2sub:
            epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

        # Perform ultimate alignment
        gaps = get_final_stage_gap_pairs(epi2sub)
        epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

        further_alignment[(i,j)] = epi2sub
        print(i, j)
    except:
        pass

with open(root_path+'1_alignment_extension_12_again.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

1 3
1 4
1 5
1 9
1 12
1 13
1 15
1 17
1 18
1 21
1 22
2 1
2 2
2 3
2 4
2 8
2 11
2 12
2 14
2 15
2 16
2 17
2 18
2 19
2 21
2 23
3 3
3 4
3 6
3 8
3 9
3 10
3 12
3 13
3 19
5 2
5 3
5 4
5 10
5 15
5 16
5 21
5 23
5 24
6 3
6 4
6 7
6 8
6 19
6 20
6 22
6 25
7 4
7 5
7 6
7 8
7 9
7 15
7 17
7 20
7 21
7 22
7 23
8 24


In [112]:
print(len(further_alignment))

64


In [111]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    try:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    except:
        print("Wrong")
    # print(epi2sub)

(1, 3) Subtitle: 352 0.9263157894736842 || Episode: 222 81.61764705882352 2
(1, 4) Subtitle: 371 0.9004854368932039 || Episode: 218 81.34328358208955 1
(1, 5) Subtitle: 450 0.9846827133479212 || Episode: 222 87.4015748031496 0
(1, 9) Subtitle: 399 0.925754060324826 || Episode: 190 78.51239669421489 0
(1, 12) Subtitle: 373 0.9395465994962217 || Episode: 222 82.22222222222221 1
(1, 13) Subtitle: 380 0.945273631840796 || Episode: 196 76.5625 0
(1, 15) Subtitle: 378 0.9545454545454546 || Episode: 198 76.74418604651163 4
(1, 17) Subtitle: 364 0.9528795811518325 || Episode: 242 81.75675675675676 0
(1, 18) Subtitle: 338 0.9234972677595629 || Episode: 230 83.63636363636363 2
(1, 21) Subtitle: 407 0.9644549763033176 || Episode: 191 84.51327433628319 0
(1, 22) Subtitle: 386 0.9346246973365617 || Episode: 230 79.3103448275862 6
(2, 1) Subtitle: 295 0.8939393939393939 || Episode: 159 69.1304347826087 1
(2, 2) Subtitle: 349 0.9509536784741145 || Episode: 158 72.81105990783409 0
(2, 3) Subtitle: 329

In [17]:
with open(root_path+'1_alignment_extension_12_again.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [12]:
from utils.head_tail_alignment import *

root_path = "results/friends_en_zh/"
with open(root_path + "1_alignment_ultimate_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        print(i, j)
        (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])

        head_tail_sub2epi = {}

        gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
        for item in gap_pairs:
            temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
            for x in temp:
                head_tail_sub2epi[x] = temp[x]

        temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
        # print(temp)
        if temp!={}:
            if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
                temp = {}
        further_alignment[(i,j)] = filter_by_idx(temp)

        # print(temp)
        # print(filter_by_idx(epi2sub))
        # print("=="*50)
    except:
        print("Pass i j")

ultimate_data = {}
for item in alignment_seeds:
    temp = alignment_seeds[item]
    if item in further_alignment:
        temp = merge_episode_alignment(temp, further_alignment[item])
    ultimate_data[item] = temp

1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 22
1 24
3 16
Pass i j
4 19
Pass i j
6 4
6 19
Pass i j
7 24
Pass i j
9 12


In [13]:
for item in further_alignment:
    print(item)
    print(further_alignment[item])
    print()
    print(ultimate_data[item])
    print("=="*50)

(1, 1)
{}

{0: [199, 200], 1: [201, 202], 2: [203, 204], 3: [205], 4: [206], 5: [207, 208], 6: [209], 7: [210, 211], 8: [212], 9: [213, 214], 10: [215], 11: [216], 12: [217], 14: [218, 219], 15: [220], 16: [221, 222, 223], 18: [224], 20: [226], 21: [227], 22: [228], 24: [229], 27: [230, 231, 232], 28: [233], 29: [234, 235], 30: [236], 31: [237, 238], 32: [239], 33: [240, 241, 242, 243], 34: [244, 245], 35: [246], 36: [247, 248, 249, 250], 38: [251, 252, 253, 254], 39: [255, 256, 257], 40: [258, 259], 41: [260], 42: [261], 43: [262, 263, 264], 44: [265], 45: [266, 267, 268, 269, 270], 46: [271], 48: [272], 49: [273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285], 50: [286], 51: [287], 52: [288], 53: [289, 290], 54: [291], 55: [292, 293, 294], 56: [295], 57: [296], 59: [297, 298, 299], 60: [300, 301, 302, 303, 304, 305, 306, 307], 61: [308], 62: [309, 310], 63: [311], 64: [312, 313, 314], 65: [315, 316], 66: [317, 318, 319], 67: [320], 68: [321], 69: [322, 323, 324], 70: [32

In [11]:
further_alignment = deepcopy(ultimate_data)
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]), len(all_transcripts[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 429 0.988479262672811 || Episode: 231 315 73.33333333333333 5
(1, 2) Subtitle: 294 0.8936170212765957 || Episode: 182 254 71.65354330708661 9
(1, 3) Subtitle: 216 0.9113924050632911 || Episode: 142 272 52.20588235294118 56
(1, 4) Subtitle: 295 0.855072463768116 || Episode: 199 268 74.25373134328358 3
(1, 5) Subtitle: 320 0.9384164222873901 || Episode: 189 254 74.40944881889764 2
(1, 6) Subtitle: 224 0.9068825910931174 || Episode: 140 229 61.135371179039296 49
(1, 7) Subtitle: 256 0.8648648648648649 || Episode: 159 252 63.095238095238095 12
(1, 8) Subtitle: 256 0.8737201365187713 || Episode: 187 246 76.01626016260163 12
(1, 9) Subtitle: 304 0.9047619047619048 || Episode: 181 242 74.79338842975206 4
(1, 10) Subtitle: 262 0.8704318936877077 || Episode: 150 241 62.24066390041494 40
(1, 11) Subtitle: 328 0.9291784702549575 || Episode: 220 297 74.07407407407408 0
(1, 12) Subtitle: 268 0.9337979094076655 || Episode: 173 270 64.07407407407408 2
(1, 13) Subtitle: 305 0.92145015

ZeroDivisionError: division by zero

In [59]:
for item in further_alignment:
    print(item)
    print(further_alignment[item])
    print()

(1, 1)
{5: [207, 208], 6: [209], 7: [210, 211], 8: [212], 9: [213, 214], 10: [215], 11: [216], 12: [217], 14: [218, 219], 15: [220], 16: [221, 222, 223], 18: [224], 20: [226], 21: [227], 22: [228], 24: [229], 27: [230, 231, 232], 28: [233], 29: [234, 235], 30: [236], 31: [237, 238], 32: [239], 33: [240, 241, 242, 243], 34: [244, 245], 35: [246], 36: [247, 248, 249, 250], 38: [251, 252, 253, 254], 39: [255, 256, 257], 40: [258, 259], 41: [260], 42: [261], 43: [262, 263, 264], 44: [265], 45: [266, 267, 268, 269, 270], 46: [271], 48: [272], 49: [273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285], 50: [286], 51: [287], 52: [288], 53: [289, 290], 54: [291], 55: [292, 293, 294], 56: [295], 57: [296], 59: [297, 298, 299], 60: [300, 301, 302, 303, 304, 305, 306, 307], 61: [308], 62: [309, 310], 63: [311], 64: [312, 313, 314], 65: [315, 316], 66: [317, 318, 319], 67: [320], 68: [321], 69: [322, 323, 324], 70: [325, 326, 327], 71: [328, 329], 72: [330], 73: [331], 74: [332], 75: [3