In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [28]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_fa/fa_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [29]:
with open('original_transcript/tbbt_transcripts.pkl', 'rb') as f:
    tbbt_transcripts = pkl.load(f)

In [30]:
# Load alignment results after stage-2
with open('alignment_results/fa/indexs_tbbt_fa.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

## Perform fine-grained alignment

### Part 0: Load Data

In [31]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

#### Part 1: String Match with sliding window

In [7]:
# Part 1: String Match with sliding window
def temp_string_match_sliding_window_no_filter(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)
    return res

In [78]:
result_0_all = {}
for i in range(12):
    for j in range(30):
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=tbbt_transcripts,
                en_subtitle=en_subtitle,
                zh_subtitle=zh_subtitle,
                results=results,
                season_id=i+1,
                episode_id=j+1,
                bias=200
            )
            temp = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=9)
            result_0_all[(i+1,j+1)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

Season: 1 Episode: 1 Episode Number: 50 Subtitle Number: 45
Season: 1 Episode: 2 Episode Number: 57 Subtitle Number: 46
Season: 1 Episode: 3 Episode Number: 42 Subtitle Number: 40
Season: 1 Episode: 4 Episode Number: 51 Subtitle Number: 41
Season: 1 Episode: 5 Episode Number: 49 Subtitle Number: 41
Season: 1 Episode: 6 Episode Number: 49 Subtitle Number: 47
Season: 1 Episode: 7 Episode Number: 52 Subtitle Number: 49
Season: 1 Episode: 8 Episode Number: 58 Subtitle Number: 50
Season: 1 Episode: 9 Episode Number: 38 Subtitle Number: 33
Season: 1 Episode: 10 Episode Number: 60 Subtitle Number: 52
Season: 1 Episode: 11 Episode Number: 38 Subtitle Number: 36
Season: 1 Episode: 12 Episode Number: 53 Subtitle Number: 46
Season: 1 Episode: 13 Episode Number: 57 Subtitle Number: 54
Season: 1 Episode: 14 Episode Number: 51 Subtitle Number: 42
Season: 1 Episode: 15 Episode Number: 47 Subtitle Number: 43
Season: 1 Episode: 16 Episode Number: 47 Subtitle Number: 40
Season: 2 Episode: 1 Episode Numb

In [79]:
with open('alignment_part_1_string_match.pkl', 'wb') as f:
    pkl.dump(result_0_all, f)

In [11]:
with open('alignment_part_1_string_match.pkl', 'rb') as f:
    result_0_all = pkl.load(f)

### Filter the indexs obtained using sliding window string match

### Part 2: Strict Match

In [327]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

In [32]:
def exact_match(en_subset, episode):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        if len(subtitle.strip().split(" ")) <=5:
            continue
        # Exact Match for short sentences
        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            if subtitle == utt:
                if i not in res:
                    res[i] = set()
                    res[i].add(j)
                else:
                    res[i].add(j)
    output = {}
    for x in res:
        output[x] = sorted(list(res[x]))

    return output

In [106]:
result_1_all = {}
for i in range(12):
    for j in range(30):
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=tbbt_transcripts,
                en_subtitle=en_subtitle,
                zh_subtitle=zh_subtitle,
                results=results,
                season_id=i+1,
                episode_id=j+1,
                bias=200
            )
            temp = exact_match(en_subset, tbbt_episode)
            result_1_all[(i+1,j+1)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(turn_sub2epi_into_epi2sub(temp)), "Subtitle Number:",len(temp))
        except:
            pass

Season: 1 Episode: 1 Episode Number: 31 Subtitle Number: 31
Season: 1 Episode: 2 Episode Number: 23 Subtitle Number: 23
Season: 1 Episode: 3 Episode Number: 17 Subtitle Number: 17
Season: 1 Episode: 4 Episode Number: 13 Subtitle Number: 13
Season: 1 Episode: 5 Episode Number: 12 Subtitle Number: 12
Season: 1 Episode: 6 Episode Number: 29 Subtitle Number: 29
Season: 1 Episode: 7 Episode Number: 22 Subtitle Number: 22
Season: 1 Episode: 8 Episode Number: 31 Subtitle Number: 31
Season: 1 Episode: 9 Episode Number: 21 Subtitle Number: 21
Season: 1 Episode: 10 Episode Number: 25 Subtitle Number: 25
Season: 1 Episode: 11 Episode Number: 25 Subtitle Number: 25
Season: 1 Episode: 12 Episode Number: 18 Subtitle Number: 18
Season: 1 Episode: 13 Episode Number: 15 Subtitle Number: 15
Season: 1 Episode: 14 Episode Number: 17 Subtitle Number: 17
Season: 1 Episode: 15 Episode Number: 24 Subtitle Number: 24
Season: 1 Episode: 16 Episode Number: 19 Subtitle Number: 19
Season: 2 Episode: 1 Episode Numb

In [107]:
with open('alignment_part_2_strict_match.pkl', 'wb') as f:
    pkl.dump(result_1_all, f)

In [13]:
with open('alignment_part_2_strict_match.pkl', 'rb') as f:
    result_1_all = pkl.load(f)

### Converge part 1 and part 2 result as the seeds

In [9]:
def merge_two_dict(dict_1, dict_2):
    res = deepcopy(dict_1)
    for sea_epi in dict_2:
        if sea_epi not in res:
            res[sea_epi] = dict_2[sea_epi]
        else:
            temp = {}
            for x in res[sea_epi]:
                temp[x] = list(res[sea_epi][x])
            # add dict 2
            if sea_epi in dict_2:
                for x in dict_2[sea_epi]:
                    if x not in temp:
                        temp[x] = sorted(list(set(dict_2[sea_epi][x])))
                    else:
                        temp[x].extend(dict_2[sea_epi][x])
                    temp[x] = sorted(list(set(temp[x])))
            res[sea_epi] = temp

    output = {}
    for x in res:
        if res[x]!={}:
            output[x] = res[x]
    return output

In [10]:
# Perform index filtering on the alignment seeds
"""
Filter indexs based on the index before and after
"""
def filter_by_idx(sub2epi):
    paris = []
    for x in sorted(list(sub2epi.keys())):
        for y in sorted(sub2epi[x]):
            paris.append([x,y])

    res = [paris[0]]
    for i in range(1, len(paris)-1):
        former = res[-1]
        current = paris[i]
        after = paris[i+1]
        if former[0] <= current[0] <= after[0]:
            if former[1]<=current[1]<=after[1]:
                res.append(current)
    if paris[-1][0] >= res[-1][0]:
        if paris[-1][1] >= res[-1][1]:
            res.append(paris[-1])

    output = {}
    for x in res:
        sub = x[0]
        epi = x[1]
        if sub not in output:
            output[sub] = [epi]
        else:
            output[sub].append(epi)

    return output

In [16]:
alignment_seeds = {}
temp = merge_two_dict(result_0_all, result_1_all)
for x in temp:
    alignment_seeds[x] = filter_by_idx(temp[x])

In [240]:
with open('alignment_seeds.pkl', 'wb') as f:
    pkl.dump(alignment_seeds, f)

In [11]:
with open('alignment_seeds.pkl', 'rb') as f:
    alignment_seeds = pkl.load(f)

In [12]:
temp = []
for x in alignment_seeds:
    temp.append(x)

## Part 2: Extend from Alignment Seeds

In [58]:
def extend_neighbors(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(epi2sub_alignment_2[epi_id])
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_latter = transformation(en_subset[sub_id_latter])
        epi = transformation(episode[epi_id][0])

        if sub_former in epi:
            sub_ids.append(sub_id_former)
            # print(epi_id, sub_id_former)
        if sub_latter in epi:
            sub_ids.append(sub_id_latter)
            # print(epi_id, sub_id_latter)
        # print(sorted(sub_ids))
        temp[epi_id] = sorted(sub_ids)
        # epi2sub_alignment_2[epi_id] = sorted(sub_ids)
        # print("=="*50)
    return temp

In [59]:
def add_strict_match_within_gaps(gaps, epi2sub):
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]

        for sub_id in sub_ids:
            sub = transformation(en_subset[sub_id].replace("’", "'"))
            for epi_id in epi_ids:
                epi = transformation(tbbt_episode[epi_id][0].replace("’", "'"))
                if len(epi.strip().split(" "))<=2:
                    continue
                if sub == epi:

                    if epi_id not in epi2sub:
                        epi2sub[epi_id] = [sub_id]
                    else:
                        epi2sub[epi_id].append(sub_id)

    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))

    return output

In [60]:
def get_optimal_wer_from_episode(ground_truth, hypothesis_pool):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.compute_measures(ground_truth, hypothesis)['wer'])
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, scores.index(min(scores))

In [61]:
def add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode):
    count = 0
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]

        for epi_id in epi_ids:
            best_score = 100
            best_pair = [None, None]
            epi = transformation(tbbt_episode[epi_id][0].replace("’", "'"))
            if len(epi.strip().split(" "))<=2:
                continue
            for sub_id in sub_ids:
                sub = transformation(en_subset[sub_id].replace("’", "'"))
                score = jiwer.compute_measures(epi, sub)['wer']
                if score < best_score:
                    best_score = score
                    best_pair = [epi_id, sub_id]
            if best_score < 0.15:
                count += 1
                if best_pair[0] not in epi2sub:
                    epi2sub[best_pair[0]] = [best_pair[1]]
                else:
                    epi2sub[best_pair[0]].append([best_pair[1]])
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))

    return output

In [62]:
def add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode):
    count = 0
    pattern = r'\.|\?|\!|\;|- '
    temp_sub2epi = {}
    for gap in gaps:
        # Build substrings
        sub_ids = gap[1]
        epi_ids = gap[0]
        sub_lists = []
        epi_lists = []

        for epi_id in epi_ids:
            epi = tbbt_episode[epi_id][0].replace("’", "'")
            epi_substring = re.split(pattern, epi)
            for item in epi_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                epi_lists.append([temp_item, epi_id])

        for sub_id in sub_ids:
            sub = en_subset[sub_id].replace("’", "'")
            sub_substring = re.split(pattern, sub)
            for item in sub_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                sub_lists.append([temp_item, sub_id])


        # Calculate WER Similarity
        for (sub, sub_id) in sub_lists:
            for (epi, epi_id) in epi_lists:
                cer = jiwer.cer(epi, sub)
                if cer <= 0.2:
                    count += 1
                    if sub_id not in temp_sub2epi:
                        temp_sub2epi[sub_id] = set()
                        temp_sub2epi[sub_id].add(epi_id)
                    else:
                        temp_sub2epi[sub_id].add(epi_id)

    for sub_id in temp_sub2epi:
        epi_ids = list(temp_sub2epi[sub_id])
        if len(epi_ids)!=1:
            continue
        epi_id = epi_ids[0]
        if epi_id not in epi2sub:
            epi2sub[epi_id] = [sub_id]
        else:
            epi2sub[epi_id].append(sub_id)

    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [63]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [64]:
def extend_neighbors_sliding(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(epi2sub_alignment_2[epi_id])
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

In [65]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_neighbors_subtitle_sliding(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [66]:
def get_alignment_extend_neighbors(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, turn_sub2epi_into_epi2sub(alignment_seeds[(season_id,episode_id)]), tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Add within the gap
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_one_size_gap(gaps, epi2sub)

    # # Further extend neighbors
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub

In [74]:
with open('alignment_results/fa/alignment_seeds.pkl', 'rb') as f:
    alignment_seeds = pkl.load(f)

In [79]:
# Clean Alignment
cleaned_alignment_seeds = {}
for x in alignment_seeds:
    temp = {}
    sub2epi = alignment_seeds[x]
    for sub_id in sub2epi:
        if len(sub2epi[sub_id])==1:
            temp[sub_id] = sub2epi[sub_id]
    epi2sub = turn_sub2epi_into_epi2sub(temp)
    cleaned_alignment_seeds[x] = epi2sub

In [80]:
alignment_seeds = deepcopy(cleaned_alignment_seeds)

In [81]:
for x in alignment_seeds:
    print(x)
    print(alignment_seeds[x])

(1, 1)
{200: [0], 217: [9], 229: [15], 231: [16], 234: [17], 242: [28], 248: [34], 250: [35], 267: [63], 269: [65], 284: [75], 294: [88], 295: [89], 296: [90], 303: [96], 309: [102], 314: [108], 326: [117], 342: [134], 343: [135], 358: [145], 361: [148], 370: [154], 373: [155], 379: [160], 388: [162], 390: [164], 395: [171], 398: [173], 401: [175], 402: [176], 403: [177], 409: [183], 412: [186], 420: [193], 427: [201], 432: [207], 433: [208], 439: [210], 444: [215], 456: [226], 457: [228], 464: [232], 470: [237], 473: [240], 476: [241], 478: [242], 482: [245], 496: [258], 497: [259], 501: [262], 509: [270], 514: [275], 519: [280], 532: [288], 539: [295], 541: [296], 542: [297], 548: [302], 550: [304], 556: [309], 561: [314]}
(1, 2)
{167: [2], 186: [18], 188: [20], 193: [25], 209: [39], 213: [42], 228: [49], 232: [52], 234: [54], 241: [60], 244: [61], 249: [64], 252: [65], 261: [71], 262: [72], 281: [82], 288: [90], 292: [93], 293: [94], 299: [105], 309: [115], 310: [116], 311: [117], 3

In [82]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment_extend_neighbors(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:",len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])), len(temp), "Subtitle Number:", len(alignment_seeds[(i,j)]),len(turn_sub2epi_into_epi2sub(temp)))

Season: 1 Episode: 1 Episode Number: 62 63 Subtitle Number: 62 63
Season: 1 Episode: 2 Episode Number: 54 54 Subtitle Number: 54 54
Season: 1 Episode: 3 Episode Number: 40 40 Subtitle Number: 40 40
Season: 1 Episode: 4 Episode Number: 45 45 Subtitle Number: 45 45
Season: 1 Episode: 5 Episode Number: 50 50 Subtitle Number: 50 50
Season: 1 Episode: 6 Episode Number: 63 64 Subtitle Number: 63 64
Season: 1 Episode: 7 Episode Number: 60 60 Subtitle Number: 60 60
Season: 1 Episode: 8 Episode Number: 61 61 Subtitle Number: 61 62
Season: 1 Episode: 9 Episode Number: 40 40 Subtitle Number: 40 40
Season: 1 Episode: 10 Episode Number: 63 64 Subtitle Number: 63 64
Season: 1 Episode: 11 Episode Number: 48 49 Subtitle Number: 48 49
Season: 1 Episode: 12 Episode Number: 39 39 Subtitle Number: 39 39
Season: 1 Episode: 13 Episode Number: 69 69 Subtitle Number: 69 69
Season: 1 Episode: 14 Episode Number: 58 58 Subtitle Number: 58 59
Season: 1 Episode: 15 Episode Number: 56 56 Subtitle Number: 56 58
Seas

In [83]:
for x in further_alignment:
    print(x)
    print(further_alignment[x])
    print("=="*50)

(1, 1)
{0: [200], 9: [217], 15: [229], 16: [231], 17: [234], 28: [242], 34: [248], 35: [250], 63: [267], 65: [269], 75: [284], 88: [294], 89: [295], 90: [296], 96: [303], 102: [309], 108: [314], 115: [318], 117: [326], 134: [342], 135: [343], 145: [358], 148: [361], 154: [370], 155: [373], 160: [379], 162: [388], 164: [390], 171: [395], 173: [398], 175: [401], 176: [402], 177: [403], 183: [409], 186: [412], 193: [420], 201: [427], 207: [432], 208: [433], 210: [439], 215: [444], 226: [456], 228: [457], 232: [464], 237: [470], 240: [473], 241: [476], 242: [478], 245: [482], 258: [496], 259: [497], 262: [501], 270: [509], 275: [514], 280: [519], 288: [532], 295: [539], 296: [541], 297: [542], 302: [548], 304: [550], 309: [556], 314: [561]}
(1, 2)
{2: [167], 18: [186], 20: [188], 25: [193], 39: [209], 42: [213], 49: [228], 52: [232], 54: [234], 60: [241], 61: [244], 64: [249], 65: [252], 71: [261], 72: [262], 82: [281], 90: [288], 93: [292], 94: [293], 105: [299], 115: [309], 116: [310], 1

In [84]:
alignment_seeds = deepcopy(further_alignment)

In [85]:
# Clean Alignment
cleaned_alignment_seeds = {}
for x in alignment_seeds:
    temp = {}
    sub2epi = alignment_seeds[x]
    for sub_id in sub2epi:
        if len(sub2epi[sub_id])==1:
            temp[sub_id] = sub2epi[sub_id]
    epi2sub = turn_sub2epi_into_epi2sub(temp)
    cleaned_alignment_seeds[x] = epi2sub

In [86]:
with open('alignment_results/zh/cleaned_final_stage_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [87]:
with open('alignment_results/zh/cleaned_final_stage_alignment.pkl', 'rb') as f:
    final_stage_alignment = pkl.load(f)

In [88]:
def get_alignment_extend_neighbors(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    epi2sub = alignment_seeds[(season_id,episode_id)]
    # Extend the neighbors
    while True:
        temp = extend_neighbors(en_subset, epi2sub, tbbt_episode)
        if len(temp)==len(epi2sub) and len(turn_sub2epi_into_epi2sub(temp))==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break
        else:
            epi2sub = temp

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # # Add within the gap
    # gaps, abandons = get_subset_in_gaps(epi2sub)
    # epi2sub = add_one_size_gap(gaps, epi2sub)

    # # Further extend neighbors
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    # epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub


In [89]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment_extend_neighbors(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:",len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])), len(temp), "Subtitle Number:", len(alignment_seeds[(i,j)]),len(turn_sub2epi_into_epi2sub(temp)))

Season: 1 Episode: 1 Episode Number: 63 63 Subtitle Number: 63 63
Season: 1 Episode: 2 Episode Number: 54 54 Subtitle Number: 54 54
Season: 1 Episode: 3 Episode Number: 40 40 Subtitle Number: 40 40
Season: 1 Episode: 4 Episode Number: 45 45 Subtitle Number: 45 45
Season: 1 Episode: 5 Episode Number: 50 50 Subtitle Number: 50 50
Season: 1 Episode: 6 Episode Number: 64 64 Subtitle Number: 64 64
Season: 1 Episode: 7 Episode Number: 60 60 Subtitle Number: 60 60
Season: 1 Episode: 8 Episode Number: 62 61 Subtitle Number: 61 62
Season: 1 Episode: 9 Episode Number: 40 40 Subtitle Number: 40 40
Season: 1 Episode: 10 Episode Number: 64 64 Subtitle Number: 64 64
Season: 1 Episode: 11 Episode Number: 49 49 Subtitle Number: 49 49
Season: 1 Episode: 12 Episode Number: 39 39 Subtitle Number: 39 39
Season: 1 Episode: 13 Episode Number: 69 69 Subtitle Number: 69 69
Season: 1 Episode: 14 Episode Number: 59 58 Subtitle Number: 58 59
Season: 1 Episode: 15 Episode Number: 58 56 Subtitle Number: 56 58
Seas

## Part 3: Final Stage Alignment

In [94]:
def get_sliding_window_substrings(input_string, window_size):
    input_tokens = input_string.strip().split(" ")
    substrings = []
    for i in range(len(input_tokens)-window_size):
        substrings.append(" ".join(input_tokens[i: i+window_size]))
    return substrings

In [95]:
def get_final_stage_gap_pairs(epi2sub):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]

    # Perform string match and CER Scoring
    return subtitle_gaps

In [96]:
"""
Explore the neighbor subtitles of a episode.

Given a episode utterance (epi_id), then we fetch the unaligned subtitle (sub_id)
[epi_id_0, epi_id_1, etc., epi_id_n] - [sub_id_0, sub_id_1, etc. sub_id_m]

Then, we search within the subset-pair

For each subtitle, we use sliding window to fetch a set of substrings in each episode utterance and calculate the CER
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    count = 0
    # Gather the gap of subtitle corresponding to episode utterance
    subtitle_gaps = get_final_stage_gap_pairs(epi2sub)

    # Iterate the whole subtitle gaps to perform substring match
    temp_epi2sub = deepcopy(epi2sub)
    for item in subtitle_gaps:
        epi_ids = [i for i in range(item[0], item[1]+1)]
        sub_ids = [i for i in subtitle_gaps[item]]

        # Fetch all episodes and subtitles
        epis = [transformation(tbbt_episode[i][0].replace("’", " ").replace('…', " ")) for i in epi_ids]
        subs = [transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids]

        # print(epi_ids)
        # print(epis)
        # print(sub_ids)
        # print(subs)
        for sub_id, sub in zip(sub_ids, subs):
            sub_len = len(sub.strip().split(" "))
            if sub_len <=3:
                continue
            min_score = float('inf')
            min_substring = ""
            source_episode = ""
            source_sub_id = float('inf')
            source_epi_id = float('inf')
            for epi, epi_id in zip(epis, epi_ids):
                # min_score = float('inf')
                # min_substring = ""
                epi_substrings = get_sliding_window_substrings(epi, window_size=sub_len)
                for substring in epi_substrings:
                    wer = jiwer.wer(sub, substring)
                    if wer <= min_score:
                        min_score = wer
                        min_substring = substring
                        source_episode = epi
                        source_epi_id = epi_id
            if min_score <= 0.5:
                if source_epi_id not in temp_epi2sub:
                    temp_epi2sub[source_epi_id] = [sub_id]
                else:
                    temp_epi2sub[source_epi_id].append(sub_id)
                # print(sub_id+2, source_epi_id+2)
                # print("Subtitle:", sub_id+2, sub, len(sub.strip().split(" ")))
                # print("All Episode to compare:")
                # print(min_score, '|', min_substring, '|', source_episode)
                # print('=='*50)
                # count += 1
    # print(temp_epi2sub)
    # print(len(temp_epi2sub), len(turn_sub2epi_into_epi2sub(temp_epi2sub)))
    # print(count)
    output = {}
    for epi_id in sorted(list(temp_epi2sub.keys())):
        output[epi_id] = sorted(list(set(temp_epi2sub[epi_id])))
    return output

# epi2sub_0 = epi2sub
# print(len(epi2sub_0), len(turn_sub2epi_into_epi2sub(epi2sub_0)))
# epi2sub_1 = extend_neighbors_episode_sliding(en_subset, epi2sub_0, tbbt_episode)
# print(len(epi2sub_1), len(turn_sub2epi_into_epi2sub(epi2sub_1)))
# epi2sub_2 = extend_neighbors_episode_sliding(en_subset, epi2sub_1, tbbt_episode)
# print(len(epi2sub_2), len(turn_sub2epi_into_epi2sub(epi2sub_2)))

In [97]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    epi2sub = alignment_seeds[(season_id,episode_id)]
    # Extend the neighbors
    while True:
        temp = extend_neighbors(en_subset, epi2sub, tbbt_episode)
        if len(temp)==len(epi2sub) and len(turn_sub2epi_into_epi2sub(temp))==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break
        else:
            epi2sub = temp

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend the neighbors
    epi2sub = extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub

In [99]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:", len(alignment_seeds[(i,j)]),len(temp), "|", "Subtitle Number:", len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])),len(turn_sub2epi_into_epi2sub(temp)))

Season: 1 Episode: 1 Episode Number: 63 66 | Subtitle Number: 63 68
Season: 1 Episode: 2 Episode Number: 54 55 | Subtitle Number: 54 55
Season: 1 Episode: 3 Episode Number: 40 42 | Subtitle Number: 40 43
Season: 1 Episode: 4 Episode Number: 45 45 | Subtitle Number: 45 45
Season: 1 Episode: 5 Episode Number: 50 51 | Subtitle Number: 50 52
Season: 1 Episode: 6 Episode Number: 64 64 | Subtitle Number: 64 64
Season: 1 Episode: 7 Episode Number: 60 61 | Subtitle Number: 60 62
Season: 1 Episode: 8 Episode Number: 61 64 | Subtitle Number: 62 67
Season: 1 Episode: 9 Episode Number: 40 40 | Subtitle Number: 40 41
Season: 1 Episode: 10 Episode Number: 64 65 | Subtitle Number: 64 67
Season: 1 Episode: 11 Episode Number: 49 49 | Subtitle Number: 49 49
Season: 1 Episode: 12 Episode Number: 39 39 | Subtitle Number: 39 39
Season: 1 Episode: 13 Episode Number: 69 69 | Subtitle Number: 69 69
Season: 1 Episode: 14 Episode Number: 58 58 | Subtitle Number: 59 61
Season: 1 Episode: 15 Episode Number: 56 58

In [100]:
with open('alignment_results/fa/final_stage_alignment_1.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [102]:
with open('alignment_results/fa/final_stage_alignment_1.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [None]:
for x in further_alignment:
    print(x, len(further_alignment[x]), len(turn_sub2epi_into_epi2sub(further_alignment[x])))

## Part 4: Ultimate Stage
In this stage:
1. Add the head and tail part
2. add the alignment in the gap

In [408]:
with open('alignment_results/zh/final_stage_alignment_1.pkl', 'rb') as f:
    alignment_seeds = pkl.load(f)

In [411]:
for x in alignment_seeds[(1,1)]:
    print(x, alignment_seeds[(1,1)][x])

0 [200, 201, 202, 203]
1 [204]
2 [205]
5 [206, 207, 208, 209, 210, 211]
6 [212]
7 [213]
8 [214]
9 [215]
10 [216]
11 [217]
12 [218, 219]
13 [220]
14 [221]
15 [222, 223, 224]
16 [225, 226]
17 [227, 228, 229]
18 [230]
19 [231]
20 [232]
21 [233]
22 [234]
23 [235]
24 [236]
28 [239]
30 [240]
31 [241]
32 [242]
33 [243, 244]
34 [246, 247]
35 [248]
36 [249]
39 [252]
40 [253]
45 [256]
46 [257]
47 [258, 259]
54 [262]
63 [266]
64 [267]
65 [268]
66 [269]
67 [270]
68 [271]
69 [272, 273]
70 [274, 275]
71 [276]
72 [277]
73 [278, 279]
74 [280]
75 [281, 282]
76 [283]
82 [285, 286]
83 [287]
84 [288]
86 [289]
88 [290]
89 [291]
90 [292]
93 [293]
94 [294]
96 [295, 296]
97 [297]
98 [298]
99 [299]
100 [300]
102 [301, 302]
104 [303]
105 [304]
106 [305]
107 [306]
108 [307]
109 [308]
111 [311]
112 [312]
115 [313]
116 [314, 315, 316, 317]
117 [318]
119 [319]
123 [321, 322]
124 [323]
126 [325]
127 [326]
128 [327]
131 [329]
132 [330]
133 [331, 332]
134 [333]
135 [334]
136 [335]
137 [336]
138 [337, 338, 339, 340]
13

In [None]:
"""
Add the head and tail alignment if it is not complete
"""


In [538]:
def fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub):
    gap_pairs = []
    # Fetch the Episode-Subtitle Pair before the episode start
    epi_ids = list(range(0, min(epi2sub.keys())))
    num_epi_token = 0
    for i in epi_ids:
        num_epi_token += len(tbbt_episode[i][0].strip().split(" "))

    sub_ids = [min(epi2sub[min(epi2sub.keys())])]
    num_sub_token = len(en_subset[sub_ids[-1]].strip().split(" "))
    while num_sub_token <= num_epi_token*3:
        sub_ids.append(sub_ids[-1]-1)
        num_sub_token += len(en_subset[sub_ids[-1]].strip().split(" "))
    gap_pairs.append([epi_ids, sorted(sub_ids)])

    # Fetch the Episode-Subtitle Pair after the episode end
    epi_ids = list(range(max(epi2sub.keys())+1, len(tbbt_episode)))
    num_epi_token = 0
    for i in epi_ids:
        num_epi_token += len(tbbt_episode[i][0].strip().split(" "))

    sub_ids = [max(epi2sub[max(epi2sub.keys())])+1]
    num_sub_token = len(en_subset[sub_ids[-1]].strip().split(" "))
    while num_sub_token <= num_epi_token*3:
        sub_ids.append(sub_ids[-1]+1)
        num_sub_token += len(en_subset[sub_ids[-1]].strip().split(" "))
    gap_pairs.append([epi_ids, sub_ids])

    return gap_pairs

In [539]:
def before_after_exact_match(en_subset, episode, epi_ids, sub_ids):
    res = {}
    for sub_id in sub_ids:
        subtitle = transformation(en_subset[sub_id])
        if len(subtitle.strip().split(" ")) <=5:
            continue
        # Exact Match for short sentences
        for epi_id in epi_ids:
            utt = transformation(episode[epi_id][0])
            speaker = episode[epi_id][1]
            if subtitle == utt:
                if i not in res:
                    res[i] = set()
                    res[i].add(j)
                else:
                    res[i].add(j)
    output = {}
    for x in res:
        output[x] = sorted(list(res[x]))

    return output

In [540]:
def before_after_string_match_sliding_window_no_filter(en_subset, episode, epi_ids, sub_ids, window_size=5):
    res = {}
    for sub_id in sub_ids:
        subtitle = transformation(en_subset[sub_id])
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for epi_id in epi_ids:
            utt = transformation(episode[epi_id][0])
            speaker = episode[epi_id][1]
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)
    return res

In [None]:
def add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode):
    count = 0
    pattern = r'\.|\?|\!|\;|- '
    temp_sub2epi = {}
    for gap in gaps:
        # Build substrings
        sub_ids = gap[1]
        epi_ids = gap[0]
        sub_lists = []
        epi_lists = []

        for epi_id in epi_ids:
            epi = tbbt_episode[epi_id][0].replace("’", "'")
            epi_substring = re.split(pattern, epi)
            for item in epi_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                epi_lists.append([temp_item, epi_id])

        for sub_id in sub_ids:
            sub = en_subset[sub_id].replace("’", "'")
            sub_substring = re.split(pattern, sub)
            for item in sub_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                sub_lists.append([temp_item, sub_id])


        # Calculate WER Similarity
        for (sub, sub_id) in sub_lists:
            for (epi, epi_id) in epi_lists:
                cer = jiwer.cer(epi, sub)
                if cer <= 0.2:
                    count += 1
                    if sub_id not in temp_sub2epi:
                        temp_sub2epi[sub_id] = set()
                        temp_sub2epi[sub_id].add(epi_id)
                    else:
                        temp_sub2epi[sub_id].add(epi_id)

    for sub_id in temp_sub2epi:
        epi_ids = list(temp_sub2epi[sub_id])
        if len(epi_ids)!=1:
            continue
        epi_id = epi_ids[0]
        if epi_id not in epi2sub:
            epi2sub[epi_id] = [sub_id]
        else:
            epi2sub[epi_id].append(sub_id)

    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [622]:
def get_optimal_wer_from_episode(ground_truth, hypothesis_pool):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.compute_measures(ground_truth, hypothesis)['wer'])
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, scores.index(min(scores))

In [662]:
def get_optimal_cer_from_episode(ground_truth, hypothesis_pool, utt_ids):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.cer(ground_truth, hypothesis))
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, utt_ids[scores.index(min(scores))]

In [781]:
def before_after_wer_match(en_subset, episode, epi_ids, sub_ids):
    # Load Sub2Epi
    temp_sub2epi = {}
    temp_epi2sub = {}
    for sub_id in sub_ids:
        subtitle = transformation(en_subset[sub_id].replace("’", " ").replace('…', " "))
        subtitle_tokens = subtitle.strip().split(" ")
        sub_len = len(subtitle_tokens)
        if sub_len <= 4:
            continue

        utt_segments = []
        utt_ids = []
        for epi_id in epi_ids:
            utt = transformation(episode[epi_id][0].replace("’", " ").replace('…', " "))
            utt_tokens = utt.strip().split(" ")
            for j in range(len(utt_tokens) - sub_len+1):
                utt_segments.append(" ".join(utt_tokens[j: j + sub_len]))
                utt_ids.append(epi_id)
        if utt_segments != [] and subtitle not in [" ", ""]:
            score, hypo, truth, index = get_optimal_cer_from_episode(subtitle, utt_segments, utt_ids)
            if score >= 0.5:
                continue
            # print("**********")
            # print("Score:", score)
            # print("Subtitle:", truth)
            # print("Episode:", hypo)
            # print("Epi ID:", index, "Sub ID:", sub_id)
            temp_sub2epi[sub_id] = [index]
            if index not in temp_epi2sub:
                temp_epi2sub[index] = [sub_id]
            else:
                temp_epi2sub[index].append(sub_id)
    # print("Sub2Epi:", temp_sub2epi)
    # print("Epi2Sub:", temp_epi2sub)
    return temp_sub2epi

In [785]:
def filter_alignment(epi2sub):
    epi_ids = list(epi2sub.keys())
    if len(epi_ids)<2:
        return epi2sub

    temp = {}
    for i in range(len(epi_ids)-1):
        if max(further_alignment[x][epi_ids[i]]) <= min(further_alignment[x][epi_ids[i+1]]):
            temp[epi_ids[i]] = further_alignment[x][epi_ids[i]]
        # print(epi_id, further_alignment[x][epi_id], min(further_alignment[x][epi_ids[i+1]]) >= max(further_alignment[x][epi_ids[i]]))
    temp[epi_ids[-1]] = epi2sub[epi_ids[-1]]
    return temp

In [794]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    head_tail_sub2epi = {}

    epi2sub = alignment_seeds[(season_id,episode_id)]
    print(min(epi2sub.keys()), max(epi2sub.keys()), len(tbbt_episode)-1, len(tbbt_episode)-max(epi2sub.keys())-1)

    # Fetch episode-subtitle alignment pairs before and after the head and tail
    gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
    for item in gap_pairs:
        # print(item)
        # print("Episode:")
        # for i in item[0]:
        #     print(transformation(tbbt_episode[i][0].replace("’", " ").replace('…', " ")))
        # print()
        # print("Subtitle:")
        # for i in item[1]:
        #     print(transformation(en_subset[i].replace("’", " ").replace('…', " ")))
        # print("------")
        temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
        for x in temp:
            head_tail_sub2epi[x] = temp[x]

    # Clean Alignment
    head_tail_epi2sub = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
    print(head_tail_epi2sub)
    # head_tail_epi2sub = filter_alignment(head_tail_epi2sub)
    # print(head_tail_epi2sub)


    return head_tail_sub2epi

    # for item in gap_pairs:
    #     temp = before_after_string_match_sliding_window_no_filter(en_subset, tbbt_episode, item[0], item[1], 200)
    #     print(temp)


    # # Extend the neighbors
    # while True:
    #     temp = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    #     if len(temp)==len(epi2sub) and len(turn_sub2epi_into_epi2sub(temp))==len(turn_sub2epi_into_epi2sub(epi2sub)):
    #         break
    #     else:
    #         epi2sub = temp
    #
    # # Extend within gap using strict string match
    # gaps, abandons = get_subset_in_gaps(epi2sub)
    # epi2sub = add_strict_match_within_gaps(gaps, epi2sub)
    #
    # # Extend within gap using wer
    # gaps, abandons = get_subset_in_gaps(epi2sub)
    # epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)
    #
    # # Extend within gap using substring cer
    # gaps, abandons = get_subset_in_gaps(epi2sub)
    # epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)
    #
    # # Extend the neighbors
    # epi2sub = extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

    # return epi2sub

In [807]:
further_alignment = {}
for (i, j) in list(alignment_seeds.keys()):
    print(i, j)
    if i==1 and j==12:
        continue
    temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = turn_sub2epi_into_epi2sub(temp)
    # print("Season:", i,"Episode:", j, "Episode Number:", len(alignment_seeds[(i,j)]),len(temp), "|", "Subtitle Number:", len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])),len(turn_sub2epi_into_epi2sub(temp)))
    print("=="*50)

head_tail_alignment = {}
for x in further_alignment:
    print(x)
    print(further_alignment[x])
    print(filter_alignment(further_alignment[x]))
    print()
    head_tail_alignment[x] = filter_alignment(further_alignment[x])

1 1
0 314 314 0
{}
1 2
2 232 234 2
{0: [166], 1: [167]}
1 3
3 240 247 7
{0: [189, 190], 241: [506], 242: [507, 509, 510], 243: [511], 245: [513]}
1 4
1 215 223 8
{217: [485, 486], 220: [488]}
1 5
0 220 222 2
{221: [475, 476], 222: [477]}
1 6
2 238 250 12
{0: [197, 198], 242: [495, 496], 244: [498], 245: [499], 246: [500], 247: [501], 248: [502], 249: [503], 250: [505, 506, 507]}
1 7
10 275 283 8
{0: [180], 1: [182], 3: [183], 8: [189, 190, 191], 9: [193], 276: [514], 279: [519], 280: [520, 521], 281: [522]}
1 8
2 263 263 0
{}
1 9
1 218 224 6
{}
1 10
0 223 224 1
{224: [481, 482, 483]}
1 11
4 245 245 0
{2: [190, 191]}
1 12
1 13
2 246 254 8
{0: [200, 201], 1: [202], 248: [512], 249: [513], 252: [516], 254: [518]}
1 14
2 230 233 3
{0: [200, 201, 203], 231: [515], 232: [516, 517]}
1 15
0 230 237 7
{231: [496], 233: [498], 237: [503, 504]}
1 16
0 221 236 15
{222: [468], 226: [471, 472], 227: [473, 477, 494], 228: [474], 229: [478], 230: [480, 481, 482, 483], 231: [484, 485], 232: [487], 233:

In [810]:
for x in head_tail_alignment:
    print(x)
    print(head_tail_alignment)
    print("=="*50)

(1, 1)
(1, 2)
(1, 3)
(1, 4)
(1, 5)
(1, 6)
(1, 7)
(1, 8)
(1, 9)
(1, 10)
(1, 11)
(1, 13)
(1, 14)
(1, 15)
(1, 16)
(2, 1)
(2, 2)
(2, 3)
(2, 4)
(2, 5)
(2, 6)
(2, 7)
(2, 8)
(2, 9)
(2, 10)
(2, 11)
(2, 12)
(2, 13)
(2, 14)
(2, 15)
(2, 16)
(2, 17)
(2, 18)
(2, 19)
(2, 20)
(2, 21)
(2, 22)
(2, 23)
(3, 1)
(3, 2)
(3, 3)
(3, 4)
(3, 5)
(3, 6)
(3, 7)
(3, 8)
(3, 9)
(3, 10)
(3, 11)
(3, 12)
(3, 13)
(3, 14)
(3, 15)
(3, 16)
(3, 17)
(3, 18)
(3, 19)
(3, 20)
(3, 21)
(3, 22)
(3, 23)
(4, 1)
(4, 2)
(4, 3)
(4, 4)
(4, 5)
(4, 6)
(4, 7)
(4, 8)
(4, 9)
(4, 10)
(4, 11)
(4, 12)
(4, 13)
(4, 14)
(4, 15)
(4, 17)
(4, 18)
(4, 19)
(4, 20)
(4, 21)
(4, 22)
(4, 23)
(4, 24)
(5, 1)
(5, 2)
(5, 3)
(5, 4)
(5, 5)
(5, 6)
(5, 7)
(5, 8)
(5, 9)
(5, 10)
(5, 11)
(5, 12)
(5, 13)
(5, 14)
(5, 15)
(5, 16)
(5, 17)
(5, 18)
(5, 19)
(5, 20)
(5, 21)
(5, 22)
(5, 23)
(6, 1)
(6, 2)
(6, 3)
(6, 4)
(6, 5)
(6, 6)
(6, 7)
(6, 8)
(6, 9)
(6, 10)
(6, 11)
(6, 12)
(6, 13)
(6, 14)
(6, 15)
(6, 16)
(6, 17)
(6, 18)
(6, 19)
(6, 20)
(6, 21)
(6, 22)
(6, 23)
(6, 24)
(7, 1)

In [806]:
for x in further_alignment:
    print(x)
    print(further_alignment[x])
    print(filter_alignment(further_alignment[x]))
    print()

(1, 1)
{}
{}

(1, 2)
{0: [166], 1: [167]}
{0: [166], 1: [167]}

(1, 3)
{0: [189, 190], 241: [506], 242: [507, 509, 510], 243: [511], 245: [513]}
{0: [189, 190], 241: [506], 242: [507, 509, 510], 243: [511], 245: [513]}

(1, 4)
{217: [485, 486], 220: [488]}
{217: [485, 486], 220: [488]}

(1, 5)
{221: [475, 476], 222: [477]}
{221: [475, 476], 222: [477]}

(1, 6)
{0: [197, 198], 242: [495, 496], 244: [498], 245: [499], 246: [500], 247: [501], 248: [502], 249: [503], 250: [505, 506, 507]}
{0: [197, 198], 242: [495, 496], 244: [498], 245: [499], 246: [500], 247: [501], 248: [502], 249: [503], 250: [505, 506, 507]}

(1, 7)
{0: [180], 1: [182], 3: [183], 8: [189, 190, 191], 9: [193], 276: [514], 279: [519], 280: [520, 521], 281: [522]}
{0: [180], 1: [182], 3: [183], 8: [189, 190, 191], 9: [193], 276: [514], 279: [519], 280: [520, 521], 281: [522]}

(1, 8)
{}
{}

(1, 9)
{}
{}

(1, 10)
{224: [481, 482, 483]}
{224: [481, 482, 483]}

(1, 11)
{2: [190, 191]}
{2: [190, 191]}

(1, 13)
{0: [200, 201]

In [791]:
for x in further_alignment:
    epi_ids = sorted(list(further_alignment[x].keys()))
    if len(epi_ids)<2:
        continue

    for i in range(len(epi_ids)-1):
        epi_id = epi_ids[i]
        print(epi_id, further_alignment[x][epi_id], min(further_alignment[x][epi_ids[i+1]]) >= max(further_alignment[x][epi_ids[i]]))
    print("=="*50)

In [738]:
with open('alignment_results/zh/head_tail_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [None]:
# Check the result
def check_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )
    print(season_id, episode_id)
    alignment = alignment_seeds[(season_id,episode_id)]
    for epi_id in alignment:
        if len(alignment[epi_id])!=max(alignment[epi_id])-min(alignment[epi_id])+1:
        # if max(temp[epi_id])-min(temp[epi_id])+1-len(temp[epi_id])==2:
            epi = transformation(tbbt_episode[epi_id][0].replace("’", " ").replace('…', " "))
            sub_ids = [i for i in range(min(alignment[epi_id]), max(alignment[epi_id])+1)]
            sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids])
            print(epi)
            print(sub)
            print(epi_id, alignment[epi_id], max(alignment[epi_id])-min(alignment[epi_id])+1-len(alignment[epi_id]), len(epi.strip().split(" ")), len(sub.strip().split(" ")), jiwer.wer(sub, epi))
            print('--')


    print("=="*50)

    # print(season_id, episode_id)
    # alignment = alignment_seeds[(season_id,episode_id)]
    # for item in alignment:
    #     epi = transformation(tbbt_episode[item][0].replace("’", " ").replace('…', " "))
    #     sub = " ".join([transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in alignment[item]])
    #     print(item, alignment[item], len(epi.strip().split(" ")), len(sub.strip().split(" ")))
    # print("=="*50)

    return epi2sub


further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = check_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)

In [None]:
alignment_seeds = further_alignment

In [None]:
for (i, j) in further_alignment:
    print(i, j)
    alignment = further_alignment[(i, j)]
    sub2epi = turn_sub2epi_into_epi2sub(alignment)
    for sub_id in sub2epi:
        if len(sub2epi[sub_id])!=1:
            print(sub_id, sub2epi[sub_id])
    print('=='*50)

In [None]:
for x in further_alignment:
    print(x, len(further_alignment[x]), len(turn_sub2epi_into_epi2sub(further_alignment[x])))

In [None]:
alignment_seeds = further_alignment

In [None]:
# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )
epi2sub = alignment_seeds[(1,1)]

In [None]:
def get_sliding_window_substrings(input_string, window_size):
    input_tokens = input_string.strip().split(" ")
    substrings = []
    for i in range(len(input_tokens)-window_size):
        substrings.append(" ".join(input_tokens[i: i+window_size]))
    return substrings

In [None]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [None]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [None]:
def sliding_within_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        for epi_id in epi_ids:
            epi = tbbt_episode

        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]


    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [None]:
"""
Add neighbor subtitle from the utterance
"""
def extend_subtitles_to_episode_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

extend_subtitles_to_episode_sliding_window(en_subset, epi2sub, tbbt_transcripts)

In [None]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_episodes_to_subtitle_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [None]:
alignment_seeds = final_stage_alignment

In [None]:
# for x in alignment_seeds:
#     print(x)
#     print(alignment_seeds[x])
#     print("=="*50)

In [None]:
# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )
epi2sub = alignment_seeds[(1,1)]

In [None]:
"""
Extend the former subtitle and latter subtitle near the episode
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]
    print(subtitle_gaps)

    # Check whether the subtitle could be merged into utterances using sliding window
    for start_epi_id, end_epi_id in subtitle_gaps:
        start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
        end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
        # print(start_epi_id, start_epi)
        # print(end_epi_id, end_epi)
        # print('--')

        for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
            sub = transformation(en_subset[sub_id])
            sub_substrings = get_sliding_window_substrings(sub, 6)
            sub_substrings.append(sub)
            # print(sub_id, sub)
            # print(sub_substrings)
            temp_start = [substring in start_epi for substring in sub_substrings]
            signal_start = True in temp_start
            temp_end = [substring in end_epi for substring in sub_substrings]
            signal_end = True in temp_end
            # print(temp_start, signal_start)
            # print(temp_end, signal_end)
            if signal_start==True and signal_end==True:
                print(start_epi_id, start_epi)
                print(end_epi_id, end_epi)
                print(sub, "|", signal_start, "|", signal_end)
                print('--')

        # print('=='*50)

extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

In [None]:
def get_final_stage_gap_pairs(epi2sub):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]

    # Perform string match and CER Scoring
    return subtitle_gaps

final_gap = get_final_stage_gap_pairs(epi2sub)

for item in final_gap:
    epi_ids = [i+2 for i in range(item[0], item[1]+1)]
    sub_ids = [x+2 for x in final_gap[item]]
    print(epi_ids)
    print(sub_ids)
    print('=='*50)

In [None]:
with open('alignment_results/zh/final_stage_alignment.pkl', 'rb') as f:
    final_stage_alignment = pkl.load(f)

# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

alignment_seeds = final_stage_alignment
epi2sub = alignment_seeds[(1,1)]

In [None]:
"""
Explore the neighbor subtitles of a episode.

Given a episode utterance (epi_id), then we fetch the unaligned subtitle (sub_id)
[epi_id_0, epi_id_1, etc., epi_id_n] - [sub_id_0, sub_id_1, etc. sub_id_m]

Then, we search within the subset-pair

For each subtitle, we use sliding window to fetch a set of substrings in each episode utterance and calculate the CER
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    count = 0
    # Gather the gap of subtitle corresponding to episode utterance
    subtitle_gaps = get_final_stage_gap_pairs(epi2sub)

    # Iterate the whole subtitle gaps to perform substring match
    temp_epi2sub = deepcopy(epi2sub)
    for item in subtitle_gaps:
        epi_ids = [i for i in range(item[0], item[1]+1)]
        sub_ids = [i for i in subtitle_gaps[item]]

        # Fetch all episodes and subtitles
        epis = [transformation(tbbt_episode[i][0].replace("’", " ").replace('…', " ")) for i in epi_ids]
        subs = [transformation(en_subset[i].replace("’", " ").replace('…', " ")) for i in sub_ids]

        # print(epi_ids)
        # print(epis)
        # print(sub_ids)
        # print(subs)
        for sub_id, sub in zip(sub_ids, subs):
            sub_len = len(sub.strip().split(" "))
            if sub_len <=3:
                continue
            min_score = float('inf')
            min_substring = ""
            source_episode = ""
            source_sub_id = float('inf')
            source_epi_id = float('inf')
            for epi, epi_id in zip(epis, epi_ids):
                # min_score = float('inf')
                # min_substring = ""
                epi_substrings = get_sliding_window_substrings(epi, window_size=sub_len)
                for substring in epi_substrings:
                    wer = jiwer.wer(sub, substring)
                    if wer <= min_score:
                        min_score = wer
                        min_substring = substring
                        source_episode = epi
                        source_epi_id = epi_id
            if min_score <= 0.5:
                if source_epi_id not in temp_epi2sub:
                    temp_epi2sub[source_epi_id] = [sub_id]
                else:
                    temp_epi2sub[source_epi_id].append(sub_id)
                # print(sub_id+2, source_epi_id+2)
                # print("Subtitle:", sub_id+2, sub, len(sub.strip().split(" ")))
                # print("All Episode to compare:")
                # print(min_score, '|', min_substring, '|', source_episode)
                # print('=='*50)
                # count += 1
    # print(temp_epi2sub)
    # print(len(temp_epi2sub), len(turn_sub2epi_into_epi2sub(temp_epi2sub)))
    # print(count)
    output = {}
    for epi_id in sorted(list(temp_epi2sub.keys())):
        output[epi_id] = sorted(list(set(temp_epi2sub[epi_id])))
    return output

# epi2sub_0 = epi2sub
# print(len(epi2sub_0), len(turn_sub2epi_into_epi2sub(epi2sub_0)))
# epi2sub_1 = extend_neighbors_episode_sliding(en_subset, epi2sub_0, tbbt_episode)
# print(len(epi2sub_1), len(turn_sub2epi_into_epi2sub(epi2sub_1)))
# epi2sub_2 = extend_neighbors_episode_sliding(en_subset, epi2sub_1, tbbt_episode)
# print(len(epi2sub_2), len(turn_sub2epi_into_epi2sub(epi2sub_2)))

In [None]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors_episode_sliding(en_subset, alignment_seeds[(i,j)], tbbt_episode)

    return epi2sub

In [None]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:", len(alignment_seeds[(i,j)]),len(temp), "|", "Subtitle Number:", len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])),len(turn_sub2epi_into_epi2sub(temp)))

In [None]:
with open('alignment_results/zh/final_stage_alignment_0.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [None]:
with open('alignment_results/zh/final_stage_alignment_0.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [None]:
for x in further_alignment:
    aligned_epi_num = len(alignment_seeds[x])
    aligned_sub_num = len(turn_sub2epi_into_epi2sub(alignment_seeds[x]))
    all_epi_num = max(alignment_seeds[x])-min(alignment_seeds[x])
    all_sub_num = max(turn_sub2epi_into_epi2sub(alignment_seeds[x]))-min(turn_sub2epi_into_epi2sub(alignment_seeds[x]))
    print(x, aligned_epi_num, all_epi_num, int(100*(aligned_epi_num/all_epi_num)), "|", aligned_sub_num, all_sub_num, int(100*(aligned_sub_num/all_sub_num)))

In [None]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors_episode_sliding(en_subset, alignment_seeds[(i,j)], tbbt_episode)

    return epi2sub

In [None]:
for (i, j) in alignment_seeds.keys():
    temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    # print("Season:", i,"Episode:", j, "Episode Number:",len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])), len(temp), "Subtitle Number:", len(alignment_seeds[(i,j)]),len(turn_sub2epi_into_epi2sub(temp)))

In [None]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, turn_sub2epi_into_epi2sub(alignment_seeds[(season_id,episode_id)]), tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Add within the gap
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_one_size_gap(gaps, epi2sub)

    # Further extend neighbors
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub

In [None]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
        further_alignment[(i,j)] = temp
        print("Season:", i,"Episode:", j, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
    except:
        pass

In [None]:
further_alignment = {}
for i in tqdm(range(3)):
    for j in tqdm(range(4)):
        try:
            temp = get_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, alignment_seeds, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

In [None]:
with open('final_stage_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [None]:
with open('final_stage_alignment.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [None]:
def get_sliding_window_substrings(input_string, window_size):
    input_tokens = input_string.strip().split(' ')
    substrings = []
    for i in range(len(input_tokens)-3):
        substrings.append(" ".join(input_tokens[i: i+4]))
    return substrings

In [None]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [None]:
def sliding_within_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        for epi_id in epi_ids:
            epi = tbbt_episode

        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]


    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [None]:
"""
Add neighbor subtitle from the utterance
"""
def extend_subtitles_to_episode_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

In [None]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_episodes_to_subtitle_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [None]:
# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )
epi2sub = alignment_seeds[(1,1)]
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

In [None]:
"""
Extend the former subtitle and latter subtitle near the episode
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]

    # Check whether the subtitle could be merged into utterances using sliding window
    for start_epi_id, end_epi_id in subtitle_gaps:
        start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
        end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
        # print(start_epi_id, start_epi)
        # print(end_epi_id, end_epi)
        # print('--')

        for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
            sub = transformation(en_subset[sub_id])
            sub_substrings = get_sliding_window_substrings(sub, 6)
            sub_substrings.append(sub)
            # print(sub_id, sub)
            # print(sub_substrings)
            temp_start = [substring in start_epi for substring in sub_substrings]
            signal_start = True in temp_start
            temp_end = [substring in end_epi for substring in sub_substrings]
            signal_end = True in temp_end
            # print(temp_start, signal_start)
            # print(temp_end, signal_end)
            if signal_start==True and signal_end==True:
                print(start_epi_id, start_epi)
                print(end_epi_id, end_epi)
                print(sub, "|", signal_start, "|", signal_end)
            # print('--')

        print('=='*50)

extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

In [None]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

subtitle_gaps = {}
for i in range(len(epi_keys)-1):
    epi_start = epi_keys[i]
    epi_end = epi_keys[i+1]
    key = (epi_start, epi_end)
    if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
        subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]


# Check whether the subtitle could be merged into utterances using sliding window
for start_epi_id, end_epi_id in subtitle_gaps:
    start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
    end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
    print(start_epi_id, start_epi)
    print(end_epi_id, end_epi)
    print('--')

    for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
        sub = transformation(en_subset[sub_id])
        sub_substrings = get_sliding_window_substrings(sub, 4)
        sub_substrings.append(sub)
        print(sub_id, sub)
        print(sub_substrings)
        temp_start = [substring in start_epi for substring in sub_substrings]
        temp_end = [substring in end_epi for substring in sub_substrings]
        print(temp_start)
        print(temp_end)
        print('--')

    print('=='*50)

In [None]:
input_string = transformation("One across is Aegean")
substrings = get_sliding_window_substrings(input_string, 7)
print(substrings)

In [None]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

subtitle_gaps = {}
for i in range(len(epi_keys)-1):
    epi_start = epi_keys[i]
    epi_end = epi_keys[i+1]
    print(epi_start, epi2sub[epi_start])
    print(epi_end, epi2sub[epi_end])
    key = (epi_start, epi_end)
    if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
        # value = (max(epi2sub[epi_start])+1, min(epi2sub[epi_end])-1)
        value = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]
        print(key, value)
    else:
        print("False")
    print()

In [None]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))
start = min(epi2sub[epi_keys[0]])
end = max(epi2sub[epi_keys[-1]])
for j in range(start, end+1):
    print(j)

In [None]:
print(start, end)
print(epi_keys)
print(sub_keys)

In [None]:
# Extend the neighbor subtitles to episode utterance


In [None]:
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

In [None]:
# Add within the gap
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_one_size_gap(gaps, epi2sub)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

In [None]:
for x in gaps:
    print(x)

In [None]:
count = 0
for gap in gaps:
    sub_ids = gap[1]
    epi_ids = gap[0]
    if not(len(sub_ids)==1 and len(epi_ids)==1):
        continue
    print(sub_ids, epi_ids)
    print("Episode:")
    for epi_id in epi_ids:
        print(len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')), tbbt_episode[epi_id][0])

    print()
    print("Subtitle:")
    for sub_id in sub_ids:
        print(len(transformation(en_subset[sub_id]).strip().split(' ')), en_subset[sub_id])

    # if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
    #     count += 1
    count += 1
    print('=='*50)

In [None]:
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

In [367]:
count = 0
for gap in gaps:
    sub_ids = gap[1]
    epi_ids = gap[0]
    if not(len(sub_ids)==1 and len(epi_ids)==1):
        continue
    print(sub_ids, epi_ids)
    print("Episode:")
    for epi_id in epi_ids:
        print(len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')), tbbt_episode[epi_id][0])

    print()
    print("Subtitle:")
    for sub_id in sub_ids:
        print(len(transformation(en_subset[sub_id]).strip().split(' ')), en_subset[sub_id])

    # if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
    #     count += 1
    count += 1
    print('=='*50)

[235] [23]
Episode:
12  I don’t know, I’ve never reneged on a proffer of sperm before.

Subtitle:
10 I've never reneged on a proffer of sperm before.
[241] [31]
Episode:
2  Not really.

Subtitle:
2 Not really.
[287] [83]
Episode:
25  Leonard, I’m not expert here but I believe in the context of a luncheon invitation, you might want to skip the reference to bowel movements.

Subtitle:
21 I'm no expert, but in the context of a lunch invitation you might wanna skip the reference to bowel movements.
[298] [98]
Episode:
1  Yeah. 

Subtitle:
1 Yeah.
[324] [125]
Episode:
4  Yes I now, but…

Subtitle:
3 Yes, I know.
[385] [172]
Episode:
2  How so?

Subtitle:
3 LEONARD: How so?
[407] [191]
Episode:
2  See what?

Subtitle:
2 See what?
[410] [194]
Episode:
8  It’s before he became a creepy computer voice:.

Subtitle:
12 [IMITATING COMPUTERIZED VOICE] It's before he became a creepy computer voice.
[418] [202]
Episode:
1  Uh-huh.

Subtitle:
1 - Uh-huh.
[451] [229]
Episode:
2  Must we?

Subtitle:
2 -

In [503]:
print(alignment)

{(1, 2): {168: [2], 169: [3], 170: [4], 171: [5], 172: [6], 173: [7], 174: [7], 175: [8], 176: [9], 177: [9], 178: [11], 179: [12], 180: [14], 181: [15], 182: [16], 183: [17], 184: [17], 185: [18], 186: [19], 187: [20], 188: [20], 190: [22], 191: [22], 192: [24], 193: [25], 194: [26], 195: [27], 196: [28], 200: [31], 201: [32], 202: [33], 204: [35], 205: [36], 206: [36], 208: [39], 209: [40], 210: [40], 211: [41], 212: [42], 213: [42], 214: [42], 215: [44], 216: [45], 217: [46], 218: [46], 219: [46], 220: [46], 221: [47], 222: [48], 223: [48], 224: [48], 225: [48], 226: [49], 231: [52], 232: [53], 233: [54], 234: [54], 235: [55], 236: [55], 237: [56], 240: [57], 241: [57], 242: [58], 243: [60], 245: [61], 246: [61], 250: [62], 251: [63], 252: [64], 253: [65], 254: [65], 255: [65], 260: [68], 263: [70], 264: [71], 265: [72], 266: [73], 267: [74], 268: [74], 273: [77], 274: [77], 278: [80, 81], 279: [80, 81], 280: [81], 282: [81], 283: [81], 284: [82], 285: [82], 287: [84], 288: [84], 28

In [17]:
def temp_string_match_sliding_window(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)

    temp = filter_alignment_by_gap(res)

    final = {}
    for x in temp:
        if len(temp[x])==1:
            final[x] = temp[x]

    return final

In [24]:
def temp_string_match_sliding_window_no_filter(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)
    return res

In [15]:
# Check one episode and adapt to the new tbbt transcript corpus

(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=3,
        bias=200
    )

In [60]:
# Firstly perform exact match
sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=5)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=6)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=7)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

155 120
112 91
81 68


In [58]:
# Firstly perform exact match
sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=5)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=6)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=7)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

95 72
6 5
4 4


In [42]:
temp = sub2epi
sub2epi = {}
for x in sorted(list(temp.keys())):
    sub2epi[x+2] = [item+2 for item in temp[x]]
for x in sub2epi:
    print(x, sub2epi[x])

191 [2]
192 [2]
194 [5]
196 [6]
197 [7]
208 [15]
221 [26]
227 [30]
229 [31]
231 [34]
233 [36]
236 [38]
242 [42]
245 [44]
258 [53]
260 [54]
263 [57]
264 [58]
265 [58]
267 [59]
271 [62]
274 [64]
275 [65]
279 [68]
280 [68]
284 [74, 72]
286 [74, 72]
287 [75]
288 [76]
289 [76]
291 [78]
293 [79]
294 [80]
295 [80]
296 [80]
297 [81]
299 [83]
304 [89]
306 [91]
307 [92]
313 [98]
314 [99]
315 [100]
317 [101]
318 [102]
320 [103]
321 [105]
327 [111]
328 [112]
329 [112]
331 [113]
334 [115]
335 [115]
344 [120]
345 [120]
347 [121]
350 [123]
351 [124]
356 [127]
358 [128]
359 [128]
365 [131]
370 [134]
372 [136]
373 [136]
374 [136]
377 [138]
379 [141]
381 [143]
383 [144]
384 [144]
386 [144]
398 [158]
400 [160]
403 [163]
405 [165]
406 [166]
408 [168]
412 [171]
413 [171]
416 [171]
418 [171]
427 [175]
431 [180]
435 [182]
437 [183]
438 [183]
442 [189]
444 [190]
450 [196]
452 [197]
453 [198]
455 [201]
458 [205]
459 [205]
463 [208]
467 [211]
471 [214]
473 [215]
474 [215]
477 [217]
479 [218]
481 [219]
486 [222]

In [43]:
temp = epi2sub
epi2sub = {}
for x in sorted(list(temp.keys())):
    epi2sub[x+2] = [item+2 for item in temp[x]]

for x in epi2sub:
    print(x, epi2sub[x])

2 [191, 192]
5 [194]
6 [196]
7 [197]
15 [208]
26 [221]
30 [227]
31 [229]
34 [231]
36 [233]
38 [236]
42 [242]
44 [245]
53 [258]
54 [260]
57 [263]
58 [264, 265]
59 [267]
62 [271]
64 [274]
65 [275]
68 [279, 280]
72 [284, 286]
74 [284, 286]
75 [287]
76 [288, 289]
78 [291]
79 [293]
80 [294, 295, 296]
81 [297]
83 [299]
89 [304]
91 [306]
92 [307]
98 [313]
99 [314]
100 [315]
101 [317]
102 [318]
103 [320]
105 [321]
111 [327]
112 [328, 329]
113 [331]
115 [334, 335]
120 [344, 345]
121 [347]
123 [350]
124 [351]
127 [356]
128 [358, 359]
131 [365]
134 [370]
136 [372, 373, 374]
138 [377]
141 [379]
143 [381]
144 [383, 384, 386]
158 [398]
160 [400]
163 [403]
165 [405]
166 [406]
168 [408]
171 [412, 413, 416, 418]
175 [427]
180 [431]
182 [435]
183 [437, 438]
189 [442]
190 [444]
196 [450]
197 [452]
198 [453]
201 [455]
205 [458, 459]
208 [463]
211 [467]
214 [471]
215 [473, 474]
217 [477]
218 [479]
219 [481]
222 [486]
226 [490]
236 [499]
237 [500]
241 [505]
242 [507]
244 [509, 512]
247 [515]


In [44]:
print(len(sub2epi))
print(len(epi2sub))

112
91


In [8]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)

    return temp

In [9]:
further_alignment = {}
for i in tqdm(range(2)):
    for j in tqdm(range(4)):
        try:
            temp = look_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:29<01:27, 29.08s/it][A
 50%|█████     | 2/4 [00:43<00:41, 20.61s/it][A
 75%|███████▌  | 3/4 [00:58<00:17, 17.78s/it][A
100%|██████████| 4/4 [01:09<00:00, 17.45s/it][A
 50%|█████     | 1/2 [01:09<01:09, 69.81s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:13<00:39, 13.30s/it][A
 50%|█████     | 2/4 [00:30<00:31, 15.66s/it][A
 75%|███████▌  | 3/4 [00:43<00:14, 14.20s/it][A
100%|██████████| 4/4 [00:57<00:00, 14.41s/it][A
100%|██████████| 2/2 [02:07<00:00, 63.73s/it]


In [10]:
for x in further_alignment:
    print(x, len(further_alignment[x][0]), len(further_alignment[x][1]),len(further_alignment[x][2]),len(further_alignment[x][3]))

(0, 0) 133 133 171 171
(0, 1) 109 109 130 130
(0, 2) 72 72 90 90
(0, 3) 112 112 134 134
(1, 0) 1 1 1 1
(1, 1) 84 84 104 104
(1, 2) 113 113 135 135
(1, 3) 124 124 159 159


In [14]:
with open('further_alignment.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [11]:
alignment = further_alignment

In [12]:
for x in alignment:
    print(x, len(alignment[x][0]), len(alignment[x][1]),len(alignment[x][2]),len(alignment[x][3]))

(0, 0) 133 133 171 171
(0, 1) 109 109 130 130
(0, 2) 72 72 90 90
(0, 3) 112 112 134 134
(1, 0) 1 1 1 1
(1, 1) 84 84 104 104
(1, 2) 113 113 135 135
(1, 3) 124 124 159 159


In [47]:
print(epi2sub)

{2: [191, 192], 5: [194], 6: [196], 7: [197], 15: [208], 26: [221], 30: [227], 31: [229], 34: [231], 36: [233], 38: [236], 42: [242], 44: [245], 53: [258], 54: [260], 57: [263], 58: [264, 265], 59: [267], 62: [271], 64: [274], 65: [275], 68: [279, 280], 72: [284, 286], 74: [284, 286], 75: [287], 76: [288, 289], 78: [291], 79: [293], 80: [294, 295, 296], 81: [297], 83: [299], 89: [304], 91: [306], 92: [307], 98: [313], 99: [314], 100: [315], 101: [317], 102: [318], 103: [320], 105: [321], 111: [327], 112: [328, 329], 113: [331], 115: [334, 335], 120: [344, 345], 121: [347], 123: [350], 124: [351], 127: [356], 128: [358, 359], 131: [365], 134: [370], 136: [372, 373, 374], 138: [377], 141: [379], 143: [381], 144: [383, 384, 386], 158: [398], 160: [400], 163: [403], 165: [405], 166: [406], 168: [408], 171: [412, 413, 416, 418], 175: [427], 180: [431], 182: [435], 183: [437, 438], 189: [442], 190: [444], 196: [450], 197: [452], 198: [453], 201: [455], 205: [458, 459], 208: [463], 211: [467]

In [244]:
alignment = alignment_seeds

In [248]:
for x in alignment:
    print(x)

(1, 1)
(1, 2)
(1, 3)
(1, 4)
(1, 5)
(1, 6)
(1, 7)
(1, 8)
(1, 9)
(1, 10)
(1, 11)
(1, 12)
(1, 13)
(1, 14)
(1, 15)
(1, 16)
(2, 1)
(2, 2)
(2, 3)
(2, 4)
(2, 5)
(2, 6)
(2, 7)
(2, 8)
(2, 9)
(2, 10)
(2, 11)
(2, 12)
(2, 13)
(2, 14)
(2, 15)
(2, 16)
(2, 17)
(2, 18)
(2, 19)
(2, 20)
(2, 21)
(2, 22)
(2, 23)
(3, 1)
(3, 2)
(3, 3)
(3, 4)
(3, 5)
(3, 6)
(3, 7)
(3, 8)
(3, 9)
(3, 10)
(3, 11)
(3, 12)
(3, 13)
(3, 14)
(3, 15)
(3, 16)
(3, 17)
(3, 18)
(3, 19)
(3, 20)
(3, 21)
(3, 22)
(3, 23)
(4, 1)
(4, 2)
(4, 3)
(4, 4)
(4, 5)
(4, 6)
(4, 7)
(4, 8)
(4, 9)
(4, 10)
(4, 11)
(4, 12)
(4, 13)
(4, 14)
(4, 15)
(4, 17)
(4, 18)
(4, 19)
(4, 20)
(4, 21)
(4, 22)
(4, 23)
(4, 24)
(5, 1)
(5, 2)
(5, 3)
(5, 4)
(5, 5)
(5, 6)
(5, 7)
(5, 8)
(5, 9)
(5, 10)
(5, 11)
(5, 12)
(5, 13)
(5, 14)
(5, 15)
(5, 16)
(5, 17)
(5, 18)
(5, 19)
(5, 20)
(5, 21)
(5, 22)
(5, 23)
(6, 1)
(6, 2)
(6, 3)
(6, 4)
(6, 5)
(6, 6)
(6, 7)
(6, 8)
(6, 9)
(6, 10)
(6, 11)
(6, 12)
(6, 13)
(6, 14)
(6, 15)
(6, 16)
(6, 17)
(6, 18)
(6, 19)
(6, 20)
(6, 21)
(6, 22)
(6, 23)
(6, 24

In [290]:
alignment = {(1,1): turn_sub2epi_into_epi2sub(temp)}

TypeError: list indices must be integers or slices, not str

In [294]:
# Write into xlsx file
for x in alignment:
    print(alignment[x])
    # Define season and episode
    season_id = x[0]
    episode_id = x[1]

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/test_episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/test_subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    sub2epi = alignment[x]
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, str(speaker), " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

{0: [200], 9: [215], 15: [224], 16: [225, 226], 17: [229], 28: [239], 34: [246, 247], 35: [248], 65: [268], 75: [282], 82: [285, 286], 88: [290], 89: [291], 90: [292], 93: [293], 96: [295], 100: [300], 102: [301, 302], 108: [307], 116: [314, 316], 117: [318], 134: [333], 135: [334], 145: [348], 148: [351], 154: [360], 155: [362], 160: [368, 369], 161: [373, 374, 375], 162: [378], 171: [384], 173: [386], 174: [387], 175: [388], 176: [389], 177: [390], 183: [396], 186: [400, 401], 193: [409], 201: [417], 207: [423], 208: [424, 425], 210: [430], 215: [435], 226: [449], 228: [450], 230: [452, 453, 454, 455, 456], 232: [458], 237: [464], 240: [467], 241: [468, 469, 470], 242: [472], 245: [476], 258: [489, 490], 259: [491], 262: [494, 495], 263: [496], 270: [503], 275: [506], 280: [510, 511], 288: [523, 524], 295: [531], 296: [532], 297: [533], 302: [542], 309: [548], 311: [550, 551], 314: [555]}


In [None]:
# Check current alignments
for x in alignment:
    # Load Data
    epi2sub = alignment[x][-1]

    pass


In [36]:
print(further_alignment[(0,0)][-1])

{0: [200, 201, 202, 203], 2: [205], 5: [206, 207], 8: [214], 9: [215], 12: [218, 219], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 21: [233], 24: [235], 29: [239], 31: [240], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 40: [252], 41: [253], 42: [254], 48: [256], 50: [259], 52: [261], 58: [263], 59: [264], 66: [266], 67: [267], 68: [268], 69: [269], 70: [270], 72: [273], 73: [274, 275], 76: [279], 78: [282], 79: [283], 85: [285, 286], 86: [287], 91: [290], 92: [291], 93: [292], 96: [293], 99: [295], 100: [297], 102: [299], 103: [300], 104: [301], 105: [302], 108: [304], 110: [306], 111: [307], 112: [308], 114: [311], 115: [312], 118: [313], 119: [314, 315, 316, 317], 120: [318], 122: [319], 124: [320], 126: [321, 322], 129: [325], 131: [327], 135: [330], 136: [331], 137: [333], 138: [334], 139: [335], 140: [336], 141: [337, 338, 339, 340], 143: [342], 151: [351], 153: [356], 154: [357, 358], 157: [360, 361], 158: [362], 163: [368, 369], 165: [373

In [35]:
print(len(further_alignment[(0,0)]))

4


In [25]:
for x in further_alignment:
    print(x, len(further_alignment[x][-1]), len(further_alignment[x][-2]), len(further_alignment[x][-3]))
    # print(further_alignment[x])

(0, 0) 171 171 133
(0, 1) 130 130 109
(0, 2) 90 90 72
(0, 3) 134 134 112
(0, 4) 132 132 106
(0, 5) 24 24 16
(0, 6) 75 75 66
(0, 7) 7 7 7
(0, 8) 130 130 100
(0, 9) 57 57 50
(0, 10) 46 46 32
(0, 11) 93 93 75
(0, 12) 137 137 114
(0, 13) 10 10 9
(0, 14) 124 124 103
(0, 15) 130 130 102
(1, 0) 1 1 1
(1, 1) 104 104 84
(1, 2) 135 135 113
(1, 3) 159 159 124
(1, 4) 74 74 60
(1, 5) 60 60 43
(1, 6) 29 29 24
(1, 7) 110 110 98
(1, 8) 70 70 61
(1, 9) 80 80 68
(1, 11) 133 133 112
(1, 12) 136 136 108
(1, 13) 161 161 131
(1, 14) 31 31 26
(1, 15) 106 106 83
(1, 16) 140 140 119
(1, 17) 127 127 102
(1, 18) 134 134 111
(1, 19) 46 46 40
(1, 20) 95 95 74
(1, 21) 137 137 104
(1, 22) 56 56 47
(2, 0) 125 125 102
(2, 1) 134 134 115
(2, 2) 103 103 81
(2, 3) 105 105 84
(2, 4) 131 131 105
(2, 5) 17 17 13
(2, 6) 159 159 131
(2, 7) 109 109 93
(2, 8) 133 133 106
(2, 9) 162 162 125
(2, 10) 129 129 112
(2, 11) 125 125 105
(2, 12) 109 109 86
(2, 13) 106 106 83
(2, 14) 124 124 103
(2, 15) 142 142 119
(2, 16) 100 100 81
(2,

In [27]:
old_alignment = further_alignment

In [34]:
print(old_alignment[(1,1)][-1])

{2: [200, 201, 202, 203], 4: [206], 5: [208], 6: [209, 210], 7: [211], 10: [217, 218], 14: [220], 18: [223], 19: [224, 225], 20: [229], 21: [230], 22: [232], 23: [233, 234], 26: [238], 28: [240, 241], 30: [242], 31: [243], 32: [244], 33: [245], 34: [246], 36: [249, 250], 37: [251], 40: [253], 41: [254, 255], 44: [258], 45: [259, 260, 261], 46: [262, 263], 47: [264], 48: [266], 49: [267], 52: [269], 56: [274], 57: [276, 277], 58: [278], 59: [279], 60: [280, 281], 61: [282], 63: [283], 64: [284], 65: [285], 67: [286], 68: [287, 288, 289], 70: [291, 292], 73: [295], 74: [296], 75: [297, 298, 299], 76: [300], 77: [302], 79: [304], 80: [305], 81: [306], 82: [308], 83: [310], 84: [311], 85: [312, 313], 87: [315], 88: [317], 89: [319], 90: [320, 321], 91: [322], 92: [325, 326], 94: [328], 95: [329], 96: [330], 97: [331], 99: [334], 100: [335, 336], 101: [337], 102: [338], 104: [339], 107: [342, 343], 109: [344], 110: [345], 111: [346], 112: [347], 113: [348, 349, 350], 115: [351], 116: [352, 

In [33]:
print(len(old_alignment[(1,1)][-1]))
print(len(turn_sub2epi_into_epi2sub(old_alignment[(1,1)][-1])))

104
140


In [38]:
alignment = further_alignment
for x in alignment:
    # Define season and episode
    season_id = x[0]+1
    episode_id = x[1]+1

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x][-1]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    tbbt_episode = []
    for x in tbbt_transcripts[(season_id, episode_id)]:
        if x[1]!='Scene':
            tbbt_episode.append(x)

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, "", " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

In [22]:
print(further_alignment.keys())

dict_keys([(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)])


In [14]:
temp = []
# Perform string match with sliding window
sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 4)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

In [18]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

In [20]:
# Perform exact match and add it to the whole alignment
exact_match_result = exact_match(en_subset, tbbt_episode)
sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

{200: [0], 202: [0], 203: [0], 205: [2], 206: [5], 207: [5], 212: [6], 213: [7], 214: [8], 215: [9], 218: [12], 219: [12], 220: [13], 222: [15], 223: [15], 224: [15], 225: [16], 226: [16], 227: [17], 228: [17], 229: [17], 232: [20], 233: [21], 234: [23], 235: [24], 239: [29], 240: [31], 241: [32], 242: [33], 246: [35], 247: [35], 248: [36], 249: [37], 251: [39], 252: [40], 253: [41], 254: [42], 256: [48], 259: [50], 260: [51], 261: [52], 262: [57], 263: [58], 264: [59], 266: [66], 267: [67], 268: [68], 269: [69], 270: [70], 272: [72], 273: [72], 274: [73], 275: [73], 276: [74], 279: [76], 280: [77], 282: [78], 283: [79], 285: [85], 286: [85], 287: [86], 288: [87], 290: [91], 291: [92], 292: [93], 293: [96], 295: [99], 296: [99], 297: [100], 299: [102], 300: [103], 301: [104], 302: [105], 303: [107], 304: [108], 306: [110], 307: [111], 308: [112], 311: [114], 312: [115], 313: [118], 314: [119], 315: [119], 316: [119], 317: [119], 318: [120], 319: [122], 320: [124], 321: [126], 322: [126

In [24]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

{200: [0], 202: [0], 203: [0], 205: [2], 206: [5], 207: [5], 212: [6], 213: [7], 214: [8], 215: [9], 218: [12], 219: [12], 220: [13], 222: [15], 223: [15], 224: [15], 225: [16], 226: [16], 227: [17], 228: [17], 229: [17], 232: [20], 233: [21], 234: [23], 235: [24], 239: [29], 240: [31], 241: [32], 242: [33], 246: [35], 247: [35], 248: [36], 249: [37], 251: [39], 252: [40], 253: [41], 254: [42], 255: [42], 256: [48], 259: [50], 260: [51], 261: [52], 262: [57], 263: [58], 264: [59], 266: [66], 267: [67], 268: [68], 269: [69], 270: [70], 272: [72], 273: [72], 274: [73], 275: [73], 276: [74], 279: [76], 280: [77], 282: [78], 283: [79], 285: [85], 286: [85], 287: [86], 288: [87], 290: [91], 291: [92], 292: [93], 293: [96], 295: [99], 296: [99], 297: [100], 299: [102], 300: [103], 301: [104, 105], 302: [105], 303: [107], 304: [108], 306: [110], 307: [111], 308: [112], 311: [114], 312: [115], 313: [118], 314: [119], 315: [119], 316: [119], 317: [119], 318: [120], 319: [122], 320: [124], 321: 

In [22]:
# Show TBBT Episode
for i, (utt, speaker) in enumerate(tbbt_episode):
    if i in epi2sub:
        print(i, "||||",epi2sub[i], speaker, utt)
    else:
        print(i, speaker, utt)

0 |||| [200, 202, 203] Sheldon  So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.
1 Leonard  Agreed, what’s your point?
2 |||| [205] Sheldon  There’s no point, I just think it’s a good idea for a tee-shirt. 
3 Leonard  Excuse me?
4 Receptionist  Hang on. 
5 |||| [206, 207] Leonard  One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti. 
6 |||| [212] Receptionist  Can I help you?
7 |||| [213] Leonard  Yes. Um, is this the High IQ sperm bank?
8 |||| [214] Receptionist  If you have to ask, maybe you shouldn’t be here.
9 |||| [215] Sheldon  I think this is the place.
10 Receptionist  Fill these out.
11 

In [23]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle, zh_subset[i])
    else:
        print(i, subtitle, zh_subset[i])

0 Thank you very much. Good day to you. 多谢了 日安
1 Good day to you. 日安
2 Come and buy a dresser! 来买梳妆台了
3 The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
4 Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
5 The village had grown. 村子也大了 有很多新的小孩
6 There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
7 But we had a secret helper. 但是我们有个秘密
8 Nikolas. -尼古拉斯
9 -Eemeli. -艾美利
10 Long time no see. You should come more often. 很久没见了 你应该常来
11 I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
12 Do you have the list? 你有名单吗
13 Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
14 As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
15 Is that... -是吗
16 -A girl, three months. -一个女孩 三个月大
17 Let's add her to the list. 那我们加上她的名字吧
18 What is the name of this little princess? 这个小公主叫什么名字
19 Aada. 亚达
20 Aada? 亚达
21 Hello, Aada. 你好 亚达
22 Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
23 My sons. 我的儿子


In [90]:
print(temp)

[{0: [200, 203], 5: [206], 8: [214], 15: [224], 16: [226], 17: [227, 229], 24: [235], 29: [239], 33: [242], 35: [246], 36: [248], 37: [249], 41: [253], 48: [256], 50: [259]}, {0: [200, 203], 5: [206, 207], 8: [214], 15: [224], 16: [225, 226], 17: [227, 229], 24: [235], 29: [239], 33: [242], 35: [246, 247], 36: [248], 37: [249], 41: [253], 48: [256], 50: [259]}, {0: [200, 203], 5: [206, 207], 8: [214], 9: [215], 15: [224], 16: [225, 226], 17: [227, 229], 21: [233], 24: [235], 29: [239], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 41: [253], 42: [254], 48: [256], 50: [259]}]


In [91]:
for x in temp:
    print(x)

3


In [None]:
    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

In [56]:
def extend_with_wer(en_subset, epi2sub_alignment_2, episode):
    temp = {}

    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_latter = transformation(en_subset[sub_id_latter])
        # sub = transformation(en_subset[sub_id])
        epi = transformation(episode[epi_id][0])

        # Fetch all relevant sentences
        epi_sentences = [episode[idx][0] for idx in [epi_id_former, epi_id, epi_id_latter] if idx>=0]
        print(epi_id_former, epi_id, epi_id_latter)
        print(epi_sentences)

        sub_sentences = [en_subset[sub_id_former, sub_id_latter] for idx in [sub_former, sub_id_latter] if idx>=0]
        print(sub_id_former, sub_id_latter)
        print(sub_sentences)

        print("=="*50)


        if sub_former in epi:
            sub_ids.append(sub_id_former)
        if sub_latter in epi:
            sub_ids.append(sub_id_latter)
        # print(sorted(sub_ids))
        temp[epi_id] = sorted(sub_ids)
        # epi2sub_alignment_2[epi_id] = sorted(sub_ids)
        # print("=="*50)
    return temp

In [26]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    tbbt_episode = []
    for x in tbbt_transcripts[(1,1)]:
        if x[1]!='Scene':
            tbbt_episode.append(x)

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

    # # Extend with WER
    # # for x in sub2epi:
    # #     print(x, sub2epi[x])
    # extend_with_wer(en_subset, epi2sub, tbbt_episode)




    return temp

In [None]:
further_alignment = {}
for i in tqdm(range(12)):
    for j in tqdm(range(30)):
        try:
            temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

In [28]:
temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, 1, 1, 200)

In [35]:
print(temp[-1])

{0: [200, 201, 202, 203], 2: [205], 5: [206, 207], 8: [214], 9: [215], 12: [218, 219], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 21: [233], 24: [235], 29: [239], 31: [240], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 40: [252], 41: [253], 42: [254], 48: [256], 50: [259], 52: [261], 58: [263], 59: [264], 66: [266], 67: [267], 68: [268], 69: [269], 70: [270], 72: [273], 73: [274, 275], 76: [279], 78: [282], 79: [283], 85: [285, 286], 86: [287], 91: [290], 92: [291], 93: [292], 96: [293], 99: [295], 100: [297], 102: [299], 103: [300], 104: [301], 105: [302], 108: [304], 110: [306], 111: [307], 112: [308], 114: [311], 115: [312], 118: [313], 119: [314, 315, 316, 317], 120: [318], 122: [319], 124: [320], 126: [321, 322], 129: [325], 131: [327], 135: [330], 136: [331], 137: [333], 138: [334], 139: [335], 140: [336], 141: [337, 338, 339, 340], 143: [342], 151: [351], 153: [356], 154: [357, 358], 157: [360, 361], 158: [362], 163: [368, 369], 165: [373

In [46]:
for x in further_alignment:
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=x[0]+1,
        episode_id=x[1]+1,
        bias=200
    )
    total = len(tbbt_episode)
    print(x, total, len(further_alignment[x][0]), len(further_alignment[x][1]), len(further_alignment[x][2]), len(further_alignment[x][3]))
    # print(further_alignment[x])
    # print("=="*50)

(0, 0) 133 80 80 100 100
