In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from utils.preprocessing import organize_coarse_alignment_by_seasons
from utils.preprocessing import fetch_subsets

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Load Source file

In [4]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/en_zh/en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/en_zh/zh_subtitles.pkl"
transcript_path = "../../source_data/transcripts/friends/friends_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/friends_en_zh.pkl"
root_path = "results/friends_en_zh/"


# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

# Section 1: Coarse-Grain Alignment
Use string match to fetch relevant index of open subtitle for each episode
The code for coarse alignment locates at "../coarse_alignment/align_transcript_subtitle.py"
The alignment results locates at "../coarse_alignment/results/"

In [5]:
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

# Section 2: Fine-Grained Alignment

In this section, we fetch alignment within one episode using the course alignment indexs fetch in the last section.

This include the following parts:

2.1: Fetch transcript episode and open subtitle subset
2.2: Exact String Match (Use as the seed, since it is more accurate)
2.3: Substring Exact Match (Sliding Window Algorithm)
2.4: Merge the alignment result of 2.2, 2.3 and filter by the index

## Part 2.1: Fetch Subsets

In [43]:
# An example of fetch_subset function
(en_subtitle, other_subtitle, tbbt_episode) = fetch_subsets(
        episode=all_transcripts,
        en_subtitle=all_en_subtitles,
        zh_subtitle=all_other_subtitles,
        results=coarse_alignments,
        season_id=3,
        episode_id=10,
        bias=200,
        zh_split=True
    )

for a, b in zip(en_subtitle, other_subtitle):
    print(a, b)

IndexError: list index out of range

## Part2: Generate Alignment Seeds

In [6]:
from utils.alignment_seeds import *

In [47]:
# Check the source alignment
results = {}
for i in sorted(list(coarse_alignments.keys())):
    for j in sorted(list(coarse_alignments[i].keys())):
        if (i, j) not in all_transcripts:
            continue
        try:
            print("Season:", i, "  Episode:", j)
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
            temp = generate_alignment_seeds(en_subset, tbbt_episode, window_size=12)
            if temp != {}:
                results[(i, j)] = temp
        except:
            print("Pass Season", i, "Episode:", j)

Season: 3   Episode: 6
Pass Season 3 Episode: 6
Season: 3   Episode: 7
Pass Season 3 Episode: 7
Season: 3   Episode: 8
Pass Season 3 Episode: 8
Season: 3   Episode: 9
Pass Season 3 Episode: 9
Season: 3   Episode: 10
Pass Season 3 Episode: 10
Season: 3   Episode: 11
Pass Season 3 Episode: 11
Season: 3   Episode: 12
Pass Season 3 Episode: 12
Season: 3   Episode: 13
Pass Season 3 Episode: 13
Season: 3   Episode: 14
Pass Season 3 Episode: 14
Season: 3   Episode: 15
Season: 3   Episode: 16
Season: 3   Episode: 17
Pass Season 3 Episode: 17
Season: 3   Episode: 18
Pass Season 3 Episode: 18
Season: 3   Episode: 19
Pass Season 3 Episode: 19
Season: 3   Episode: 20
Pass Season 3 Episode: 20
Season: 3   Episode: 21
Pass Season 3 Episode: 21
Season: 3   Episode: 22
Pass Season 3 Episode: 22
Season: 3   Episode: 23
Pass Season 3 Episode: 23
Season: 3   Episode: 24
Pass Season 3 Episode: 24
Season: 3   Episode: 25
Pass Season 3 Episode: 25


In [48]:
print(results)

{(3, 16): {200: [151, 151]}}


In [17]:
with open(root_path + "0_alignment_seeds_12.pkl", "wb") as f:
    pkl.dump(results, f)

In [13]:
with open(root_path + "0_alignment_seeds_temp.pkl", "wb") as f:
    pkl.dump(results, f)

## Part 3: Incrementally Extend from Alignment Seeds

In [18]:
with open(root_path + "0_alignment_seeds_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

In [None]:
# The input to each extension function should be epi2sub

In [7]:
from utils.alignment_extension import *
from utils.helper_functions import *
from utils.ultimate_alignment import *

In [13]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
            )
    epi2sub = filter_by_idx(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)]))
    # Extend the neighbor
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform a set of extension
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform alignment within gaps
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Extend Subtitle ids with its min and max index
    for x in epi2sub:
        epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

    # Perform ultimate alignment
    gaps = get_final_stage_gap_pairs(epi2sub)
    epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

    further_alignment[(i,j)] = epi2sub

In [20]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    try:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    except:
        print("Wrong")
    # print(epi2sub)

(1, 1) Subtitle: 429 0.988479262672811 || Episode: 231 73.33333333333333 5
(1, 2) Subtitle: 294 0.8936170212765957 || Episode: 182 71.65354330708661 9
(1, 3) Subtitle: 216 0.9113924050632911 || Episode: 142 52.20588235294118 56
(1, 4) Subtitle: 295 0.855072463768116 || Episode: 199 74.25373134328358 3
(1, 5) Subtitle: 320 0.9384164222873901 || Episode: 189 74.40944881889764 2
(1, 6) Subtitle: 224 0.9068825910931174 || Episode: 140 61.135371179039296 49
(1, 7) Subtitle: 256 0.8648648648648649 || Episode: 159 63.095238095238095 12
(1, 8) Subtitle: 256 0.8737201365187713 || Episode: 187 76.01626016260163 12
(1, 9) Subtitle: 304 0.9047619047619048 || Episode: 181 74.79338842975206 4
(1, 10) Subtitle: 262 0.8704318936877077 || Episode: 150 62.24066390041494 40
(1, 11) Subtitle: 328 0.9291784702549575 || Episode: 220 74.07407407407408 0
(1, 12) Subtitle: 268 0.9337979094076655 || Episode: 173 64.07407407407408 2
(1, 13) Subtitle: 305 0.9214501510574018 || Episode: 193 75.390625 5
(1, 14) Sub

In [18]:
print(results[(1,1)])

{190: [0], 193: [0], 195: [2], 196: [5], 204: [8], 205: [9], 213: [15], 214: [15], 216: [16], 217: [17], 219: [17], 225: [23], 229: [29], 232: [33], 236: [35], 238: [36], 239: [37], 243: [41], 246: [46], 249: [48], 253: [55], 256: [63], 258: [65], 260: [67], 272: [75], 275: [81], 276: [81], 277: [82], 280: [87], 281: [88], 282: [89], 283: [92], 285: [95], 287: [96], 290: [99], 292: [101], 294: [104], 297: [107], 298: [108], 304: [115], 306: [115], 307: [115], 308: [116], 315: [125], 317: [127], 323: [133], 324: [134], 326: [136], 327: [137], 330: [137], 332: [139], 338: [144], 341: [147], 348: [150], 350: [153], 352: [154], 358: [159], 364: [160], 365: [160], 368: [161], 370: [163], 374: [170], 376: [172], 378: [172, 174], 379: [175], 380: [176], 381: [178], 382: [179], 384: [181], 386: [182], 388: [183], 390: [185], 392: [186], 398: [191], 399: [192], 400: [193], 404: [198], 407: [200], 409: [202], 413: [206], 414: [207], 417: [208], 418: [208], 420: [209], 421: [210], 424: [213], 425

In [21]:
with open(root_path+'1_alignment_extension_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

# Part 4: Add Head Tail

In [8]:
from utils.head_tail_alignment import *

root_path = "results/friends_en_zh/"
with open(root_path + "1_alignment_extension_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        print(i, j)
        (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])

        head_tail_sub2epi = {}

        gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
        for item in gap_pairs:
            temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
            for x in temp:
                head_tail_sub2epi[x] = temp[x]

        temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
        # print(temp)
        if temp!={}:
            if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
                temp = {}
        further_alignment[(i,j)] = filter_by_idx(temp)

        # print(temp)
        # print(filter_by_idx(epi2sub))
        # print("=="*50)
    except:
        print("Pass i j")

with open(root_path+'1_alignment_head_tail_12.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

ultimate_data = {}
for item in alignment_seeds:
    temp = alignment_seeds[item]
    if item in further_alignment:
        temp = merge_episode_alignment(temp, further_alignment[item])
    ultimate_data[item] = temp

with open(root_path+'1_alignment_ultimate_12.pkl', 'wb') as f:
    pkl.dump(ultimate_data, f)

1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 22
1 24
3 16
Pass i j
4 19
Pass i j
6 4
6 19
Pass i j
7 24
Pass i j
9 12


In [11]:
for item in further_alignment:
    print(item)
    print(further_alignment[item])
    print()
    print(ultimate_data[item])
    print("=="*50)

(1, 1)
{0: [199, 200], 1: [201, 202], 2: [203, 204], 3: [205], 4: [206], 280: [644], 281: [645], 282: [646, 647], 287: [651], 289: [652, 653], 290: [654], 293: [656], 294: [657], 295: [658], 296: [660], 298: [662]}

{0: [199, 200], 1: [201, 202], 2: [203, 204], 3: [205], 4: [206], 5: [207, 208], 6: [209], 7: [210, 211], 8: [212], 9: [213, 214], 10: [215], 11: [216], 12: [217], 14: [218, 219], 15: [220], 16: [221, 222, 223], 18: [224], 20: [226], 21: [227], 22: [228], 24: [229], 27: [230, 231, 232], 28: [233], 29: [234, 235], 30: [236], 31: [237, 238], 32: [239], 33: [240, 241, 242, 243], 34: [244, 245], 35: [246], 36: [247, 248, 249, 250], 38: [251, 252, 253, 254], 39: [255, 256, 257], 40: [258, 259], 41: [260], 42: [261], 43: [262, 263, 264], 44: [265], 45: [266, 267, 268, 269, 270], 46: [271], 48: [272], 49: [273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285], 50: [286], 51: [287], 52: [288], 53: [289, 290], 54: [291], 55: [292, 293, 294], 56: [295], 57: [296], 59: [297

In [15]:
with open(root_path + "1_alignment_ultimate_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
            )
    epi2sub = filter_by_idx(alignment_seeds[(i,j)])
    # Extend the neighbor
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))
        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))
        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform a set of extension
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(add_neighbor_subtitles_to_episode(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Perform alignment within gaps
    while True:
        temp_len = len(turn_sub2epi_into_epi2sub(epi2sub))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_strict_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        gaps, _ = get_subset_in_gaps(epi2sub)
        epi2sub = filter_by_idx(add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode))

        epi2sub = filter_by_idx(extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode))

        if temp_len==len(turn_sub2epi_into_epi2sub(epi2sub)):
            break

    # Extend Subtitle ids with its min and max index
    for x in epi2sub:
        epi2sub[x] = [i for i in range(min(epi2sub[x]), max(epi2sub[x])+1)]

    # Perform ultimate alignment
    gaps = get_final_stage_gap_pairs(epi2sub)
    epi2sub = ultimate_alignment(gaps, epi2sub, en_subset, tbbt_episode)

    further_alignment[(i,j)] = epi2sub

with open(root_path+'1_alignment_extension_12_again.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [16]:
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    try:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    except:
        print("Wrong")
    # print(epi2sub)

(1, 1) Subtitle: 459 0.9870967741935484 || Episode: 251 79.68253968253968 0
(1, 2) Subtitle: 375 0.9191176470588235 || Episode: 194 76.37795275590551 0
(1, 3) Subtitle: 223 0.9409282700421941 || Episode: 142 52.20588235294118 56
(1, 4) Subtitle: 340 0.8923884514435696 || Episode: 217 80.97014925373134 1
(1, 5) Subtitle: 338 0.9548022598870056 || Episode: 193 75.98425196850394 0
(1, 6) Subtitle: 297 0.8027027027027027 || Episode: 178 77.72925764192141 0
(1, 7) Subtitle: 291 0.8818181818181818 || Episode: 175 69.44444444444444 0
(1, 8) Subtitle: 276 0.8761904761904762 || Episode: 198 80.48780487804879 0
(1, 9) Subtitle: 332 0.9222222222222223 || Episode: 187 77.27272727272727 0
(1, 10) Subtitle: 341 0.9291553133514986 || Episode: 186 77.17842323651453 0
(1, 11) Subtitle: 334 0.9226519337016574 || Episode: 224 75.42087542087542 0
(1, 12) Subtitle: 309 0.9507692307692308 || Episode: 192 71.11111111111111 1
(1, 13) Subtitle: 314 0.9373134328358209 || Episode: 195 76.171875 0
(1, 14) Subtitl

In [17]:
with open(root_path+'1_alignment_extension_12_again.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [12]:
from utils.head_tail_alignment import *

root_path = "results/friends_en_zh/"
with open(root_path + "1_alignment_ultimate_12.pkl", "rb") as f:
    alignment_seeds = pkl.load(f)

further_alignment = {}
for (i, j) in alignment_seeds.keys():
    try:
        print(i, j)
        (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=all_transcripts,
                en_subtitle=all_en_subtitles,
                zh_subtitle=all_other_subtitles,
                results=coarse_alignments,
                season_id=i,
                episode_id=j,
                bias=200,
                zh_split=True
                )
        epi2sub = filter_by_idx(alignment_seeds[(i,j)])

        head_tail_sub2epi = {}

        gap_pairs = fetch_before_after(en_subset, zh_subset, tbbt_episode, epi2sub)
        for item in gap_pairs:
            temp = before_after_wer_match(en_subset, tbbt_episode, item[0], item[1])
            for x in temp:
                head_tail_sub2epi[x] = temp[x]

        temp = turn_sub2epi_into_epi2sub(head_tail_sub2epi)
        # print(temp)
        if temp!={}:
            if min(list(turn_sub2epi_into_epi2sub(temp).keys()))<0:
                temp = {}
        further_alignment[(i,j)] = filter_by_idx(temp)

        # print(temp)
        # print(filter_by_idx(epi2sub))
        # print("=="*50)
    except:
        print("Pass i j")

ultimate_data = {}
for item in alignment_seeds:
    temp = alignment_seeds[item]
    if item in further_alignment:
        temp = merge_episode_alignment(temp, further_alignment[item])
    ultimate_data[item] = temp

1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 22
1 24
3 16
Pass i j
4 19
Pass i j
6 4
6 19
Pass i j
7 24
Pass i j
9 12


In [13]:
for item in further_alignment:
    print(item)
    print(further_alignment[item])
    print()
    print(ultimate_data[item])
    print("=="*50)

(1, 1)
{}

{0: [199, 200], 1: [201, 202], 2: [203, 204], 3: [205], 4: [206], 5: [207, 208], 6: [209], 7: [210, 211], 8: [212], 9: [213, 214], 10: [215], 11: [216], 12: [217], 14: [218, 219], 15: [220], 16: [221, 222, 223], 18: [224], 20: [226], 21: [227], 22: [228], 24: [229], 27: [230, 231, 232], 28: [233], 29: [234, 235], 30: [236], 31: [237, 238], 32: [239], 33: [240, 241, 242, 243], 34: [244, 245], 35: [246], 36: [247, 248, 249, 250], 38: [251, 252, 253, 254], 39: [255, 256, 257], 40: [258, 259], 41: [260], 42: [261], 43: [262, 263, 264], 44: [265], 45: [266, 267, 268, 269, 270], 46: [271], 48: [272], 49: [273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285], 50: [286], 51: [287], 52: [288], 53: [289, 290], 54: [291], 55: [292, 293, 294], 56: [295], 57: [296], 59: [297, 298, 299], 60: [300, 301, 302, 303, 304, 305, 306, 307], 61: [308], 62: [309, 310], 63: [311], 64: [312, 313, 314], 65: [315, 316], 66: [317, 318, 319], 67: [320], 68: [321], 69: [322, 323, 324], 70: [32

In [11]:
further_alignment = deepcopy(ultimate_data)
for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]), len(all_transcripts[item]),  len(further_alignment[item])/len(all_transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 1) Subtitle: 429 0.988479262672811 || Episode: 231 315 73.33333333333333 5
(1, 2) Subtitle: 294 0.8936170212765957 || Episode: 182 254 71.65354330708661 9
(1, 3) Subtitle: 216 0.9113924050632911 || Episode: 142 272 52.20588235294118 56
(1, 4) Subtitle: 295 0.855072463768116 || Episode: 199 268 74.25373134328358 3
(1, 5) Subtitle: 320 0.9384164222873901 || Episode: 189 254 74.40944881889764 2
(1, 6) Subtitle: 224 0.9068825910931174 || Episode: 140 229 61.135371179039296 49
(1, 7) Subtitle: 256 0.8648648648648649 || Episode: 159 252 63.095238095238095 12
(1, 8) Subtitle: 256 0.8737201365187713 || Episode: 187 246 76.01626016260163 12
(1, 9) Subtitle: 304 0.9047619047619048 || Episode: 181 242 74.79338842975206 4
(1, 10) Subtitle: 262 0.8704318936877077 || Episode: 150 241 62.24066390041494 40
(1, 11) Subtitle: 328 0.9291784702549575 || Episode: 220 297 74.07407407407408 0
(1, 12) Subtitle: 268 0.9337979094076655 || Episode: 173 270 64.07407407407408 2
(1, 13) Subtitle: 305 0.92145015

ZeroDivisionError: division by zero

In [59]:
for item in further_alignment:
    print(item)
    print(further_alignment[item])
    print()

(1, 1)
{5: [207, 208], 6: [209], 7: [210, 211], 8: [212], 9: [213, 214], 10: [215], 11: [216], 12: [217], 14: [218, 219], 15: [220], 16: [221, 222, 223], 18: [224], 20: [226], 21: [227], 22: [228], 24: [229], 27: [230, 231, 232], 28: [233], 29: [234, 235], 30: [236], 31: [237, 238], 32: [239], 33: [240, 241, 242, 243], 34: [244, 245], 35: [246], 36: [247, 248, 249, 250], 38: [251, 252, 253, 254], 39: [255, 256, 257], 40: [258, 259], 41: [260], 42: [261], 43: [262, 263, 264], 44: [265], 45: [266, 267, 268, 269, 270], 46: [271], 48: [272], 49: [273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285], 50: [286], 51: [287], 52: [288], 53: [289, 290], 54: [291], 55: [292, 293, 294], 56: [295], 57: [296], 59: [297, 298, 299], 60: [300, 301, 302, 303, 304, 305, 306, 307], 61: [308], 62: [309, 310], 63: [311], 64: [312, 313, 314], 65: [315, 316], 66: [317, 318, 319], 67: [320], 68: [321], 69: [322, 323, 324], 70: [325, 326, 327], 71: [328, 329], 72: [330], 73: [331], 74: [332], 75: [3