In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_zh/zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [5]:
with open('data/friends_transformed.pkl', 'rb') as f:
    friends = pkl.load(f)

In [6]:
with open('en_subtitles.txt', 'w', encoding='utf-8') as f:
    for i, x in enumerate(en_subtitle):
        f.write(str(i) + x)

### Start Alignment

In [7]:
# Load alignment results after stage-2
with open('alignment_results/transformed_friends_index.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [39]:
for i in results:
    for j in results[i]:
        print(i, j)
        print(results[i][j])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [49]:
for season_id in sorted(list(results.keys())):
    season = results[season_id]
    for episode_id in sorted(list(season.keys())):
        idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
        subsets = find_all_continuous_subsets(idx_list, gaps, 5, 50)
        print(season_id, episode_id, subsets)

1 1 [[1664580, 1664584, 1664586, 1664587, 1664588, 1664589, 1664591, 1664593, 1664598, 1664601, 1664608, 1664609, 1664624, 1664626, 1664631, 1664634, 1664638, 1664640, 1664645, 1664652, 1664655, 1664662, 1664663, 1664665, 1664666, 1664668, 1664672, 1664675, 1664681, 1664683, 1664688, 1664689, 1664690, 1664692, 1664706, 1664711, 1664723, 1664729, 1664743, 1664746, 1664747, 1664749, 1664761, 1664766, 1664767, 1664773, 1664778, 1664779, 1664790, 1664791, 1664801, 1664802, 1664806, 1664807, 1664811, 1664813, 1664814, 1664819, 1664820, 1664822, 1664825, 1664826, 1664828, 1664833, 1664838, 1664845, 1664860, 1664861, 1664870, 1664876, 1664892, 1664893, 1664894, 1664896, 1664897, 1664899, 1664908, 1664909, 1664910, 1664911, 1664917, 1664918, 1664927, 1664928, 1664929, 1664931, 1664935, 1664943, 1664946, 1664947, 1664952, 1664954, 1664957, 1664958, 1664965, 1664977, 1664978, 1664979, 1664982, 1664986, 1664998, 1664999, 1665001, 1665008, 1665021, 1665026, 1665042]]
1 2 [[1667536, 1667544, 166754

In [52]:
print(en_subtitle[4563937])
print(en_subtitle[4563938])
print(en_subtitle[4563943])

Papa, I don't want to go, Papa. I don't want to go.
Mama, please tell Papa. I don't want to go to boarding school.
I don't want to go.


In [8]:
# Check all substrings in each episode
for season_id in sorted(list(results.keys())):
    season = results[season_id]
    for episode_id in sorted(list(season.keys())):
        idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
        subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)
        try:
            gaps = calculate_gaps(subsets[-1])
            print(gaps)
            print()
            print(subsets)
            print("Season:", season_id, "|Episode:", episode_id, "|Subset Length:", len(subsets[-1]), "|Sum:", sum(gaps), "|Maximum:", max(gaps))
        except:
            print("Season:", season_id, "Episode:", episode_id, "Subset Length:", subsets)
        print('=='*50)

[4, 2, 1, 1, 1, 2, 2, 5, 3, 7, 1, 15, 2, 5, 3, 4, 2, 5, 7, 3, 7, 1, 2, 1, 2, 4, 3, 6, 2, 5, 1, 1, 2, 14, 5, 12, 6, 14, 3, 1, 2, 12, 5, 1, 6, 5, 1, 11, 1, 10, 1, 4, 1, 4, 2, 1, 5, 1, 2, 3, 1, 2, 5, 5, 7, 15, 1, 9, 6, 16, 1, 1, 2, 1, 2, 9, 1, 1, 1, 6, 1, 9, 1, 1, 2, 4, 8, 3, 1, 5, 2, 3, 1, 7, 12, 1, 1, 3, 4, 12, 1, 2, 7, 13, 5, 16]

[[1664580, 1664584, 1664586, 1664587, 1664588, 1664589, 1664591, 1664593, 1664598, 1664601, 1664608, 1664609, 1664624, 1664626, 1664631, 1664634, 1664638, 1664640, 1664645, 1664652, 1664655, 1664662, 1664663, 1664665, 1664666, 1664668, 1664672, 1664675, 1664681, 1664683, 1664688, 1664689, 1664690, 1664692, 1664706, 1664711, 1664723, 1664729, 1664743, 1664746, 1664747, 1664749, 1664761, 1664766, 1664767, 1664773, 1664778, 1664779, 1664790, 1664791, 1664801, 1664802, 1664806, 1664807, 1664811, 1664813, 1664814, 1664819, 1664820, 1664822, 1664825, 1664826, 1664828, 1664833, 1664838, 1664845, 1664860, 1664861, 1664870, 1664876, 1664892, 1664893, 1664894, 1664896,

In [40]:
print(en_subtitle[3279030])

Yes, bravo on the hot nanny.


In [9]:
def fetch_friends_subsets(episode, en_subtitle, zh_subtitle, results, season_id, episode_id, bias):
    idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
    subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)[-1]
    # Calculate gaps within the subset
    gaps_subsets = calculate_gaps(subsets)

    # Prepare Subtitle Subset
    start = subsets[0] - bias
    end = subsets[-1] + bias
    en_subset = en_subtitle[start: end]
    zh_subset = zh_subtitle[start: end]

    # Prepare utterances of one episode
    friends_episode = []
    for item in episode:
        if int(item[1:3]) == season_id and int(item[5:]) == episode_id:
            for (sentence, id, speaker) in episode[item]:
                if transformation(sentence) in [' ', '']:
                    continue
                friends_episode.append([sentence, speaker])

    return en_subset, zh_subset, friends_episode

In [12]:
def look_alignment(friends, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, friends_episode) = fetch_friends_subsets(
        episode=friends,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )
    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, friends_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, friends_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, friends_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    return temp

In [36]:
further_alignment = {}
for i in range(12):
    for j in range(30):
        try:
            temp = look_alignment(friends, en_subtitle, zh_subtitle, results, i, j, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

In [34]:
(en_subset, zh_subset, friends_episode) = fetch_friends_subsets(
        episode=friends,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=9,
        episode_id=24,
        bias=200
    )

In [35]:
print(friends_episode)

[["Oh, ain't this nice? It's so quiet, I could just lie here all day.", ['Chandler Bing']], ['I know.', ['Monica Geller']], ['Open your drapes! Open your drapes!', ['Rachel Green']], ["I'm so glad we've got adjoining rooms!", ['Chandler Bing']], ['The sun is out!', ['Monica Geller']], ['Hey! Remember when I had corneas?', ['Chandler Bing']], ["Ok listen, you go down to the pool and reserve the chairs, and I'll get the magazines and the lotion.", ['Monica Geller']], ["Ladies? Ross's speech is in 45 minutes.", ['Chandler Bing']], ['Nooo!', ['Rachel Green']], ['Damn it!', ['Monica Geller']], ['Walls are pretty thin, guys!', ['Ross Geller']], ['Then we have to await the data from recent MRI scans and DNA testing which call into question information gathered from years of simple carbon dating.', ['Ross Geller']], ["Look at that woman sitting by the pool getting tan... so leathery and wrinkled, I'm so jealous!", ['Rachel Green']], ['Finally, factoring the profusion of new species recently di

In [31]:
for x in further_alignment:
    print(x, len(further_alignment[x][2]))

(1, 1) 113
(1, 2) 12
(1, 3) 10
(1, 4) 157
(1, 5) 135
(1, 6) 27
(1, 7) 11
(1, 8) 142
(1, 9) 44
(1, 10) 29
(1, 11) 21
(1, 12) 122
(1, 13) 36
(1, 14) 115
(1, 15) 55
(1, 16) 10
(1, 17) 107
(1, 18) 55
(1, 19) 87
(1, 20) 156
(1, 21) 1
(1, 22) 73
(1, 23) 1
(1, 24) 107
(2, 7) 1
(3, 2) 1
(3, 15) 1
(3, 16) 1
(4, 8) 1
(4, 9) 1
(4, 16) 1
(4, 19) 1
(4, 21) 2
(5, 6) 1
(5, 21) 1
(6, 4) 1
(6, 15) 1
(6, 19) 1
(7, 15) 1
(7, 24) 1
(9, 11) 2
(9, 12) 160
(9, 13) 1
(9, 15) 1
(9, 17) 1
(9, 18) 2
(9, 24) 4
(10, 8) 1
(10, 10) 1


In [26]:
for x in further_alignment:
    print(x, len(further_alignment[x][0]), len(further_alignment[x][1]), len(further_alignment[x][2]), len(further_alignment[x][3]))

(1, 1) 88 88 113 113
(1, 2) 9 9 12 12
(3, 2) 1 1 1 1


In [150]:
for x in further_alignment:
    print(x, len(further_alignment[x][0]), len(further_alignment[x][1]), len(further_alignment[x][2]), len(further_alignment[x][3]))

(0, 1) 88 88 113 113
(1, 0) 88 88 113 113
(1, 2) 88 88 113 113
(2, 0) 88 88 113 113
(2, 1) 88 88 113 113
(2, 2) 88 88 113 113


In [151]:
print(further_alignment[(0,1)])

[{0: [200], 1: [201], 2: [204], 4: [206], 5: [207, 208], 6: [209], 7: [211], 9: [213], 14: [218], 15: [220], 16: [221, 222, 223], 20: [226], 22: [228], 24: [229], 27: [232], 29: [235], 31: [238], 32: [239], 33: [242, 243], 34: [244, 245], 35: [246], 36: [248, 249], 38: [251, 254], 40: [258], 41: [260], 44: [265], 45: [267], 48: [272], 49: [273, 274, 275, 278, 280, 282, 283, 284, 285], 50: [286], 51: [287], 52: [288], 55: [292, 293], 56: [295], 57: [296], 60: [301, 303, 306], 61: [308], 62: [309, 310], 63: [311], 64: [312, 313], 65: [316], 69: [322], 70: [325, 326], 72: [330], 73: [331], 78: [337], 84: [343], 87: [346], 88: [347], 89: [349], 91: [352], 95: [360], 98: [363, 364], 100: [366], 101: [367], 102: [369, 370], 103: [375], 107: [380], 108: [381, 382], 109: [386, 387, 388, 389], 113: [393], 118: [398, 399], 120: [401], 121: [403], 127: [408, 410], 128: [411], 135: [421, 422, 423, 424, 426, 427], 136: [429], 139: [431], 140: [432, 433, 434, 435, 436], 141: [438, 439, 440], 142: [4

In [152]:
print(further_alignment[(1,0)])

[{0: [200], 1: [201], 2: [204], 4: [206], 5: [207, 208], 6: [209], 7: [211], 9: [213], 14: [218], 15: [220], 16: [221, 222, 223], 20: [226], 22: [228], 24: [229], 27: [232], 29: [235], 31: [238], 32: [239], 33: [242, 243], 34: [244, 245], 35: [246], 36: [248, 249], 38: [251, 254], 40: [258], 41: [260], 44: [265], 45: [267], 48: [272], 49: [273, 274, 275, 278, 280, 282, 283, 284, 285], 50: [286], 51: [287], 52: [288], 55: [292, 293], 56: [295], 57: [296], 60: [301, 303, 306], 61: [308], 62: [309, 310], 63: [311], 64: [312, 313], 65: [316], 69: [322], 70: [325, 326], 72: [330], 73: [331], 78: [337], 84: [343], 87: [346], 88: [347], 89: [349], 91: [352], 95: [360], 98: [363, 364], 100: [366], 101: [367], 102: [369, 370], 103: [375], 107: [380], 108: [381, 382], 109: [386, 387, 388, 389], 113: [393], 118: [398, 399], 120: [401], 121: [403], 127: [408, 410], 128: [411], 135: [421, 422, 423, 424, 426, 427], 136: [429], 139: [431], 140: [432, 433, 434, 435, 436], 141: [438, 439, 440], 142: [4

In [135]:
further_alignment = {}
for i in tqdm(range(3)):
    for j in tqdm(range(3)):
        try:
            temp = look_alignment(friends, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/3 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:00<00:00, 733.31it/s]


Yes
Yes
Yes



100%|██████████| 3/3 [00:00<00:00, 1335.77it/s]


Yes
Yes
Yes



100%|██████████| 3/3 [00:00<00:00, 522.63it/s]
100%|██████████| 3/3 [00:00<00:00, 116.33it/s]

Yes
Yes
Yes





In [127]:
print(further_alignment)

{}


In [26]:
for x in further_alignment:
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=x[0]+1,
        episode_id=x[1]+1,
        bias=200
    )
    total = len(tbbt_episode)
    print(x, total, len(further_alignment[x][0]), len(further_alignment[x][1]), len(further_alignment[x][2]), len(further_alignment[x][3]))
    # print(further_alignment[x])
    # print("=="*50)

(0, 0) 133 80 80 100 100
(0, 1) 84 51 51 61 61
(0, 2) 118 52 52 63 63
(0, 3) 76 49 49 56 56
(0, 4) 98 17 17 17 17
(0, 5) 90 54 54 69 69
(0, 6) 82 51 51 61 61
(0, 7) 93 27 27 35 35
(0, 8) 56 11 11 12 12
(0, 9) 71 43 43 51 51
(0, 10) 44 29 29 31 31
(0, 11) 77 19 19 26 26
(0, 12) 88 48 48 60 60
(0, 13) 76 46 46 58 58
(0, 14) 84 42 42 56 56
(0, 15) 48 27 27 35 35
(1, 0) 77 54 54 65 65
(1, 1) 44 26 26 31 31
(1, 2) 59 36 36 43 43
(1, 3) 74 47 47 60 60
(1, 4) 62 18 18 21 21
(1, 5) 75 39 39 49 49
(1, 6) 82 33 33 41 41
(1, 7) 49 32 32 37 37
(1, 8) 49 8 8 8 8
(1, 9) 35 23 23 25 25
(1, 10) 62 16 16 21 21
(1, 11) 63 46 46 52 52
(1, 12) 48 27 27 38 38
(1, 13) 57 19 19 22 22
(1, 14) 61 36 36 45 45
(1, 15) 93 51 51 67 67
(1, 16) 51 34 34 42 42
(1, 17) 44 29 29 31 31
(1, 18) 79 49 49 64 64
(1, 19) 74 33 33 43 43
(1, 20) 91 47 47 63 63
(1, 21) 86 51 51 67 67
(1, 22) 56 36 36 44 44
(2, 0) 91 58 58 71 71
(2, 1) 72 37 37 49 49
(2, 2) 79 37 37 53 53
(2, 3) 82 54 54 65 65
(2, 4) 62 34 34 50 50
(2, 5) 68 37 

In [13]:
further_alignment[(i,j)] = temp

In [14]:
print(further_alignment)

{(1, 2): [{2: [200], 3: [201], 4: [204], 8: [209, 210, 211], 9: [212], 11: [215, 217, 218], 15: [225], 16: [226], 18: [228], 19: [229], 22: [248], 23: [264, 265], 25: [268], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [441], 57: [443], 61: [462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 15: [225], 16: [226], 18: [227, 228], 19: [229], 22: [248], 23: [264, 265, 266], 25: [268, 269], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [340, 341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [440, 441], 57: [443], 61: [461, 462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 14: [221], 15: [225]

In [13]:
with open('further_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

## Check the further alignment

In [114]:
with open('further_alignment.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [17]:
alignment = further_alignment

In [18]:
for x in alignment:
    # Define season and episode
    season_id = x[0]+1
    episode_id = x[1]+1

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x][2]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, "", " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

In [None]:
"""
Check alignment based on Season-Episode
Write into xlsx file
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [109]:
"""
Check alignment based on Season-Episode
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [105]:
# Show TBBT Episode
for i, x in enumerate(friends_episode):
    print(x)

["There's nothing to tell! He's just some guy I work with!", ['Monica Geller']]
["C'mon, you're going out with the guy! There's gotta be something wrong with him!", ['Joey Tribbiani']]
['All right Joey, be nice. So does he have a hump? A hump and a hairpiece?', ['Chandler Bing']]
['Wait, does he eat chalk?', ['Phoebe Buffay']]
["Just, 'cause, I don't want her to go through what I went through with Carl- oh!", ['Phoebe Buffay']]
["Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.", ['Monica Geller']]
['Sounds like a date to me.', ['Chandler Bing']]
["Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked.", ['Chandler Bing']]
['Oh, yeah. Had that dream.', ['#ALL#']]
["Then I look down, and I realize there's a phone... there.", ['Chandler Bing']]
['Instead of...?', ['Joey Tribbiani']]
["That's right.", ['Chandler Bing']]
['Never had that dream.', ['Joey Tribbiani']]
['N

In [106]:
for i, x in enumerate(en_subset):
    print(i, x)

0 "The project was created because the metaphors, the story line, that existed in our culture, for dying, in which dying was seen as a failure and the enemy, and an error of the universe somehow.
1 And therefore, a hospital intense of care ward was the ultimate inner chamber of the temple about life.
2 And the whole fear that surrounded the process of the doctor's seeing death as a failure.
3 We felt that it was needed for another metaphorical system."
4 "Please begin by picturing in front of you your client or some other person you know who are suffering right now.
5 And feel in an open and direct way, how this person actually suffers.
6 Feel the quality of it."
7 "The Dying project was created in order to create an environment, in which people that were deep in philosophical materialism could explore another metaphor for life and thus for death.
8 And the people we found most willing to do that, were people that were themselves facing death."
9 "What do you think gonna happen when yo

In [108]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle, zh_subset[i])
    else:
        print(i, subtitle, zh_subset[i])

0 Thank you very much. Good day to you. 多谢了 日安
1 Good day to you. 日安
2 Come and buy a dresser! 来买梳妆台了
3 The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
4 Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
5 The village had grown. 村子也大了 有很多新的小孩
6 There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
7 But we had a secret helper. 但是我们有个秘密
8 Nikolas. -尼古拉斯
9 -Eemeli. -艾美利
10 Long time no see. You should come more often. 很久没见了 你应该常来
11 I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
12 Do you have the list? 你有名单吗
13 Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
14 As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
15 Is that... -是吗
16 -A girl, three months. -一个女孩 三个月大
17 Let's add her to the list. 那我们加上她的名字吧
18 What is the name of this little princess? 这个小公主叫什么名字
19 Aada. 亚达
20 Aada? 亚达
21 Hello, Aada. 你好 亚达
22 Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
23 My sons. 我的儿子
