In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_zh/zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [23]:
with open('data/friends_transformed.pkl', 'rb') as f:
    friends = pkl.load(f)

### Start Alignment

In [24]:
# Load alignment results after stage-2
with open('alignment_results/indexs_friends.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [25]:
# Check all substrings in each episode
for season_id in sorted(list(results.keys())):
    season = results[season_id]
    for episode_id in sorted(list(season.keys())):
        idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
        subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)
        try:
            gaps = calculate_gaps(subsets[-1])
            print(gaps)
            print("Season:", season_id, "|Episode:", episode_id, "|Subset Length:", len(subsets[-1]), "|Sum:", sum(gaps), "|Maximum:", max(gaps))
        except:
            print("Season:", season_id, "Episode:", episode_id, "Subset Length:", subsets)
        print('=='*50)

[4, 2, 3, 1, 1, 2, 4, 1, 2, 3, 2, 3, 10, 3, 2, 2, 4, 3, 4, 2, 6, 6, 1, 1, 3, 4, 1, 3, 3, 1, 3, 2, 4, 4, 5, 1, 1, 2, 1, 4, 11, 1, 5, 12, 6, 13, 2, 1, 1, 2, 2, 5, 1, 4, 1, 3, 5, 1, 11, 1, 1, 8, 2, 1, 4, 1, 1, 4, 1, 1, 2, 4, 1, 2, 2, 1, 2, 3, 2, 5, 5, 10, 9, 2, 3, 1, 14, 1, 1, 2, 1, 2, 4, 3, 1, 1, 1, 6, 1, 9, 1, 3, 4, 1, 6, 2, 1, 2, 1, 3, 2, 1, 6, 14, 3, 2, 1, 3, 1, 11, 2, 5, 4, 5, 1, 7, 1, 6, 10]
Season: 1 |Episode: 1 |Subset Length: 130 |Sum: 436 |Maximum: 14
[4, 1, 2, 4, 2, 1, 3, 4, 4, 13, 6, 3, 1, 9, 1, 7, 4, 1, 1, 7, 1, 5, 4, 2, 2, 3, 7, 1, 3, 7, 2, 1, 1, 3, 1, 1, 3, 1, 1, 2, 1, 3, 2, 2, 2, 9, 1, 1, 8, 2, 1, 1, 1, 2, 2, 1, 13, 7, 7, 4, 29, 3, 11, 2, 5, 5, 9, 3, 6, 3, 5, 1, 14, 1]
Season: 1 |Episode: 2 |Subset Length: 75 |Sum: 296 |Maximum: 29
[3, 5, 10, 1, 6, 2, 9, 7, 15, 4, 3, 1, 2, 5, 1, 4, 1, 1, 8, 2, 8, 1, 1, 1, 4, 1, 1, 4, 7, 9, 7, 3, 1, 4, 3, 8, 1, 2, 6, 3, 7, 16, 15, 7, 1, 3, 1, 1, 4, 4, 5, 6, 4, 2, 2, 2, 2, 3, 4, 1, 8, 16, 1, 3, 2, 7, 8, 4, 1]
Season: 1 |Episode: 3 |Subset Le

In [45]:
def fetch_friends_subsets(episode, en_subtitle, zh_subtitle, results, season_id, episode_id, bias):
    idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
    subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)[-1]
    # Calculate gaps within the subset
    gaps_subsets = calculate_gaps(subsets)

    # Prepare Subtitle Subset
    start = subsets[0] - bias
    end = subsets[-1] + bias
    en_subset = en_subtitle[start: end]
    zh_subset = zh_subtitle[start: end]

    # Prepare utterances of one episode
    tbbt_episode = []
    for item in episode:
        if int(item[1:3]) == season_id and int(item[5:]) == episode_id:
            for x in episode[item]:
                print(x)
            print()
            sentences = episode[item]['sentences']
            speakers = episode[item]['speakers']
            for sentence, speaker in zip(sentences, speakers):
                tbbt_episode.append([sentence, speaker])

    return en_subset, zh_subset, tbbt_episode

In [46]:
# Fetch subset located by the stage-1 alignment
(en_subset, zh_subset, friends_episode) = fetch_friends_subsets(
    episode=friends,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=1,
    episode_id=1,
    bias=200
)

["There's nothing to tell! He's just some guy I work with!", 'there is nothing to tell he is just some guy i work with', 's01_e01_c01_u001']
["C'mon, you're going out with the guy! There's gotta be something wrong with him!", 'c amon you are going out with the guy there is gotta be something wrong with him', 's01_e01_c01_u002']
['All right Joey, be nice. So does he have a hump? A hump and a hairpiece?', 'all right joey be nice so does he have a hump a hump and a hairpiece', 's01_e01_c01_u003']
['Wait, does he eat chalk?', 'wait does he eat chalk', 's01_e01_c01_u004']
['', '', 's01_e01_c01_u005']
["Just, 'cause, I don't want her to go through what I went through with Carl- oh!", 'just cause i do not want her to go through what i went through with carl oh', 's01_e01_c01_u006']
["Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.", 'okay everybody relax this is not even a date it is just two people going out to dinner and not havi

TypeError: list indices must be integers or slices, not str

In [5]:
# Load Memor Dataset
with open('memor/data.json') as f:
    tbbt = json.load(f)

In [6]:
# Load alignment results after stage-2
with open('alignment_results/indexs_tbbt_zh.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [7]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=3,
        bias=200
    )

In [9]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)



    return temp

In [10]:
further_alignment = {}
for i in tqdm(range(10)):
    for j in tqdm(range(25)):
        try:
            temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            # print(temp)
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:02<00:50,  2.10s/it][A
  8%|▊         | 2/25 [00:03<00:36,  1.58s/it][A
 12%|█▏        | 3/25 [00:05<00:37,  1.68s/it][A
 16%|█▌        | 4/25 [00:06<00:30,  1.47s/it][A
 20%|██        | 5/25 [00:07<00:28,  1.43s/it][A
 24%|██▍       | 6/25 [00:08<00:26,  1.38s/it][A
 28%|██▊       | 7/25 [00:10<00:23,  1.31s/it][A
 32%|███▏      | 8/25 [00:11<00:22,  1.32s/it][A
 36%|███▌      | 9/25 [00:12<00:18,  1.17s/it][A
 40%|████      | 10/25 [00:13<00:16,  1.12s/it][A
 44%|████▍     | 11/25 [00:13<00:13,  1.04it/s][A
 48%|████▊     | 12/25 [00:14<00:13,  1.01s/it][A
 52%|█████▏    | 13/25 [00:16<00:13,  1.10s/it][A
 56%|█████▌    | 14/25 [00:17<00:12,  1.10s/it][A
 60%|██████    | 15/25 [00:18<00:11,  1.13s/it][A
100%|██████████| 25/25 [00:19<00:00,  1.29it/s][A
 10%|█         | 1/10 [00:19<02:54, 19.36s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [

In [26]:
for x in further_alignment:
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=x[0]+1,
        episode_id=x[1]+1,
        bias=200
    )
    total = len(tbbt_episode)
    print(x, total, len(further_alignment[x][0]), len(further_alignment[x][1]), len(further_alignment[x][2]), len(further_alignment[x][3]))
    # print(further_alignment[x])
    # print("=="*50)

(0, 0) 133 80 80 100 100
(0, 1) 84 51 51 61 61
(0, 2) 118 52 52 63 63
(0, 3) 76 49 49 56 56
(0, 4) 98 17 17 17 17
(0, 5) 90 54 54 69 69
(0, 6) 82 51 51 61 61
(0, 7) 93 27 27 35 35
(0, 8) 56 11 11 12 12
(0, 9) 71 43 43 51 51
(0, 10) 44 29 29 31 31
(0, 11) 77 19 19 26 26
(0, 12) 88 48 48 60 60
(0, 13) 76 46 46 58 58
(0, 14) 84 42 42 56 56
(0, 15) 48 27 27 35 35
(1, 0) 77 54 54 65 65
(1, 1) 44 26 26 31 31
(1, 2) 59 36 36 43 43
(1, 3) 74 47 47 60 60
(1, 4) 62 18 18 21 21
(1, 5) 75 39 39 49 49
(1, 6) 82 33 33 41 41
(1, 7) 49 32 32 37 37
(1, 8) 49 8 8 8 8
(1, 9) 35 23 23 25 25
(1, 10) 62 16 16 21 21
(1, 11) 63 46 46 52 52
(1, 12) 48 27 27 38 38
(1, 13) 57 19 19 22 22
(1, 14) 61 36 36 45 45
(1, 15) 93 51 51 67 67
(1, 16) 51 34 34 42 42
(1, 17) 44 29 29 31 31
(1, 18) 79 49 49 64 64
(1, 19) 74 33 33 43 43
(1, 20) 91 47 47 63 63
(1, 21) 86 51 51 67 67
(1, 22) 56 36 36 44 44
(2, 0) 91 58 58 71 71
(2, 1) 72 37 37 49 49
(2, 2) 79 37 37 53 53
(2, 3) 82 54 54 65 65
(2, 4) 62 34 34 50 50
(2, 5) 68 37 

In [13]:
further_alignment[(i,j)] = temp

In [14]:
print(further_alignment)

{(1, 2): [{2: [200], 3: [201], 4: [204], 8: [209, 210, 211], 9: [212], 11: [215, 217, 218], 15: [225], 16: [226], 18: [228], 19: [229], 22: [248], 23: [264, 265], 25: [268], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [441], 57: [443], 61: [462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 15: [225], 16: [226], 18: [227, 228], 19: [229], 22: [248], 23: [264, 265, 266], 25: [268, 269], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [340, 341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [440, 441], 57: [443], 61: [461, 462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 14: [221], 15: [225]

In [13]:
with open('further_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

## Check the further alignment

In [114]:
with open('further_alignment.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [17]:
alignment = further_alignment

In [18]:
for x in alignment:
    # Define season and episode
    season_id = x[0]+1
    episode_id = x[1]+1

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x][2]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, "", " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

In [None]:
"""
Check alignment based on Season-Episode
Write into xlsx file
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [109]:
"""
Check alignment based on Season-Episode
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [110]:
# Show TBBT Episode
for i, (utt, speaker) in enumerate(tbbt_episode):
    if i in epi2sub:
        print(i, "||||",epi2sub[i], speaker, utt)
    else:
        print(i, speaker, utt)

0 4 no, i haven't.
1 2 get used to it.
2 |||| [196] 4 yeah, i probably won't, but... hey sheldon.
3 |||| [197] 1 hi.
4 |||| [198, 199] 4 hey raj!  still not talking to me, huh?
5 1 don't take it personally, it's his pathology, he can't talk to women.
6 |||| [201] 2 he can't talk to attractive women, or in your case a cheesecake-scented goddess!
7 |||| [202] 0 so, there's gonna be some furniture delivered?
8 |||| [217, 218, 219, 220] 1 oh no, let's assume that they can. lois lane is falling, accelerating at an initial rate of 32 feet per second per second. superman swoops down to save her by reaching out two arms of steel. miss lane, who is now travelling at approximately 120 miles per hour, hits them, and is immediately sliced into three equal pieces.
9 |||| [226, 228, 229] 1 are you listening to yourself, it is well established that superman's flight is a feat of strength, it is an extension of his ability to leap tall buildings, an ability he derives from earth's yellow sun.
10 |||| 

In [108]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle, zh_subset[i])
    else:
        print(i, subtitle, zh_subset[i])

0 Thank you very much. Good day to you. 多谢了 日安
1 Good day to you. 日安
2 Come and buy a dresser! 来买梳妆台了
3 The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
4 Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
5 The village had grown. 村子也大了 有很多新的小孩
6 There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
7 But we had a secret helper. 但是我们有个秘密
8 Nikolas. -尼古拉斯
9 -Eemeli. -艾美利
10 Long time no see. You should come more often. 很久没见了 你应该常来
11 I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
12 Do you have the list? 你有名单吗
13 Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
14 As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
15 Is that... -是吗
16 -A girl, three months. -一个女孩 三个月大
17 Let's add her to the list. 那我们加上她的名字吧
18 What is the name of this little princess? 这个小公主叫什么名字
19 Aada. 亚达
20 Aada? 亚达
21 Hello, Aada. 你好 亚达
22 Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
23 My sons. 我的儿子
