In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [4]:
print(transformation("agreed what’s your point"))
print(transformation("agreed what’s your point".replace("’", "'")))
print(transformation("agreed what's your point"))

agreed what’s your point
agreed what is your point
agreed what is your point


In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_fa/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_fa/fa_subtitles.pkl', 'rb') as f:
    fa_subtitle = pkl.load(f)

In [5]:
with open('original_transcript/tbbt_transcripts.pkl', 'rb') as f:
    tbbt_transcripts = pkl.load(f)

In [6]:
# Load alignment results after stage-2
with open('alignment_results/indexs_tbbt_fa.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [19]:
print(len(results))

9


In [20]:
temp = []
for i in sorted(list(results.keys())):
    for j in sorted(list(results[i].keys())):
        print(i, j, len(results[i][j]))
        temp.append((i,j))

1 1 686
1 2 145
1 3 113
1 4 137
1 5 91
1 6 234
1 7 136
1 8 806
1 9 82
1 10 417
1 11 40
1 12 167
1 13 146
1 14 2073
1 15 498
1 16 141
2 1 248
2 2 16
2 3 255
2 4 417
2 5 83
2 6 605
2 7 131
2 8 1200
2 9 139
2 10 455
2 11 112
2 12 378
2 13 12
2 14 258
2 15 298
2 16 462
2 17 54
2 18 24
2 19 19
2 20 47
2 21 601
2 22 582
2 23 19
3 1 169
3 2 345
3 3 76
3 4 1047
3 5 62
3 6 124
3 7 76
3 8 126
3 9 416
3 10 347
3 11 2512
3 12 583
3 13 30
3 14 499
3 15 236
3 16 61
3 17 381
3 18 450
3 19 382
3 20 62
3 21 880
3 22 286
3 23 86
4 1 130
4 2 86
4 3 435
4 4 58
4 5 300
4 6 437
4 7 92
4 8 63
4 9 713
4 10 84
4 11 357
4 12 89
4 13 148
4 14 415
4 15 360
4 16 480
4 17 28
4 18 14
4 19 510
4 20 772
4 21 127
4 22 241
4 23 182
4 24 138
5 1 762
5 2 70
5 3 43
5 4 36
5 5 441
5 6 31
5 7 69
5 8 133
5 9 711
5 10 671
5 11 382
5 12 241
5 13 331
5 14 855
5 15 168
5 16 928
5 17 554
5 18 178
5 19 106
5 20 276
5 21 363
5 22 1330
5 23 261
6 1 193
6 2 396
6 3 504
6 4 112
6 5 126
6 6 1996
6 7 192
6 8 57
6 9 213
6 10 567
6 11 27
6

In [17]:
print(temp)

[(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (2, 15), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (2, 21), (2, 22), (2, 23), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (3, 15), (3, 16), (3, 17), (3, 18), (3, 19), (3, 20), (3, 21), (3, 22), (3, 23), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (4, 10), (4, 11), (4, 12), (4, 13), (4, 14), (4, 15), (4, 16), (4, 17), (4, 18), (4, 19), (4, 20), (4, 21), (4, 22), (4, 23), (4, 24), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (5, 15), (5, 16), (5, 17), (5, 18), (5, 19), (5, 20), (5, 21), (5, 22), (5, 23), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8),

## Perform fine-grained alignment

### Part 0: Load Data

In [7]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=fa_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

In [8]:
print(en_subset)

['Pretty good. Uh, remember Nikki?', 'She-she came by today.', 'Uh-huh. Yeah.', "Yeah, she's getting married to some guy.", 'Yeah, I hope he has insurance.', 'Uh, Rebecca and Brian', '- are waiting for you. - Thank you.', "I'm gonna get Kelly.", 'Yeah, I am so glad you two are making such progress.', "I wasn't sure we'd make it this far.", "Yeah, I just decided to stop complaining about all the things I didn't have and start appreciating the things I did.", 'That is what I want to hear from both of you:', 'getting along and appreciating what you have.', 'So, how are things physically?', "Oh, well, um, well, well, we're not quite having sex yet.", "Um, but-but, hey, we are getting along, and-and it's not all about sex, right?", "Yeah, I-I, I'm not complaining.", "Uh, I'm just trying to appreciate the non-sex that we're having.", "You know, you don't have to have sex to show somebody you love them.", 'What you writing?', "Guys, I am so sorry I'm late.", 'Something came up at the office.'

#### Part 1: String Match with sliding window

In [26]:
# Part 1: String Match with sliding window
def temp_string_match_sliding_window_no_filter(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)
    return res

In [27]:
result_0_all = {}
for i in range(12):
    for j in range(30):
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=tbbt_transcripts,
                en_subtitle=en_subtitle,
                zh_subtitle=fa_subtitle,
                results=results,
                season_id=i+1,
                episode_id=j+1,
                bias=200
            )
            temp = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=9)
            result_0_all[(i+1,j+1)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

Season: 1 Episode: 1 Episode Number: 48 Subtitle Number: 42
Season: 1 Episode: 2 Episode Number: 55 Subtitle Number: 44
Season: 1 Episode: 3 Episode Number: 37 Subtitle Number: 36
Season: 1 Episode: 4 Episode Number: 52 Subtitle Number: 42
Season: 1 Episode: 5 Episode Number: 50 Subtitle Number: 42
Season: 1 Episode: 6 Episode Number: 49 Subtitle Number: 48
Season: 1 Episode: 7 Episode Number: 53 Subtitle Number: 50
Season: 1 Episode: 8 Episode Number: 64 Subtitle Number: 54
Season: 1 Episode: 9 Episode Number: 40 Subtitle Number: 35
Season: 1 Episode: 10 Episode Number: 61 Subtitle Number: 53
Season: 1 Episode: 11 Episode Number: 35 Subtitle Number: 34
Season: 1 Episode: 12 Episode Number: 69 Subtitle Number: 57
Season: 1 Episode: 13 Episode Number: 58 Subtitle Number: 53
Season: 1 Episode: 14 Episode Number: 61 Subtitle Number: 52
Season: 1 Episode: 15 Episode Number: 55 Subtitle Number: 49
Season: 1 Episode: 16 Episode Number: 57 Subtitle Number: 49
Season: 2 Episode: 4 Episode Numb

In [28]:
with open('alignment_results/fa/alignment_part_1_string_match.pkl', 'wb') as f:
    pkl.dump(result_0_all, f)

In [29]:
with open('alignment_results/fa/alignment_part_1_string_match.pkl', 'rb') as f:
    result_0_all = pkl.load(f)

### Filter the indexs obtained using sliding window string match

In [90]:
temp = result_0_all[(1,4)]
for x in temp:
    print(x, temp[x])

194 {2}
195 {2}
205 {10}
211 {16}
223 {25}
236 {35}
242 {42}
248 {46}
249 {46}
250 {46}
257 {52}
272 {64}
275 {67}
287 {77}
291 {79}
292 {79}
295 {80}
300 {84}
306 {92}
313 {96}
314 {97}
317 {99}
322 {103}
336 {111}
345 {119}
359 {129}
371 {137}
373 {139}
385 {151}
386 {152}
389 {153}
390 {153}
392 {153}
406 {159}
407 {159}
408 {159}
411 {161}
425 {167}
428 {168}
434 {173}
436 {174}
441 {179}
444 {182}
454 {190}
459 {193}
461 {196}
462 {196}
468 {203}
469 {203}
474 {206}
483 {215}


### Part 2: Strict Match

In [327]:
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=fa_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )

In [328]:
for x in tbbt_episode:
    print(x)

[' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'Sheldon', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't', 'e', 'd', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'a', ' ', 'p', 'l', 'a', 'n', 'e', ' ', 'w', 'i', 't', 'h', ' ', 't', 'w', 'o', ' ', 's', 'l', 'i', 't', 's', ' ', 'i', 'n', ' ', 'i', 't', ' ', 'a', 'n', 'd', ' ', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'i', 's', ' ', 'o', 'b', 's', 'e', 'r', 'v', 'e', 'd', ' ', 'i', 't', ' ', 'w', 'i', 'l', 'l', ' ', 'n', 'o', 't', ' ', 'g', 'o', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'b', 'o', 't', 'h', '.', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't', 'e', 'd', ' ', 't', 'h', 'r', 'o', 'u',

In [329]:
for x in tbbt_transcripts[(1,1)]:
    print(x)

[' A corridor at a sperm bank.', 'Scene']
[' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'Sheldon', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't', 'e', 'd', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'a', ' ', 'p', 'l', 'a', 'n', 'e', ' ', 'w', 'i', 't', 'h', ' ', 't', 'w', 'o', ' ', 's', 'l', 'i', 't', 's', ' ', 'i', 'n', ' ', 'i', 't', ' ', 'a', 'n', 'd', ' ', 'e', 'i', 't', 'h', 'e', 'r', ' ', 'i', 's', ' ', 'o', 'b', 's', 'e', 'r', 'v', 'e', 'd', ' ', 'i', 't', ' ', 'w', 'i', 'l', 'l', ' ', 'n', 'o', 't', ' ', 'g', 'o', ' ', 't', 'h', 'r', 'o', 'u', 'g', 'h', ' ', 'b', 'o', 't', 'h', '.', 'I', 'f', ' ', 'a', ' ', 'p', 'h', 'o', 't', 'o', 'n', ' ', 'i', 's', ' ', 'd', 'i', 'r', 'e', 'c', 't

In [32]:
def exact_match(en_subset, episode):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        if len(subtitle.strip().split(" ")) <=5:
            continue
        # Exact Match for short sentences
        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            if subtitle == utt:
                if i not in res:
                    res[i] = set()
                    res[i].add(j)
                else:
                    res[i].add(j)
    output = {}
    for x in res:
        output[x] = sorted(list(res[x]))

    return output

In [33]:
result_1_all = {}
for i in range(12):
    for j in range(30):
        try:
            (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
                episode=tbbt_transcripts,
                en_subtitle=en_subtitle,
                zh_subtitle=fa_subtitle,
                results=results,
                season_id=i+1,
                episode_id=j+1,
                bias=200
            )
            temp = exact_match(en_subset, tbbt_episode)
            result_1_all[(i+1,j+1)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(turn_sub2epi_into_epi2sub(temp)), "Subtitle Number:",len(temp))
        except:
            pass

Season: 1 Episode: 1 Episode Number: 32 Subtitle Number: 32
Season: 1 Episode: 2 Episode Number: 21 Subtitle Number: 21
Season: 1 Episode: 3 Episode Number: 8 Subtitle Number: 8
Season: 1 Episode: 4 Episode Number: 14 Subtitle Number: 14
Season: 1 Episode: 5 Episode Number: 16 Subtitle Number: 16
Season: 1 Episode: 6 Episode Number: 26 Subtitle Number: 26
Season: 1 Episode: 7 Episode Number: 22 Subtitle Number: 22
Season: 1 Episode: 8 Episode Number: 26 Subtitle Number: 26
Season: 1 Episode: 9 Episode Number: 18 Subtitle Number: 18
Season: 1 Episode: 10 Episode Number: 24 Subtitle Number: 24
Season: 1 Episode: 11 Episode Number: 22 Subtitle Number: 22
Season: 1 Episode: 12 Episode Number: 25 Subtitle Number: 26
Season: 1 Episode: 13 Episode Number: 28 Subtitle Number: 28
Season: 1 Episode: 14 Episode Number: 18 Subtitle Number: 18
Season: 1 Episode: 15 Episode Number: 18 Subtitle Number: 18
Season: 1 Episode: 16 Episode Number: 22 Subtitle Number: 22
Season: 2 Episode: 4 Episode Number

In [34]:
with open('alignment_results/fa/alignment_part_2_strict_match.pkl', 'wb') as f:
    pkl.dump(result_1_all, f)

In [35]:
with open('alignment_results/fa/alignment_part_2_strict_match.pkl', 'rb') as f:
    result_1_all = pkl.load(f)

In [39]:
for x in result_1_all:
    print(x)
    print(result_1_all[x])
    print()

(1, 1)
{217: [9], 242: [28], 250: [35], 267: [63], 269: [65], 294: [88], 295: [89], 296: [90], 314: [108], 326: [117], 373: [155], 390: [164], 395: [171], 402: [176], 403: [177], 409: [183], 420: [193], 427: [201], 432: [207], 439: [210], 444: [215], 457: [228], 470: [237], 473: [240], 478: [242], 482: [245], 509: [270], 514: [275], 539: [295], 542: [297], 548: [302], 550: [304]}

(1, 2)
{249: [64], 261: [71], 262: [72], 288: [90], 293: [94], 311: [117], 315: [120], 316: [121], 324: [129], 337: [142], 343: [147], 352: [157], 354: [159], 359: [161], 405: [186], 410: [190], 434: [208], 440: [213], 448: [219], 451: [221], 466: [232]}

(1, 3)
{252: [60], 272: [77], 281: [84], 301: [100], 305: [103], 430: [192], 433: [194], 485: [239]}

(1, 4)
{200: [7], 222: [19], 231: [25], 273: [63], 329: [104], 334: [108], 338: [111], 351: [121], 371: [138], 376: [142], 378: [146], 399: [158], 438: [180], 440: [182]}

(1, 5)
{209: [17], 216: [22], 218: [24], 227: [33], 238: [41], 259: [57], 284: [79], 3

In [148]:
temp = result_0_all[(1,3)]
for x in temp:
    print(x, temp[x])
    print(en_subset[x])
    for item in temp[x]:
        print(tbbt_episode[item])
    print("=="*50)

192 {3}
This is what the last 97 hours have been about.
[' Don’t panic, this is what the last 97 hours have been about.', 'Leonard']
243 {42}
Civil servants have a documented propensity to, you know, snap, so...
[' Oh no, that’s probably not such a good idea. Civil servants have a documented propensity to, you know, snap. ', 'Leonard']
256 {51}
Don't tell me that your hopeless infatuation is devolving into pointless jealousy.
[' Please don’t tell me that your hopeless infatuation is devolving into pointless jealousy.', 'Sheldon']
261 {55}
At least now you can retrieve the black box from the smoldering wreckage that was once your fantasy of dating her and analyze the data so that you don't crash into geek mountain again.
[' Well, at least now you can retrieve the black box from the twisted smouldering wreckage that was once your fantasy of dating her, and analyse the data so that you don’t crash into geek mountain again.', 'Sheldon']
263 {56}
A relentless pursuit that only ends when she

In [36]:
temp = result_1_all[(1,3)]
for x in temp:
    print(x, temp[x])
    print(en_subset[x])
    for item in temp[x]:
        print(tbbt_episode[item])
    print("=="*50)

252 [60]
"and thought you'd like to have it.
[' I know, she’s so thoughtful.', 'Raj']
272 [77]
What a nice gift for an astrophysicist.
[' But which one of us should be the party who…', 'Sheldon']
281 [84]
Well, our choices are we do this with the university or we don't do it at all.
[' Yeah, yeah, and beauty’s on the inside, size doesn’t matter, how much she spend?', 'Penny']
301 [100]
Ooh, let's find out.
[' Um, you think you’d want to grab a cup of coffee? Hello?', 'Emily']
305 [103]
Oh, my.
[' I’m sorry, I don’t think that’s a good idea.', 'Raj']
430 [192]
Well, I believe I'm treating you generously.
[' Oh. Okay. I understand.', 'Emily']
433 [194]
No, I am not saying that, because I kept saying that this morning and Leonard said, "Stop saying that""
[' Okay, I’ll stop.', 'Emily']
485 [239]
Sheldon, that's so nice.


IndexError: list index out of range

### Converge part 1 and part 2 result as the seeds

In [40]:
def merge_two_dict(dict_1, dict_2):
    res = deepcopy(dict_1)
    for sea_epi in dict_2:
        if sea_epi not in res:
            res[sea_epi] = dict_2[sea_epi]
        else:
            temp = {}
            for x in res[sea_epi]:
                temp[x] = list(res[sea_epi][x])
            # add dict 2
            if sea_epi in dict_2:
                for x in dict_2[sea_epi]:
                    if x not in temp:
                        temp[x] = sorted(list(set(dict_2[sea_epi][x])))
                    else:
                        temp[x].extend(dict_2[sea_epi][x])
                    temp[x] = sorted(list(set(temp[x])))
            res[sea_epi] = temp

    output = {}
    for x in res:
        if res[x]!={}:
            output[x] = res[x]
    return output

In [41]:
# Perform index filtering on the alignment seeds
"""
Filter indexs based on the index before and after
"""
def filter_by_idx(sub2epi):
    paris = []
    for x in sorted(list(sub2epi.keys())):
        for y in sorted(sub2epi[x]):
            paris.append([x,y])

    res = [paris[0]]
    for i in range(1, len(paris)-1):
        former = res[-1]
        current = paris[i]
        after = paris[i+1]
        if former[0] <= current[0] <= after[0]:
            if former[1]<=current[1]<=after[1]:
                res.append(current)
    if paris[-1][0] >= res[-1][0]:
        if paris[-1][1] >= res[-1][1]:
            res.append(paris[-1])

    output = {}
    for x in res:
        sub = x[0]
        epi = x[1]
        if sub not in output:
            output[sub] = [epi]
        else:
            output[sub].append(epi)

    return output

In [42]:
alignment_seeds = {}
temp = merge_two_dict(result_0_all, result_1_all)
for x in temp:
    alignment_seeds[x] = filter_by_idx(temp[x])

In [44]:
with open('alignment_results/fa/alignment_seeds.pkl', 'wb') as f:
    pkl.dump(alignment_seeds, f)

In [45]:
with open('alignment_results/fa/alignment_seeds.pkl', 'rb') as f:
    alignment_seeds = pkl.load(f)

In [72]:
for x in alignment_seeds:
    print(x)
    print(alignment_seeds[x])
    print()

(1, 1)
{200: [0], 217: [9], 229: [15], 231: [16], 234: [17], 242: [28], 248: [34], 250: [35], 267: [63], 269: [65], 284: [75], 289: [82], 290: [82], 294: [88], 295: [89], 296: [90], 303: [96], 309: [102], 314: [108], 322: [116], 324: [116], 326: [117], 342: [134], 343: [135], 358: [145], 361: [148], 370: [154], 373: [155], 379: [160], 384: [161], 385: [161], 388: [162], 390: [164], 395: [171], 398: [173], 401: [175], 402: [176], 403: [177], 409: [183], 412: [186], 420: [193], 427: [201], 432: [207], 433: [208], 439: [210], 444: [215], 456: [226], 457: [228], 459: [230], 461: [230], 462: [230], 464: [232], 470: [237], 473: [240], 476: [241], 478: [242], 482: [245], 496: [258], 497: [259], 501: [262], 509: [270], 514: [275], 519: [280], 532: [288], 539: [295], 541: [296], 542: [297], 548: [302], 550: [304], 556: [309], 558: [311], 559: [311], 561: [314]}

(1, 2)
{167: [2], 172: [7], 173: [7], 186: [18], 188: [20], 193: [25], 209: [39], 213: [42], 220: [46], 221: [46], 222: [46], 228: [49

In [73]:
temp = []
for x in alignment_seeds:
    temp.append(x)

In [74]:
print(temp)

[(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (2, 4), (2, 6), (2, 8), (2, 9), (3, 21), (3, 23), (4, 1), (4, 3), (4, 4), (4, 5), (4, 6), (4, 8), (4, 16), (4, 19), (4, 21), (4, 22), (4, 23), (4, 24), (5, 2), (5, 6), (5, 12), (5, 14), (5, 15), (5, 20), (5, 23), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (6, 14), (6, 15), (6, 16), (6, 17), (6, 19), (6, 20), (6, 21), (6, 24), (7, 1), (7, 2), (7, 6), (7, 10), (7, 11), (7, 15), (7, 16), (7, 17), (7, 18), (7, 19), (7, 20), (7, 22), (7, 23), (8, 1), (8, 2), (8, 3), (8, 5), (8, 6), (8, 7), (8, 9), (8, 11), (8, 12), (8, 14), (8, 15), (8, 16), (8, 17), (8, 18), (8, 19), (8, 20), (8, 21), (8, 22), (8, 23), (8, 24), (9, 1), (9, 3), (9, 4), (9, 7), (9, 8), (9, 9), (9, 10), (9, 11), (9, 12), (9, 16), (9, 17), (9, 18)]


### Expand alignment from seeds

In [47]:
def extend_neighbors(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(epi2sub_alignment_2[epi_id])
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_latter = transformation(en_subset[sub_id_latter])
        epi = transformation(episode[epi_id][0])

        if sub_former in epi:
            sub_ids.append(sub_id_former)
            # print(epi_id, sub_id_former)
        if sub_latter in epi:
            sub_ids.append(sub_id_latter)
            # print(epi_id, sub_id_latter)
        # print(sorted(sub_ids))
        temp[epi_id] = sorted(sub_ids)
        # epi2sub_alignment_2[epi_id] = sorted(sub_ids)
        # print("=="*50)
    return temp

In [48]:
def add_strict_match_within_gaps(gaps, epi2sub):
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]

        for sub_id in sub_ids:
            sub = transformation(en_subset[sub_id].replace("’", "'"))
            for epi_id in epi_ids:
                epi = transformation(tbbt_episode[epi_id][0].replace("’", "'"))
                if len(epi.strip().split(" "))<=2:
                    continue
                if sub == epi:

                    if epi_id not in epi2sub:
                        epi2sub[epi_id] = [sub_id]
                    else:
                        epi2sub[epi_id].append(sub_id)

    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))

    return output

In [49]:
def get_optimal_wer_from_episode(ground_truth, hypothesis_pool):
    scores = []
    for i, hypothesis in enumerate(hypothesis_pool):
        scores.append(jiwer.compute_measures(ground_truth, hypothesis)['wer'])
    return min(scores), hypothesis_pool[scores.index(min(scores))], ground_truth, scores.index(min(scores))

In [50]:
def add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode):
    count = 0
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]

        for epi_id in epi_ids:
            best_score = 100
            best_pair = [None, None]
            epi = transformation(tbbt_episode[epi_id][0].replace("’", "'"))
            if len(epi.strip().split(" "))<=2:
                continue
            for sub_id in sub_ids:
                sub = transformation(en_subset[sub_id].replace("’", "'"))
                score = jiwer.compute_measures(epi, sub)['wer']
                if score < best_score:
                    best_score = score
                    best_pair = [epi_id, sub_id]
            if best_score < 0.15:
                count += 1
                if best_pair[0] not in epi2sub:
                    epi2sub[best_pair[0]] = [best_pair[1]]
                else:
                    epi2sub[best_pair[0]].append([best_pair[1]])
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))

    return output

In [51]:
def add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode):
    count = 0
    pattern = r'\.|\?|\!|\;|- '
    temp_sub2epi = {}
    for gap in gaps:
        # Build substrings
        sub_ids = gap[1]
        epi_ids = gap[0]
        sub_lists = []
        epi_lists = []

        for epi_id in epi_ids:
            epi = tbbt_episode[epi_id][0].replace("’", "'")
            epi_substring = re.split(pattern, epi)
            for item in epi_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                epi_lists.append([temp_item, epi_id])

        for sub_id in sub_ids:
            sub = en_subset[sub_id].replace("’", "'")
            sub_substring = re.split(pattern, sub)
            for item in sub_substring:
                if len(item.strip().split(" "))<=2:
                    continue
                temp_item = transformation(item.replace("-", " ").strip())
                sub_lists.append([temp_item, sub_id])


        # Calculate WER Similarity
        for (sub, sub_id) in sub_lists:
            for (epi, epi_id) in epi_lists:
                cer = jiwer.cer(epi, sub)
                if cer <= 0.2:
                    count += 1
                    if sub_id not in temp_sub2epi:
                        temp_sub2epi[sub_id] = set()
                        temp_sub2epi[sub_id].add(epi_id)
                    else:
                        temp_sub2epi[sub_id].add(epi_id)

    for sub_id in temp_sub2epi:
        epi_ids = list(temp_sub2epi[sub_id])
        if len(epi_ids)!=1:
            continue
        epi_id = epi_ids[0]
        if epi_id not in epi2sub:
            epi2sub[epi_id] = [sub_id]
        else:
            epi2sub[epi_id].append(sub_id)

    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [52]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [53]:
def extend_neighbors_sliding(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(epi2sub_alignment_2[epi_id])
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

In [54]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_neighbors_subtitle_sliding(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [71]:
print(len(alignment_seeds))

102


In [69]:
print(turn_sub2epi_into_epi2sub(alignment_seeds[(1,1)]))

{0: [200], 9: [217], 15: [229], 16: [231], 17: [234], 28: [242], 34: [248], 35: [250], 63: [267], 65: [269], 75: [284], 82: [289, 290], 88: [294], 89: [295], 90: [296], 96: [303], 102: [309], 108: [314], 116: [322, 324], 117: [326], 134: [342], 135: [343], 145: [358], 148: [361], 154: [370], 155: [373], 160: [379], 161: [384, 385], 162: [388], 164: [390], 171: [395], 173: [398], 175: [401], 176: [402], 177: [403], 183: [409], 186: [412], 193: [420], 201: [427], 207: [432], 208: [433], 210: [439], 215: [444], 226: [456], 228: [457], 230: [459, 461, 462], 232: [464], 237: [470], 240: [473], 241: [476], 242: [478], 245: [482], 258: [496], 259: [497], 262: [501], 270: [509], 275: [514], 280: [519], 288: [532], 295: [539], 296: [541], 297: [542], 302: [548], 304: [550], 309: [556], 311: [558, 559], 314: [561]}


In [70]:
for x in tbbt_episode:
    print(x)

[' Okay, I gotta ask, why are you wearing a bow tie?', 'Howard']
[' I’ve never applied for a patent before. I wanted to make a good impression.', 'Sheldon']
[' Is the impression that your first name is Pee-Wee?', 'Howard']
[' Yeah, well, you’re an engineer. End of joke, burn.', 'Sheldon']
[' Come on in, fellas.', 'Attorney']
[' See, he’s not wearing a tie.', 'Howard']
[' Well, he’s a patent attorney. Maybe his tie is pending.', 'Leonard']
[' So, I’ve reviewed your paperwork, and it seems like we’ve got everything we need to file a patent for your infinite persistence gyroscope.', 'Attorney']
[' That’s great.', 'Sheldon']
[' Excellent.', 'Howard']
[' So what happens next?', 'Leonard']
[' Well, the legal team needs to review existing patents to avoid overlap.', 'Attorney']
[' Oh, I don’t think there will be.', 'Howard']
[' Yeah, we did our own search.', 'Leonard']
[' That’s nice, but I think ours might be a bit more thorough.', 'Attorney']
[' Get a load of this guy.', 'Sheldon']
[' Can y

In [67]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, turn_sub2epi_into_epi2sub(alignment_seeds[(1,1)]), tbbt_episode)
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)

# Extend within gap using strict string match
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

# Extend within gap using wer
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

# Extend within gap using substring cer
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

# Add within the gap
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_one_size_gap(gaps, epi2sub)

IndexError: list index out of range

In [61]:
def get_alignment(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, turn_sub2epi_into_epi2sub(alignment_seeds[(season_id,episode_id)]), tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)

    # Extend within gap using strict string match
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_strict_match_within_gaps(gaps, epi2sub)

    # Extend within gap using wer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Extend within gap using substring cer
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_wer_substring_match_within_gaps(gaps, epi2sub, en_subset, tbbt_episode)

    # Add within the gap
    gaps, abandons = get_subset_in_gaps(epi2sub)
    epi2sub = add_one_size_gap(gaps, epi2sub)

    # Further extend neighbors
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
    epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)

    return epi2sub

In [62]:
print(alignment_seeds.keys())
print(len(alignment_seeds))

dict_keys([(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (2, 4), (2, 6), (2, 8), (2, 9), (3, 21), (3, 23), (4, 1), (4, 3), (4, 4), (4, 5), (4, 6), (4, 8), (4, 16), (4, 19), (4, 21), (4, 22), (4, 23), (4, 24), (5, 2), (5, 6), (5, 12), (5, 14), (5, 15), (5, 20), (5, 23), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (6, 14), (6, 15), (6, 16), (6, 17), (6, 19), (6, 20), (6, 21), (6, 24), (7, 1), (7, 2), (7, 6), (7, 10), (7, 11), (7, 15), (7, 16), (7, 17), (7, 18), (7, 19), (7, 20), (7, 22), (7, 23), (8, 1), (8, 2), (8, 3), (8, 5), (8, 6), (8, 7), (8, 9), (8, 11), (8, 12), (8, 14), (8, 15), (8, 16), (8, 17), (8, 18), (8, 19), (8, 20), (8, 21), (8, 22), (8, 23), (8, 24), (9, 1), (9, 3), (9, 4), (9, 7), (9, 8), (9, 9), (9, 10), (9, 11), (9, 12), (9, 16), (9, 17), (9, 18)])
102


In [65]:
for x in tbbt_transcripts:
    print(x)

(1, 1)
(1, 2)
(1, 3)
(1, 4)
(1, 5)
(1, 6)
(1, 7)
(1, 8)
(1, 9)
(1, 10)
(1, 11)
(1, 12)
(1, 13)
(1, 14)
(1, 15)
(1, 16)
(1, 17)
(2, 1)
(2, 2)
(2, 3)
(2, 4)
(2, 5)
(2, 6)
(2, 7)
(2, 8)
(2, 9)
(2, 10)
(2, 11)
(2, 12)
(2, 13)
(2, 14)
(2, 15)
(2, 16)
(2, 17)
(2, 18)
(2, 19)
(2, 20)
(2, 21)
(2, 22)
(2, 23)
(3, 1)
(3, 2)
(3, 3)
(3, 4)
(3, 5)
(3, 6)
(3, 7)
(3, 8)
(3, 9)
(3, 10)
(3, 11)
(3, 12)
(3, 13)
(3, 14)
(3, 15)
(3, 16)
(3, 17)
(3, 18)
(3, 19)
(3, 20)
(3, 21)
(3, 22)
(3, 23)
(4, 1)
(4, 2)
(4, 3)
(4, 4)
(4, 5)
(4, 6)
(4, 7)
(4, 8)
(4, 9)
(4, 10)
(4, 11)
(4, 12)
(4, 13)
(4, 14)
(4, 15)
(4, 16)
(4, 17)
(4, 18)
(4, 19)
(4, 20)
(4, 21)
(4, 22)
(4, 23)
(4, 24)
(5, 1)
(5, 2)
(5, 3)
(5, 4)
(5, 5)
(5, 6)
(5, 7)
(5, 8)
(5, 9)
(5, 10)
(5, 11)
(5, 12)
(5, 13)
(5, 14)
(5, 15)
(5, 16)
(5, 17)
(5, 18)
(5, 19)
(5, 20)
(5, 21)
(5, 22)
(5, 23)
(5, 24)
(6, 1)
(6, 2)
(6, 3)
(6, 4)
(6, 5)
(6, 6)
(6, 7)
(6, 8)
(6, 9)
(6, 10)
(6, 11)
(6, 12)
(6, 13)
(6, 14)
(6, 15)
(6, 16)
(6, 17)
(6, 18)
(6, 19)
(6, 20)
(6, 21

In [64]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    print(i, j)
    temp = get_alignment(tbbt_transcripts, en_subtitle, fa_subtitle, results, i, j, 200)
    further_alignment[(i,j)] = temp
    print("Season:", i,"Episode:", j, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))

1 1


IndexError: list index out of range

In [517]:
further_alignment = {}
for i in tqdm(range(3)):
    for j in tqdm(range(4)):
        try:
            temp = get_alignment(tbbt_transcripts, en_subtitle, fa_subtitle, alignment_seeds, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/3 [00:00<?, ?it/s]
100%|██████████| 4/4 [00:00<?, ?it/s][A

100%|██████████| 4/4 [00:00<?, ?it/s][A

100%|██████████| 4/4 [00:00<?, ?it/s][A
100%|██████████| 3/3 [00:00<00:00, 125.33it/s]


In [404]:
with open('final_stage_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

In [10]:
with open('final_stage_alignment.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [11]:
for x in further_alignment:
    print(x)
    print(further_alignment[x])

(0, 0)
{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334], 13

In [80]:
alignment_seeds = {}
for i, j in further_alignment:
    alignment_seeds[(i+1, j+1)] = further_alignment[(i, j)]

In [28]:
for item in alignment_seeds:
    print(item, len(alignment_seeds[item]))
    # print(alignment_seeds[item])
    # print('=='*50)

(1, 1) 226
(1, 2) 161
(1, 3) 163
(1, 4) 143
(1, 5) 144
(1, 6) 165
(1, 7) 166
(1, 8) 195
(1, 9) 146
(1, 10) 144
(1, 11) 164
(1, 12) 95
(1, 13) 145
(1, 14) 151
(1, 15) 157
(1, 16) 144
(2, 1) 161
(2, 2) 166
(2, 3) 152
(2, 4) 162
(2, 5) 138
(2, 6) 171
(2, 7) 173
(2, 8) 135
(2, 9) 150
(2, 10) 144
(2, 11) 162
(2, 12) 149
(2, 13) 156
(2, 14) 190
(2, 15) 183
(2, 16) 183
(2, 17) 169
(2, 18) 142
(2, 19) 166
(2, 20) 164
(2, 21) 166
(2, 22) 148
(2, 23) 168
(3, 1) 156
(3, 2) 158
(3, 3) 157
(3, 4) 175
(3, 5) 175
(3, 6) 156
(3, 7) 185
(3, 8) 149
(3, 9) 155
(3, 10) 191
(3, 11) 160
(3, 12) 158
(3, 13) 133
(3, 14) 141
(3, 15) 159
(3, 16) 161
(3, 17) 145
(3, 18) 136
(3, 19) 173
(3, 20) 196
(3, 21) 185
(3, 22) 180
(3, 23) 181
(4, 1) 192
(4, 2) 175
(4, 3) 195
(4, 4) 178
(4, 5) 175
(4, 6) 182
(4, 7) 167
(4, 8) 154
(4, 9) 192
(4, 10) 164
(4, 11) 173
(4, 12) 176
(4, 13) 184
(4, 14) 156
(4, 15) 169
(4, 17) 185
(4, 18) 172
(4, 19) 191
(4, 20) 193
(4, 21) 164
(4, 22) 202
(4, 23) 192
(4, 24) 179
(5, 1) 143
(5, 2)

In [38]:
instance = alignment_seeds[(1,1)]

In [39]:
for x in instance:
    print(x, instance[x])

0 [200]
1 [204]
2 [205]
5 [210]
6 [212]
7 [213]
8 [214]
9 [215]
10 [216]
11 [217]
12 [218]
13 [220]
14 [221]
15 [224]
16 [225, 226]
17 [229]
18 [230]
19 [231]
20 [232]
21 [233]
22 [234]
24 [236]
28 [239]
30 [240]
31 [241]
32 [242]
33 [243, 244]
34 [246, 247]
35 [248]
36 [249]
39 [252]
40 [253]
45 [256]
46 [257]
63 [266]
64 [267]
65 [268]
66 [269]
67 [270]
68 [271]
69 [272, 273]
70 [274, 275]
71 [276]
72 [277]
73 [278, 279]
74 [280]
75 [282]
76 [283]
82 [285, 286]
84 [288]
86 [289]
88 [290]
89 [291]
90 [292]
93 [293]
94 [294]
96 [295]
97 [297]
98 [298]
99 [299]
100 [300]
102 [301, 302]
104 [303]
105 [304]
106 [305]
107 [306]
108 [307]
109 [308]
111 [311]
112 [312]
115 [313]
116 [314, 316]
117 [318]
119 [319]
123 [321, 322]
124 [323]
126 [325]
127 [326]
128 [327]
131 [329]
132 [330]
133 [331]
134 [333]
135 [334]
136 [335]
137 [336]
138 [337]
139 [341]
140 [342]
141 [343]
142 [344]
143 [345]
144 [346]
145 [348]
146 [349]
147 [350]
148 [351]
149 [352, 353, 355]
150 [356]
151 [358]
154 [360

In [33]:
results = further_alignment

### Part 4: Final Stage Alignment in the gap

In [103]:
def get_sliding_window_substrings(input_string, window_size):
    input_tokens = input_string.strip().split(' ')
    substrings = []
    for i in range(len(input_tokens)-3):
        substrings.append(" ".join(input_tokens[i: i+4]))
    return substrings

In [None]:
def add_one_size_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]
        if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
            epi2sub[epi_id] = [sub_id]
    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [23]:
def sliding_within_gap(gaps, epi2sub):
    temp = {}
    for gap in gaps:
        sub_ids = gap[1]
        epi_ids = gap[0]
        for epi_id in epi_ids:
            epi = tbbt_episode

        if not(len(sub_ids)==1 and len(epi_ids)==1):
            continue
        sub_id = sub_ids[0]
        epi_id = epi_ids[0]


    output = {}
    for epi_id in sorted(list(epi2sub.keys())):
        output[epi_id] = sorted(list(set(epi2sub[epi_id])))
    return output

In [117]:
"""
Add neighbor subtitle from the utterance
"""
def extend_subtitles_to_episode_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1
        epi = transformation(episode[epi_id][0])

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_former_tokens = sub_former.strip().split(' ')
        sub_former_strings = []
        for i in range(len(sub_former_tokens)-3):
            sub_former_strings.append(" ".join(sub_former_tokens[i: i+4]))
        sub_former_strings.append(sub_former)
        for item in sub_former_strings:
            if item in epi:
                sub_ids.append(sub_id_former)
                break


        sub_latter = transformation(en_subset[sub_id_latter])
        sub_latter_tokens = sub_latter.strip().split(' ')
        sub_latter_strings = []
        for i in range(len(sub_latter_tokens)-3):
            sub_latter_strings.append(" ".join(sub_latter_tokens[i: i+4]))
        sub_latter_strings.append(sub_latter)
        for item in sub_latter_strings:
            if item in epi:
                sub_ids.append(sub_id_latter)
                break

        temp[epi_id] = sorted(sub_ids)
    return temp

In [118]:
"""
Extend the former episode and latter episode near the substitle
"""
def extend_episodes_to_subtitle_sliding_window(en_subset, epi2sub_alignment_2, episode):
    temp = {}
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub_alignment_2)

    for sub_id in sub2epi:
        sub = transformation(en_subset[sub_id])

        # Check whether the epi nearby is in the subtitle
        epi_id_former = min(sub2epi[sub_id])-1
        epi_former = transformation(episode[epi_id_former][0])
        epi_former_tokens = epi_former.strip().split(' ')
        epi_former_strings = []
        for i in range(len(epi_former_tokens)-3):
            epi_former_strings.append(" ".join(epi_former_tokens[i: i+4]))
        epi_former_strings.append(epi_former)
        for item in epi_former_strings:
            if item in sub:
                epi_ids.append(epi_id_former)
                break


        epi_id_latter = max(sub2epi[sub_id])+1
        epi_latter = transformation(episode[epi_id_latter][0])
        epi_latter_tokens = epi_latter.strip().split(' ')
        epi_latter_strings = []
        for i in range(len(epi_latter_tokens)-3):
            epi_latter_strings.append(" ".join(epi_latter_tokens[i: i+4]))
        epi_latter_strings.append(epi_latter)
        for item in epi_latter_strings:
            if item in sub:
                epi_ids.append(epi_id_latter)
                break
        print(epi_former_strings)
        print(epi_latter_strings)
        print(sub)
        print('=='*50)
        temp[sub_id] = sorted(epi_ids)
    return temp

In [81]:
# Load Data
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=1,
        bias=200
    )
epi2sub = alignment_seeds[(1,1)]
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334], 136: [335

In [119]:
print(epi2sub)

{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334], 136: [335

In [121]:
print(extend_subtitles_to_episode_sliding_window(en_subset, epi2sub, tbbt_episode))

{0: [200], 1: [204], 2: [205], 5: [210, 211], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218, 219], 13: [220], 14: [221], 15: [223, 224], 16: [225, 226], 17: [228, 229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295, 296], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316, 317], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331, 332]

In [122]:
print(extend_episodes_to_subtitle_sliding_window(en_subset, epi2sub, tbbt_episode))

['i don’t know what', 'don’t know what your', 'know what your odds', 'what your odds are', 'your odds are in', 'odds are in the', 'are in the world', 'in the world as', 'the world as a', 'world as a whole', 'as a whole but', 'a whole but as', 'whole but as far', 'but as far as', 'as far as the', 'far as the population', 'as the population of', 'the population of this', 'population of this car', 'of this car goes', 'this car goes you’re', 'car goes you’re a', 'goes you’re a veritable', 'you’re a veritable mack', 'a veritable mack daddy', 'i don’t know what your odds are in the world as a whole but as far as the population of this car goes you’re a veritable mack daddy']
['agreed what’s your point', 'agreed what’s your point']
if a photon is directed through a plane with two slits in it and either is observed it will not go through both


NameError: name 'epi_ids' is not defined

In [135]:
"""
Extend the former subtitle and latter subtitle near the episode
"""
def extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode):
    # Gather the gap of subtitle corresponding to episode utterance
    epi_keys = sorted(list(epi2sub.keys()))
    sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

    subtitle_gaps = {}
    for i in range(len(epi_keys)-1):
        epi_start = epi_keys[i]
        epi_end = epi_keys[i+1]
        key = (epi_start, epi_end)
        if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
            subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]

    # Check whether the subtitle could be merged into utterances using sliding window
    for start_epi_id, end_epi_id in subtitle_gaps:
        start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
        end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
        # print(start_epi_id, start_epi)
        # print(end_epi_id, end_epi)
        # print('--')

        for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
            sub = transformation(en_subset[sub_id])
            sub_substrings = get_sliding_window_substrings(sub, 6)
            sub_substrings.append(sub)
            # print(sub_id, sub)
            # print(sub_substrings)
            temp_start = [substring in start_epi for substring in sub_substrings]
            signal_start = True in temp_start
            temp_end = [substring in end_epi for substring in sub_substrings]
            signal_end = True in temp_end
            # print(temp_start, signal_start)
            # print(temp_end, signal_end)
            if signal_start==True and signal_end==True:
                print(start_epi_id, start_epi)
                print(end_epi_id, end_epi)
                print(sub, "|", signal_start, "|", signal_end)
            # print('--')

        print('=='*50)

extend_neighbors_episode_sliding(en_subset, epi2sub, tbbt_episode)

241 to mend her broken heart this situation is much less complicated there s some kind of dispute between penny and her exboyfriend as to who gets custody of the tv she just wanted to avoid having a scene with him
242 so we get to have a scene with him
she just wanted to avoid a scene with him | True | True
297 no it was a valid hypothesis
298 that was a valid hypothesis what is happening to you
that was a valid hypo | True | True
299 really thank you so much for going and trying you re uh you re so terrific why don t you put some clothes on i ll get my purse and dinner is on me okay
301 thank you you re not done with her are you
thank you | True | True


In [116]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

subtitle_gaps = {}
for i in range(len(epi_keys)-1):
    epi_start = epi_keys[i]
    epi_end = epi_keys[i+1]
    key = (epi_start, epi_end)
    if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
        subtitle_gaps[(epi_start, epi_end)] = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]


# Check whether the subtitle could be merged into utterances using sliding window
for start_epi_id, end_epi_id in subtitle_gaps:
    start_epi = transformation(tbbt_episode[start_epi_id][0].replace("’", " "))
    end_epi = transformation(tbbt_episode[end_epi_id][0].replace("’", " "))
    print(start_epi_id, start_epi)
    print(end_epi_id, end_epi)
    print('--')

    for sub_id in subtitle_gaps[(start_epi_id, end_epi_id)]:
        sub = transformation(en_subset[sub_id])
        sub_substrings = get_sliding_window_substrings(sub, 4)
        sub_substrings.append(sub)
        print(sub_id, sub)
        print(sub_substrings)
        temp_start = [substring in start_epi for substring in sub_substrings]
        temp_end = [substring in end_epi for substring in sub_substrings]
        print(temp_start)
        print(temp_end)
        print('--')

    print('=='*50)

0 so if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it s unobserved it will however if it s observed after it s left the plane but before it hits its target it will not have gone through both slits
1 agreed what s your point
--
201 if unobserved it will
['if unobserved it will', 'if unobserved it will']
[False, False]
[False, False]
--
202 if it is observed after it left the plane before it hits its target
['if it is observed', 'it is observed after', 'is observed after it', 'observed after it left', 'after it left the', 'it left the plane', 'left the plane before', 'the plane before it', 'plane before it hits', 'before it hits its', 'it hits its target', 'if it is observed after it left the plane before it hits its target']
[False, False, False, False, False, False, False, False, False, True, True, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
--
203 it will not

In [104]:
input_string = transformation("One across is Aegean")
substrings = get_sliding_window_substrings(input_string, 7)
print(substrings)

['one across is aegean']


{(0, 1): [201, 202, 203], (2, 5): [206, 207, 208, 209], (5, 6): [211], (12, 13): [219], (14, 15): [222, 223], (16, 17): [227, 228], (22, 24): [235], (24, 28): [237, 238], (33, 34): [245], (36, 39): [250, 251], (40, 45): [254, 255], (46, 63): [258, 259, 260, 261, 262, 263, 264, 265], (74, 75): [281], (76, 82): [284], (82, 84): [287], (96, 97): [296], (109, 111): [309, 310], (116, 117): [317], (119, 123): [320], (124, 126): [324], (128, 131): [328], (133, 134): [332], (138, 139): [338, 339, 340], (144, 145): [347], (150, 151): [357], (151, 154): [359], (154, 155): [361], (159, 160): [367], (160, 161): [370, 371, 372], (161, 162): [376, 377], (164, 169): [381], (171, 173): [385], (181, 183): [394, 395], (183, 185): [397, 398], (193, 195): [410], (198, 200): [414, 415], (208, 209): [426], (209, 210): [428, 429], (211, 214): [432], (215, 221): [436, 437, 438, 439, 440, 441], (221, 222): [443], (222, 224): [445, 446], (235, 236): [462], (241, 242): [471], (249, 250): [480], (250, 252): [482]

In [79]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))

subtitle_gaps = {}
for i in range(len(epi_keys)-1):
    epi_start = epi_keys[i]
    epi_end = epi_keys[i+1]
    print(epi_start, epi2sub[epi_start])
    print(epi_end, epi2sub[epi_end])
    key = (epi_start, epi_end)
    if max(epi2sub[epi_start])+1<min(epi2sub[epi_end]):
        # value = (max(epi2sub[epi_start])+1, min(epi2sub[epi_end])-1)
        value = [item for item in range(max(epi2sub[epi_start])+1, min(epi2sub[epi_end]))]
        print(key, value)
    else:
        print("False")
    print()

0 [200]
1 [204]
(0, 1) [201, 202, 203]

1 [204]
2 [205]
False

2 [205]
5 [210]
(2, 5) [206, 207, 208, 209]

5 [210]
6 [212]
(5, 6) [211]

6 [212]
7 [213]
False

7 [213]
8 [214]
False

8 [214]
9 [215]
False

9 [215]
10 [216]
False

10 [216]
11 [217]
False

11 [217]
12 [218]
False

12 [218]
13 [220]
(12, 13) [219]

13 [220]
14 [221]
False

14 [221]
15 [224]
(14, 15) [222, 223]

15 [224]
16 [225, 226]
False

16 [225, 226]
17 [229]
(16, 17) [227, 228]

17 [229]
18 [230]
False

18 [230]
19 [231]
False

19 [231]
20 [232]
False

20 [232]
21 [233]
False

21 [233]
22 [234]
False

22 [234]
24 [236]
(22, 24) [235]

24 [236]
28 [239]
(24, 28) [237, 238]

28 [239]
30 [240]
False

30 [240]
31 [241]
False

31 [241]
32 [242]
False

32 [242]
33 [243, 244]
False

33 [243, 244]
34 [246, 247]
(33, 34) [245]

34 [246, 247]
35 [248]
False

35 [248]
36 [249]
False

36 [249]
39 [252]
(36, 39) [250, 251]

39 [252]
40 [253]
False

40 [253]
45 [256]
(40, 45) [254, 255]

45 [256]
46 [257]
False

46 [257]
63 [266]

In [65]:
# Gather the gap of subtitle corresponding to episode utterance
epi_keys = sorted(list(epi2sub.keys()))
sub_keys = sorted(list(turn_sub2epi_into_epi2sub(epi2sub).keys()))
start = min(epi2sub[epi_keys[0]])
end = max(epi2sub[epi_keys[-1]])
for j in range(start, end+1):
    print(j)

200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449


In [61]:
print(start, end)
print(epi_keys)
print(sub_keys)

200 555
[0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 28, 30, 31, 32, 33, 34, 35, 36, 39, 40, 45, 46, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 82, 84, 86, 88, 89, 90, 93, 94, 96, 97, 98, 99, 100, 102, 104, 105, 106, 107, 108, 109, 111, 112, 115, 116, 117, 119, 123, 124, 126, 127, 128, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 154, 155, 157, 158, 159, 160, 161, 162, 163, 164, 169, 170, 171, 173, 174, 175, 176, 177, 179, 180, 181, 183, 185, 186, 187, 188, 189, 190, 191, 192, 193, 195, 197, 198, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 214, 215, 221, 222, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 250, 252, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 268, 269, 270, 272, 275, 277, 278, 279, 280, 281, 282, 283, 284, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297,

In [None]:
# Extend the neighbor subtitles to episode utterance


In [25]:
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

NameError: name 'extend_neighbors_sliding' is not defined

In [540]:
# Add within the gap
gaps, abandons = get_subset_in_gaps(epi2sub)
epi2sub = add_one_size_gap(gaps, epi2sub)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

{2: [168], 3: [169], 5: [171], 6: [172], 7: [173, 174], 8: [175], 9: [176, 177], 11: [178], 12: [179], 14: [180], 15: [181], 16: [182], 17: [183, 184], 18: [185], 19: [186], 20: [187, 188], 22: [190, 191], 24: [192], 25: [193], 26: [194], 27: [195], 28: [196], 31: [200], 32: [201], 33: [202], 35: [204], 36: [205, 206], 39: [208], 40: [209, 210], 41: [211], 42: [212, 213, 214], 44: [215], 45: [216], 46: [217, 218, 219, 220], 47: [221], 48: [222, 223, 224, 225], 49: [226], 52: [231], 53: [232], 54: [233, 234], 55: [235, 236], 56: [237], 57: [240, 241], 58: [242], 60: [243], 61: [245, 246], 62: [250], 63: [251], 64: [252], 65: [253, 254, 255], 68: [260], 70: [263], 71: [264], 72: [265], 73: [266], 74: [267, 268], 77: [273, 274], 80: [278, 279], 81: [278, 279, 280, 282, 283], 82: [284, 285], 84: [286, 287], 85: [288], 86: [289], 90: [291], 91: [292], 92: [293], 93: [294, 295], 94: [296], 95: [297], 98: [300, 301], 103: [303], 105: [305, 306], 107: [308], 108: [309], 109: [310], 114: [314],

In [541]:
for x in gaps:
    print(x)

[[4], [170]]
[[21], [189]]
[[29, 30], [197, 198, 199]]
[[34], [203]]
[[37, 38], [207]]
[[50, 51], [227, 228, 229, 230]]
[[66, 67], [256, 257, 258, 259]]
[[69], [261, 262]]
[[75, 76], [269, 270, 271, 272]]
[[78, 79], [275, 276, 277]]
[[87, 88, 89], [290]]
[[96, 97], [298, 299]]
[[99, 100, 101, 102], [302]]
[[104], [304]]
[[106], [307]]
[[110, 111, 112, 113], [311, 312, 313]]
[[124, 125], [326]]
[[135, 136], [335]]
[[138], [337]]
[[149, 150], [349, 350]]
[[152], [352]]
[[160], [359, 360, 361, 362]]
[[170], [375]]
[[199, 200, 201], [425]]
[[212], [438, 439]]
[[216, 217], [443, 444]]
[[220], [449, 450, 451]]
[[224], [457]]
[[227, 228, 229, 230, 231], [461, 462, 463, 464, 465, 466]]


In [542]:
count = 0
for gap in gaps:
    sub_ids = gap[1]
    epi_ids = gap[0]
    if not(len(sub_ids)==1 and len(epi_ids)==1):
        continue
    print(sub_ids, epi_ids)
    print("Episode:")
    for epi_id in epi_ids:
        print(len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')), tbbt_episode[epi_id][0])

    print()
    print("Subtitle:")
    for sub_id in sub_ids:
        print(len(transformation(en_subset[sub_id]).strip().split(' ')), en_subset[sub_id])

    # if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
    #     count += 1
    count += 1
    print('=='*50)

[170] [4]
Episode:
4  Are there any chopsticks?

Subtitle:
2 - Any chopsticks?
[189] [21]
Episode:
1  Yes.

Subtitle:
3 - Yes. - Oh.
[203] [34]
Episode:
25  Yeah, yeah, if it gets here and I’m not here tomorrow could you just sign for it and have them put it in my apartment.

Subtitle:
18 If it gets here and I'm not here, could you sign for it and put it in?
[304] [104]
Episode:
15  Oh, great, thank you again (she throws her jacket over the back of the sofa).

Subtitle:
3 Thank you again.
[307] [106]
Episode:
4  What’s he talking about?

Subtitle:
5 I'm here for you.
[337] [138]
Episode:
16  You know what, you’ve convinced me, maybe tonight we should sneak in and shampoo her carpet.

Subtitle:
13 You've convinced me, maybe we should sneak in and shampoo her carpet.
[352] [152]
Episode:
11  You came into my apartment last night when I was sleeping?

Subtitle:
9 You came into my apartment while I was sleeping?
[375] [170]
Episode:
1  No.

Subtitle:
3 - No! - Whoo.
[457] [224]
Episode:
17

In [497]:
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
epi2sub = extend_neighbors_sliding(en_subset, epi2sub, tbbt_episode)
print(epi2sub)
print(len(epi2sub), len(turn_sub2epi_into_epi2sub(epi2sub)))

{2: [168], 3: [169], 4: [170], 5: [171], 6: [172], 7: [173, 174], 8: [175], 9: [176, 177], 11: [178], 12: [179], 14: [180], 15: [181], 16: [182], 17: [183, 184], 18: [185], 19: [186], 20: [187, 188], 22: [190, 191], 24: [192], 25: [193], 26: [194], 27: [195], 28: [196], 31: [200], 32: [201], 33: [202], 35: [204], 36: [205, 206], 39: [208], 40: [209, 210], 41: [211], 42: [212, 213, 214], 44: [215], 45: [216], 46: [217, 218, 219, 220], 47: [221], 48: [222, 223, 224, 225], 49: [226], 52: [231], 53: [232], 54: [233, 234], 55: [235, 236], 56: [237], 57: [240, 241], 58: [242], 60: [243], 61: [245, 246], 62: [250], 63: [251], 64: [252], 65: [254, 255], 68: [260], 70: [263], 71: [264], 72: [265], 73: [266], 74: [267, 268], 77: [273, 274], 80: [278, 279], 81: [278, 279, 280, 282, 283], 82: [284, 285], 84: [287, 288], 86: [289], 88: [290], 90: [291], 91: [292], 92: [293], 93: [294, 295], 94: [296], 97: [297, 298], 98: [298], 99: [299], 100: [300], 103: [303], 105: [305, 306], 108: [307], 109: [3

In [425]:
# Merge neighbors into the current sentence
# From Episode side
for epi_id in seeds_final:
    epi = transformation(tbbt_episode[epi_id][0])
    sub = transformation("".join([en_subset[item] for item in seeds_final[epi_id]]))
    print(epi_id, seeds_final[epi_id])
    print(epi)
    print(sub)
    print(len(epi.split(' ')), len(sub.split(' ')), abs(len(epi.split(' '))-len(sub.split(' '))))

    print('=='*50)

2 [168]
uh i’m not sure everyone keep an eye on howard in case he starts to swell up
everyone keep an eye on howard in case he starts to swell up
17 13 4
3 [169]
since it’s not bee season you can have my epinephrine
since it is not bee season you can have my epinephrine
10 11 1
4 [170]
are there any chopsticks
any chopsticks
4 2 2
5 [171]
you don’t need chopsticks this is thai food
do not need chopsticks this is thai food
8 8 0
6 [172]
here we go
here we go
3 3 0
7 [173, 174]
thailand has had the fork since the latter half of the nineteenth century interestingly they don’t actually put the fork in their mouth they use it to put the food on a spoon which then goes into their mouth
thailand has had the fork since the latter half of the 19th centurythey do not put the fork in their mouth they use it to put the food on a spoon which then goes into their mouth
39 37 2
8 [175]
ask him for a napkin i dare you there is a knock on the door i’ll get it
ask him for a napkin i dare you
18 8 10
9 [

In [428]:
for epi_id in seeds_final:
    epi = transformation(tbbt_episode[epi_id][0])
    sub = transformation("".join([en_subset[item] for item in seeds_final[epi_id]]))
    former_sub = transformation(en_subset[min(seeds_final[epi_id])-1])
    after_sub = transformation(en_subset[max(seeds_final[epi_id])+1])
    print(epi_id, seeds_final[epi_id])
    print(epi)
    print("Former:", former_sub)
    print("Current:", sub)
    print("After:", after_sub)
    print(len(epi.split(' ')), len(sub.split(' ')), abs(len(epi.split(' '))-len(sub.split(' '))))

    print('=='*50)

2 [168]
uh i’m not sure everyone keep an eye on howard in case he starts to swell up
Former: does it have peanut oil
Current: everyone keep an eye on howard in case he starts to swell up
After: since it is not bee season you can have my epinephrine
17 13 4
3 [169]
since it’s not bee season you can have my epinephrine
Former: everyone keep an eye on howard in case he starts to swell up
Current: since it is not bee season you can have my epinephrine
After: any chopsticks
10 11 1
4 [170]
are there any chopsticks
Former: since it is not bee season you can have my epinephrine
Current: any chopsticks
After: do not need chopsticks this is thai food
4 2 2
5 [171]
you don’t need chopsticks this is thai food
Former: any chopsticks
Current: do not need chopsticks this is thai food
After: here we go
8 8 0
6 [172]
here we go
Former: do not need chopsticks this is thai food
Current: here we go
After: thailand has had the fork since the latter half of the 19th century
3 3 0
7 [173, 174]
thailand has 

In [367]:
count = 0
for gap in gaps:
    sub_ids = gap[1]
    epi_ids = gap[0]
    if not(len(sub_ids)==1 and len(epi_ids)==1):
        continue
    print(sub_ids, epi_ids)
    print("Episode:")
    for epi_id in epi_ids:
        print(len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')), tbbt_episode[epi_id][0])

    print()
    print("Subtitle:")
    for sub_id in sub_ids:
        print(len(transformation(en_subset[sub_id]).strip().split(' ')), en_subset[sub_id])

    # if len(transformation(en_subset[sub_id]).strip().split(' '))==len(transformation(tbbt_episode[epi_id][0]).strip().split(' ')):
    #     count += 1
    count += 1
    print('=='*50)

[235] [23]
Episode:
12  I don’t know, I’ve never reneged on a proffer of sperm before.

Subtitle:
10 I've never reneged on a proffer of sperm before.
[241] [31]
Episode:
2  Not really.

Subtitle:
2 Not really.
[287] [83]
Episode:
25  Leonard, I’m not expert here but I believe in the context of a luncheon invitation, you might want to skip the reference to bowel movements.

Subtitle:
21 I'm no expert, but in the context of a lunch invitation you might wanna skip the reference to bowel movements.
[298] [98]
Episode:
1  Yeah. 

Subtitle:
1 Yeah.
[324] [125]
Episode:
4  Yes I now, but…

Subtitle:
3 Yes, I know.
[385] [172]
Episode:
2  How so?

Subtitle:
3 LEONARD: How so?
[407] [191]
Episode:
2  See what?

Subtitle:
2 See what?
[410] [194]
Episode:
8  It’s before he became a creepy computer voice:.

Subtitle:
12 [IMITATING COMPUTERIZED VOICE] It's before he became a creepy computer voice.
[418] [202]
Episode:
1  Uh-huh.

Subtitle:
1 - Uh-huh.
[451] [229]
Episode:
2  Must we?

Subtitle:
2 -

In [503]:
print(alignment)

{(1, 2): {168: [2], 169: [3], 170: [4], 171: [5], 172: [6], 173: [7], 174: [7], 175: [8], 176: [9], 177: [9], 178: [11], 179: [12], 180: [14], 181: [15], 182: [16], 183: [17], 184: [17], 185: [18], 186: [19], 187: [20], 188: [20], 190: [22], 191: [22], 192: [24], 193: [25], 194: [26], 195: [27], 196: [28], 200: [31], 201: [32], 202: [33], 204: [35], 205: [36], 206: [36], 208: [39], 209: [40], 210: [40], 211: [41], 212: [42], 213: [42], 214: [42], 215: [44], 216: [45], 217: [46], 218: [46], 219: [46], 220: [46], 221: [47], 222: [48], 223: [48], 224: [48], 225: [48], 226: [49], 231: [52], 232: [53], 233: [54], 234: [54], 235: [55], 236: [55], 237: [56], 240: [57], 241: [57], 242: [58], 243: [60], 245: [61], 246: [61], 250: [62], 251: [63], 252: [64], 253: [65], 254: [65], 255: [65], 260: [68], 263: [70], 264: [71], 265: [72], 266: [73], 267: [74], 268: [74], 273: [77], 274: [77], 278: [80, 81], 279: [80, 81], 280: [81], 282: [81], 283: [81], 284: [82], 285: [82], 287: [84], 288: [84], 28

In [85]:
alignment = {(1,1):turn_sub2epi_into_epi2sub(epi2sub)}
# alignment = further_alignment
# Write into xlsx file
for x in alignment:
    print(x)
    print(alignment[x])
    # Define season and episode
    season_id = x[0]
    episode_id = x[1]

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/test_episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/test_subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, str(speaker), " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

(1, 1)
{0: [200], 1: [204], 2: [205], 5: [210], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218], 13: [220], 14: [221], 15: [224], 16: [225, 226], 17: [229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [282], 76: [283], 82: [285, 286], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 316], 117: [318], 119: [319], 123: [321, 322], 124: [323], 126: [325], 127: [326], 128: [327], 131: [329], 132: [330], 133: [331], 134: [333], 135: [334], 13

In [24]:
epi2sub = turn_sub2epi_into_epi2sub(alignment[(1,1)])
for x in epi2sub:
    utt = transformation(tbbt_episode[x][0])
    sub = transformation("".join([en_subset[item] for item in epi2sub[x]]))

    print("Utt:", len(utt.strip().split(" ")), utt)
    print("Sub:", len(sub.strip().split(" ")), sub)
    print("=="*50)
    # print(tbbt_episode[x])

Utt: 54 so if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits if it’s unobserved it will however if it’s observed after it’s left the plane but before it hits its target it will not have gone through both slits
Sub: 23 if a photon is directed through a plane with two slits in it and either is observed it will not go through both
Utt: 6 i think this is the place
Sub: 6 i think this is the place
Utt: 36 no we are committing genetic fraud there’s no guarantee that our sperm is going to generate high iq offspring think about that i have a sister with the same basic dna mix who hostesses at fuddruckers
Sub: 14 i have a sister with the same basic dna mix who hostesses at fuddruckers
Utt: 17 sheldon this was your idea a little extra money to get fractional t1 bandwidth in the apartment
Sub: 16 sheldon this was your ideaa little extra money to get fractional t1 bandwidth in the apartment
Utt: 50 i know and i do yearn for f

In [17]:
def temp_string_match_sliding_window(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)

    temp = filter_alignment_by_gap(res)

    final = {}
    for x in temp:
        if len(temp[x])==1:
            final[x] = temp[x]

    return final

In [24]:
def temp_string_match_sliding_window_no_filter(en_subset, episode, window_size=5):
    res = {}
    for i, subtitle in enumerate(en_subset):
        subtitle = transformation(subtitle)
        subtitle_tokens = subtitle.strip().split(" ")
        if len(subtitle_tokens) < window_size:
            continue

        subtitle_segments = []
        for j in range(len(subtitle_tokens) - window_size):
            subtitle_segments.append(" ".join(subtitle_tokens[j: j + window_size]))
            # print(" ".join(subtitle_tokens[j: j+5]))

        for j, (utt, speaker) in enumerate(episode):
            utt = transformation(utt)
            for sub_seg in subtitle_segments:
                if sub_seg in utt:
                    if i not in res:
                        res[i] = set()
                        res[i].add(j)
                    else:
                        res[i].add(j)
    return res

In [15]:
# Check one episode and adapt to the new tbbt transcript corpus

(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=1,
        episode_id=3,
        bias=200
    )

In [60]:
# Firstly perform exact match
sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=5)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=6)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window_no_filter(en_subset, tbbt_episode, window_size=7)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

155 120
112 91
81 68


In [58]:
# Firstly perform exact match
sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=5)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=6)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

sub2epi = temp_string_match_sliding_window(en_subset, tbbt_episode, window_size=7)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(len(sub2epi), len(epi2sub))

95 72
6 5
4 4


In [42]:
temp = sub2epi
sub2epi = {}
for x in sorted(list(temp.keys())):
    sub2epi[x+2] = [item+2 for item in temp[x]]
for x in sub2epi:
    print(x, sub2epi[x])

191 [2]
192 [2]
194 [5]
196 [6]
197 [7]
208 [15]
221 [26]
227 [30]
229 [31]
231 [34]
233 [36]
236 [38]
242 [42]
245 [44]
258 [53]
260 [54]
263 [57]
264 [58]
265 [58]
267 [59]
271 [62]
274 [64]
275 [65]
279 [68]
280 [68]
284 [74, 72]
286 [74, 72]
287 [75]
288 [76]
289 [76]
291 [78]
293 [79]
294 [80]
295 [80]
296 [80]
297 [81]
299 [83]
304 [89]
306 [91]
307 [92]
313 [98]
314 [99]
315 [100]
317 [101]
318 [102]
320 [103]
321 [105]
327 [111]
328 [112]
329 [112]
331 [113]
334 [115]
335 [115]
344 [120]
345 [120]
347 [121]
350 [123]
351 [124]
356 [127]
358 [128]
359 [128]
365 [131]
370 [134]
372 [136]
373 [136]
374 [136]
377 [138]
379 [141]
381 [143]
383 [144]
384 [144]
386 [144]
398 [158]
400 [160]
403 [163]
405 [165]
406 [166]
408 [168]
412 [171]
413 [171]
416 [171]
418 [171]
427 [175]
431 [180]
435 [182]
437 [183]
438 [183]
442 [189]
444 [190]
450 [196]
452 [197]
453 [198]
455 [201]
458 [205]
459 [205]
463 [208]
467 [211]
471 [214]
473 [215]
474 [215]
477 [217]
479 [218]
481 [219]
486 [222]

In [43]:
temp = epi2sub
epi2sub = {}
for x in sorted(list(temp.keys())):
    epi2sub[x+2] = [item+2 for item in temp[x]]

for x in epi2sub:
    print(x, epi2sub[x])

2 [191, 192]
5 [194]
6 [196]
7 [197]
15 [208]
26 [221]
30 [227]
31 [229]
34 [231]
36 [233]
38 [236]
42 [242]
44 [245]
53 [258]
54 [260]
57 [263]
58 [264, 265]
59 [267]
62 [271]
64 [274]
65 [275]
68 [279, 280]
72 [284, 286]
74 [284, 286]
75 [287]
76 [288, 289]
78 [291]
79 [293]
80 [294, 295, 296]
81 [297]
83 [299]
89 [304]
91 [306]
92 [307]
98 [313]
99 [314]
100 [315]
101 [317]
102 [318]
103 [320]
105 [321]
111 [327]
112 [328, 329]
113 [331]
115 [334, 335]
120 [344, 345]
121 [347]
123 [350]
124 [351]
127 [356]
128 [358, 359]
131 [365]
134 [370]
136 [372, 373, 374]
138 [377]
141 [379]
143 [381]
144 [383, 384, 386]
158 [398]
160 [400]
163 [403]
165 [405]
166 [406]
168 [408]
171 [412, 413, 416, 418]
175 [427]
180 [431]
182 [435]
183 [437, 438]
189 [442]
190 [444]
196 [450]
197 [452]
198 [453]
201 [455]
205 [458, 459]
208 [463]
211 [467]
214 [471]
215 [473, 474]
217 [477]
218 [479]
219 [481]
222 [486]
226 [490]
236 [499]
237 [500]
241 [505]
242 [507]
244 [509, 512]
247 [515]


In [44]:
print(len(sub2epi))
print(len(epi2sub))

112
91


In [8]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)

    return temp

In [9]:
further_alignment = {}
for i in tqdm(range(2)):
    for j in tqdm(range(4)):
        try:
            temp = look_alignment(tbbt_transcripts, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:29<01:27, 29.08s/it][A
 50%|█████     | 2/4 [00:43<00:41, 20.61s/it][A
 75%|███████▌  | 3/4 [00:58<00:17, 17.78s/it][A
100%|██████████| 4/4 [01:09<00:00, 17.45s/it][A
 50%|█████     | 1/2 [01:09<01:09, 69.81s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:13<00:39, 13.30s/it][A
 50%|█████     | 2/4 [00:30<00:31, 15.66s/it][A
 75%|███████▌  | 3/4 [00:43<00:14, 14.20s/it][A
100%|██████████| 4/4 [00:57<00:00, 14.41s/it][A
100%|██████████| 2/2 [02:07<00:00, 63.73s/it]


In [10]:
for x in further_alignment:
    print(x, len(further_alignment[x][0]), len(further_alignment[x][1]),len(further_alignment[x][2]),len(further_alignment[x][3]))

(0, 0) 133 133 171 171
(0, 1) 109 109 130 130
(0, 2) 72 72 90 90
(0, 3) 112 112 134 134
(1, 0) 1 1 1 1
(1, 1) 84 84 104 104
(1, 2) 113 113 135 135
(1, 3) 124 124 159 159


In [14]:
with open('further_alignment.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [11]:
alignment = further_alignment

In [12]:
for x in alignment:
    print(x, len(alignment[x][0]), len(alignment[x][1]),len(alignment[x][2]),len(alignment[x][3]))

(0, 0) 133 133 171 171
(0, 1) 109 109 130 130
(0, 2) 72 72 90 90
(0, 3) 112 112 134 134
(1, 0) 1 1 1 1
(1, 1) 84 84 104 104
(1, 2) 113 113 135 135
(1, 3) 124 124 159 159


In [47]:
print(epi2sub)

{2: [191, 192], 5: [194], 6: [196], 7: [197], 15: [208], 26: [221], 30: [227], 31: [229], 34: [231], 36: [233], 38: [236], 42: [242], 44: [245], 53: [258], 54: [260], 57: [263], 58: [264, 265], 59: [267], 62: [271], 64: [274], 65: [275], 68: [279, 280], 72: [284, 286], 74: [284, 286], 75: [287], 76: [288, 289], 78: [291], 79: [293], 80: [294, 295, 296], 81: [297], 83: [299], 89: [304], 91: [306], 92: [307], 98: [313], 99: [314], 100: [315], 101: [317], 102: [318], 103: [320], 105: [321], 111: [327], 112: [328, 329], 113: [331], 115: [334, 335], 120: [344, 345], 121: [347], 123: [350], 124: [351], 127: [356], 128: [358, 359], 131: [365], 134: [370], 136: [372, 373, 374], 138: [377], 141: [379], 143: [381], 144: [383, 384, 386], 158: [398], 160: [400], 163: [403], 165: [405], 166: [406], 168: [408], 171: [412, 413, 416, 418], 175: [427], 180: [431], 182: [435], 183: [437, 438], 189: [442], 190: [444], 196: [450], 197: [452], 198: [453], 201: [455], 205: [458, 459], 208: [463], 211: [467]

In [244]:
alignment = alignment_seeds

In [248]:
for x in alignment:
    print(x)

(1, 1)
(1, 2)
(1, 3)
(1, 4)
(1, 5)
(1, 6)
(1, 7)
(1, 8)
(1, 9)
(1, 10)
(1, 11)
(1, 12)
(1, 13)
(1, 14)
(1, 15)
(1, 16)
(2, 1)
(2, 2)
(2, 3)
(2, 4)
(2, 5)
(2, 6)
(2, 7)
(2, 8)
(2, 9)
(2, 10)
(2, 11)
(2, 12)
(2, 13)
(2, 14)
(2, 15)
(2, 16)
(2, 17)
(2, 18)
(2, 19)
(2, 20)
(2, 21)
(2, 22)
(2, 23)
(3, 1)
(3, 2)
(3, 3)
(3, 4)
(3, 5)
(3, 6)
(3, 7)
(3, 8)
(3, 9)
(3, 10)
(3, 11)
(3, 12)
(3, 13)
(3, 14)
(3, 15)
(3, 16)
(3, 17)
(3, 18)
(3, 19)
(3, 20)
(3, 21)
(3, 22)
(3, 23)
(4, 1)
(4, 2)
(4, 3)
(4, 4)
(4, 5)
(4, 6)
(4, 7)
(4, 8)
(4, 9)
(4, 10)
(4, 11)
(4, 12)
(4, 13)
(4, 14)
(4, 15)
(4, 17)
(4, 18)
(4, 19)
(4, 20)
(4, 21)
(4, 22)
(4, 23)
(4, 24)
(5, 1)
(5, 2)
(5, 3)
(5, 4)
(5, 5)
(5, 6)
(5, 7)
(5, 8)
(5, 9)
(5, 10)
(5, 11)
(5, 12)
(5, 13)
(5, 14)
(5, 15)
(5, 16)
(5, 17)
(5, 18)
(5, 19)
(5, 20)
(5, 21)
(5, 22)
(5, 23)
(6, 1)
(6, 2)
(6, 3)
(6, 4)
(6, 5)
(6, 6)
(6, 7)
(6, 8)
(6, 9)
(6, 10)
(6, 11)
(6, 12)
(6, 13)
(6, 14)
(6, 15)
(6, 16)
(6, 17)
(6, 18)
(6, 19)
(6, 20)
(6, 21)
(6, 22)
(6, 23)
(6, 24

In [290]:
alignment = {(1,1): turn_sub2epi_into_epi2sub(temp)}

TypeError: list indices must be integers or slices, not str

In [294]:
# Write into xlsx file
for x in alignment:
    print(alignment[x])
    # Define season and episode
    season_id = x[0]
    episode_id = x[1]

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/test_episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/test_subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    sub2epi = alignment[x]
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, str(speaker), " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

{0: [200], 9: [215], 15: [224], 16: [225, 226], 17: [229], 28: [239], 34: [246, 247], 35: [248], 65: [268], 75: [282], 82: [285, 286], 88: [290], 89: [291], 90: [292], 93: [293], 96: [295], 100: [300], 102: [301, 302], 108: [307], 116: [314, 316], 117: [318], 134: [333], 135: [334], 145: [348], 148: [351], 154: [360], 155: [362], 160: [368, 369], 161: [373, 374, 375], 162: [378], 171: [384], 173: [386], 174: [387], 175: [388], 176: [389], 177: [390], 183: [396], 186: [400, 401], 193: [409], 201: [417], 207: [423], 208: [424, 425], 210: [430], 215: [435], 226: [449], 228: [450], 230: [452, 453, 454, 455, 456], 232: [458], 237: [464], 240: [467], 241: [468, 469, 470], 242: [472], 245: [476], 258: [489, 490], 259: [491], 262: [494, 495], 263: [496], 270: [503], 275: [506], 280: [510, 511], 288: [523, 524], 295: [531], 296: [532], 297: [533], 302: [542], 309: [548], 311: [550, 551], 314: [555]}


In [None]:
# Check current alignments
for x in alignment:
    # Load Data
    epi2sub = alignment[x][-1]

    pass


In [36]:
print(further_alignment[(0,0)][-1])

{0: [200, 201, 202, 203], 2: [205], 5: [206, 207], 8: [214], 9: [215], 12: [218, 219], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 21: [233], 24: [235], 29: [239], 31: [240], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 40: [252], 41: [253], 42: [254], 48: [256], 50: [259], 52: [261], 58: [263], 59: [264], 66: [266], 67: [267], 68: [268], 69: [269], 70: [270], 72: [273], 73: [274, 275], 76: [279], 78: [282], 79: [283], 85: [285, 286], 86: [287], 91: [290], 92: [291], 93: [292], 96: [293], 99: [295], 100: [297], 102: [299], 103: [300], 104: [301], 105: [302], 108: [304], 110: [306], 111: [307], 112: [308], 114: [311], 115: [312], 118: [313], 119: [314, 315, 316, 317], 120: [318], 122: [319], 124: [320], 126: [321, 322], 129: [325], 131: [327], 135: [330], 136: [331], 137: [333], 138: [334], 139: [335], 140: [336], 141: [337, 338, 339, 340], 143: [342], 151: [351], 153: [356], 154: [357, 358], 157: [360, 361], 158: [362], 163: [368, 369], 165: [373

In [35]:
print(len(further_alignment[(0,0)]))

4


In [25]:
for x in further_alignment:
    print(x, len(further_alignment[x][-1]), len(further_alignment[x][-2]), len(further_alignment[x][-3]))
    # print(further_alignment[x])

(0, 0) 171 171 133
(0, 1) 130 130 109
(0, 2) 90 90 72
(0, 3) 134 134 112
(0, 4) 132 132 106
(0, 5) 24 24 16
(0, 6) 75 75 66
(0, 7) 7 7 7
(0, 8) 130 130 100
(0, 9) 57 57 50
(0, 10) 46 46 32
(0, 11) 93 93 75
(0, 12) 137 137 114
(0, 13) 10 10 9
(0, 14) 124 124 103
(0, 15) 130 130 102
(1, 0) 1 1 1
(1, 1) 104 104 84
(1, 2) 135 135 113
(1, 3) 159 159 124
(1, 4) 74 74 60
(1, 5) 60 60 43
(1, 6) 29 29 24
(1, 7) 110 110 98
(1, 8) 70 70 61
(1, 9) 80 80 68
(1, 11) 133 133 112
(1, 12) 136 136 108
(1, 13) 161 161 131
(1, 14) 31 31 26
(1, 15) 106 106 83
(1, 16) 140 140 119
(1, 17) 127 127 102
(1, 18) 134 134 111
(1, 19) 46 46 40
(1, 20) 95 95 74
(1, 21) 137 137 104
(1, 22) 56 56 47
(2, 0) 125 125 102
(2, 1) 134 134 115
(2, 2) 103 103 81
(2, 3) 105 105 84
(2, 4) 131 131 105
(2, 5) 17 17 13
(2, 6) 159 159 131
(2, 7) 109 109 93
(2, 8) 133 133 106
(2, 9) 162 162 125
(2, 10) 129 129 112
(2, 11) 125 125 105
(2, 12) 109 109 86
(2, 13) 106 106 83
(2, 14) 124 124 103
(2, 15) 142 142 119
(2, 16) 100 100 81
(2,

In [27]:
old_alignment = further_alignment

In [34]:
print(old_alignment[(1,1)][-1])

{2: [200, 201, 202, 203], 4: [206], 5: [208], 6: [209, 210], 7: [211], 10: [217, 218], 14: [220], 18: [223], 19: [224, 225], 20: [229], 21: [230], 22: [232], 23: [233, 234], 26: [238], 28: [240, 241], 30: [242], 31: [243], 32: [244], 33: [245], 34: [246], 36: [249, 250], 37: [251], 40: [253], 41: [254, 255], 44: [258], 45: [259, 260, 261], 46: [262, 263], 47: [264], 48: [266], 49: [267], 52: [269], 56: [274], 57: [276, 277], 58: [278], 59: [279], 60: [280, 281], 61: [282], 63: [283], 64: [284], 65: [285], 67: [286], 68: [287, 288, 289], 70: [291, 292], 73: [295], 74: [296], 75: [297, 298, 299], 76: [300], 77: [302], 79: [304], 80: [305], 81: [306], 82: [308], 83: [310], 84: [311], 85: [312, 313], 87: [315], 88: [317], 89: [319], 90: [320, 321], 91: [322], 92: [325, 326], 94: [328], 95: [329], 96: [330], 97: [331], 99: [334], 100: [335, 336], 101: [337], 102: [338], 104: [339], 107: [342, 343], 109: [344], 110: [345], 111: [346], 112: [347], 113: [348, 349, 350], 115: [351], 116: [352, 

In [33]:
print(len(old_alignment[(1,1)][-1]))
print(len(turn_sub2epi_into_epi2sub(old_alignment[(1,1)][-1])))

104
140


In [38]:
alignment = further_alignment
for x in alignment:
    # Define season and episode
    season_id = x[0]+1
    episode_id = x[1]+1

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x][-1]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    tbbt_episode = []
    for x in tbbt_transcripts[(season_id, episode_id)]:
        if x[1]!='Scene':
            tbbt_episode.append(x)

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, "", " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

In [22]:
print(further_alignment.keys())

dict_keys([(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)])


In [14]:
temp = []
# Perform string match with sliding window
sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 4)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

In [18]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

In [20]:
# Perform exact match and add it to the whole alignment
exact_match_result = exact_match(en_subset, tbbt_episode)
sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

{200: [0], 202: [0], 203: [0], 205: [2], 206: [5], 207: [5], 212: [6], 213: [7], 214: [8], 215: [9], 218: [12], 219: [12], 220: [13], 222: [15], 223: [15], 224: [15], 225: [16], 226: [16], 227: [17], 228: [17], 229: [17], 232: [20], 233: [21], 234: [23], 235: [24], 239: [29], 240: [31], 241: [32], 242: [33], 246: [35], 247: [35], 248: [36], 249: [37], 251: [39], 252: [40], 253: [41], 254: [42], 256: [48], 259: [50], 260: [51], 261: [52], 262: [57], 263: [58], 264: [59], 266: [66], 267: [67], 268: [68], 269: [69], 270: [70], 272: [72], 273: [72], 274: [73], 275: [73], 276: [74], 279: [76], 280: [77], 282: [78], 283: [79], 285: [85], 286: [85], 287: [86], 288: [87], 290: [91], 291: [92], 292: [93], 293: [96], 295: [99], 296: [99], 297: [100], 299: [102], 300: [103], 301: [104], 302: [105], 303: [107], 304: [108], 306: [110], 307: [111], 308: [112], 311: [114], 312: [115], 313: [118], 314: [119], 315: [119], 316: [119], 317: [119], 318: [120], 319: [122], 320: [124], 321: [126], 322: [126

In [24]:
# Extend the neighbors
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
temp.append(epi2sub)

print(sub2epi)
print(len(sub2epi))
print(len(epi2sub), len(tbbt_episode))

{200: [0], 202: [0], 203: [0], 205: [2], 206: [5], 207: [5], 212: [6], 213: [7], 214: [8], 215: [9], 218: [12], 219: [12], 220: [13], 222: [15], 223: [15], 224: [15], 225: [16], 226: [16], 227: [17], 228: [17], 229: [17], 232: [20], 233: [21], 234: [23], 235: [24], 239: [29], 240: [31], 241: [32], 242: [33], 246: [35], 247: [35], 248: [36], 249: [37], 251: [39], 252: [40], 253: [41], 254: [42], 255: [42], 256: [48], 259: [50], 260: [51], 261: [52], 262: [57], 263: [58], 264: [59], 266: [66], 267: [67], 268: [68], 269: [69], 270: [70], 272: [72], 273: [72], 274: [73], 275: [73], 276: [74], 279: [76], 280: [77], 282: [78], 283: [79], 285: [85], 286: [85], 287: [86], 288: [87], 290: [91], 291: [92], 292: [93], 293: [96], 295: [99], 296: [99], 297: [100], 299: [102], 300: [103], 301: [104, 105], 302: [105], 303: [107], 304: [108], 306: [110], 307: [111], 308: [112], 311: [114], 312: [115], 313: [118], 314: [119], 315: [119], 316: [119], 317: [119], 318: [120], 319: [122], 320: [124], 321: 

In [22]:
# Show TBBT Episode
for i, (utt, speaker) in enumerate(tbbt_episode):
    if i in epi2sub:
        print(i, "||||",epi2sub[i], speaker, utt)
    else:
        print(i, speaker, utt)

0 |||| [200, 202, 203] Sheldon  So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.
1 Leonard  Agreed, what’s your point?
2 |||| [205] Sheldon  There’s no point, I just think it’s a good idea for a tee-shirt. 
3 Leonard  Excuse me?
4 Receptionist  Hang on. 
5 |||| [206, 207] Leonard  One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti. 
6 |||| [212] Receptionist  Can I help you?
7 |||| [213] Leonard  Yes. Um, is this the High IQ sperm bank?
8 |||| [214] Receptionist  If you have to ask, maybe you shouldn’t be here.
9 |||| [215] Sheldon  I think this is the place.
10 Receptionist  Fill these out.
11 

In [23]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle, zh_subset[i])
    else:
        print(i, subtitle, zh_subset[i])

0 Thank you very much. Good day to you. 多谢了 日安
1 Good day to you. 日安
2 Come and buy a dresser! 来买梳妆台了
3 The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
4 Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
5 The village had grown. 村子也大了 有很多新的小孩
6 There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
7 But we had a secret helper. 但是我们有个秘密
8 Nikolas. -尼古拉斯
9 -Eemeli. -艾美利
10 Long time no see. You should come more often. 很久没见了 你应该常来
11 I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
12 Do you have the list? 你有名单吗
13 Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
14 As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
15 Is that... -是吗
16 -A girl, three months. -一个女孩 三个月大
17 Let's add her to the list. 那我们加上她的名字吧
18 What is the name of this little princess? 这个小公主叫什么名字
19 Aada. 亚达
20 Aada? 亚达
21 Hello, Aada. 你好 亚达
22 Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
23 My sons. 我的儿子


In [90]:
print(temp)

[{0: [200, 203], 5: [206], 8: [214], 15: [224], 16: [226], 17: [227, 229], 24: [235], 29: [239], 33: [242], 35: [246], 36: [248], 37: [249], 41: [253], 48: [256], 50: [259]}, {0: [200, 203], 5: [206, 207], 8: [214], 15: [224], 16: [225, 226], 17: [227, 229], 24: [235], 29: [239], 33: [242], 35: [246, 247], 36: [248], 37: [249], 41: [253], 48: [256], 50: [259]}, {0: [200, 203], 5: [206, 207], 8: [214], 9: [215], 15: [224], 16: [225, 226], 17: [227, 229], 21: [233], 24: [235], 29: [239], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 41: [253], 42: [254], 48: [256], 50: [259]}]


In [91]:
for x in temp:
    print(x)

3


In [None]:
    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

In [56]:
def extend_with_wer(en_subset, epi2sub_alignment_2, episode):
    temp = {}

    for epi_id in epi2sub_alignment_2:
        # Define episode utterance id
        epi_id_former = epi_id - 1
        epi_id = epi_id
        epi_id_latter = epi_id + 1

        # Define subtitle id
        sub_id_former = min(epi2sub_alignment_2[epi_id]) - 1
        sub_id_latter = max(epi2sub_alignment_2[epi_id]) + 1
        sub_ids = sorted(list(epi2sub_alignment_2[epi_id]))
        # print(epi_id)
        # print(sub_ids)

        # Check whether subtitle nearby is in the utterance
        sub_former = transformation(en_subset[sub_id_former])
        sub_latter = transformation(en_subset[sub_id_latter])
        # sub = transformation(en_subset[sub_id])
        epi = transformation(episode[epi_id][0])

        # Fetch all relevant sentences
        epi_sentences = [episode[idx][0] for idx in [epi_id_former, epi_id, epi_id_latter] if idx>=0]
        print(epi_id_former, epi_id, epi_id_latter)
        print(epi_sentences)

        sub_sentences = [en_subset[sub_id_former, sub_id_latter] for idx in [sub_former, sub_id_latter] if idx>=0]
        print(sub_id_former, sub_id_latter)
        print(sub_sentences)

        print("=="*50)


        if sub_former in epi:
            sub_ids.append(sub_id_former)
        if sub_latter in epi:
            sub_ids.append(sub_id_latter)
        # print(sorted(sub_ids))
        temp[epi_id] = sorted(sub_ids)
        # epi2sub_alignment_2[epi_id] = sorted(sub_ids)
        # print("=="*50)
    return temp

In [26]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    tbbt_episode = []
    for x in tbbt_transcripts[(1,1)]:
        if x[1]!='Scene':
            tbbt_episode.append(x)

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)

    # # Extend with WER
    # # for x in sub2epi:
    # #     print(x, sub2epi[x])
    # extend_with_wer(en_subset, epi2sub, tbbt_episode)




    return temp

In [None]:
further_alignment = {}
for i in tqdm(range(12)):
    for j in tqdm(range(30)):
        try:
            temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

In [28]:
temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, 1, 1, 200)

In [35]:
print(temp[-1])

{0: [200, 201, 202, 203], 2: [205], 5: [206, 207], 8: [214], 9: [215], 12: [218, 219], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 21: [233], 24: [235], 29: [239], 31: [240], 32: [241], 33: [242], 35: [246, 247], 36: [248], 37: [249], 39: [251], 40: [252], 41: [253], 42: [254], 48: [256], 50: [259], 52: [261], 58: [263], 59: [264], 66: [266], 67: [267], 68: [268], 69: [269], 70: [270], 72: [273], 73: [274, 275], 76: [279], 78: [282], 79: [283], 85: [285, 286], 86: [287], 91: [290], 92: [291], 93: [292], 96: [293], 99: [295], 100: [297], 102: [299], 103: [300], 104: [301], 105: [302], 108: [304], 110: [306], 111: [307], 112: [308], 114: [311], 115: [312], 118: [313], 119: [314, 315, 316, 317], 120: [318], 122: [319], 124: [320], 126: [321, 322], 129: [325], 131: [327], 135: [330], 136: [331], 137: [333], 138: [334], 139: [335], 140: [336], 141: [337, 338, 339, 340], 143: [342], 151: [351], 153: [356], 154: [357, 358], 157: [360, 361], 158: [362], 163: [368, 369], 165: [373

In [46]:
for x in further_alignment:
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=x[0]+1,
        episode_id=x[1]+1,
        bias=200
    )
    total = len(tbbt_episode)
    print(x, total, len(further_alignment[x][0]), len(further_alignment[x][1]), len(further_alignment[x][2]), len(further_alignment[x][3]))
    # print(further_alignment[x])
    # print("=="*50)

(0, 0) 133 80 80 100 100


In [13]:
further_alignment[(i,j)] = temp

In [14]:
print(further_alignment)

{(1, 2): [{2: [200], 3: [201], 4: [204], 8: [209, 210, 211], 9: [212], 11: [215, 217, 218], 15: [225], 16: [226], 18: [228], 19: [229], 22: [248], 23: [264, 265], 25: [268], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [441], 57: [443], 61: [462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 15: [225], 16: [226], 18: [227, 228], 19: [229], 22: [248], 23: [264, 265, 266], 25: [268, 269], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [340, 341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [440, 441], 57: [443], 61: [461, 462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 14: [221], 15: [225]

In [13]:
with open('further_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)

## Check the further alignment

In [114]:
with open('further_alignment.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [17]:
alignment = further_alignment

In [18]:
for x in alignment:
    # Define season and episode
    season_id = x[0]+1
    episode_id = x[1]+1

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('xlsx_files/episodes/episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('xlsx_files/subtitles/subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x][2]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, "", " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

In [None]:
"""
Check alignment based on Season-Episode
Write into xlsx file
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [109]:
"""
Check alignment based on Season-Episode
"""

# Define Season and Episode
season_id = 0
episode_id = 1

# Fetch alignment result
epi2sub = alignment[(season_id, episode_id)][2]
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

# Fetch subset located by the stage-2 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=season_id+1,
    episode_id=episode_id+1,
    bias=200
)

In [110]:
# Show TBBT Episode
for i, (utt, speaker) in enumerate(tbbt_episode):
    if i in epi2sub:
        print(i, "||||",epi2sub[i], speaker, utt)
    else:
        print(i, speaker, utt)

0 4 no, i haven't.
1 2 get used to it.
2 |||| [196] 4 yeah, i probably won't, but... hey sheldon.
3 |||| [197] 1 hi.
4 |||| [198, 199] 4 hey raj!  still not talking to me, huh?
5 1 don't take it personally, it's his pathology, he can't talk to women.
6 |||| [201] 2 he can't talk to attractive women, or in your case a cheesecake-scented goddess!
7 |||| [202] 0 so, there's gonna be some furniture delivered?
8 |||| [217, 218, 219, 220] 1 oh no, let's assume that they can. lois lane is falling, accelerating at an initial rate of 32 feet per second per second. superman swoops down to save her by reaching out two arms of steel. miss lane, who is now travelling at approximately 120 miles per hour, hits them, and is immediately sliced into three equal pieces.
9 |||| [226, 228, 229] 1 are you listening to yourself, it is well established that superman's flight is a feat of strength, it is an extension of his ability to leap tall buildings, an ability he derives from earth's yellow sun.
10 |||| 

In [108]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle, zh_subset[i])
    else:
        print(i, subtitle, zh_subset[i])

0 Thank you very much. Good day to you. 多谢了 日安
1 Good day to you. 日安
2 Come and buy a dresser! 来买梳妆台了
3 The years with Iisakki passed quickly. 由丽萨奇的日子过的很快
4 Before I knew it, I was all grown up, with a beard and all. 在我知道之前 我已经长大 还有着胡须
5 The village had grown. 村子也大了 有很多新的小孩
6 There were so many new children - that me and Iisakki could not keep count. 我和丽萨奇都无法数过来
7 But we had a secret helper. 但是我们有个秘密
8 Nikolas. -尼古拉斯
9 -Eemeli. -艾美利
10 Long time no see. You should come more often. 很久没见了 你应该常来
11 I've been busy. Iisakki is no longer young. 我一直很忙 丽萨奇已经不再年轻了
12 Do you have the list? 你有名单吗
13 Well, I'll be... So many new children. 呃 我 好多新生的孩子啊
14 As a matter of fact, one name is missing from that list. Elsa? 事实上 有一个名字漏掉了 埃尔莎
15 Is that... -是吗
16 -A girl, three months. -一个女孩 三个月大
17 Let's add her to the list. 那我们加上她的名字吧
18 What is the name of this little princess? 这个小公主叫什么名字
19 Aada. 亚达
20 Aada? 亚达
21 Hello, Aada. 你好 亚达
22 Nikolas, meet Henrik and Hermanni. 尼古拉斯 见一下汉瑞克和赫曼尼
23 My sons. 我的儿子
