In [14]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy

## Stage-1 Alignment: Locate Subset in Open Subtitle

In the rough alignment phrase, we try to align each utterance in tbbt episode with a open subtitle index. Specifically, we do the following:

1.Clean strings and then perform matching
2.assign index to each utterance. So each utterance is assigned a set of subtitles [index, segment, en_subtitle, zh_subtitle]
--index is the subtitle's index in open subtitle dataset
--segment is the utterance segment for alignment
--en_subtitle and zh_subtitle is the whole subtitle in English and Chinese language.


In [1]:
from ..utils.preprocessing import get_epi_indexs_gaps
from ..utils.preprocessing import find_all_continuous_subsets
from ..utils.preprocessing import calculate_gaps
from ..utils.preprocessing import fetch_subsets
from ..utils.preprocessing import organize_tbbt_by_seasons

ImportError: attempted relative import with no known parent package

In [16]:
# Load search result
with open('data/episode_indexs_transformed.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [17]:
# Check all substrings in each episode
for season_id in sorted(list(results.keys())):
    season = results[season_id]
    for episode_id in sorted(list(season.keys())):
        idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
        subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)
        try:
            gaps = calculate_gaps(subsets[-1])
            print(gaps)
            print("Season:", season_id, "|Episode:", episode_id, "|Subset Length:", len(subsets[-1]), "|Sum:", sum(gaps), "|Maximum:", max(gaps))
        except:
            print("Season:", season_id, "Episode:", episode_id, "Subset Length:", subsets)
        print('=='*50)

[3, 2, 1, 2, 2, 13, 1, 2, 1, 1, 1, 1, 4, 1, 7, 4, 10, 7, 25, 1, 1, 7, 3, 4, 1, 2, 7, 2, 1, 1, 9, 7, 2, 1, 1, 2, 1, 1, 1, 3, 7, 7, 1, 1, 1, 5, 14, 1, 10, 1, 1, 1, 20, 1, 2, 5, 4, 1, 1, 3, 1, 1, 4, 3, 13, 1, 1, 4, 1, 2, 3, 3, 8, 1, 14, 13, 16, 2, 2, 1, 4, 1, 1, 3, 2]
Season: 1 |Episode: 1 |Subset Length: 86 |Sum: 338 |Maximum: 25
[1, 17, 1, 1, 6, 2, 3, 3, 44, 2, 2, 1, 23, 4, 4, 14, 1, 2, 7, 1, 8, 1, 8, 2, 5, 1, 3, 1, 2, 1, 7, 2, 2, 1, 5, 2, 4, 1, 2, 1, 1, 1, 8, 2, 46, 1, 1, 2, 7]
Season: 1 |Episode: 2 |Subset Length: 50 |Sum: 267 |Maximum: 46
[1, 5, 3, 2, 8, 3, 5, 2, 5, 7, 10, 5, 1, 5, 1, 2, 27, 1, 1, 1, 16, 15, 1, 1, 1, 1, 19, 1, 4, 9, 4, 10, 2, 2, 3, 14, 2, 1, 3, 2, 1, 1, 1, 1, 1, 3, 2, 7, 12, 1, 4, 2, 3, 1, 1, 5, 1, 1, 7, 8, 4, 11, 13, 1, 1, 8, 6]
Season: 1 |Episode: 3 |Subset Length: 68 |Sum: 313 |Maximum: 27
[2, 14, 1, 8, 3, 7, 1, 6, 3, 18, 12, 10, 2, 8, 8, 8, 2, 1, 1, 2, 3, 2, 1, 2, 20, 11, 9, 2, 5, 1, 1, 3, 7, 41, 1, 1, 9, 6, 2, 6, 1, 4, 4, 9, 1, 16]
Season: 1 |Episode: 4 |Subset 

## Fine-Level Alignment: Search within the Open Subtitle Subset

## 1.Fetch subset after stage-1 Alignment

In [18]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_zh//zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [19]:
# Load Memor Dataset
with open('memor/data.json') as f:
    tbbt = json.load(f)

In [20]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [21]:
# Fetch subset located by the stage-1 alignment
(en_subset, zh_subset, tbbt_episode) = fetch_subsets(
    episode=tbbt,
    en_subtitle=en_subtitle,
    zh_subtitle=zh_subtitle,
    results=results,
    season_id=1,
    episode_id=1,
    bias=200
)

## 2.Stage-2 Alignment within the subset

In [22]:
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

#### Align with Sliding window based string match and extend the neighbors

In [23]:
# Perform string match with sliding window
count = 0
sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(epi2sub)
print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
print("=="*50)


# Extend the neighbors
epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
print(epi2sub)
print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
print("=="*50)


# Perform exact match and add it to the whole alignment
exact_match_result = exact_match(en_subset, tbbt_episode)
sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(epi2sub)
print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
print("=="*50)


# Extend the gap
sub2epi = full_fill_gap(sub2epi)
epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
print(epi2sub)
print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
print("=="*50)

{0: [200, 202, 203], 2: [205], 3: [206, 208, 210], 4: [223, 224], 5: [226], 6: [227, 228, 229], 14: [234], 15: [235], 16: [242], 18: [246], 21: [252], 22: [253], 27: [256], 29: [263], 37: [287], 38: [288], 40: [289], 42: [290], 43: [297], 46: [300], 47: [304], 48: [305], 50: [307], 51: [308], 53: [314, 315, 316, 317], 54: [318], 63: [327], 68: [334], 70: [336], 71: [337, 340], 74: [342], 75: [343, 348], 76: [351], 77: [355], 79: [358], 82: [360, 361], 83: [362], 85: [363], 87: [366], 88: [368], 89: [382], 90: [383], 91: [393], 92: [394, 395], 93: [396], 94: [398], 95: [416], 96: [417], 98: [419], 99: [424], 102: [430], 103: [431], 105: [433, 434], 106: [435], 107: [439], 109: [442], 110: [444], 113: [453, 455, 456], 114: [457], 115: [461], 116: [462, 463], 117: [464], 118: [467], 119: [469, 470, 471], 120: [477], 121: [478], 123: [479], 125: [493], 126: [506], 129: [522, 524], 139: [531], 140: [532], 141: [533], 142: [536, 538], 144: [541]}
Episode Number: 75 Subtitle Number: 97
{0: [2

In [24]:
subset_pairs, abandon = get_subset_in_gaps(epi2sub)

In [35]:
for x in subset_pairs:
    epi_ids = x[0]
    sub_ids = x[1]
    print("Episode Subset:")
    for epi_id in epi_ids:
        print(epi_id, tbbt_episode[epi_id], transformation(tbbt_episode[epi_id][0]))
    print()
    print("Subtitle Subset:")
    for sub_id in sub_ids:
        print(sub_id, en_subset[sub_id], transformation(en_subset[sub_id]))
    print("=="*50)

Episode Subset:
7 ["i'm sure she'll still love him.", 0] i am sure she will still love him
8 ["i wouldn't.", 1] i would not
9 ["i'm sure she'll still love him.", 0] i am sure she will still love him
10 ["i wouldn't.", 1] i would not
11 ['well, what do you want to do?', 0] well what do you want to do

Subtitle Subset:
230 - I'm sure she'll still love him. i am sure she will still love him
231 - I wouldn't. i would not
232 Well, what do you wanna do? well what do you wanna do
Episode Subset:
17 ["i don't care. two millimetres? that doesn't seem right.", 0] i do not care two millimetres that does not seem right

Subtitle Subset:
243 - I don't care. i do not care
244 Two millime...? That doesn't seem right. two millime that does not seem right
Episode Subset:
19 ['new neighbour?', 0] new neighbour

Subtitle Subset:
248 Is that why they sent you to boarding school? No. is that why they sent you to boarding school no
249 That was the result of my work with lasers. that was the result of my w

In [42]:
temp = {}
for x in subset_pairs:
    epi_ids = x[0]
    sub_ids = x[1]
    for epi_id in epi_ids:
        epi = tbbt_episode[epi_id]
        epi_transformed = transformation(epi[0])
        for sub_id in sub_ids:
            sub = en_subset[sub_id]
            sub_transformed = transformation(sub)
            if epi_transformed==sub_transformed:
                print("epi_id:", epi_id)
                print(epi_transformed)
                print("sub_id:", sub_id)
                print(sub_transformed)
                print("=="*50)

epi_id: 7
i am sure she will still love him
sub_id: 230
i am sure she will still love him
epi_id: 8
i would not
sub_id: 231
i would not
epi_id: 9
i am sure she will still love him
sub_id: 230
i am sure she will still love him
epi_id: 10
i would not
sub_id: 231
i would not
epi_id: 23
hi
sub_id: 255
hi
epi_id: 24
hi
sub_id: 255
hi
epi_id: 25
hi
sub_id: 255
hi
epi_id: 26
hi
sub_id: 255
hi
epi_id: 72
well that is interesting leonard can not process corn
sub_id: 341
well that is interesting leonard can not process corn
epi_id: 73
well that is interesting leonard can not process corn
sub_id: 341
well that is interesting leonard can not process corn
epi_id: 131
so you will think about it
sub_id: 526
so you will think about it
epi_id: 132
oh i do not think i will be able to stop thinking about it
sub_id: 527
oh i do not think i will be able to stop thinking about it
epi_id: 134
so you will think about it
sub_id: 526
so you will think about it
epi_id: 135
oh i do not think i will be able to sto

In [25]:
# Show TBBT Episode
for i, (utt, speaker) in enumerate(tbbt_episode):
    if i in abandon:
        continue
    if i in epi2sub:
        print(i, "||||",epi2sub[i], speaker, utt)
    else:
        print(i, speaker, utt)

0 |||| [200, 201, 202, 203] 1 if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. if it's unobserved it will, however, if it's observed after it's left the plane but before it hits its target, it will not have gone through both slits.
1 |||| [204] 0 agreed, what's your point?
2 |||| [205] 1 there's no point, i just think it's a good idea for a t-shirt.
3 |||| [206, 207, 208, 209, 210, 211] 0 one across is aegean, eight down is nabakov, twenty-six across is mcm, fourteen down is... move your finger... phylum, which makes fourteen across port-au-prince. see, papa doc's capital idea, that's port-au-prince. haiti.
4 |||| [222, 223, 224] 1 no. we are committing genetic fraud. there's no guarantee that our sperm is going to generate high iq offspring, think about that. i have a sister with the same basic dna mix who hostesses at fuddruckers.
5 |||| [225, 226] 0 sheldon, this was your idea. a little extra money to get fra

In [26]:
# Show Open Subtitle
for i, subtitle in enumerate(en_subset):
    if i in sub2epi:
        print(i, "||||", sub2epi[i], subtitle)
    else:
        print(i, subtitle)

0 Thank you very much. Good day to you.
1 Good day to you.
2 Come and buy a dresser!
3 The years with Iisakki passed quickly.
4 Before I knew it, I was all grown up, with a beard and all.
5 The village had grown.
6 There were so many new children - that me and Iisakki could not keep count.
7 But we had a secret helper.
8 Nikolas.
9 -Eemeli.
10 Long time no see. You should come more often.
11 I've been busy. Iisakki is no longer young.
12 Do you have the list?
13 Well, I'll be... So many new children.
14 As a matter of fact, one name is missing from that list. Elsa?
15 Is that...
16 -A girl, three months.
17 Let's add her to the list.
18 What is the name of this little princess?
19 Aada.
20 Aada?
21 Hello, Aada.
22 Nikolas, meet Henrik and Hermanni.
23 My sons.
24 I sought them out and asked them here.
25 We were wrong when we...
26 We want to make it up to our father.
27 We came to take him to live with us.
28 -To live with you? Where?
29 Away from here. Father is too old to be living 