In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_zh/zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_fa/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_fa/fa_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [23]:
print(len(zh_subtitle))

9969005


In [5]:
with open('original_transcript/tbbt_transcripts.pkl', 'rb') as f:
    tbbt_transcripts = pkl.load(f)

In [6]:
# Load alignment results after stage-2
with open('alignment_results/zh/indexs_tbbt_zh.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [6]:
# Load alignment results after stage-2
with open('alignment_results/fa/indexs_tbbt_fa.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [188]:
with open('alignment_results/zh/head_tail_alignment.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [27]:
with open('alignment_results/zh/alignment_seeds.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [172]:
with open('alignment_results/zh/final_stage_alignment_1.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [7]:
with open('alignment_results/zh/final_stage_alignment_1_with_head_tail.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [19]:
with open('alignment_results/zh/ultimate_data.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [7]:
with open('alignment_results/fa/ultimate_data.pkl', 'rb') as f:
    further_alignment = pkl.load(f)

In [8]:
alignment = further_alignment

In [21]:
print(alignment)

{(1, 1): {0: [200, 201, 202, 203], 1: [204], 2: [205], 5: [206, 207, 208, 209, 210, 211], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218, 219], 13: [220], 14: [221], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 23: [235], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [245, 246, 247], 35: [248], 36: [249], 37: [250], 38: [251], 39: [252], 40: [253], 41: [254, 255], 45: [256], 46: [257], 47: [258, 259], 48: [260], 49: [261], 54: [262], 56: [264], 59: [265], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [281, 282], 76: [283], 82: [285, 286], 83: [287], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295, 296], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109

In [9]:
# Write into xlsx file
for x in alignment:
    print(x)
    print(alignment[x])
    # Define season and episode
    season_id = x[0]
    episode_id = x[1]

    # Define xlsx file
    episode_book = xlsxwriter.Workbook('alignment_results/fa/xlsx_files/episodes/f_episode_s%de%d.xlsx'%(season_id, episode_id))
    episode_sheet = episode_book.add_worksheet()
    episode_bold = episode_book.add_format({'bold':1})
    for j, item in enumerate(['utterance', 'speaker', 'subtitle id']):
        episode_sheet.write(0, j, item, episode_bold)

    subtitle_book = xlsxwriter.Workbook('alignment_results/fa/xlsx_files/subtitles/f_subtitle_s%de%d.xlsx'%(season_id, episode_id))
    subtitle_sheet = subtitle_book.add_worksheet()
    subtitle_bold = subtitle_book.add_format({'bold':1})
    for j, item in enumerate(['subtitle_en', 'subtitle_zh', 'episode id']):
        subtitle_sheet.write(0, j, item, subtitle_bold)

    # Load Data
    epi2sub = alignment[x]
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)

    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Write into file
    for i, (utt, speaker) in enumerate(tbbt_episode):
        if i in epi2sub:
            temp = [utt, str(speaker), " ".join([str(item+2) for item in epi2sub[i]])]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item, episode_bold)
        else:
            temp = [utt, str(speaker), " "]
            for j, item in enumerate(temp):
                episode_sheet.write(i+1, j, item)

    for i, subtitle in enumerate(en_subset):
        if i in sub2epi:
            temp = [subtitle, zh_subset[i], " ".join([str(item+2) for item in sub2epi[i]])]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item, subtitle_bold)
        else:
            temp = [subtitle, zh_subset[i], " "]
            for j, item in enumerate(temp):
                subtitle_sheet.write(i+1, j, item)

    episode_book.close()
    subtitle_book.close()

(1, 1)
{0: [200, 201, 202, 203], 1: [204], 2: [205], 3: [206], 5: [207, 208, 209, 210, 211, 212, 213], 6: [214], 7: [215], 8: [216], 9: [217], 10: [218], 11: [219], 12: [220, 221, 222], 13: [223], 14: [224, 225], 15: [226, 227, 228, 229], 16: [230, 231], 17: [232, 233, 234], 18: [235], 20: [236], 21: [237], 22: [238], 23: [239], 24: [240], 28: [242], 30: [243], 32: [244, 245, 246], 33: [247], 34: [248, 249], 35: [250], 36: [251], 39: [254], 40: [255], 41: [256], 45: [257], 46: [258], 47: [259, 260], 48: [261], 49: [262], 54: [263], 59: [266], 63: [267], 64: [268], 65: [269], 66: [270], 67: [271, 272], 68: [273], 69: [274, 275], 70: [276, 277], 71: [278], 72: [279], 73: [280, 281], 74: [282], 75: [283, 284], 76: [285], 77: [286, 287], 82: [289, 290], 83: [291], 84: [292], 86: [293], 88: [294], 89: [295], 90: [296], 92: [298], 93: [299, 300], 94: [301], 95: [302], 96: [303, 304], 97: [305], 99: [306], 100: [307], 102: [308, 309], 104: [310], 105: [311], 106: [312], 107: [313], 108: [314]

In [44]:
# Collect Data as Multi-Coref Source Data
for epi_id, sub_id in further_alignment:
    print((epi_id, sub_id), len(further_alignment[(epi_id, sub_id)]), len(turn_sub2epi_into_epi2sub(further_alignment[(epi_id, sub_id)])))
    # print(further_alignment[(epi_id, sub_id)])

(1, 1) 235 317
(1, 2) 177 264
(1, 3) 188 268
(1, 4) 156 240
(1, 5) 160 236
(1, 6) 178 238
(1, 7) 187 253
(1, 8) 211 310
(1, 9) 177 251
(1, 10) 171 253
(1, 11) 182 243
(1, 12) 107 165
(1, 13) 160 230
(1, 14) 170 250
(1, 15) 179 259
(1, 16) 171 248
(2, 1) 176 265
(2, 2) 186 255
(2, 3) 172 243
(2, 4) 180 249
(2, 5) 151 226
(2, 6) 177 240
(2, 7) 179 252
(2, 8) 143 199
(2, 9) 158 249
(2, 10) 151 225
(2, 11) 175 258
(2, 12) 158 252
(2, 13) 161 270
(2, 14) 195 273
(2, 15) 185 245
(2, 16) 187 257
(2, 17) 174 261
(2, 18) 147 203
(2, 19) 178 259
(2, 20) 172 237
(2, 21) 179 251
(2, 22) 156 222
(2, 23) 176 263
(3, 1) 164 249
(3, 2) 167 270
(3, 3) 171 234
(3, 4) 187 272
(3, 5) 177 250
(3, 6) 159 248
(3, 7) 195 291
(3, 8) 154 212
(3, 9) 158 238
(3, 10) 194 290
(3, 11) 171 242
(3, 12) 164 240
(3, 13) 141 223
(3, 14) 149 211
(3, 15) 161 268
(3, 16) 174 267
(3, 17) 157 262
(3, 18) 139 264
(3, 19) 184 252
(3, 20) 200 263
(3, 21) 187 296
(3, 22) 185 260
(3, 23) 186 271
(4, 1) 196 292
(4, 2) 179 248
(4, 3

In [47]:
for x in tbbt_transcripts[(1,1)]:
    print(x)

[' A corridor at a sperm bank.', 'Scene']
[' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'Sheldon']
[' Agreed, what’s your point?', 'Leonard']
[' There’s no point, I just think it’s a good idea for a tee-shirt. ', 'Sheldon']
[' Excuse me?', 'Leonard']
[' Hang on. ', 'Receptionist']
[' One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti. ', 'Leonard']
[' Can I help you?', 'Receptionist']
[' Yes. Um, is this the High IQ sperm bank?', 'Leonard']
[' If you have to ask, maybe you shouldn’t be here.', 'Receptionist']
[' I think this is the place.', 'Sheldon']
[' Fill these out.', 'Receptionist']

## Part 2: Construct the Final Corpus for Multi_Coref

In [53]:
with open('alignment_results/zh/final_stage_alignment_1.pkl', 'rb') as f:
    alignment_seeds = pkl.load(f)

In [155]:
def fetch_index_information(episode, en_subtitle, zh_subtitle, results, season_id, episode_id, bias):
    idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
    subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)[-1]
    # Calculate gaps within the subset
    gaps_subsets = calculate_gaps(subsets)

    # Prepare Subtitle Subset
    start = subsets[0] - bias
    end = subsets[-1] + bias
    en_subset = en_subtitle[start: end]
    zh_subset = zh_subtitle[start: end]


    tbbt_episode = []
    count = 0
    temp = []
    for x in episode[(season_id, episode_id)]:
        if x[1] != 'Scene':
            temp.append([count, x])
            count += 1
        else:
            tbbt_episode.append(temp)
            temp = []

    print(len(tbbt_episode))

    for item in tbbt_episode:
        print(item)
        print("=="*50)

    # # Clean the episode data
    # # 1. Remove empty string
    # # 2. Remove duplicate stings
    # temp_tbbt_episode = []
    # abandon_idx = set()
    # for i, x in enumerate(tbbt_episode):
    #     if transformation(x[0]) in [" ", ""]:
    #         abandon_idx.add(i)
    # for length in range(6):
    #     length += 1
    #     for i in range(len(tbbt_episode)-length):
    #         if tbbt_episode[i][0]==tbbt_episode[i+length][0] and tbbt_episode[i][1]==tbbt_episode[i+length][1]:
    #             abandon_idx.add(i)
    #
    # for i, item in enumerate(tbbt_episode):
    #     if i not in abandon_idx:
    #         temp_tbbt_episode.append(item)


    # tbbt_episode = []
    # for x in episode[(season_id, episode_id)]:
    #     if x[1] != 'Scene':
    #         tbbt_episode.append(x)
    #
    #
    # # Clean the episode data
    # # 1. Remove empty string
    # # 2. Remove duplicate stings
    # temp_tbbt_episode = []
    # abandon_idx = set()
    # for i, x in enumerate(tbbt_episode):
    #     if transformation(x[0]) in [" ", ""]:
    #         abandon_idx.add(i)
    # for length in range(6):
    #     length += 1
    #     for i in range(len(tbbt_episode)-length):
    #         if tbbt_episode[i][0]==tbbt_episode[i+length][0] and tbbt_episode[i][1]==tbbt_episode[i+length][1]:
    #             abandon_idx.add(i)
    #
    # for i, item in enumerate(tbbt_episode):
    #     if i not in abandon_idx:
    #         temp_tbbt_episode.append(item)

    return en_subset, zh_subset, temp_tbbt_episode


In [156]:
def collect_multi_coref_corpus(tbbt, en_subtitle, other_subtitle, results, season_id, episode_id, bias):
    print(season_id, episode_id)
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_index_information(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )
    print(len(tbbt_episode))
    # Load Abandon Idx
    abandon_idx = set()
    for i, x in enumerate(tbbt_episode):
        if transformation(x[0]) in [" ", ""]:
            abandon_idx.add(i)
    for length in range(6):
        length += 1
        for i in range(len(tbbt_episode)-length):
            if tbbt_episode[i][0]==tbbt_episode[i+length][0] and tbbt_episode[i][1]==tbbt_episode[i+length][1]:
                abandon_idx.add(i)

    one_episode = []
    temp = []
    for i, x in enumerate(tbbt_transcripts[(season_id, episode_id)]):
        if x[1]=='Scene':
            one_episode.append(temp)
            temp = []
        else:
            if i in abandon_idx:
                continue
            temp.append(x)

In [157]:
further_alignment = {}
for (i, j) in alignment_seeds.keys():
    if (i,j)!=(1,1):
        continue
    temp = collect_multi_coref_corpus(tbbt_transcripts, en_subtitle, zh_subtitle, results, i, j, 200)
    # further_alignment[(i,j)] = temp
    # print("Season:", i,"Episode:", j, "Episode Number:",len(turn_sub2epi_into_epi2sub(alignment_seeds[(i,j)])), len(temp), "Subtitle Number:", len(alignment_seeds[(i,j)]),len(turn_sub2epi_into_epi2sub(temp)))

1 1
12
[]
[[0, [' So if a photon is directed through a plane with two slits in it and either slit is observed it will not go through both slits. If it’s unobserved it will, however, if it’s observed after it’s left the plane but before it hits its target, it will not have gone through both slits.', 'Sheldon']], [1, [' Agreed, what’s your point?', 'Leonard']], [2, [' There’s no point, I just think it’s a good idea for a tee-shirt. ', 'Sheldon']], [3, [' Excuse me?', 'Leonard']], [4, [' Hang on. ', 'Receptionist']], [5, [' One across is Aegean, eight down is Nabakov, twenty-six across is MCM, fourteen down is… move your finger… phylum, which makes fourteen across Port-au-Prince. See, Papa Doc’s capital idea, that’s Port-au-Prince. Haiti. ', 'Leonard']], [6, [' Can I help you?', 'Receptionist']], [7, [' Yes. Um, is this the High IQ sperm bank?', 'Leonard']], [8, [' If you have to ask, maybe you shouldn’t be here.', 'Receptionist']], [9, [' I think this is the place.', 'Sheldon']], [10, ['

NameError: name 'temp_tbbt_episode' is not defined