In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

from data_construction.alignment.fine_alignment.utils.visulization import generate_xlxs_for_episode
from data_construction.alignment.fine_alignment.utils.preprocessing import organize_coarse_alignment_by_seasons

In [2]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [3]:
# Set File Path
en_subtitle_path = "../../source_data/subtitles/en_zh/en_subtitles.pkl"
other_subtitle_path = "../../source_data/subtitles/en_zh/zh_subtitles.pkl"
transcript_path = "../../source_data/transcripts/tbbt/tbbt_transcripts.pkl"
coarse_alignment_path = "../coarse_alignment/results/tbbt_en_zh.pkl"
results_root_path = "results/tbbt_en_zh/"

# Load Data
with open(en_subtitle_path, 'rb') as f:
    all_en_subtitles = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    all_other_subtitles = pkl.load(f)
with open(transcript_path, 'rb') as f:
    all_transcripts = pkl.load(f)

In [4]:
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
    coarse_alignments = organize_coarse_alignment_by_seasons(temp)

In [5]:
alignment_root = "../fine_alignment/results/tbbt_en_zh/"
with open(alignment_root+"0_alignment_seeds_12.pkl", 'rb') as f:
    seeds = pkl.load(f)

In [12]:
alignment_root = "../fine_alignment/results/tbbt_en_zh/"
with open(alignment_root+"1_alignment_head_tail_12.pkl", 'rb') as f:
    alignment = pkl.load(f)

In [7]:
print(len(alignment))

184


In [8]:
from data_construction.alignment.fine_alignment.utils.alignment_seeds import *

In [9]:
xlsx_path = "xlxs/"

In [10]:
for item in alignment:
    former = alignment[item]
    latter = filter_by_idx(alignment[item])
    if former!=latter:
        print(item)
        print(former)
        print(latter)
        print()

(1, 6)
{0: [197, 198], 235: [488], 236: [489], 237: [490, 491], 238: [492], 242: [495, 496], 244: [498], 245: [499], 246: [500], 247: [501], 248: [502], 249: [503], 250: [505, 506, 507]}
{0: [197, 198], 235: [488], 236: [489], 237: [490, 491], 238: [492], 242: [496, 495], 244: [498], 245: [499], 246: [500], 247: [501], 248: [502], 249: [503], 250: [505, 506, 507]}

(1, 13)
{0: [200, 201], 1: [202], 2: [203, 204], 3: [205, 206], 4: [207, 208], 244: [506], 246: [510], 248: [512], 249: [513], 252: [516], 254: [518]}
{0: [200, 201], 1: [202], 2: [203, 204], 3: [205, 206], 4: [208, 207], 244: [506], 246: [510], 248: [512], 249: [513], 252: [516], 254: [518]}

(1, 15)
{226: [489], 227: [490, 491], 228: [493], 230: [495], 231: [496], 233: [498], 237: [503, 504]}
{226: [489], 227: [490, 491], 228: [493], 230: [495], 231: [496], 233: [498], 237: [504, 503]}

(1, 16)
{0: [182], 221: [465, 466, 467, 535], 222: [468], 226: [471, 472], 227: [473, 477, 494], 228: [474], 229: [478], 230: [480, 481, 4

In [13]:
for epi_key in tqdm(list(alignment.keys())[:]):
    try:
        generate_xlxs_for_episode(season_id=epi_key[0],
                              episode_id=epi_key[1],
                              tbbt_transcripts=all_transcripts,
                              en_subtitle=all_en_subtitles,
                              zh_subtitle=all_other_subtitles,
                              results=coarse_alignments,
                              xlsx_path=xlsx_path,
                              alignment=alignment,
                              zh_split=True)

    except:
        print("Pass")
        pass

100%|██████████| 184/184 [00:10<00:00, 17.82it/s]


In [13]:
print(seeds[(1,1)])

{190: [0], 204: [8], 205: [9], 214: [15], 216: [16], 217: [17], 219: [17], 229: [29], 232: [33], 236: [35], 238: [36], 239: [37], 243: [41], 258: [65], 272: [75], 275: [81], 276: [81], 280: [87], 281: [88], 282: [89], 283: [92], 285: [95], 290: [99], 292: [101], 297: [107], 304: [115], 306: [115], 307: [115], 308: [116], 323: [133], 324: [134], 327: [137], 338: [144], 341: [147], 350: [153], 352: [154], 358: [159], 364: [160], 365: [160], 368: [161], 370: [163], 374: [170], 376: [172], 377: [173], 378: [174], 379: [175], 380: [176], 386: [182], 390: [185], 392: [186], 399: [192], 404: [198], 407: [200], 413: [206], 414: [207], 418: [208], 420: [209], 425: [214], 439: [225], 440: [227], 445: [229], 446: [229], 448: [231], 454: [236], 457: [239], 460: [240], 462: [241], 466: [244], 480: [257], 481: [258], 483: [260], 484: [261], 485: [261], 486: [262], 493: [269], 501: [279], 502: [280], 514: [287], 521: [294], 522: [295], 523: [296], 526: [298], 528: [298], 532: [301], 534: [303], 538: 