In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from data_construction.alignment.fine_alignment.utils.preprocessing import fetch_subsets
from data_construction.alignment.fine_alignment.utils.preprocessing import organize_coarse_alignment_by_seasons
from data_construction.parallel_corpus.utils import clean_sentences
from data_construction.parallel_corpus.utils import clean_sentence_brackets
from data_construction.alignment.fine_alignment.utils.helper_functions import *

from data_construction.parallel_corpus.utils import ParallelCorpusCollector

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Load Transcript and Subtitles

In [7]:
collector_tbbt_en_zh = ParallelCorpusCollector(
    en_subtitle_path = '../source_data/subtitles/en_zh/en_subtitles.pkl',
    other_subtitle_path = '../source_data/subtitles/en_zh/zh_subtitles.pkl',
    transcript_path = '../source_data/transcripts/tbbt/tbbt_transcripts.pkl',
    coarse_alignment_path = '../alignment/coarse_alignment/results/tbbt_en_zh.pkl',
    fine_alignment_path = '../alignment/fine_alignment/results/tbbt_en_zh/1_alignment_ultimate_12.pkl',
    root_path = "results/tbbt_en_zh/",
    output_path = 'parallel_data/tbbt_en_zh_12.pkl'
)

In [8]:
collector_tbbt_en_fa = ParallelCorpusCollector(
    en_subtitle_path = '../source_data/subtitles/en_fa/en_subtitles.pkl',
    other_subtitle_path = '../source_data/subtitles/en_fa/fa_subtitles.pkl',
    transcript_path = '../source_data/transcripts/tbbt/tbbt_transcripts.pkl',
    coarse_alignment_path = '../alignment/coarse_alignment/results/tbbt_en_fa.pkl',
    fine_alignment_path = '../alignment/fine_alignment/results/tbbt_en_fa/1_alignment_ultimate_12.pkl',
    root_path = "results/tbbt_en_fa/",
    output_path = 'parallel_data/tbbt_en_fa_12.pkl'
)

In [9]:
collector_friends_en_zh = ParallelCorpusCollector(
    en_subtitle_path = '../source_data/subtitles/en_zh/en_subtitles.pkl',
    other_subtitle_path = '../source_data/subtitles/en_zh/zh_subtitles.pkl',
    transcript_path = '../source_data/transcripts/friends/friends_transcripts.pkl',
    coarse_alignment_path = '../alignment/coarse_alignment/results/friends_en_zh.pkl',
    fine_alignment_path = '../alignment/fine_alignment/results/friends_en_zh/1_alignment_ultimate_12.pkl',
    root_path = "results/friends_en_zh/",
    output_path = 'parallel_data/friends_en_zh_12.pkl'
)

In [10]:
collector_friends_en_fa = ParallelCorpusCollector(
    en_subtitle_path = '../source_data/subtitles/en_fa/en_subtitles.pkl',
    other_subtitle_path = '../source_data/subtitles/en_fa/fa_subtitles.pkl',
    transcript_path = '../source_data/transcripts/friends/friends_transcripts.pkl',
    coarse_alignment_path = '../alignment/coarse_alignment/results/friends_en_fa.pkl',
    fine_alignment_path = '../alignment/fine_alignment/results/friends_en_fa/1_alignment_ultimate_12.pkl',
    root_path = "results/friends_en_zh/",
    output_path = 'parallel_data/friends_en_fa_12.pkl'
)

In [22]:
intersections = set(collector_tbbt_en_zh.all_alignment.keys()) & set(collector_tbbt_en_fa.all_alignment.keys())
print("TBBT Intersection", len(intersections))
for i, j in collector_tbbt_en_zh.all_alignment:
    if (i, j) not in intersections:
        continue
    print(i, j)
    print(collector_tbbt_en_zh.show_episode_alignment_result(i, j))
    print(collector_tbbt_en_fa.show_episode_alignment_result(i, j))

TBBT Intersection 89
1 1
(334, 258, 355, 350)
(334, 255, 361, 353)
1 3
(259, 213, 323, 315)
(259, 193, 313, 293)
1 4
(238, 182, 294, 286)
(238, 178, 292, 285)
1 5
(232, 188, 286, 281)
(232, 189, 279, 273)
1 7
(291, 239, 342, 330)
(291, 234, 328, 307)
1 9
(234, 200, 308, 302)
(234, 185, 281, 277)
1 10
(235, 188, 292, 291)
(235, 182, 275, 271)
1 11
(257, 209, 296, 289)
(257, 209, 297, 287)
1 12
(243, 113, 481, 252)
(243, 112, 464, 464)
1 13
(265, 203, 318, 294)
(265, 204, 326, 310)
1 14
(243, 195, 317, 303)
(243, 196, 316, 303)
1 15
(248, 209, 328, 317)
(248, 200, 315, 301)
1 16
(248, 196, 308, 292)
(248, 192, 289, 272)
2 4
(269, 222, 316, 305)
(269, 206, 422, 403)
2 6
(245, 213, 307, 301)
(245, 198, 289, 284)
3 21
(268, 211, 341, 330)
(268, 205, 355, 344)
3 23
(259, 215, 314, 310)
(259, 219, 319, 318)
4 1
(257, 213, 332, 322)
(257, 184, 303, 280)
4 3
(283, 231, 331, 321)
(283, 227, 370, 353)
4 4
(271, 210, 297, 287)
(271, 230, 351, 343)
4 5
(238, 193, 299, 291)
(238, 190, 323, 316)
4 6


In [23]:
intersections = set(collector_friends_en_zh.all_alignment.keys()) & set(collector_friends_en_fa.all_alignment.keys())
print("Friends Intersection", len(intersections))
for i, j in collector_friends_en_zh.all_alignment:
    if (i, j) not in intersections:
        continue
    print(i, j)
    print(collector_friends_en_zh.show_episode_alignment_result(i, j))
    print(collector_friends_en_fa.show_episode_alignment_result(i, j))

Friends Intersection 25
1 1
(315, 231, 434, 429)
(315, 251, 504, 425)
1 2
(254, 182, 329, 294)
(254, 194, 297, 285)
1 3
(272, 142, 237, 216)
(272, 183, 313, 276)
1 4
(268, 199, 345, 295)
(268, 191, 365, 318)
1 6
(229, 140, 247, 224)
(229, 170, 346, 317)
1 7
(252, 159, 296, 256)
(252, 169, 281, 267)
1 8
(246, 187, 293, 256)
(246, 181, 290, 260)
1 9
(242, 181, 336, 304)
(242, 191, 309, 288)
1 10
(241, 150, 301, 262)
(241, 181, 329, 311)
1 11
(297, 220, 353, 328)
(297, 1, 7, 8)
1 12
(270, 173, 287, 268)
(270, 161, 282, 253)
1 13
(256, 193, 331, 305)
(256, 178, 315, 290)
1 14
(206, 136, 271, 243)
(206, 166, 314, 301)
1 15
(258, 186, 318, 289)
(258, 176, 333, 310)
1 16
(287, 195, 299, 268)
(287, 214, 328, 305)
1 17
(296, 212, 684, 639)
(296, 205, 327, 308)
1 18
(275, 154, 253, 221)
(275, 180, 314, 277)
1 19
(253, 154, 262, 246)
(253, 178, 330, 302)
1 20
(256, 190, 318, 299)
(256, 182, 341, 302)
1 22
(290, 198, 316, 299)
(290, 188, 329, 303)
1 24
(269, 191, 311, 276)
(269, 181, 375, 281)
3 1

In [106]:
# TBBT EN-ZH
en_subtitle_path = '../source_data/subtitles/en_zh/en_subtitles.pkl'
other_subtitle_path = '../source_data/subtitles/en_zh/zh_subtitles.pkl'
transcript_path = '../source_data/transcripts/tbbt/tbbt_transcripts.pkl'
coarse_alignment_path = '../alignment/coarse_alignment/results/tbbt_en_zh.pkl'
fine_alignment_path = '../alignment/fine_alignment/results/tbbt_en_zh/1_alignment_ultimate_12.pkl'
root_path = "results/tbbt_en_zh/"
output_path = 'parallel_data/tbbt_en_zh_12.pkl'

collector_tbbt_en_zh = ParallelCorpusCollector(
    en_subtitle_path = '../source_data/subtitles/en_zh/en_subtitles.pkl',
    other_subtitle_path = '../source_data/subtitles/en_zh/zh_subtitles.pkl',
    transcript_path = '../source_data/transcripts/tbbt/tbbt_transcripts.pkl',
    coarse_alignment_path = '../alignment/coarse_alignment/results/tbbt_en_zh.pkl',
    fine_alignment_path = '../alignment/fine_alignment/results/tbbt_en_zh/1_alignment_ultimate_12.pkl',
    root_path = "results/tbbt_en_zh/",
    output_path = 'parallel_data/tbbt_en_zh_12.pkl'
)

with open(en_subtitle_path, 'rb') as f:
    en_subtitle = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    zh_subtitle = pkl.load(f)
with open(transcript_path, 'rb') as f:
    transcripts = pkl.load(f)
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
results = organize_coarse_alignment_by_seasons(temp)
with open(fine_alignment_path, 'rb') as f:
    all_alignment = pkl.load(f)

for item in all_alignment:
    epi2sub = all_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(all_alignment[item])
    if len(sub2epi)/(max(sub2epi)-min(sub2epi)) < 0.6:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(all_alignment[item]), len(transcripts[item]),  len(all_alignment[item])/len(transcripts[item])*100, min(all_alignment[item].keys()))

(1, 12) Subtitle: 252 0.5239085239085239 || Episode: 113 243 46.50205761316872 102


In [97]:
# TBBT EN-FA
en_subtitle_path = '../source_data/subtitles/en_fa/en_subtitles.pkl'
other_subtitle_path = '../source_data/subtitles/en_fa/fa_subtitles.pkl'
transcript_path = '../source_data/transcripts/tbbt/tbbt_transcripts.pkl'
coarse_alignment_path = '../alignment/coarse_alignment/results/tbbt_en_fa.pkl'
fine_alignment_path = '../alignment/fine_alignment/results/tbbt_en_fa/1_alignment_ultimate_12.pkl'
root_path = "results/tbbt_en_fa/"
output_path = 'parallel_data/tbbt_en_fa_12.pkl'

with open(en_subtitle_path, 'rb') as f:
    en_subtitle = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    zh_subtitle = pkl.load(f)
with open(transcript_path, 'rb') as f:
    transcripts = pkl.load(f)
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
results = organize_coarse_alignment_by_seasons(temp)
with open(fine_alignment_path, 'rb') as f:
    all_alignment = pkl.load(f)

for item in all_alignment:
    epi2sub = all_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(all_alignment[item])
    if len(sub2epi)/(max(sub2epi)-min(sub2epi)) < 0.6:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(all_alignment[item]), len(transcripts[item]),  len(all_alignment[item])/len(transcripts[item])*100, min(all_alignment[item].keys()))

In [98]:
# Friends EN-ZH
en_subtitle_path = '../source_data/subtitles/en_zh/en_subtitles.pkl'
other_subtitle_path = '../source_data/subtitles/en_zh/zh_subtitles.pkl'
transcript_path = '../source_data/transcripts/friends/friends_transcripts.pkl'
coarse_alignment_path = '../alignment/coarse_alignment/results/friends_en_zh.pkl'
fine_alignment_path = '../alignment/fine_alignment/results/friends_en_zh/1_alignment_ultimate_12.pkl'
root_path = "results/friends_en_zh/"
output_path = 'parallel_data/friends_en_zh_12.pkl'

with open(en_subtitle_path, 'rb') as f:
    en_subtitle = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    zh_subtitle = pkl.load(f)
with open(transcript_path, 'rb') as f:
    transcripts = pkl.load(f)
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
results = organize_coarse_alignment_by_seasons(temp)
with open(fine_alignment_path, 'rb') as f:
    all_alignment = pkl.load(f)

for item in all_alignment:
    epi2sub = all_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(all_alignment[item])
    if len(sub2epi)/(max(sub2epi)-min(sub2epi)) < 100:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(all_alignment[item]), len(transcripts[item]),  len(all_alignment[item])/len(transcripts[item])*100, min(all_alignment[item].keys()))

(1, 1) Subtitle: 429 0.988479262672811 || Episode: 231 315 73.33333333333333 5
(1, 2) Subtitle: 294 0.8936170212765957 || Episode: 182 254 71.65354330708661 9
(1, 3) Subtitle: 216 0.9113924050632911 || Episode: 142 272 52.20588235294118 56
(1, 4) Subtitle: 295 0.855072463768116 || Episode: 199 268 74.25373134328358 3
(1, 5) Subtitle: 320 0.9384164222873901 || Episode: 189 254 74.40944881889764 2
(1, 6) Subtitle: 224 0.9068825910931174 || Episode: 140 229 61.135371179039296 49
(1, 7) Subtitle: 256 0.8648648648648649 || Episode: 159 252 63.095238095238095 12
(1, 8) Subtitle: 256 0.8737201365187713 || Episode: 187 246 76.01626016260163 12
(1, 9) Subtitle: 304 0.9047619047619048 || Episode: 181 242 74.79338842975206 4
(1, 10) Subtitle: 262 0.8704318936877077 || Episode: 150 241 62.24066390041494 40
(1, 11) Subtitle: 328 0.9291784702549575 || Episode: 220 297 74.07407407407408 0
(1, 12) Subtitle: 268 0.9337979094076655 || Episode: 173 270 64.07407407407408 2
(1, 13) Subtitle: 305 0.92145015

ZeroDivisionError: division by zero

In [105]:
# Friends EN-ZH
en_subtitle_path = '../source_data/subtitles/en_fa/en_subtitles.pkl'
other_subtitle_path = '../source_data/subtitles/en_fa/fa_subtitles.pkl'
transcript_path = '../source_data/transcripts/friends/friends_transcripts.pkl'
coarse_alignment_path = '../alignment/coarse_alignment/results/friends_en_fa.pkl'
fine_alignment_path = '../alignment/fine_alignment/results/friends_en_fa/1_alignment_ultimate_12.pkl'
root_path = "results/friends_en_zh/"
output_path = 'parallel_data/friends_en_fa_12.pkl'

with open(en_subtitle_path, 'rb') as f:
    en_subtitle = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    zh_subtitle = pkl.load(f)
with open(transcript_path, 'rb') as f:
    transcripts = pkl.load(f)
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
results = organize_coarse_alignment_by_seasons(temp)
with open(fine_alignment_path, 'rb') as f:
    all_alignment = pkl.load(f)

for item in all_alignment:
    epi2sub = all_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(all_alignment[item])
    if len(sub2epi)/(max(sub2epi)-min(sub2epi)) < 100:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(all_alignment[item]), len(transcripts[item]),  len(all_alignment[item])/len(transcripts[item])*100, min(all_alignment[item].keys()))

(1, 1) Subtitle: 425 0.8432539682539683 || Episode: 251 315 79.68253968253968 0
(1, 2) Subtitle: 285 0.9595959595959596 || Episode: 194 254 76.37795275590551 0
(1, 3) Subtitle: 276 0.8817891373801917 || Episode: 183 272 67.27941176470588 3
(1, 4) Subtitle: 318 0.8712328767123287 || Episode: 191 268 71.26865671641791 1
(1, 6) Subtitle: 317 0.9161849710982659 || Episode: 170 229 74.235807860262 0
(1, 7) Subtitle: 267 0.9501779359430605 || Episode: 169 252 67.06349206349206 0
(1, 8) Subtitle: 260 0.896551724137931 || Episode: 181 246 73.57723577235772 0
(1, 9) Subtitle: 288 0.9320388349514563 || Episode: 191 242 78.92561983471074 0
(1, 10) Subtitle: 311 0.9452887537993921 || Episode: 181 241 75.10373443983403 3
(1, 11) Subtitle: 8 1.1428571428571428 || Episode: 1 297 0.33670033670033667 13
(1, 12) Subtitle: 253 0.8971631205673759 || Episode: 161 270 59.62962962962963 1
(1, 13) Subtitle: 290 0.9206349206349206 || Episode: 178 256 69.53125 0
(1, 14) Subtitle: 301 0.9585987261146497 || Episo

In [101]:
with open('../alignment/fine_alignment/results/friends_en_zh/1_alignment_ultimate_12.pkl', 'rb') as f:
    all_alignment = pkl.load(f)
with open('../alignment/fine_alignment/results/friends_en_zh/1_alignment_extension_12.pkl', 'rb') as f:
    extension = pkl.load(f)
with open('../alignment/fine_alignment/results/friends_en_zh/1_alignment_head_tail_12.pkl', 'rb') as f:
    head_tail = pkl.load(f)

for item in all_alignment:
    print(head_tail[item])
    print()
    print(extension[item])
    print()
    print(all_alignment[item])

KeyError: (1, 1)

In [99]:
# Friends EN-ZH
en_subtitle_path = '../source_data/subtitles/en_fa/en_subtitles.pkl'
other_subtitle_path = '../source_data/subtitles/en_fa/fa_subtitles.pkl'
transcript_path = '../source_data/transcripts/friends/friends_transcripts.pkl'
coarse_alignment_path = '../alignment/coarse_alignment/results/friends_en_fa.pkl'
fine_alignment_path = '../alignment/fine_alignment/results/friends_en_fa/1_alignment_ultimate_12.pkl'
root_path = "results/friends_en_fa/"
output_path = 'parallel_data/friends_en_fa_12.pkl'

with open(en_subtitle_path, 'rb') as f:
    en_subtitle = pkl.load(f)
with open(other_subtitle_path, 'rb') as f:
    zh_subtitle = pkl.load(f)
with open(transcript_path, 'rb') as f:
    transcripts = pkl.load(f)
with open(coarse_alignment_path, 'rb') as f:
    temp = pkl.load(f)
results = organize_coarse_alignment_by_seasons(temp)
with open(fine_alignment_path, 'rb') as f:
    all_alignment = pkl.load(f)

for item in all_alignment:
    epi2sub = all_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(all_alignment[item])
    if len(sub2epi)/(max(sub2epi)-min(sub2epi)) < 100:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(all_alignment[item]), len(transcripts[item]),  len(all_alignment[item])/len(transcripts[item])*100, min(all_alignment[item].keys()))

(1, 1) Subtitle: 425 0.8432539682539683 || Episode: 251 315 79.68253968253968 0
(1, 2) Subtitle: 285 0.9595959595959596 || Episode: 194 254 76.37795275590551 0
(1, 3) Subtitle: 276 0.8817891373801917 || Episode: 183 272 67.27941176470588 3
(1, 4) Subtitle: 318 0.8712328767123287 || Episode: 191 268 71.26865671641791 1
(1, 6) Subtitle: 317 0.9161849710982659 || Episode: 170 229 74.235807860262 0
(1, 7) Subtitle: 267 0.9501779359430605 || Episode: 169 252 67.06349206349206 0
(1, 8) Subtitle: 260 0.896551724137931 || Episode: 181 246 73.57723577235772 0
(1, 9) Subtitle: 288 0.9320388349514563 || Episode: 191 242 78.92561983471074 0
(1, 10) Subtitle: 311 0.9452887537993921 || Episode: 181 241 75.10373443983403 3
(1, 11) Subtitle: 8 1.1428571428571428 || Episode: 1 297 0.33670033670033667 13
(1, 12) Subtitle: 253 0.8971631205673759 || Episode: 161 270 59.62962962962963 1
(1, 13) Subtitle: 290 0.9206349206349206 || Episode: 178 256 69.53125 0
(1, 14) Subtitle: 301 0.9585987261146497 || Episo

In [None]:
# """
# Collect the data of a certain episode by scene
# """
# def collect_parallel_corpus(tbbt_transcripts, en_subtitle, zh_subtitle, results, season_id, episode_id, all_alignment):
#     # Fetch subset data
#     (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
#         episode=tbbt_transcripts,
#         en_subtitle=en_subtitle,
#         zh_subtitle=zh_subtitle,
#         results=results,
#         season_id=season_id,
#         episode_id=episode_id,
#         bias=200
#     )
#
#     # Construct the index dictionary from original index to the collected index
#     idx_dict = {}
#
#     idx = 0
#     for i, x in enumerate(tbbt_episode):
#         while True:
#             if x[0]==tbbt_transcripts[(season_id,episode_id)][idx][0] and x[1]==tbbt_transcripts[(season_id,episode_id)][idx][1]:
#                 idx_dict[idx] = i
#                 idx += 1
#                 break
#             else:
#                 idx += 1
#
#
#     ## Collect ZH subtitles to episodes
#     alignment = all_alignment[(season_id,episode_id)]
#     en_subset = en_subset
#     zh_subset = zh_subset
#
#     one_episode = []
#     # Turn episode into a dictionary form
#     for x in tbbt_episode:
#         print(x)
#         temp = {}
#         temp['utterance'] = x[0]
#         temp['speaker'] = x[1]
#         one_episode.append(temp)
#
#     # Add subtitles into episode
#     for x in alignment:
#         en_subs = []
#         zh_subs = []
#         for item in alignment[x]:
#             en_subs.append(en_subset[item])
#             zh_subs.append(zh_subset[item])
#         one_episode[x]['en_subtitles'] = en_subs
#         one_episode[x]['zh_subtitles'] = zh_subs
#
#
#     # Store all scenes
#     scenes = []
#
#     # Iterate all episodes into one scene
#     temp = []
#     for i, x in enumerate(tbbt_transcripts[(season_id,episode_id)]):
#         if x[1]=='Scene':
#             scenes.append(temp)
#             temp = []
#         elif i in idx_dict:
#             temp.append(one_episode[idx_dict[i]])
#     scenes.pop(0)
#
#     return scenes

## Collect Parallel Corpus

In [86]:
"""
Collect the data of a certain episode by scene
"""
def collect_parallel_corpus(tbbt_transcripts, en_subtitle, zh_subtitle, results, season_id, episode_id, all_alignment):
    # Fetch subset data
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200,
        zh_split=True
    )

    # Construct the index dictionary from original index to the collected index
    idx_dict = {}

    idx = 0
    for i, x in enumerate(tbbt_episode):
        while True:
            if x[0]==tbbt_transcripts[(season_id,episode_id)][idx][0] and x[1]==tbbt_transcripts[(season_id,episode_id)][idx][1]:
                idx_dict[idx] = i
                idx += 1
                break
            else:
                idx += 1


    ## Collect ZH subtitles to episodes
    alignment = all_alignment[(season_id,episode_id)]
    en_subset = en_subset
    zh_subset = zh_subset

    one_episode = []
    # Turn episode into a dictionary form
    for x in tbbt_episode:
        temp = {}
        temp['speaker'] = x[1]
        temp['utterance'] = clean_sentence_brackets(x[0])
        temp['utterance_with_info'] = x[0]
        temp['en_subtitles'] = ""
        temp['zh_subtitles'] = ""
        one_episode.append(temp)

    # Add subtitles into episode
    for x in alignment:
        en_subs = []
        zh_subs = []
        for item in alignment[x]:
            en_subs.append(en_subset[item])
            zh_subs.append(zh_subset[item])
        one_episode[x]['en_subtitles'] = " ".join(en_subs)
        one_episode[x]['zh_subtitles'] = " ".join(zh_subs)

    return one_episode

In [90]:
count = 0
parallel_corpus = {}
for x in all_alignment:
    print(all_alignment[x])
    # print((max(list(all_alignment[x].keys()))-min(list(all_alignment[x].keys))))
    print()
    count += len(all_alignment[x])
    temp = collect_parallel_corpus(transcripts, en_subtitle, zh_subtitle, results, x[0], x[1], all_alignment)
    parallel_corpus[x] = temp
with open(output_path, 'wb') as f:
    pkl.dump(parallel_corpus, f)
print(count)

{0: [190, 191, 192, 193], 1: [194], 2: [195], 5: [196, 197, 198, 199, 200, 201], 6: [202], 7: [203], 8: [204], 9: [205], 10: [206], 11: [207], 12: [208, 209], 13: [210], 14: [211], 15: [212, 213, 214], 16: [215, 216], 17: [217, 218, 219], 18: [220], 19: [221], 20: [222], 21: [223], 22: [224], 23: [225], 24: [226], 26: [227], 29: [229], 31: [230], 32: [231], 33: [232], 34: [233, 234], 35: [235, 236, 237], 36: [238], 37: [239], 38: [240], 39: [241], 40: [242], 41: [243], 42: [244, 245], 46: [246], 47: [247], 48: [248, 249], 49: [250], 54: [252], 56: [254], 59: [255], 63: [256], 64: [257], 65: [258], 66: [259], 67: [260], 68: [261], 69: [262, 263], 70: [264, 265], 71: [266], 72: [267], 73: [268, 269], 74: [270], 75: [271, 272], 76: [273], 77: [274], 81: [275, 276], 82: [277], 83: [278], 85: [279], 87: [280], 88: [281], 89: [282], 92: [283], 93: [284], 95: [285, 286], 96: [287], 97: [288], 98: [289], 99: [290], 101: [291, 292], 103: [293], 104: [294], 105: [295], 106: [296], 107: [297], 10

In [89]:
count  = 0
for x in parallel_corpus:
    count += len(parallel_corpus[x])
print(count)

40692


In [94]:
with open(fine_alignment_path, 'rb') as f:
    further_alignment = pkl.load(f)

for item in further_alignment:
    epi2sub = further_alignment[item]
    sub2epi = turn_sub2epi_into_epi2sub(further_alignment[item])
    if len(sub2epi)/(max(sub2epi)-min(sub2epi)) < 0.6:
        print(item, "Subtitle:", len(sub2epi), len(sub2epi)/(max(sub2epi)-min(sub2epi)), "||", "Episode:", len(further_alignment[item]), len(transcripts[item]),  len(further_alignment[item])/len(transcripts[item])*100, min(further_alignment[item].keys()))
    # print(epi2sub)

(1, 12) Subtitle: 252 0.5239085239085239 || Episode: 113 243 46.50205761316872 102
