In [3]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [4]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [5]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Part 0: Load Data

In [6]:
# Load Open Subtitle
with open('../open_subtitle/en_fa/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_fa/fa_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [7]:
# Load Source Transcript
with open('original_transcript/tbbt_transcripts.pkl', 'rb') as f:
    tbbt_transcripts = pkl.load(f)

# Use the original index information to fetch the en_subset
with open('alignment_results/fa/indexs_tbbt_fa.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [8]:
# Exact alignment information
with open('alignment_results/fa/final_stage_alignment_1.pkl', 'rb') as f:
    all_alignment = pkl.load(f)

## Part 1:Expreiment with one set

In this part, we need to do the following things:
1.Attach ZH Subtitle to Transcript
2.Divide based on scence

In [9]:
"""
Collect the data of a certain episode by scene
"""
def collect_parallel_corpus(tbbt_transcripts, en_subtitle, zh_subtitle, results, season_id, episode_id, all_alignment):
    # Fetch subset data
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Construct the index dictionary from original index to the collected index
    idx_dict = {}

    idx = 0
    for i, x in enumerate(tbbt_episode):
        while True:
            if x[0]==tbbt_transcripts[(season_id,episode_id)][idx][0] and x[1]==tbbt_transcripts[(season_id,episode_id)][idx][1]:
                idx_dict[idx] = i
                idx += 1
                break
            else:
                idx += 1


    ## Collect ZH subtitles to episodes
    alignment = all_alignment[(season_id,episode_id)]
    en_subset = en_subset
    zh_subset = zh_subset

    one_episode = []
    # Turn episode into a dictionary form
    for x in tbbt_episode:
        temp = {}
        temp['utterance'] = x[0]
        temp['speaker'] = x[1]
        one_episode.append(temp)

    # Add subtitles into episode
    for x in alignment:
        en_subs = []
        zh_subs = []
        for item in alignment[x]:
            en_subs.append(en_subset[item])
            zh_subs.append(zh_subset[item])
        one_episode[x]['en_subtitles'] = en_subs
        one_episode[x]['zh_subtitles'] = zh_subs


    # Store all scenes
    scenes = []

    # Iterate all episodes into one scene
    temp = []
    for i, x in enumerate(tbbt_transcripts[(season_id,episode_id)]):
        if x[1]=='Scene':
            scenes.append(temp)
            temp = []
        elif i in idx_dict:
            temp.append(one_episode[idx_dict[i]])
    scenes.pop(0)

    return scenes

In [10]:
parallel_corpus = {}
for x in tqdm(all_alignment):
    temp = collect_parallel_corpus(tbbt_transcripts, en_subtitle, zh_subtitle, results, x[0], x[1], all_alignment)
    parallel_corpus[x] = temp

100%|██████████| 102/102 [00:00<00:00, 262.74it/s]


In [11]:
with open('parallel_corpus/tbbt_en_fa.pkl', 'wb') as f:
    pkl.dump(parallel_corpus, f)

## Part 3: Calculate Statistics

In [15]:
with open('parallel_corpus/tbbt_en_zh.pkl', 'rb') as f:
    zh_data = pkl.load(f)

In [16]:
with open('parallel_corpus/tbbt_en_fa.pkl', 'rb') as f:
    fa_data = pkl.load(f)

In [17]:
inter_keys = set(zh_data.keys()) & set(fa_data.keys())

In [19]:
data = {}
for x in inter_keys:
    data[x] = zh_data[x]

In [22]:
count = 0
for x in data:
    scenes = data[x]
    # count += len(scenes)
    for scene in scenes:
        pass
        # print(len(scene))
        # print(scene)
        # print(scene['en_subtitles'])
        count += len(scene)
print(count)

20517


In [None]:
zh_data

In [21]:
print(len(data))

96
