In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm
import xlsxwriter
import re

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

## Part 0: Load Data (EN-ZH)

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_fa/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_fa/fa_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

# Load Source Transcript
with open('original_transcript/tbbt_transcripts.pkl', 'rb') as f:
    tbbt_transcripts = pkl.load(f)

# Use the original index information to fetch the en_subset
with open('alignment_results/fa/indexs_tbbt_fa.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

# Exact alignment information
with open('alignment_results/fa/ultimate_data.pkl', 'rb') as f:
    all_alignment = pkl.load(f)

## Part 0: Load Data (EN-ZH)

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_zh/zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [5]:
# Load Source Transcript
with open('original_transcript/tbbt_transcripts.pkl', 'rb') as f:
    tbbt_transcripts = pkl.load(f)

# Use the original index information to fetch the en_subset
with open('alignment_results/zh/indexs_tbbt_zh.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [8]:
# Exact alignment information
with open('alignment_results/zh/final_stage_alignment_1_with_head_tail.pkl', 'rb') as f:
    all_alignment = pkl.load(f)

In [62]:
count = 0
for x in tbbt_transcripts:
    count += len(tbbt_transcripts[x])

In [63]:
print(count)

54406


In [59]:
speakers = set()

for x in tbbt_transcripts:
    for item in tbbt_transcripts[x]:
        if ("(" not in item[1]) and (")" not in item[1]):
            speakers.add(item[1])

In [60]:
print(speakers)

{'Leonard', 'Korean', 'Warrior', 'machine', 'Clerk', 'Elon', 'Announcer', 'Isabella', 'hat', '12', 'Sehldon', 'Audiitoner', 'Mary', 'Teleplay', 'Bill', 'Ramona', 'queue', 'Burton', 'Costume', 'Rai', 'Rajj', 'sequence', 'Shldon', 'Apartment', 'Psychic', 'M', 'Santa', 'Beverly', 'Registrar', 'Venkatesh', 'Kim', 'Paramedic', 'Official', 'Jones', 'Laura', 'Joy', 'Bermadette', 'Caption', 'Wheaton', 'Bert', 'Tattooist', 'Jenson', 'Suit', 'Elizabeth', 'Lakshmi', 'Hernandez', 'Dave', 'Guard', 'Brent', 'GPS', 'LeVar', 'Priya', 'Alien', 'Col', 'Emily', 'Screen', 'Goldfarb', 'Child', 'Seibert', 'Winkle', 'Policeman', 'Vanessa', 'Answerphone', 'Fillion', 'box', 'Photographer', 'man', 'Roeger', 'Manager', 'Sackhoff', 'Wolowitz', 'Page', 'Daniel', 'Randall', 'R', 'James', 'Penny', 'Story', 'Lalita', 'Both', 'Dan', 'Doug', 'Mandel', 'Kid', 'Ellen', 'Barman', 'Stuart', 'Everyone', 'Koothrappali', 'assistant', 'Leonard-warrior', 'employee', 'Officer', 'patrons', 'Summer', 'scenes', 'Sheldon-bot', 'Tren

In [61]:
print(len(speakers))

302


## Part 1:Expreiment with one set

In this part, we need to do the following things:
1.Attach ZH Subtitle to Transcript
2.Divide based on scence

In [5]:
"""
Collect the data of a certain episode by scene
"""
def collect_parallel_corpus(tbbt_transcripts, en_subtitle, zh_subtitle, results, season_id, episode_id, all_alignment):
    # Fetch subset data
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt_transcripts,
        en_subtitle=en_subtitle,
        zh_subtitle=zh_subtitle,
        results=results,
        season_id=season_id,
        episode_id=episode_id,
        bias=200
    )

    # Construct the index dictionary from original index to the collected index
    idx_dict = {}

    idx = 0
    for i, x in enumerate(tbbt_episode):
        while True:
            if x[0]==tbbt_transcripts[(season_id,episode_id)][idx][0] and x[1]==tbbt_transcripts[(season_id,episode_id)][idx][1]:
                idx_dict[idx] = i
                idx += 1
                break
            else:
                idx += 1


    ## Collect ZH subtitles to episodes
    alignment = all_alignment[(season_id,episode_id)]
    en_subset = en_subset
    zh_subset = zh_subset

    one_episode = []
    # Turn episode into a dictionary form
    for x in tbbt_episode:
        temp = {}
        temp['utterance'] = x[0]
        temp['speaker'] = x[1]
        one_episode.append(temp)

    # Add subtitles into episode
    for x in alignment:
        en_subs = []
        zh_subs = []
        for item in alignment[x]:
            en_subs.append(en_subset[item])
            zh_subs.append(zh_subset[item])
        one_episode[x]['en_subtitles'] = en_subs
        one_episode[x]['zh_subtitles'] = zh_subs


    # Store all scenes
    scenes = []

    # Iterate all episodes into one scene
    temp = []
    for i, x in enumerate(tbbt_transcripts[(season_id,episode_id)]):
        if x[1]=='Scene':
            scenes.append(temp)
            temp = []
        elif i in idx_dict:
            temp.append(one_episode[idx_dict[i]])
    scenes.pop(0)

    return scenes

In [6]:
parallel_corpus = {}
for x in tqdm(all_alignment):
    temp = collect_parallel_corpus(tbbt_transcripts, en_subtitle, zh_subtitle, results, x[0], x[1], all_alignment)
    parallel_corpus[x] = temp

100%|██████████| 102/102 [00:00<00:00, 176.25it/s]


In [10]:
with open('parallel_corpus/tbbt_en_fa.pkl', 'wb') as f:
    pkl.dump(parallel_corpus, f)

## Part 3: Calculate Statistics

In [27]:
with open('alignment_results/zh/final_stage_alignment_1.pkl', 'rb') as f:
    alignment = pkl.load(f)

In [28]:
print(len(alignment))

193


In [34]:
count = 0
for x in alignment:
    count += len(alignment[x])
    print(x)
    print(alignment[x])

(1, 1)
{0: [200, 201, 202, 203], 1: [204], 2: [205], 5: [206, 207, 208, 209, 210, 211], 6: [212], 7: [213], 8: [214], 9: [215], 10: [216], 11: [217], 12: [218, 219], 13: [220], 14: [221], 15: [222, 223, 224], 16: [225, 226], 17: [227, 228, 229], 18: [230], 19: [231], 20: [232], 21: [233], 22: [234], 23: [235], 24: [236], 28: [239], 30: [240], 31: [241], 32: [242], 33: [243, 244], 34: [246, 247], 35: [248], 36: [249], 39: [252], 40: [253], 45: [256], 46: [257], 47: [258, 259], 54: [262], 63: [266], 64: [267], 65: [268], 66: [269], 67: [270], 68: [271], 69: [272, 273], 70: [274, 275], 71: [276], 72: [277], 73: [278, 279], 74: [280], 75: [281, 282], 76: [283], 82: [285, 286], 83: [287], 84: [288], 86: [289], 88: [290], 89: [291], 90: [292], 93: [293], 94: [294], 96: [295, 296], 97: [297], 98: [298], 99: [299], 100: [300], 102: [301, 302], 104: [303], 105: [304], 106: [305], 107: [306], 108: [307], 109: [308], 111: [311], 112: [312], 115: [313], 116: [314, 315, 316, 317], 117: [318], 119: 

In [39]:
count_sentence = 0
for x in alignment:
    # print(x)
    # print(len(alignment[x]))
    count_sentence += len(turn_sub2epi_into_epi2sub(alignment[x]))

In [40]:
print(count_sentence)

51998


In [12]:
with open('parallel_corpus/tbbt_en_zh_with_head_tail.pkl', 'rb') as f:
    zh_data = pkl.load(f)

In [17]:
print(len(zh_data))

192


In [44]:
count_scene = 0
for x in zh_data:
    print(x, len(zh_data[x]))
    count_scene += len(zh_data[x])

(1, 1) 11
(1, 2) 10
(1, 3) 10
(1, 4) 13
(1, 5) 8
(1, 6) 6
(1, 7) 6
(1, 8) 5
(1, 9) 8
(1, 10) 8
(1, 11) 9
(1, 13) 6
(1, 14) 8
(1, 15) 9
(1, 16) 10
(2, 1) 9
(2, 2) 9
(2, 3) 12
(2, 4) 6
(2, 5) 12
(2, 6) 11
(2, 7) 12
(2, 8) 10
(2, 9) 9
(2, 10) 9
(2, 11) 7
(2, 12) 10
(2, 13) 8
(2, 14) 8
(2, 15) 10
(2, 16) 8
(2, 17) 4
(2, 18) 6
(2, 19) 10
(2, 20) 9
(2, 21) 13
(2, 22) 9
(2, 23) 8
(3, 1) 11
(3, 2) 11
(3, 3) 9
(3, 4) 10
(3, 5) 9
(3, 6) 8
(3, 7) 11
(3, 8) 13
(3, 9) 9
(3, 10) 8
(3, 11) 8
(3, 12) 10
(3, 13) 11
(3, 14) 10
(3, 15) 11
(3, 16) 10
(3, 17) 8
(3, 18) 7
(3, 19) 8
(3, 20) 10
(3, 21) 9
(3, 22) 3
(3, 23) 11
(4, 1) 10
(4, 2) 10
(4, 3) 10
(4, 4) 8
(4, 5) 11
(4, 6) 8
(4, 7) 11
(4, 8) 12
(4, 9) 12
(4, 10) 12
(4, 11) 11
(4, 12) 12
(4, 13) 14
(4, 14) 9
(4, 15) 10
(4, 17) 10
(4, 18) 11
(4, 19) 13
(4, 20) 12
(4, 21) 11
(4, 22) 10
(4, 23) 10
(4, 24) 11
(5, 1) 9
(5, 2) 9
(5, 3) 11
(5, 4) 12
(5, 5) 9
(5, 6) 10
(5, 7) 10
(5, 8) 10
(5, 9) 10
(5, 10) 9
(5, 11) 9
(5, 12) 9
(5, 13) 11
(5, 14) 10
(5, 15) 8
(

In [11]:
with open('parallel_corpus/tbbt_en_fa.pkl', 'rb') as f:
    fa_data = pkl.load(f)

In [12]:
inter_keys = set(zh_data.keys()) & set(fa_data.keys())

NameError: name 'zh_data' is not defined

In [19]:
data = {}
for x in inter_keys:
    data[x] = zh_data[x]

In [22]:
count = 0
for x in data:
    scenes = data[x]
    # count += len(scenes)
    for scene in scenes:
        pass
        # print(len(scene))
        # print(scene)
        # print(scene['en_subtitles'])
        count += len(scene)
print(count)

20517


In [None]:
zh_data

In [21]:
print(len(data))

96
