In [1]:
import pickle as pkl
import json
from collections import defaultdict
import jiwer
from copy import deepcopy
from tqdm import tqdm

In [2]:
from utils.preprocessing import organize_tbbt_by_seasons
from utils.preprocessing import get_epi_indexs_gaps
from utils.preprocessing import find_all_continuous_subsets
from utils.preprocessing import calculate_gaps
from utils.preprocessing import fetch_subsets
from utils.alignment import string_match_sliding_window
from utils.alignment import filter_alignment_by_gap
from utils.alignment import turn_sub2epi_into_epi2sub
from utils.alignment import extend_neighbors
from utils.alignment import exact_match
from utils.alignment import add_cleaned_exact_match_result
from utils.alignment import full_fill_gap
from utils.alignment import get_subset_in_gaps

In [3]:
# Define sentence transformation
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.ExpandCommonEnglishContractions(),
    jiwer.RemovePunctuation(),
    jiwer.Strip()
])

In [4]:
# Load Open Subtitle
with open('../open_subtitle/en_zh/en_subtitles.pkl', 'rb') as f:
    en_subtitle = pkl.load(f)
with open('../open_subtitle/en_zh/zh_subtitles.pkl', 'rb') as f:
    zh_subtitle = pkl.load(f)

In [5]:
# Load Memor Dataset
with open('memor/data.json') as f:
    tbbt = json.load(f)

In [6]:
# Test Farsi
with open('alignment_results/indexs_tbbt_zh.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_tbbt_by_seasons(temp)

In [7]:
def look_alignment(tbbt, en_subtitle, other_subtitle, alignment, season_id, episode_id, bias):
    # Fetch subset located by the stage-1 alignment
    (en_subset, zh_subset, tbbt_episode) = fetch_subsets(
        episode=tbbt,
        en_subtitle=en_subtitle,
        zh_subtitle=other_subtitle,
        results=alignment,
        season_id=season_id,
        episode_id=episode_id,
        bias=bias
    )

    temp = []

    # Perform string match with sliding window
    count = 0
    sub2epi = string_match_sliding_window(en_subset, tbbt_episode, 5)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the neighbors
    epi2sub = extend_neighbors(en_subset, epi2sub, tbbt_episode)
    sub2epi = turn_sub2epi_into_epi2sub(epi2sub)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Perform exact match and add it to the whole alignment
    exact_match_result = exact_match(en_subset, tbbt_episode)
    sub2epi = add_cleaned_exact_match_result(sub2epi, exact_match_result)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)


    # Extend the gap
    sub2epi = full_fill_gap(sub2epi)
    epi2sub = turn_sub2epi_into_epi2sub(sub2epi)
    temp.append(epi2sub)
    # print(epi2sub)
    # print("Episode Number:", len(epi2sub), "Subtitle Number:", len(sub2epi))
    # print("=="*50)
    return temp

In [12]:
further_alignment = {}
for i in tqdm(range(10)):
    for j in tqdm(range(25)):
        try:
            temp = look_alignment(tbbt, en_subtitle, zh_subtitle, results, i+1, j+1, 200)
            further_alignment[(i,j)] = temp
            # print(temp)
            print("Season:", i+1,"Episode:", j+1, "Episode Number:",len(temp), "Subtitle Number:",len(turn_sub2epi_into_epi2sub(temp)))
        except:
            pass

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [00:02<00:53,  2.23s/it][A
  8%|▊         | 2/25 [00:03<00:41,  1.81s/it][A
 12%|█▏        | 3/25 [00:05<00:42,  1.92s/it][A
 16%|█▌        | 4/25 [00:07<00:35,  1.67s/it][A
 20%|██        | 5/25 [00:08<00:32,  1.60s/it][A
 24%|██▍       | 6/25 [00:10<00:29,  1.55s/it][A
 28%|██▊       | 7/25 [00:11<00:26,  1.46s/it][A
 32%|███▏      | 8/25 [00:12<00:24,  1.45s/it][A
 36%|███▌      | 9/25 [00:13<00:20,  1.28s/it][A
 40%|████      | 10/25 [00:14<00:18,  1.22s/it][A
 44%|████▍     | 11/25 [00:15<00:14,  1.05s/it][A
 48%|████▊     | 12/25 [00:16<00:14,  1.11s/it][A
 52%|█████▏    | 13/25 [00:18<00:14,  1.20s/it][A
 56%|█████▌    | 14/25 [00:19<00:13,  1.22s/it][A
 60%|██████    | 15/25 [00:20<00:12,  1.27s/it][A
100%|██████████| 25/25 [00:21<00:00,  1.16it/s][A
 10%|█         | 1/10 [00:21<03:13, 21.52s/it]
  0%|          | 0/25 [00:00<?, ?it/s][A
  4%|▍         | 1/25 [

In [10]:
for x in further_alignment:
    print(x)
    print(further_alignment[x])
    print("=="*50)

(0, 0)
[{0: [200, 202, 203], 2: [205], 3: [206, 208, 210], 4: [223, 224], 5: [226], 6: [227, 228, 229], 14: [234], 15: [235], 16: [242], 18: [246], 21: [252], 22: [253], 27: [256], 29: [263], 37: [287], 38: [288], 40: [289], 42: [290], 43: [297], 46: [300], 47: [304], 48: [305], 50: [307], 51: [308], 53: [314, 315, 316, 317], 54: [318], 63: [327], 68: [334], 70: [336], 71: [337, 340], 74: [342], 75: [343, 348], 76: [351], 77: [355], 79: [358], 82: [360, 361], 83: [362], 85: [363], 87: [366], 88: [368], 89: [382], 90: [383], 91: [393], 92: [394, 395], 93: [396], 94: [398], 95: [416], 96: [417], 98: [419], 99: [424], 102: [430], 103: [431], 105: [433, 434], 106: [435], 107: [439], 109: [442], 110: [444], 113: [453, 455, 456], 114: [457], 115: [461], 116: [462, 463], 117: [464], 118: [467], 119: [469, 470, 471], 120: [477], 121: [478], 123: [479], 125: [493], 126: [506], 129: [522, 524], 139: [531], 140: [532], 141: [533], 142: [536, 538], 144: [541]}, {0: [200, 202, 203], 2: [205], 3: [2

In [13]:
further_alignment[(i,j)] = temp

In [14]:
print(further_alignment)

{(1, 2): [{2: [200], 3: [201], 4: [204], 8: [209, 210, 211], 9: [212], 11: [215, 217, 218], 15: [225], 16: [226], 18: [228], 19: [229], 22: [248], 23: [264, 265], 25: [268], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [441], 57: [443], 61: [462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 15: [225], 16: [226], 18: [227, 228], 19: [229], 22: [248], 23: [264, 265, 266], 25: [268, 269], 30: [293], 32: [295], 35: [297], 36: [298], 38: [318], 41: [328], 43: [340, 341], 44: [349, 350], 46: [360, 361, 362], 47: [365, 366], 51: [368, 369, 370], 52: [371], 55: [440, 441], 57: [443], 61: [461, 462], 62: [466, 467], 63: [468], 64: [469], 66: [470, 471]}, {2: [200], 3: [201], 4: [203, 204], 8: [208, 209, 210, 211], 9: [212], 11: [214, 215, 217, 218], 14: [221], 15: [225]

In [13]:
with open('further_alignment.pkl', 'wb') as f:
    pkl.dump(further_alignment, f)