### In this program, I firstly use the large aligned results to locate subset, then match within the subset

In [8]:
import pickle as pkl
import json
from collections import defaultdict

In [13]:
"""
Organize index within one episode
Input: ([[]])
Output: {index: []}
"""
def get_index_dict(episode):
    index_dict = defaultdict()
    for idx, segment, en_sub, zh_sub in episode:
        temp = [segment, en_sub, zh_sub]
        if idx not in index_dict:
            index_dict[idx] = [temp]
        else:
            index_dict[idx].append(temp)
    return index_dict

In [51]:
# Organize data
def organize_by_seasons(all_data):
    res = {}
    for epi in list(all_data.keys()):
        season = int(epi[1:3])
        episode = int(epi[-2:])
        # Process the data in one episode
        temp = get_index_dict(all_data[epi])
        if season not in res:
            res[season] = {
                episode: temp
            }
        else:
            res[season][episode] = temp
    return res

In [94]:
"""
Get all the indexs within one episode
Input: {index: [[]]}
Output: sorted index list
"""
def get_epi_indexs_gaps(episode):
    idx_list = []
    for idx in episode:
        idx_list.append(idx)
    idx_list.sort()
    # Calculate gaps
    gaps = calculate_gaps(idx_list)
    return idx_list, gaps

In [95]:
"""
Calculate gaps between elements given an list of integer
"""
def calculate_gaps(idx_list):
    gaps = []
    idx_list.sort()
    for i in range(len(idx_list)-1):
        gaps.append(idx_list[i+1]-idx_list[i])
    return gaps

In [96]:
"""
Locate continuous subset that gap between to indexs is small than threshold
Input: indexs, gaps, threshold
Output: indexs of continuous subset
"""
def find_all_continuous_subsets(idx_list, gaps, len_threshold, gap_threshold):
    res = []
    path = [idx_list[0]]
    for i in range(len(gaps)):
        if gaps[i]<=gap_threshold:
            path.append(idx_list[i+1])
        else:
            if len(path)>=len_threshold:
                res.append(path)
            path = [idx_list[i+1]]
    return res

In [56]:
# Load search result
with open('episode_indexs_transformed.pkl', 'rb') as f:
    temp = pkl.load(f)
results = organize_by_seasons(temp)

In [103]:
# Check all substrings in each episode
for season_id in sorted(list(results.keys())):
    season = results[season_id]
    for episode_id in sorted(list(season.keys())):
        idx_list, gaps = get_epi_indexs_gaps(results[season_id][episode_id])
        subsets = find_all_continuous_subsets(idx_list, gaps, 6, 100)
        try:
            gaps = calculate_gaps(subsets[-1])
            print(gaps)
            print("Season:", season_id, "|Episode:", episode_id, "|Subset Length:", len(subsets[-1]), "|Sum:", sum(gaps), "|Maximum:", max(gaps))
        except:
            print("Season:", season_id, "Episode:", episode_id, "Subset Length:", subsets)
        print('=='*50)

[3, 2, 1, 2, 2, 13, 1, 2, 1, 1, 1, 1, 4, 1, 7, 4, 10, 7, 25, 1, 1, 7, 3, 4, 1, 2, 7, 2, 1, 1, 9, 7, 2, 1, 1, 2, 1, 1, 1, 3, 7, 7, 1, 1, 1, 5, 14, 1, 10, 1, 1, 1, 20, 1, 2, 5, 4, 1, 1, 3, 1, 1, 4, 3, 13, 1, 1, 4, 1, 2, 3, 3, 8, 1, 14, 13, 16, 2, 2, 1, 4, 1, 1, 3, 2]
Season: 1 |Episode: 1 |Subset Length: 86 |Sum: 338 |Maximum: 25
[1, 17, 1, 1, 6, 2, 3, 3, 44, 2, 2, 1, 23, 4, 4, 14, 1, 2, 7, 1, 8, 1, 8, 2, 5, 1, 3, 1, 2, 1, 7, 2, 2, 1, 5, 2, 4, 1, 2, 1, 1, 1, 8, 2, 46, 1, 1, 2, 7]
Season: 1 |Episode: 2 |Subset Length: 50 |Sum: 267 |Maximum: 46
[1, 5, 3, 2, 8, 3, 5, 2, 5, 7, 10, 5, 1, 5, 1, 2, 27, 1, 1, 1, 16, 15, 1, 1, 1, 1, 19, 1, 4, 9, 4, 10, 2, 2, 3, 14, 2, 1, 3, 2, 1, 1, 1, 1, 1, 3, 2, 7, 12, 1, 4, 2, 3, 1, 1, 5, 1, 1, 7, 8, 4, 11, 13, 1, 1, 8, 6]
Season: 1 |Episode: 3 |Subset Length: 68 |Sum: 313 |Maximum: 27
[2, 14, 1, 8, 3, 7, 1, 6, 3, 18, 12, 10, 2, 8, 8, 8, 2, 1, 1, 2, 3, 2, 1, 2, 20, 11, 9, 2, 5, 1, 1, 3, 7, 41, 1, 1, 9, 6, 2, 6, 1, 4, 4, 9, 1, 16]
Season: 1 |Episode: 4 |Subset 