## Common Parameters and Function Definitions

In [2]:
import os
from element_type import TYPE_DICT

In [3]:
IMG_EXT = ".png"
INFO_EXT = ".json"
DEMO_RECORD_DIR = "./_completed/"

In [4]:
import json

def search_filenames_having_both_of_image_and_json_information():
    tgt_files = dict()
    for episode in os.listdir(DEMO_RECORD_DIR):
        episode_dir = os.path.join(DEMO_RECORD_DIR, episode)
        
        tgt_files[(episode, episode_dir)] = list()
        ep_files = os.listdir(episode_dir)
        for fpath in ep_files:
            fname, ext = os.path.splitext(fpath)
            if ext == INFO_EXT:
                if fname + IMG_EXT in ep_files:
                    tgt_files[(episode, episode_dir)].append(fname)
    return tgt_files

def _save_json(data, filepath):
    with open(filepath, "w") as json_file:
        json.dump(data, json_file)

### Duplicate Image Removal Function

In [5]:
from PIL import Image, ImageChops

def get_unique_images(tgt_files: dict):
    epi_grp_files = dict()
    for (episode, episode_dir), files in tgt_files.items():
        grp_name = episode[:episode.find("_")]
        if grp_name not in epi_grp_files:
            epi_grp_files[grp_name] = list()
        epi_grp_files[grp_name].extend([os.path.join(episode_dir, f + IMG_EXT) for f in files])

    print("\n[Duplicated image removal]")
    compare_area_box = (0, 50, 160, 210)
    unique_imgs = dict()
    total_n_compares = 0
    # Comparing images within a group and removing duplicate images.
    for grp, files in epi_grp_files.items():
        unique_imgs[grp] = list()
        n_compares = 0
        candidate_files = files.copy()
        while candidate_files:
            file1 = candidate_files.pop(0)
            unique_imgs[grp].append(file1)
            img1 = Image.open(file1).crop(compare_area_box)
            file2_cands = candidate_files.copy()
            diff_files_from_img1 = list()
            while file2_cands:
                file2 = file2_cands.pop(0)
                img2 = Image.open(file2).crop(compare_area_box)
                diff = ImageChops.difference(img1, img2)
                n_compares += 1
                if diff.getbbox():
                    diff_files_from_img1.append(file2)
            candidate_files = diff_files_from_img1.copy()

        print(f"- {grp}: {n_compares} compares, {len(unique_imgs[grp])} files survived out of {len(files)}")
        total_n_compares += n_compares

    print(
        f"* total: {total_n_compares} compares, {sum([len(_files) for _files in unique_imgs.values()])} files survived out of {sum([len(_files) for _files in epi_grp_files.values()])}")

    return unique_imgs

### Execute

In [None]:
%%time

tgt_files = search_filenames_having_both_of_image_and_json_information()

unique_imgs = get_unique_images(tgt_files=tgt_files)

_save_json(data=unique_imgs, filepath="./unique_imgs.json")