In [39]:
import os
import os.path as osp
import json
from tqdm import tqdm
import pickle

import numpy as np
import cv2

os.chdir('/opt/ml/input/local')
from dataset import filter_vertices, resize_img, adjust_height, rotate_img, crop_img

In [40]:
def preprocessing(
        root_dir,
        split="train",
        num=0,
        image_size=2048,
        crop_size=1024,
        ignore_tags=["masked", "excluded-region", "maintable", "stamp"],
        ignore_under_threshold=10,
        drop_under_threshold=1,
    ):
    
        if crop_size % 32 != 0:
            raise ValueError("`input_size` must be a multiple of 32")
        
        if num == 0:
            json_dir = osp.join(root_dir, "ufo/{}.json".format(split))
            pkl_dir = osp.join(root_dir, "ufo/{}.pickle".format(split))
        else:
            json_dir = osp.join(root_dir, "ufo/{}.json".format(split + str(num)))
            pkl_dir = osp.join(root_dir, "ufo/{}.pickle".format(split + str(num)))

        with open(json_dir, "r") as f:
            anno = json.load(f)

        image_fnames = sorted(anno["images"].keys())
        if split == "val":
            split = "train"
        image_dir = osp.join(root_dir, "img", split)
        
        total = dict(images = [],
                     vertices = [],
                     labels = [])
        for idx in tqdm(range(len(image_fnames))):
            image_fname = image_fnames[idx]
            image_fpath = osp.join(image_dir, image_fname)

            ########################################################################
            vertices, labels = [], []
            for word_info in anno["images"][image_fname]["words"].values():
                word_tags = word_info["tags"]
                ignore_sample = any(
                    elem for elem in word_tags if elem in ignore_tags
                )
                num_pts = np.array(word_info["points"]).shape[0]

                if ignore_sample or num_pts > 4:
                    continue

                vertices.append(np.array(word_info["points"]).flatten())
                labels.append(int(not word_info["illegibility"]))
            vertices, labels = np.array(vertices, dtype=np.float32), np.array(
                labels, dtype=np.int64
            )
            ########################################################################

            vertices, labels = filter_vertices(
                vertices,
                labels,
                ignore_under=ignore_under_threshold,
                drop_under=drop_under_threshold,
            )
            image = cv2.imread(image_fpath)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            #######################################################################
            image, vertices = resize_img(image, vertices, image_size)
            image, vertices = adjust_height(image, vertices)
            image, vertices = rotate_img(image, vertices)
            image, vertices = crop_img(image, vertices, labels, crop_size)
            #######################################################################

            total["images"].append(image)
            total["vertices"].append(vertices)
            total["labels"].append(labels)
        
        print(f"Save path >> {pkl_dir}")
        with open(pkl_dir,'wb') as fw:
            pickle.dump(total, fw)

- 수정 가능한 부분
    - root_dir
    - split : "train" or "val"
    - num : json 파일에 붙은 num을 그대로 사용

In [42]:
preprocessing(
        root_dir = '/opt/ml/input/data/medical',
        split="val",
        num=2,
        image_size=2048,
        crop_size=1024,
        ignore_tags=["masked", "excluded-region", "maintable", "stamp"],
        ignore_under_threshold=10,
        drop_under_threshold=1,
    )

100%|██████████| 61/61 [04:05<00:00,  4.02s/it]


Save path >> /opt/ml/input/data/medical/ufo/val2.pickle


- 저장 되었는지 확인

In [31]:
with open('/opt/ml/input/data/medical/ufo/train3.pickle', 'rb') as fr:
    total = pickle.load(fr)