In [1]:
from glob import glob
import cv2
import json
import numpy as np
from tqdm import tqdm
from pathlib import Path

from matplotlib import pyplot as plt

In [2]:
path_json = "D:/RRC2019_ReCTS/gt_unicode/"
path_img = "D:/RRC2019_ReCTS/img/"

In [3]:
fns = glob(path_img + "*.jpg"); len(fns)

20000

In [4]:
clahe = cv2.createCLAHE(
    clipLimit=2.0, 
    tileGridSize=(4,4))

In [5]:
def clahe_rgb(img):
    img_eq = np.concatenate([np.expand_dims(clahe.apply(img[...,i:i+1]), -1) for i in range(3)], axis=-1)
    return img_eq

def save_char_box_images(fn, dst="D:/RRC2019_ReCTS/char_imgs_case_sensitive/", dst_clahe="D:/RRC2019_ReCTS/char_imgs_clahe/"):
    raw_fn = Path(fn).stem
    json_data = json.load(open(f"{path_json}/{raw_fn}.json", encoding="utf-8_sig"))
    img = cv2.imread(fn)[..., ::-1].astype(np.uint8)
    h, w, _ = img.shape
    
    for i, c in enumerate(json_data['chars']):
        y0, x0, y1, x1, y2, x2, y3, x3 = c['points']
        transcription = c['transcription']
        
        # handling invalid chars
        if transcription in [":", "；", ":"]:
            transcription = "colon"
        elif transcription == "\\":
            transcription = "back_slash"
        elif transcription == "/":
            transcription = "slash"
        elif transcription == "?":
            transcription = "question_mark"
        elif transcription == "*":
            transcription = "star"
        elif transcription == "\"":
            transcription = "quotation_mark"
        elif transcription == "<":
            transcription = "less_than_sign"
        elif transcription == ">":
            transcription = "greater_than_sign"
        elif transcription == "|":
            transcription = "vertical_bar"
        elif transcription in [",", ".", "，", "．", "、", "。", "·"]:
            transcription = "dot"
        elif transcription in [ "'", "＇", "＂", "〃", "‘", "’", "“", "”", "｀", "′", "″"]:
            transcription = "quotation_mark"
        elif transcription in [")", "）", "〕", "〉", "］", "｝", "〗", "】"]:
            transcription = "right_bracket"
        elif transcription in ["(", "（", "〔", "〈", "［", "｛", "〖", "【"]:
            transcription = "left_bracket"
        elif transcription == "\xad":
            transcription = "soft_hypen"
        elif transcription in ["_", "-", "＿", "－", "ˉ", "—"]:
            transcription = "dash"
        elif transcription in ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']:
            transcription = "lower_" + transcription
        elif transcription in ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']:
            transcription = "upper_" + transcription            
    
        top, left = np.minimum(x0, x1), np.minimum(y0, y3)
        button, right = np.maximum(x2, x3), np.maximum(y1, y2)
        top = np.maximum(top, 0)
        left = np.maximum(left, 0)
        button = np.minimum(button, h)
        right = np.minimum(right, w)
        
        if (button - top)*(right-left) <= 14*14:
            continue
        
        #char_bbox = (top_left, button_right)
        char_img = img[top:button, left:right].copy()
        
        dst_folder = dst+f"{str(transcription)}/"
        dst_clahe_folder = dst_clahe+f"{str(transcription)}/"
        Path(dst_folder).mkdir(parents=True, exist_ok=True)
        Path(dst_clahe_folder).mkdir(parents=True, exist_ok=True)
        plt.imsave(fname=f"{dst_folder}{raw_fn}_char{str(i)}.jpg", arr=char_img, format="jpg")
        if not transcription in ['0','1','2','3','4','5','6','7','8','9']:
            char_img_clahe = clahe_rgb(char_img)
            plt.imsave(fname=f"{dst_folder}{raw_fn}_char{str(i)}_clahe.jpg", arr=char_img_clahe, format="jpg")
    

In [6]:
for fn in tqdm(fns):
    save_char_box_images(fn)

100%|██████████████████████████████████████████████████████████████████████████| 20000/20000 [2:08:12<00:00,  5.14it/s]
