In [5]:
import os
import pandas as pd
import json
import numpy as np
import random
import seaborn as sns
import matplotlib as mpl

from pathlib import Path
from sklearn.model_selection import train_test_split
from copy import deepcopy
from matplotlib import pyplot as plt

In [6]:
# seed
seed = 214
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [7]:
def read_json(filename):
    with Path(filename).open(encoding='utf8') as handle:
        ann = json.load(handle)
    return ann

def save_json(data: dict, file_nm: str, dir_path: str):
    with open(os.path.join(dir_path, file_nm), 'w') as outfile:
        json.dump(data, outfile, indent=4)

root_dir = '/opt/ml/input/data/ICDAR19_ArT'

data_org = read_json(os.path.join(root_dir, 'ufo/train.json'))

In [12]:
def get_box_size(quads):
    """ 단어 영역의 사각형 좌표가 주어졌을 때 가로, 세로길이를 계산해주는 함수.
    TODO: 각 변의 길이를 단순히 max로 처리하기때문에 직사각형에 가까운 형태가 아니면 약간 왜곡이 있다.
    Args:
        quads: np.ndarray(n, 4, 2) n개 단어 bounding-box의 4개 점 좌표 (단위 pixel)
    Return:
        sizes: np.ndarray(n, 2) n개 box의 (height, width)쌍
    """
    dists = []
    for i, j in [(1, 2), (3, 0), (0, 1), (2, 3)]: # [right(height), left(height), upper(width), lower(width)] sides
        dists.append(np.linalg.norm(quads[:, i] - quads[:, j], ord=2, axis=1))

    dists = np.stack(dists, axis=-1).reshape(-1, 2, 2) # shape (n, 2, 2) widths, heights into separate dim
    return np.rint(dists.mean(axis=-1)).astype(int)


def rectify_poly(poly, direction, img_w, img_h):
    """일반 polygon형태인 라벨을 크롭하고 rectify해주는 함수.
    Args:
        poly: np.ndarray(2n+4, 2) (where n>0), 4, 6, 8
        image: np.ndarray opencv 포멧의 이미지
        direction: 글자의 읽는 방향과 진행 방향의 수평(Horizontal) 혹은 수직(Vertical) 여부
    Return:
        rectified: np.ndarray(2, ?) rectify된 단어 bbox의 사이즈.
    """
    
    n_pts = poly.shape[0]
    assert n_pts % 2 == 0
    if n_pts == 4:
        size = get_box_size(poly[None])
        h = size[:, 0] / img_h
        w = size[:, 1] / img_w
        return np.stack((h,w))

    def unroll(indices):
        return list(zip(indices[:-1], indices[1:]))

    # polygon하나를 인접한 사각형 여러개로 쪼갠다.
    indices = list(range(n_pts))
    if direction == 'Horizontal':
        upper_pts = unroll(indices[:n_pts // 2]) # (0, 1), (1, 2), ... (4, 5)
        lower_pts = unroll(indices[n_pts // 2:])[::-1] # (8, 9), (7, 8), ... (6, 7)

        quads = np.stack([poly[[i, j, k, l]] for (i, j), (k, l) in zip(upper_pts, lower_pts)])
    else:
        right_pts = unroll(indices[1:n_pts // 2 + 1]) # (1, 2), (2, 3), ... (4, 5)
        left_pts = unroll([0] + indices[:n_pts // 2:-1]) # (0, 9), (9, 8), ... (7, 6)

        quads = np.stack([poly[[i, j, k, l]] for (j, k), (i, l) in zip(right_pts, left_pts)])

    sizes = get_box_size(quads)
    if direction == 'Horizontal':
        h = sizes[:, 0].max() / img_h
        widths = sizes[:, 1]
        w = np.sum(widths) / img_w
        return np.stack((h,w)).reshape(2,-1)
        #return np.stack((h,w))
    elif direction == 'Vertical':
        heights = sizes[:, 0]
        w = sizes[:, 1].max() / img_w
        h = np.sum(heights) / img_h
        return np.stack((h,w)).reshape(2,-1)
    else:
        h = sizes[:, 0] / img_h
        w = sizes[:, 1] / img_w
        return np.stack((h,w),-1)
    
def get_image_dfs(data):
    df = {}
    df['image'] = []
    df['word_counts'] = []
    df['image_width'] = []
    df['image_height'] = []
    df['image_tags'] = []
    img_tags = []

    quads = []
    polys = []
    seq_length = []
    hor_sizes = []
    ver_sizes = []
    irr_sizes = []
    languages = []
    orientation = []
    word_tags = []
    aspect_ratio = []
    ver_string = []

    bbox_properties = []
    
    for image_key, image_value in data["images"].items():
        df['image'].append(image_key)
        img_w = image_value['img_w']
        img_h = image_value['img_h']
        df['image_width'].append(img_w)
        df['image_height'].append(img_h)
        df['image_tags'].append(image_value['tags'])
        df['image_tags']= [['None'] if v is None else v for v in df['image_tags']] # our data does not inlcude multi-tag images 
        word_ann = image_value['words']
        count_ill = 0 
        for word in word_ann.values():
            if word['illegibility']== False:
                orientation.append(word['orientation'])
                orientation = [v for v in orientation]
                seq_length.append(len(word['transcription']))
                languages.append(word['language'])
                languages = [['None'] if v is None else v for v in languages] # our data does not inlcude multi-language words
                if word['words_tags'] != None:
                    word_tags.extend(word['words_tags'][:])
                elif word['words_tags']== None:
                    word_tags.append('None')
                poly = np.int32(word['points'])
                size = rectify_poly(poly, word['orientation'], img_w, img_h)
                if word['orientation'] == 'Horizontal':
                    hor_sizes.append(size)
                    bbox_properties.append([image_key, size, 'Horizontal'])
                elif word['orientation'] == 'Vertical':
                    ver_sizes.append(size)
                    bbox_properties.append([image_key, size, 'Vertical'])
                else:
                    irr_sizes.append(size)
                    bbox_properties.append([image_key, size, 'Irregular'])
            else:
                count_ill += 1

        df['word_counts'].append(len(word_ann)-count_ill)


    all_sizes = hor_sizes + ver_sizes + irr_sizes
    quad_area = [all_sizes[i][0]*all_sizes[i][1] for i in range(len(all_sizes))]
    total_area = []
    for s in quad_area:
        if s.shape[0] == 1:
            total_area.append(np.sum(s[0])) 
        else:
            total_area.append(np.sum(s))

    hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
    ver_aspect_ratio = [ver_sizes[i][1]/ver_sizes[i][0] for i in range(len(ver_sizes))]

    image_df = pd.DataFrame.from_dict(df)
    bbox_df = pd.DataFrame(data=bbox_properties,
                          columns=['image', 'size', 'orientation'])
    
    bbox_df['aspect_ratio'] = bbox_df.apply(lambda x: (x['size'][1]/x['size'][0])[0], axis=1)
    
    return image_df, bbox_df

In [13]:
image_df, bbox_df = get_image_dfs(data_org)

In [14]:
image_df

Unnamed: 0,image,word_counts,image_width,image_height,image_tags
0,gt_4616.jpg,16,390,500,[None]
1,gt_1295.jpg,9,1280,960,[None]
2,gt_894.jpg,24,576,1024,[None]
3,gt_5361.jpg,25,959,1057,[None]
4,gt_3489.jpg,6,191,320,[None]
...,...,...,...,...,...
5044,gt_4588.jpg,7,503,301,[None]
5045,gt_4728.jpg,3,420,374,[None]
5046,gt_2851.jpg,10,719,1280,[None]
5047,gt_3057.jpg,12,1280,960,[None]


In [15]:
bbox_df

Unnamed: 0,image,size,orientation,aspect_ratio
0,gt_4616.jpg,"[[0.048], [0.47692307692307695]]",Horizontal,9.935897
1,gt_4616.jpg,"[[0.024], [0.0641025641025641]]",Horizontal,2.670940
2,gt_4616.jpg,"[[0.026], [0.07179487179487179]]",Horizontal,2.761341
3,gt_4616.jpg,"[[0.026], [0.08974358974358974]]",Horizontal,3.451677
4,gt_4616.jpg,"[[0.026], [0.1282051282051282]]",Horizontal,4.930966
...,...,...,...,...
47877,gt_3057.jpg,"[[0.022916666666666665], [0.075]]",Horizontal,3.272727
47878,gt_3057.jpg,"[[0.026041666666666668], [0.140625]]",Horizontal,5.400000
47879,gt_1077.jpg,"[[0.08333333333333333], [0.20625]]",Horizontal,2.475000
47880,gt_1077.jpg,"[[0.07916666666666666], [0.128125]]",Horizontal,1.618421


In [16]:
# random split
X_train, X_test, _, _ = train_test_split(image_df.image, image_df.image, test_size=0.2, shuffle=True, random_state=seed)

train = {'images': {k: v for k, v in data_org['images'].items() if k in X_train.values}}
valid = {'images': {k: v for k, v in data_org['images'].items() if k in X_test.values}}

data_list = [train, valid]
file_nm_list = ['train1.json', 'valid1.json']

for data, file_nm in zip(data_list, file_nm_list):
    save_json(data, file_nm, dir_path=os.path.join(root_dir, 'ufo'))