# Data split version description
## Version 1
>>
가장 기본적인 random sampling (비복원 추출) 방식을 따랐습니다.\
Random seed는 42로 설정했습니다.\
생성되는 .json 파일은 ufo 폴더 안에 위치하도록 했으며 dataset 별로 모두 ufo 폴더가 있다고 가정하고 진행합니다.

# Load modules

In [1]:
import os
import pandas as pd
import json
import numpy as np
import random

from pathlib import Path
from sklearn.model_selection import train_test_split

## Fix random seed

In [2]:
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# Load original dataset

In [3]:
def read_json(filename):
    with Path(filename).open(encoding='utf8') as handle:
        ann = json.load(handle)
    return ann    

In [4]:
root_dir = '../input/data/'
path_ICDAR17 = 'ICDAR17_Korean/ufo/'
data_nm = 'train.json'

data = read_json(os.path.join(root_dir, path_ICDAR17, data_nm))

# Get image properties

In [5]:
def get_box_size(quads):
    """ 단어 영역의 사각형 좌표가 주어졌을 때 가로, 세로길이를 계산해주는 함수.
    TODO: 각 변의 길이를 단순히 max로 처리하기때문에 직사각형에 가까운 형태가 아니면 약간 왜곡이 있다.
    Args:
        quads: np.ndarray(n, 4, 2) n개 단어 bounding-box의 4개 점 좌표 (단위 pixel)
    Return:
        sizes: np.ndarray(n, 2) n개 box의 (height, width)쌍
    """
    dists = []
    for i, j in [(1, 2), (3, 0), (0, 1), (2, 3)]: # [right(height), left(height), upper(width), lower(width)] sides
        dists.append(np.linalg.norm(quads[:, i] - quads[:, j], ord=2, axis=1))

    dists = np.stack(dists, axis=-1).reshape(-1, 2, 2) # shape (n, 2, 2) widths, heights into separate dim
    return np.rint(dists.mean(axis=-1)).astype(int)


def rectify_poly(poly, direction, img_w, img_h):
    """일반 polygon형태인 라벨을 크롭하고 rectify해주는 함수.
    Args:
        poly: np.ndarray(2n+4, 2) (where n>0), 4, 6, 8
        image: np.ndarray opencv 포멧의 이미지
        direction: 글자의 읽는 방향과 진행 방향의 수평(Horizontal) 혹은 수직(Vertical) 여부
    Return:
        rectified: np.ndarray(2, ?) rectify된 단어 bbox의 사이즈.
    """
    
    n_pts = poly.shape[0]
    assert n_pts % 2 == 0
    if n_pts == 4:
        size = get_box_size(poly[None])
        h = size[:, 0] / img_h
        w = size[:, 1] / img_w
        return np.stack((h,w))

    def unroll(indices):
        return list(zip(indices[:-1], indices[1:]))

    # polygon하나를 인접한 사각형 여러개로 쪼갠다.
    indices = list(range(n_pts))
    if direction == 'Horizontal':
        upper_pts = unroll(indices[:n_pts // 2]) # (0, 1), (1, 2), ... (4, 5)
        lower_pts = unroll(indices[n_pts // 2:])[::-1] # (8, 9), (7, 8), ... (6, 7)

        quads = np.stack([poly[[i, j, k, l]] for (i, j), (k, l) in zip(upper_pts, lower_pts)])
    else:
        right_pts = unroll(indices[1:n_pts // 2 + 1]) # (1, 2), (2, 3), ... (4, 5)
        left_pts = unroll([0] + indices[:n_pts // 2:-1]) # (0, 9), (9, 8), ... (7, 6)

        quads = np.stack([poly[[i, j, k, l]] for (j, k), (i, l) in zip(right_pts, left_pts)])

    sizes = get_box_size(quads)
    if direction == 'Horizontal':
        h = sizes[:, 0].max() / img_h
        widths = sizes[:, 1]
        w = np.sum(widths) / img_w
        return np.stack((h,w)).reshape(2,-1)
        #return np.stack((h,w))
    elif direction == 'Vertical':
        heights = sizes[:, 0]
        w = sizes[:, 1].max() / img_w
        h = np.sum(heights) / img_h
        return np.stack((h,w)).reshape(2,-1)
    else:
        h = sizes[:, 0] / img_h
        w = sizes[:, 1] / img_w
        return np.stack((h,w),-1)
    
def get_image_df

In [6]:
df = {}
df['image'] = []
df['word_counts'] = []
df['image_width'] = []
df['image_height'] = []
df['image_tags'] = []
img_tags = []

quads = []
polys = []
seq_length = []
hor_sizes = []
ver_sizes = []
irr_sizes = []
languages = []
orientation = []
word_tags = []
aspect_ratio = []
ver_string = []

for image_key, image_value in data["images"].items():
    df['image'].append(image_key)
    img_w = image_value['img_w']
    img_h = image_value['img_h']
    df['image_width'].append(img_w)
    df['image_height'].append(img_h)
    df['image_tags'].append(image_value['tags'])
    df['image_tags']= [['None'] if v is None else v for v in df['image_tags']] # our data does not inlcude multi-tag images 
    word_ann = image_value['words']
    count_ill = 0 
    for word in word_ann.values():
        if word['illegibility']== False:
            orientation.append(word['orientation'])
            orientation = [v for v in orientation]
            seq_length.append(len(word['transcription']))
            languages.append(word['language'])
            languages = [['None'] if v is None else v for v in languages] # our data does not inlcude multi-language words
            if word['word_tags'] != None:
                word_tags.extend(word['word_tags'][:])
            elif word['word_tags']== None:
                word_tags.append('None')
            poly = np.int32(word['points'])
            size = rectify_poly(poly, word['orientation'], img_w, img_h)
            if word['orientation'] == 'Horizontal':
                hor_sizes.append(size)
            elif word['orientation'] == 'Vertical':
                ver_sizes.append(size)
            else:
                irr_sizes.append(size)
            
        else:
            count_ill += 1
        break
    
    df['word_counts'].append(len(word_ann)-count_ill)
    
    
all_sizes = hor_sizes + ver_sizes + irr_sizes
quad_area = [all_sizes[i][0]*all_sizes[i][1] for i in range(len(all_sizes))]
total_area = []
for s in quad_area:
    if s.shape[0] == 1:
        total_area.append(np.sum(s[0])) 
    else:
        total_area.append(np.sum(s))

hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
ver_aspect_ratio = [ver_sizes[i][1]/ver_sizes[i][0] for i in range(len(ver_sizes))]

image_df = pd.DataFrame.from_dict(df)

In [7]:
# Check data frame
image_df

Unnamed: 0,image,word_counts,image_width,image_height,image_tags
0,img_4380.jpg,3,1836,2448,[None]
1,img_4583.jpg,2,2268,2268,[None]
2,img_4234.jpg,7,2592,3456,[None]
3,img_4345.jpg,7,1836,2448,[None]
4,img_4016.jpg,1,1836,2448,[None]
...,...,...,...,...,...
531,img_1048.jpg,4,3024,2268,[None]
532,img_1071.jpg,2,2448,1836,[None]
533,img_1122.jpg,9,2448,1836,[None]
534,img_1131.jpg,2,3144,2328,[None]


In [8]:
# Split images at random
X_train_v1, X_valid_v1, y_train_v1, y_valid_v1 = train_test_split(image_df.image, image_df.image, test_size=0.2, shuffle=True, random_state=seed)

In [9]:
# Simple check split result
print(sum(X_train_v1!=y_train_v1))
print(set(y_train_v1).intersection(set(y_valid_v1)))

0
set()


In [10]:
type(data['images'])

dict

In [11]:
train_v1 = {'images': {k: v for k, v in data['images'].items() if k in X_train_v1.values}}
valid_v1 = {'images': {k: v for k, v in data['images'].items() if k in X_valid_v1.values}}

In [12]:
# Just for check
print(len(train_v1['images']))
print(len(valid_v1['images']))

428
108


# Save validation version 1

In [20]:
def save_json(data: dict, file_nm: str, dir_path: str):
    with open(os.path.join(dir_path, file_nm), 'w') as outfile:
        json.dump(data, outfile)

In [21]:
data_list = [train_v1,
             valid_v1
            ]
file_nm_list = ['train_v1.json',
                'valid_v1.json',
               ]

for data, file_nm in zip(data_list, file_nm_list):
    save_json(data, file_nm, dir_path=os.path.join(root_dir, path_ICDAR17))