In [2]:
%matplotlib inline
import glob
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pathlib import Path
from PIL import Image
from collections import Counter
import numpy as np
from tqdm import tqdm
from collections import deque

import sklearn
from sklearn.model_selection import train_test_split,StratifiedGroupKFold

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
def read_json(filename):
    with Path(filename).open(encoding='utf8') as handle:
        ann = json.load(handle)
    return ann

In [5]:
ICDAR_data = read_json("/opt/ml/input/data/ICDAR17_Korean/ufo/train.json")
UPSTAGE_data = read_json("/opt/ml/input/data/upstage/ufo/annotation.json")

In [6]:
def get_box_size(quads):
    """ 단어 영역의 사각형 좌표가 주어졌을 때 가로, 세로길이를 계산해주는 함수.
    TODO: 각 변의 길이를 단순히 max로 처리하기때문에 직사각형에 가까운 형태가 아니면 약간 왜곡이 있다.
    Args:
        quads: np.ndarray(n, 4, 2) n개 단어 bounding-box의 4개 점 좌표 (단위 pixel)
    Return:
        sizes: np.ndarray(n, 2) n개 box의 (height, width)쌍
    """
    dists = []
    for i, j in [(1, 2), (3, 0), (0, 1), (2, 3)]: # [right(height), left(height), upper(width), lower(width)] sides
        dists.append(np.linalg.norm(quads[:, i] - quads[:, j], ord=2, axis=1))

    dists = np.stack(dists, axis=-1).reshape(-1, 2, 2) # shape (n, 2, 2) widths, heights into separate dim
    return np.rint(dists.mean(axis=-1)).astype(int)

In [7]:
def rectify_poly(poly, direction, img_w, img_h):
    """일반 polygon형태인 라벨을 크롭하고 rectify해주는 함수.
    Args:
        poly: np.ndarray(2n+4, 2) (where n>0), 4, 6, 8
        image: np.ndarray opencv 포멧의 이미지
        direction: 글자의 읽는 방향과 진행 방향의 수평(Horizontal) 혹은 수직(Vertical) 여부
    Return:
        rectified: np.ndarray(2, ?) rectify된 단어 bbox의 사이즈.
    """
    
    n_pts = poly.shape[0]
    assert n_pts % 2 == 0
    if n_pts == 4:
        size = get_box_size(poly[None])
        h = size[:, 0] / img_h
        w = size[:, 1] / img_w
        return np.stack((h,w))

    def unroll(indices):
        return list(zip(indices[:-1], indices[1:]))

    # polygon하나를 인접한 사각형 여러개로 쪼갠다.
    indices = list(range(n_pts))
    if direction == 'Horizontal':
        upper_pts = unroll(indices[:n_pts // 2]) # (0, 1), (1, 2), ... (4, 5)
        lower_pts = unroll(indices[n_pts // 2:])[::-1] # (8, 9), (7, 8), ... (6, 7)

        quads = np.stack([poly[[i, j, k, l]] for (i, j), (k, l) in zip(upper_pts, lower_pts)])
    else:
        right_pts = unroll(indices[1:n_pts // 2 + 1]) # (1, 2), (2, 3), ... (4, 5)
        left_pts = unroll([0] + indices[:n_pts // 2:-1]) # (0, 9), (9, 8), ... (7, 6)

        quads = np.stack([poly[[i, j, k, l]] for (j, k), (i, l) in zip(right_pts, left_pts)])

    sizes = get_box_size(quads)
    if direction == 'Horizontal':
        h = sizes[:, 0].max() / img_h
        widths = sizes[:, 1]
        w = np.sum(widths) / img_w
        return np.stack((h,w)).reshape(2,-1)
        #return np.stack((h,w))
    elif direction == 'Vertical':
        heights = sizes[:, 0]
        w = sizes[:, 1].max() / img_w
        h = np.sum(heights) / img_h
        return np.stack((h,w)).reshape(2,-1)
    else:
        h = sizes[:, 0] / img_h
        w = sizes[:, 1] / img_w
        return np.stack((h,w),-1)

In [8]:
def making_df(data):   
    df = dict()
    df['image'] = deque()
    df['word_counts'] = deque()
    df['image_width'] = deque()
    df['image_height'] = deque()
    df['image_tags'] = deque()
    img_tags = deque()

    quads = deque()
    polys = deque()
    seq_length = deque()
    hor_sizes = deque()
    ver_sizes = deque()
    irr_sizes = deque()
    languages = deque()
    orientation = deque()
    word_tags = deque()
    aspect_ratio = deque()
    ver_string = deque()

    for image_key, image_value in tqdm(data["images"].items()):
        df['image'].append(image_key)
        img_w = image_value['img_w']
        img_h = image_value['img_h']
        df['image_width'].append(img_w)
        df['image_height'].append(img_h)
        df['image_tags'].append(image_value['tags'])
        df['image_tags']= [['None'] if v is None else v for v in df['image_tags']] # our data does not inlcude multi-tag images 
        word_ann = image_value['words']
        count_ill = 0 
        for word in word_ann.values():
            if word['illegibility']== False:
                orientation.append(word['orientation'])
                orientation = [v for v in orientation]
                seq_length.append(len(word['transcription']))
                languages.append(word['language'])
                languages = [['None'] if v is None else v for v in languages] # our data does not inlcude multi-language words
                if word['tags'] != None:
                    word_tags.extend(word['tags'][:])
                elif word['tags']== None:
                    word_tags.append('None')
                poly = np.int32(word['points'])
                size = rectify_poly(poly, word['orientation'], img_w, img_h)
                if word['orientation'] == 'Horizontal':
                    hor_sizes.append(size)
                elif word['orientation'] == 'Vertical':
                    ver_sizes.append(size)
                else:
                    irr_sizes.append(size)
                
            else:
                count_ill += 1
        df['word_counts'].append(len(word_ann)-count_ill)

            
    all_sizes = hor_sizes + ver_sizes + irr_sizes
    quad_area = [all_sizes[i][0]*all_sizes[i][1] for i in range(len(all_sizes))]
    total_area = deque()
    for s in quad_area:
        if s.shape[0] == 1:
            total_area.append(np.sum(s[0])) 
        else:
            total_area.append(np.sum(s))

    hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
    ver_aspect_ratio = [ver_sizes[i][1]/ver_sizes[i][0] for i in range(len(ver_sizes))]
    image_df = pd.DataFrame.from_dict(df)
    return image_df 

# 여기부터 시작


In [9]:
def add_word_counts_amount_to_df():
    _25percent = ICDAR_image_df.describe().word_counts.loc['25%']
    _50percent = ICDAR_image_df.describe().word_counts.loc['50%']
    _75percent = ICDAR_image_df.describe().word_counts.loc['75%']

    _33percent = (_25percent+_50percent)/2
    _66percent = (_50percent+_75percent)/2
    ICDAR_image_df['word_counts_amount'] = np.where(ICDAR_image_df["word_counts"] <= _33percent, 'few',
                                                np.where(ICDAR_image_df["word_counts"]<=_66percent,'normal','many'))


In [10]:
def check_ratio_and_dump(cv): 
    cv=cv
    var = [(ICDAR_image_df.loc[idx,'image'], ICDAR_image_df.loc[idx,'word_counts_amount']) for idx in range(len(ICDAR_image_df))]

    X = np.ones((len(var), ))
    y = np.array([v[1] for v in var])
    groups = np.array([v[0] for v in  var])

    for train_idx, test_idx in cv.split(X, y, groups):
        print("TRAIN:", groups[train_idx])
        print("      ", y[train_idx])
        print("TEST :", groups[test_idx])
        print("      ", y[test_idx])
        
    amounts = ['few','normal','many']
    wd = {'few': 0, 'many': 1, 'normal': 2}
    # check distribution
    def get_distribution(y):
        y_distr = Counter(y)
        y_vals_sum = sum(y_distr.values())
        
        return [f'{y_distr[i]/y_vals_sum:.2%}'  for i in amounts]
        
    distrs = [get_distribution(y)]
    index = ['training set']

    for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
        train_y, val_y = y[train_idx], y[val_idx]
        train_gr, val_gr = groups[train_idx], groups[val_idx]
        
        assert len(set(train_gr) & set(val_gr)) == 0
        
        distrs.append(get_distribution(train_y))
        distrs.append(get_distribution(val_y))
        
        index.append(f'train - fold{fold_ind}')
        index.append(f'val - fold{fold_ind}')
                    
    categories = [d for d in wd.keys()]

    print(pd.DataFrame(distrs, index=index, columns = [amounts[i] for i in range(3)]))

    for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
        ICDAR_train_fold = dict(); ICDAR_valid_fold = dict()
        ICDAR_train_fold['images']={}; ICDAR_valid_fold['images']={}
        
        for i in range(len(groups[train_idx])):
            ICDAR_train_fold['images'][groups[train_idx][i]] = ICDAR_data['images'][groups[train_idx][i]]

        for i in range(len(groups[valid_idx])):
            ICDAR_valid_fold['images'][groups[valid_idx][i]] = ICDAR_data['images'][groups[valid_idx][i]]
        
        
        with open(f'../../../input/data/ICDAR19/ufo/ICDAR19_{language}_train_fold{idx}.json', 'w') as f:
            json.dump(ICDAR_train_fold, f, indent = 4)

        with open(f'../../../input/data/ICDAR19/ufo/ICDAR19_{language}_valid_fold{idx}.json', 'w') as f:
            json.dump(ICDAR_valid_fold, f, indent = 4)

In [13]:
cv = StratifiedGroupKFold(n_splits=2, shuffle = True, random_state=2022)
languages=['Latin','Arabic','Symbols','None','Chinese','Mixed','Japanese','Korean','Bangla','Hindi']

language='Total'
ICDAR_data = read_json(f'../../../input/data/ICDAR19/ufo/ICDAR19_{language}.json')
ICDAR_image_df = making_df(ICDAR_data)
add_word_counts_amount_to_df()
check_ratio_and_dump(cv)

100%|██████████| 9996/9996 [17:57<00:00,  9.27it/s]  
  hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
  hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]


TRAIN: ['tr_img_00002.jpg' 'tr_img_00004.jpg' 'tr_img_00005.jpg' ...
 'tr_img_09996.jpg' 'tr_img_09998.jpg' 'tr_img_10000.jpg']
       ['normal' 'normal' 'few' ... 'few' 'few' 'normal']
TEST : ['tr_img_00001.jpg' 'tr_img_00003.jpg' 'tr_img_00006.jpg' ...
 'tr_img_09995.jpg' 'tr_img_09997.jpg' 'tr_img_09999.jpg']
       ['many' 'normal' 'few' ... 'normal' 'normal' 'normal']
TRAIN: ['tr_img_00001.jpg' 'tr_img_00003.jpg' 'tr_img_00006.jpg' ...
 'tr_img_09995.jpg' 'tr_img_09997.jpg' 'tr_img_09999.jpg']
       ['many' 'normal' 'few' ... 'normal' 'normal' 'normal']
TEST : ['tr_img_00002.jpg' 'tr_img_00004.jpg' 'tr_img_00005.jpg' ...
 'tr_img_09996.jpg' 'tr_img_09998.jpg' 'tr_img_10000.jpg']
       ['normal' 'normal' 'few' ... 'few' 'few' 'normal']
                  few  normal    many
training set   42.03%  25.30%  32.67%
train - fold0  42.18%  25.29%  32.53%
val - fold0    41.88%  25.31%  32.81%
train - fold1  41.88%  25.31%  32.81%
val - fold1    42.18%  25.29%  32.53%


In [None]:
for idx in range(10):
    language = languages[idx]
    ICDAR_data = read_json(f"../../../input/data/ICDAR19/ufo/ICDAR19_{language}.json")
    ICDAR_image_df = making_df(ICDAR_data)
    add_word_counts_amount_to_df()
    check_ratio_and_dump(cv)

# Merge df 관련

In [None]:
merge_df = pd.concat([ICDAR_image_df,UPSTAGE_image_df])