In [1]:
%matplotlib inline
import glob
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pathlib import Path
from PIL import Image
from collections import Counter
import numpy as np
from tqdm import tqdm
from collections import deque

import sklearn
from sklearn.model_selection import train_test_split,StratifiedGroupKFold

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
def read_json(filename):
    with Path(filename).open(encoding='utf8') as handle:
        ann = json.load(handle)
    return ann

In [4]:
ICDAR_data = read_json("/opt/ml/input/data/ICDAR17_Korean/ufo/train.json")
UPSTAGE_data = read_json("/opt/ml/input/data/upstage/ufo/annotation.json")

In [5]:
def get_box_size(quads):
    """ 단어 영역의 사각형 좌표가 주어졌을 때 가로, 세로길이를 계산해주는 함수.
    TODO: 각 변의 길이를 단순히 max로 처리하기때문에 직사각형에 가까운 형태가 아니면 약간 왜곡이 있다.
    Args:
        quads: np.ndarray(n, 4, 2) n개 단어 bounding-box의 4개 점 좌표 (단위 pixel)
    Return:
        sizes: np.ndarray(n, 2) n개 box의 (height, width)쌍
    """
    dists = []
    for i, j in [(1, 2), (3, 0), (0, 1), (2, 3)]: # [right(height), left(height), upper(width), lower(width)] sides
        dists.append(np.linalg.norm(quads[:, i] - quads[:, j], ord=2, axis=1))

    dists = np.stack(dists, axis=-1).reshape(-1, 2, 2) # shape (n, 2, 2) widths, heights into separate dim
    return np.rint(dists.mean(axis=-1)).astype(int)

In [6]:
def rectify_poly(poly, direction, img_w, img_h):
    """일반 polygon형태인 라벨을 크롭하고 rectify해주는 함수.
    Args:
        poly: np.ndarray(2n+4, 2) (where n>0), 4, 6, 8
        image: np.ndarray opencv 포멧의 이미지
        direction: 글자의 읽는 방향과 진행 방향의 수평(Horizontal) 혹은 수직(Vertical) 여부
    Return:
        rectified: np.ndarray(2, ?) rectify된 단어 bbox의 사이즈.
    """
    
    n_pts = poly.shape[0]
    assert n_pts % 2 == 0
    if n_pts == 4:
        size = get_box_size(poly[None])
        h = size[:, 0] / img_h
        w = size[:, 1] / img_w
        return np.stack((h,w))

    def unroll(indices):
        return list(zip(indices[:-1], indices[1:]))

    # polygon하나를 인접한 사각형 여러개로 쪼갠다.
    indices = list(range(n_pts))
    if direction == 'Horizontal':
        upper_pts = unroll(indices[:n_pts // 2]) # (0, 1), (1, 2), ... (4, 5)
        lower_pts = unroll(indices[n_pts // 2:])[::-1] # (8, 9), (7, 8), ... (6, 7)

        quads = np.stack([poly[[i, j, k, l]] for (i, j), (k, l) in zip(upper_pts, lower_pts)])
    else:
        right_pts = unroll(indices[1:n_pts // 2 + 1]) # (1, 2), (2, 3), ... (4, 5)
        left_pts = unroll([0] + indices[:n_pts // 2:-1]) # (0, 9), (9, 8), ... (7, 6)

        quads = np.stack([poly[[i, j, k, l]] for (j, k), (i, l) in zip(right_pts, left_pts)])

    sizes = get_box_size(quads)
    if direction == 'Horizontal':
        h = sizes[:, 0].max() / img_h
        widths = sizes[:, 1]
        w = np.sum(widths) / img_w
        return np.stack((h,w)).reshape(2,-1)
        #return np.stack((h,w))
    elif direction == 'Vertical':
        heights = sizes[:, 0]
        w = sizes[:, 1].max() / img_w
        h = np.sum(heights) / img_h
        return np.stack((h,w)).reshape(2,-1)
    else:
        h = sizes[:, 0] / img_h
        w = sizes[:, 1] / img_w
        return np.stack((h,w),-1)

In [7]:
def making_df(data):   
    df = dict()
    df['image'] = deque()
    df['word_counts'] = deque()
    df['image_width'] = deque()
    df['image_height'] = deque()
    df['image_tags'] = deque()
    img_tags = deque()

    quads = deque()
    polys = deque()
    seq_length = deque()
    hor_sizes = deque()
    ver_sizes = deque()
    irr_sizes = deque()
    languages = deque()
    orientation = deque()
    word_tags = deque()
    aspect_ratio = deque()
    ver_string = deque()

    for image_key, image_value in tqdm(data["images"].items()):
        df['image'].append(image_key)
        img_w = image_value['img_w']
        img_h = image_value['img_h']
        df['image_width'].append(img_w)
        df['image_height'].append(img_h)
        df['image_tags'].append(image_value['tags'])
        df['image_tags']= [['None'] if v is None else v for v in df['image_tags']] # our data does not inlcude multi-tag images 
        word_ann = image_value['words']
        count_ill = 0 
        for word in word_ann.values():
            if word['illegibility']== False:
                orientation.append(word['orientation'])
                orientation = [v for v in orientation]
                seq_length.append(len(word['transcription']))
                languages.append(word['language'])
                languages = [['None'] if v is None else v for v in languages] # our data does not inlcude multi-language words
                if word['tags'] != None:
                    word_tags.extend(word['tags'][:])
                elif word['tags']== None:
                    word_tags.append('None')
                poly = np.int32(word['points'])
                size = rectify_poly(poly, word['orientation'], img_w, img_h)
                if word['orientation'] == 'Horizontal':
                    hor_sizes.append(size)
                elif word['orientation'] == 'Vertical':
                    ver_sizes.append(size)
                else:
                    irr_sizes.append(size)
                
            else:
                count_ill += 1
        df['word_counts'].append(len(word_ann)-count_ill)

            
    all_sizes = hor_sizes + ver_sizes + irr_sizes
    quad_area = [all_sizes[i][0]*all_sizes[i][1] for i in range(len(all_sizes))]
    total_area = deque()
    for s in quad_area:
        if s.shape[0] == 1:
            total_area.append(np.sum(s[0])) 
        else:
            total_area.append(np.sum(s))

    hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
    ver_aspect_ratio = [ver_sizes[i][1]/ver_sizes[i][0] for i in range(len(ver_sizes))]
    image_df = pd.DataFrame.from_dict(df)
    return image_df 

# 여기부터 시작


In [29]:
def add_word_counts_amount_to_df():
    _25percent = ICDAR_image_df.describe().word_counts.loc['25%']
    _50percent = ICDAR_image_df.describe().word_counts.loc['50%']
    _75percent = ICDAR_image_df.describe().word_counts.loc['75%']

    _33percent = (_25percent+_50percent)/2
    _66percent = (_50percent+_75percent)/2
    ICDAR_image_df['word_counts_amount'] = np.where(ICDAR_image_df["word_counts"] <= _33percent, 'few',
                                                np.where(ICDAR_image_df["word_counts"]<=_66percent,'normal','many'))


In [34]:
def check_ratio_and_dump(cv): 
    cv=cv
    var = [(ICDAR_image_df.loc[idx,'image'], ICDAR_image_df.loc[idx,'word_counts_amount']) for idx in range(len(ICDAR_image_df))]

    X = np.ones((len(var), ))
    y = np.array([v[1] for v in var])
    groups = np.array([v[0] for v in  var])

    for train_idx, test_idx in cv.split(X, y, groups):
        print("TRAIN:", groups[train_idx])
        print("      ", y[train_idx])
        print("TEST :", groups[test_idx])
        print("      ", y[test_idx])
        
    amounts = ['few','normal','many']
    wd = {'few': 0, 'many': 1, 'normal': 2}
    # check distribution
    def get_distribution(y):
        y_distr = Counter(y)
        y_vals_sum = sum(y_distr.values())
        
        return [f'{y_distr[i]/y_vals_sum:.2%}'  for i in amounts]
        
    distrs = [get_distribution(y)]
    index = ['training set']

    for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
        train_y, val_y = y[train_idx], y[val_idx]
        train_gr, val_gr = groups[train_idx], groups[val_idx]
        
        assert len(set(train_gr) & set(val_gr)) == 0
        
        distrs.append(get_distribution(train_y))
        distrs.append(get_distribution(val_y))
        
        index.append(f'train - fold{fold_ind}')
        index.append(f'val - fold{fold_ind}')
                    
    categories = [d for d in wd.keys()]

    print(pd.DataFrame(distrs, index=index, columns = [amounts[i] for i in range(3)]))

    for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
        ICDAR_train_fold = dict(); ICDAR_valid_fold = dict()
        ICDAR_train_fold['images']={}; ICDAR_valid_fold['images']={}
        
        for i in range(len(groups[train_idx])):
            ICDAR_train_fold['images'][groups[train_idx][i]] = ICDAR_data['images'][groups[train_idx][i]]

        for i in range(len(groups[valid_idx])):
            ICDAR_valid_fold['images'][groups[valid_idx][i]] = ICDAR_data['images'][groups[valid_idx][i]]
        
        
        with open(f'../../../input/data/ICDAR19/ufo/ICDAR19_{language}_train_fold{idx}.json', 'w') as f:
            json.dump(ICDAR_train_fold, f, indent = 4)

        with open(f'../../../input/data/ICDAR19/ufo/ICDAR19_{language}_valid_fold{idx}.json', 'w') as f:
            json.dump(ICDAR_valid_fold, f, indent = 4)

In [35]:
cv = StratifiedGroupKFold(n_splits=2, shuffle = True, random_state=2022)
languages=['Latin','Arabic','Symbols','None','Chinese','Mixed','Japanese','Korean','Bangla','Hindi']

for idx in range(10):
    language = languages[idx]
    ICDAR_data = read_json(f"../../../input/data/ICDAR19/ufo/ICDAR19_{language}.json")
    ICDAR_image_df = making_df(ICDAR_data)
    add_word_counts_amount_to_df()
    check_ratio_and_dump(cv)

100%|██████████| 6982/6982 [09:57<00:00, 11.68it/s]
  hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
  hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]


TRAIN: ['tr_img_07871.jpg' 'tr_img_05966.jpg' 'tr_img_01551.jpg' ...
 'tr_img_08382.jpg' 'tr_img_02829.jpg' 'tr_img_05995.jpg']
       ['many' 'few' 'few' ... 'normal' 'few' 'few']
TEST : ['tr_img_05011.jpg' 'tr_img_05173.jpg' 'tr_img_07920.jpg' ...
 'tr_img_01365.jpg' 'tr_img_06775.jpg' 'tr_img_05281.jpg']
       ['few' 'normal' 'few' ... 'few' 'many' 'normal']
TRAIN: ['tr_img_05011.jpg' 'tr_img_05173.jpg' 'tr_img_07920.jpg' ...
 'tr_img_01365.jpg' 'tr_img_06775.jpg' 'tr_img_05281.jpg']
       ['few' 'normal' 'few' ... 'few' 'many' 'normal']
TEST : ['tr_img_07871.jpg' 'tr_img_05966.jpg' 'tr_img_01551.jpg' ...
 'tr_img_08382.jpg' 'tr_img_02829.jpg' 'tr_img_05995.jpg']
       ['many' 'few' 'few' ... 'normal' 'few' 'few']
                  few  normal    many
training set   40.63%  26.81%  32.56%
train - fold0  41.19%  25.55%  33.26%
val - fold0    40.07%  28.07%  31.85%
train - fold1  40.07%  28.07%  31.85%
val - fold1    41.19%  25.55%  33.26%


100%|██████████| 1001/1001 [00:05<00:00, 184.97it/s]


TRAIN: ['tr_img_00831.jpg' 'tr_img_00213.jpg' 'tr_img_00113.jpg'
 'tr_img_00997.jpg' 'tr_img_00625.jpg' 'tr_img_00055.jpg'
 'tr_img_00887.jpg' 'tr_img_00848.jpg' 'tr_img_00013.jpg'
 'tr_img_00253.jpg' 'tr_img_00739.jpg' 'tr_img_00221.jpg'
 'tr_img_00295.jpg' 'tr_img_00509.jpg' 'tr_img_00627.png'
 'tr_img_00775.jpg' 'tr_img_00791.jpg' 'tr_img_00878.jpg'
 'tr_img_00194.jpg' 'tr_img_00617.jpg' 'tr_img_00985.jpg'
 'tr_img_00565.jpg' 'tr_img_00531.jpg' 'tr_img_00282.jpg'
 'tr_img_00528.jpg' 'tr_img_00020.jpg' 'tr_img_00377.jpg'
 'tr_img_00141.jpg' 'tr_img_00660.jpg' 'tr_img_00675.jpg'
 'tr_img_00267.jpg' 'tr_img_00305.jpg' 'tr_img_00459.jpg'
 'tr_img_00297.jpg' 'tr_img_00441.jpg' 'tr_img_00949.jpg'
 'tr_img_00633.jpg' 'tr_img_00861.jpg' 'tr_img_00453.jpg'
 'tr_img_00642.jpg' 'tr_img_00539.jpg' 'tr_img_00323.jpg'
 'tr_img_00600.jpg' 'tr_img_00555.jpg' 'tr_img_00753.jpg'
 'tr_img_00275.jpg' 'tr_img_00977.jpg' 'tr_img_00278.jpg'
 'tr_img_00313.jpg' 'tr_img_00751.jpg' 'tr_img_00169.jpg'
 'tr_im

100%|██████████| 1021/1021 [00:41<00:00, 24.53it/s]
  hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]


TRAIN: ['tr_img_07871.jpg' 'tr_img_02350.jpg' 'tr_img_00055.jpg'
 'tr_img_01373.jpg' 'tr_img_01339.jpg' 'tr_img_01537.jpg'
 'tr_img_01679.jpg' 'tr_img_05670.jpg' 'tr_img_00295.jpg'
 'tr_img_06846.jpg' 'tr_img_01116.jpg' 'tr_img_09907.jpg'
 'tr_img_09944.jpg' 'tr_img_09335.png' 'tr_img_07653.jpg'
 'tr_img_04739.jpg' 'tr_img_02556.jpg' 'tr_img_09978.jpg'
 'tr_img_04548.jpg' 'tr_img_07913.jpg' 'tr_img_07048.jpg'
 'tr_img_05160.jpg' 'tr_img_06490.jpg' 'tr_img_01831.jpg'
 'tr_img_01748.jpg' 'tr_img_00274.jpg' 'tr_img_04590.jpg'
 'tr_img_04866.jpg' 'tr_img_07580.jpg' 'tr_img_02590.jpg'
 'tr_img_08571.jpg' 'tr_img_08311.jpg' 'tr_img_04408.jpg'
 'tr_img_05367.jpg' 'tr_img_00297.jpg' 'tr_img_05924.jpg'
 'tr_img_07564.jpg' 'tr_img_00842.jpg' 'tr_img_07435.jpg'
 'tr_img_08910.jpg' 'tr_img_04422.jpg' 'tr_img_05195.jpg'
 'tr_img_07806.jpg' 'tr_img_09872.jpg' 'tr_img_02464.jpg'
 'tr_img_04306.jpg' 'tr_img_04472.jpg' 'tr_img_09368.jpg'
 'tr_img_04070.jpg' 'tr_img_04186.jpg' 'tr_img_03691.jpg'
 'tr_im

100%|██████████| 2007/2007 [02:04<00:00, 16.11it/s]


TRAIN: ['tr_img_09763.jpg' 'tr_img_04645.jpg' 'tr_img_04813.jpg' ...
 'tr_img_07183.jpg' 'tr_img_08484.jpg' 'tr_img_05663.jpg']
       ['few' 'normal' 'few' ... 'many' 'normal' 'few']
TEST : ['tr_img_04541.jpg' 'tr_img_08398.jpg' 'tr_img_07871.jpg' ...
 'tr_img_05176.jpg' 'tr_img_06775.jpg' 'tr_img_05281.jpg']
       ['many' 'few' 'many' ... 'normal' 'many' 'normal']
TRAIN: ['tr_img_04541.jpg' 'tr_img_08398.jpg' 'tr_img_07871.jpg' ...
 'tr_img_05176.jpg' 'tr_img_06775.jpg' 'tr_img_05281.jpg']
       ['many' 'few' 'many' ... 'normal' 'many' 'normal']
TEST : ['tr_img_09763.jpg' 'tr_img_04645.jpg' 'tr_img_04813.jpg' ...
 'tr_img_07183.jpg' 'tr_img_08484.jpg' 'tr_img_05663.jpg']
       ['few' 'normal' 'few' ... 'many' 'normal' 'few']
                  few  normal    many
training set   41.50%  24.12%  34.38%
train - fold0  42.07%  21.83%  36.09%
val - fold0    40.94%  26.39%  32.67%
train - fold1  40.94%  26.39%  32.67%
val - fold1    42.07%  21.83%  36.09%


100%|██████████| 994/994 [00:02<00:00, 410.68it/s] 


TRAIN: ['tr_img_03799.jpg' 'tr_img_03955.jpg' 'tr_img_03730.jpg'
 'tr_img_03120.jpg' 'tr_img_03675.jpg' 'tr_img_03725.jpg'
 'tr_img_03873.jpg' 'tr_img_03571.jpg' 'tr_img_03224.jpg'
 'tr_img_03886.jpg' 'tr_img_03806.jpg' 'tr_img_03655.jpg'
 'tr_img_03026.jpg' 'tr_img_03186.jpg' 'tr_img_03266.jpg'
 'tr_img_03041.jpg' 'tr_img_03314.jpg' 'tr_img_03229.jpg'
 'tr_img_03625.jpg' 'tr_img_03040.jpg' 'tr_img_03150.jpg'
 'tr_img_03526.jpg' 'tr_img_03843.jpg' 'tr_img_03881.jpg'
 'tr_img_03794.jpg' 'tr_img_03373.jpg' 'tr_img_03786.jpg'
 'tr_img_03474.jpg' 'tr_img_03262.jpg' 'tr_img_03434.jpg'
 'tr_img_03127.jpg' 'tr_img_03718.jpg' 'tr_img_03717.jpg'
 'tr_img_03649.jpg' 'tr_img_03456.jpg' 'tr_img_03032.jpg'
 'tr_img_03691.jpg' 'tr_img_03510.jpg' 'tr_img_03521.jpg'
 'tr_img_03422.jpg' 'tr_img_03077.jpg' 'tr_img_03905.jpg'
 'tr_img_03789.jpg' 'tr_img_03201.jpg' 'tr_img_03826.jpg'
 'tr_img_03193.jpg' 'tr_img_03823.jpg' 'tr_img_03445.jpg'
 'tr_img_03932.jpg' 'tr_img_03327.jpg' 'tr_img_03641.jpg'
 'tr_im

100%|██████████| 167/167 [00:01<00:00, 145.60it/s]


TRAIN: ['tr_img_06897.jpg' 'tr_img_05381.jpg' 'tr_img_06314.jpg'
 'tr_img_05072.jpg' 'tr_img_06783.jpg' 'tr_img_03270.jpg'
 'tr_img_06135.jpg' 'tr_img_03834.jpg' 'tr_img_06019.jpg'
 'tr_img_06227.jpg' 'tr_img_05243.jpg' 'tr_img_06846.jpg'
 'tr_img_06408.jpg' 'tr_img_06982.jpg' 'tr_img_03987.jpg'
 'tr_img_06332.jpg' 'tr_img_03356.jpg' 'tr_img_03229.jpg'
 'tr_img_05009.jpg' 'tr_img_06554.jpg' 'tr_img_03581.jpg'
 'tr_img_06167.jpg' 'tr_img_06366.jpg' 'tr_img_05965.jpg'
 'tr_img_03717.jpg' 'tr_img_05556.jpg' 'tr_img_03719.jpg'
 'tr_img_05387.jpg' 'tr_img_06601.jpg' 'tr_img_06543.jpg'
 'tr_img_06828.jpg' 'tr_img_06819.jpg' 'tr_img_05192.jpg'
 'tr_img_06261.jpg' 'tr_img_03034.jpg' 'tr_img_06264.jpg'
 'tr_img_06956.jpg' 'tr_img_03160.jpg' 'tr_img_06536.jpg'
 'tr_img_06639.jpg' 'tr_img_03487.jpg' 'tr_img_03591.jpg'
 'tr_img_03721.jpg' 'tr_img_05730.jpg' 'tr_img_06488.jpg'
 'tr_img_06506.jpg' 'tr_img_05750.jpg' 'tr_img_03956.jpg'
 'tr_img_03963.jpg' 'tr_img_03237.jpg' 'tr_img_06211.jpg'
 'tr_im

100%|██████████| 1037/1037 [00:13<00:00, 76.85it/s]


TRAIN: ['tr_img_06375.jpg' 'tr_img_06756.jpg' 'tr_img_06070.jpg'
 'tr_img_06976.jpg' 'tr_img_06108.jpg' 'tr_img_06421.jpg'
 'tr_img_06252.jpg' 'tr_img_06094.jpg' 'tr_img_06721.jpg'
 'tr_img_06882.jpg' 'tr_img_06477.jpg' 'tr_img_06010.jpg'
 'tr_img_06693.jpg' 'tr_img_06226.jpg' 'tr_img_06157.jpg'
 'tr_img_06394.jpg' 'tr_img_06921.jpg' 'tr_img_06730.jpg'
 'tr_img_06304.jpg' 'tr_img_06389.jpg' 'tr_img_06332.jpg'
 'tr_img_06495.jpg' 'tr_img_06636.jpg' 'tr_img_06769.jpg'
 'tr_img_06814.jpg' 'tr_img_06737.jpg' 'tr_img_06381.jpg'
 'tr_img_06215.jpg' 'tr_img_06174.jpg' 'tr_img_06961.jpg'
 'tr_img_06345.jpg' 'tr_img_06372.jpg' 'tr_img_06310.jpg'
 'tr_img_06195.jpg' 'tr_img_06520.jpg' 'tr_img_06576.jpg'
 'tr_img_06716.jpg' 'tr_img_06922.jpg' 'tr_img_06278.jpg'
 'tr_img_06563.jpg' 'tr_img_06879.jpg' 'tr_img_06603.jpg'
 'tr_img_06999.jpg' 'tr_img_06299.jpg' 'tr_img_06867.jpg'
 'tr_img_06943.jpg' 'tr_img_06632.jpg' 'tr_img_06020.jpg'
 'tr_img_06164.jpg' 'tr_img_06670.jpg' 'tr_img_06294.jpg'
 'tr_im

100%|██████████| 1061/1061 [00:15<00:00, 68.62it/s]


TRAIN: ['tr_img_05011.jpg' 'tr_img_05811.jpg' 'tr_img_05966.jpg'
 'tr_img_05247.jpg' 'tr_img_05916.jpg' 'tr_img_05444.jpg'
 'tr_img_05058.jpg' 'tr_img_05465.jpg' 'tr_img_05184.jpg'
 'tr_img_05462.jpg' 'tr_img_05624.jpg' 'tr_img_05446.jpg'
 'tr_img_05160.jpg' 'tr_img_06490.jpg' 'tr_img_05093.jpg'
 'tr_img_05250.jpg' 'tr_img_05929.jpg' 'tr_img_05632.jpg'
 'tr_img_05282.jpg' 'tr_img_05861.jpg' 'tr_img_05188.jpg'
 'tr_img_05326.jpg' 'tr_img_05103.jpg' 'tr_img_05001.jpg'
 'tr_img_05074.jpg' 'tr_img_05486.jpg' 'tr_img_05965.jpg'
 'tr_img_05456.jpg' 'tr_img_05989.jpg' 'tr_img_05264.jpg'
 'tr_img_05588.jpg' 'tr_img_05213.jpg' 'tr_img_05180.jpg'
 'tr_img_05028.jpg' 'tr_img_05556.jpg' 'tr_img_05484.jpg'
 'tr_img_05683.jpg' 'tr_img_05196.jpg' 'tr_img_05225.jpg'
 'tr_img_05963.jpg' 'tr_img_06563.jpg' 'tr_img_06819.jpg'
 'tr_img_05626.jpg' 'tr_img_05232.jpg' 'tr_img_05414.jpg'
 'tr_img_05464.jpg' 'tr_img_05008.jpg' 'tr_img_05106.jpg'
 'tr_img_05241.jpg' 'tr_img_05825.jpg' 'tr_img_05777.jpg'
 'tr_im

100%|██████████| 995/995 [00:02<00:00, 454.38it/s] 


TRAIN: ['tr_img_08660.jpg' 'tr_img_08364.jpg' 'tr_img_08214.jpg'
 'tr_img_08247.jpg' 'tr_img_08185.jpg' 'tr_img_08436.jpg'
 'tr_img_08357.jpg' 'tr_img_08783.jpg' 'tr_img_08287.jpg'
 'tr_img_08868.jpg' 'tr_img_08929.jpg' 'tr_img_08955.jpg'
 'tr_img_08689.jpg' 'tr_img_08082.jpg' 'tr_img_08530.jpg'
 'tr_img_08301.jpg' 'tr_img_08687.jpg' 'tr_img_08031.jpg'
 'tr_img_08028.jpg' 'tr_img_08040.jpg' 'tr_img_08103.jpg'
 'tr_img_08860.jpg' 'tr_img_08769.jpg' 'tr_img_08234.jpg'
 'tr_img_08334.jpg' 'tr_img_08618.jpg' 'tr_img_08438.jpg'
 'tr_img_08571.jpg' 'tr_img_08605.jpg' 'tr_img_08733.jpg'
 'tr_img_08482.jpg' 'tr_img_08583.jpg' 'tr_img_08631.jpg'
 'tr_img_08517.jpg' 'tr_img_08321.jpg' 'tr_img_08577.jpg'
 'tr_img_08798.jpg' 'tr_img_08826.jpg' 'tr_img_08762.jpg'
 'tr_img_08780.jpg' 'tr_img_08528.jpg' 'tr_img_08799.jpg'
 'tr_img_08333.jpg' 'tr_img_08609.jpg' 'tr_img_08740.jpg'
 'tr_img_08465.jpg' 'tr_img_08897.jpg' 'tr_img_08298.jpg'
 'tr_img_08726.jpg' 'tr_img_08625.jpg' 'tr_img_08032.jpg'
 'tr_im

100%|██████████| 999/999 [00:01<00:00, 714.80it/s] 


TRAIN: ['tr_img_09534.jpg' 'tr_img_09339.jpg' 'tr_img_09258.jpg'
 'tr_img_09784.jpg' 'tr_img_09096.jpg' 'tr_img_09569.jpg'
 'tr_img_09944.jpg' 'tr_img_09768.jpg' 'tr_img_09296.jpg'
 'tr_img_09946.jpg' 'tr_img_09164.jpg' 'tr_img_09335.png'
 'tr_img_09505.jpg' 'tr_img_09822.jpg' 'tr_img_09412.jpg'
 'tr_img_09053.jpg' 'tr_img_09134.jpg' 'tr_img_09802.jpg'
 'tr_img_09123.jpg' 'tr_img_09978.jpg' 'tr_img_09179.jpg'
 'tr_img_09490.jpg' 'tr_img_09321.jpg' 'tr_img_09856.jpg'
 'tr_img_09593.jpg' 'tr_img_09121.jpg' 'tr_img_09474.jpg'
 'tr_img_09536.jpg' 'tr_img_09650.jpg' 'tr_img_09738.jpg'
 'tr_img_09905.png' 'tr_img_09932.jpg' 'tr_img_09448.jpg'
 'tr_img_09672.jpg' 'tr_img_09420.jpg' 'tr_img_09527.jpg'
 'tr_img_09243.jpg' 'tr_img_09408.jpg' 'tr_img_09415.jpg'
 'tr_img_09240.jpg' 'tr_img_09089.jpg' 'tr_img_09668.jpg'
 'tr_img_09838.jpg' 'tr_img_09646.png' 'tr_img_09390.jpg'
 'tr_img_09872.jpg' 'tr_img_09206.jpg' 'tr_img_09140.jpg'
 'tr_img_09194.jpg' 'tr_img_09552.jpg' 'tr_img_09686.jpg'
 'tr_im

# Merge df 관련

In [None]:
merge_df = pd.concat([ICDAR_image_df,UPSTAGE_image_df])