In [35]:
%matplotlib inline
import glob
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pathlib import Path
from PIL import Image
from collections import Counter
import numpy as np

In [36]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [37]:
def read_json(filename):
    with Path(filename).open(encoding='utf8') as handle:
        ann = json.load(handle)
    return ann

In [38]:
ICDAR_data = read_json("/opt/ml/input/data/ICDAR17_Korean/ufo/train.json")
UPSTAGE_data = read_json("/opt/ml/input/data/upstage/ufo/anootation.json")

In [39]:
def get_box_size(quads):
    """ 단어 영역의 사각형 좌표가 주어졌을 때 가로, 세로길이를 계산해주는 함수.
    TODO: 각 변의 길이를 단순히 max로 처리하기때문에 직사각형에 가까운 형태가 아니면 약간 왜곡이 있다.
    Args:
        quads: np.ndarray(n, 4, 2) n개 단어 bounding-box의 4개 점 좌표 (단위 pixel)
    Return:
        sizes: np.ndarray(n, 2) n개 box의 (height, width)쌍
    """
    dists = []
    for i, j in [(1, 2), (3, 0), (0, 1), (2, 3)]: # [right(height), left(height), upper(width), lower(width)] sides
        dists.append(np.linalg.norm(quads[:, i] - quads[:, j], ord=2, axis=1))

    dists = np.stack(dists, axis=-1).reshape(-1, 2, 2) # shape (n, 2, 2) widths, heights into separate dim
    return np.rint(dists.mean(axis=-1)).astype(int)

In [40]:
def rectify_poly(poly, direction, img_w, img_h):
    """일반 polygon형태인 라벨을 크롭하고 rectify해주는 함수.
    Args:
        poly: np.ndarray(2n+4, 2) (where n>0), 4, 6, 8
        image: np.ndarray opencv 포멧의 이미지
        direction: 글자의 읽는 방향과 진행 방향의 수평(Horizontal) 혹은 수직(Vertical) 여부
    Return:
        rectified: np.ndarray(2, ?) rectify된 단어 bbox의 사이즈.
    """
    
    n_pts = poly.shape[0]
    assert n_pts % 2 == 0
    if n_pts == 4:
        size = get_box_size(poly[None])
        h = size[:, 0] / img_h
        w = size[:, 1] / img_w
        return np.stack((h,w))

    def unroll(indices):
        return list(zip(indices[:-1], indices[1:]))

    # polygon하나를 인접한 사각형 여러개로 쪼갠다.
    indices = list(range(n_pts))
    if direction == 'Horizontal':
        upper_pts = unroll(indices[:n_pts // 2]) # (0, 1), (1, 2), ... (4, 5)
        lower_pts = unroll(indices[n_pts // 2:])[::-1] # (8, 9), (7, 8), ... (6, 7)

        quads = np.stack([poly[[i, j, k, l]] for (i, j), (k, l) in zip(upper_pts, lower_pts)])
    else:
        right_pts = unroll(indices[1:n_pts // 2 + 1]) # (1, 2), (2, 3), ... (4, 5)
        left_pts = unroll([0] + indices[:n_pts // 2:-1]) # (0, 9), (9, 8), ... (7, 6)

        quads = np.stack([poly[[i, j, k, l]] for (j, k), (i, l) in zip(right_pts, left_pts)])

    sizes = get_box_size(quads)
    if direction == 'Horizontal':
        h = sizes[:, 0].max() / img_h
        widths = sizes[:, 1]
        w = np.sum(widths) / img_w
        return np.stack((h,w)).reshape(2,-1)
        #return np.stack((h,w))
    elif direction == 'Vertical':
        heights = sizes[:, 0]
        w = sizes[:, 1].max() / img_w
        h = np.sum(heights) / img_h
        return np.stack((h,w)).reshape(2,-1)
    else:
        h = sizes[:, 0] / img_h
        w = sizes[:, 1] / img_w
        return np.stack((h,w),-1)

In [41]:
df = {}
df['image'] = []
df['word_counts'] = []
df['image_width'] = []
df['image_height'] = []
df['image_tags'] = []
img_tags = []

quads = []
polys = []
seq_length = []
hor_sizes = []
ver_sizes = []
irr_sizes = []
languages = []
orientation = []
word_tags = []
aspect_ratio = []
ver_string = []

for image_key, image_value in ICDAR_data["images"].items():
    df['image'].append(image_key)
    img_w = image_value['img_w']
    img_h = image_value['img_h']
    df['image_width'].append(img_w)
    df['image_height'].append(img_h)
    df['image_tags'].append(image_value['tags'])
    df['image_tags']= [['None'] if v is None else v for v in df['image_tags']] # our data does not inlcude multi-tag images 
    word_ann = image_value['words']
    count_ill = 0 
    for word in word_ann.values():
        if word['illegibility']== False:
            orientation.append(word['orientation'])
            orientation = [v for v in orientation]
            seq_length.append(len(word['transcription']))
            languages.append(word['language'])
            languages = [['None'] if v is None else v for v in languages] # our data does not inlcude multi-language words
            if word['word_tags'] != None:
                word_tags.extend(word['tags'][:])
            elif word['word_tags']== None:
                word_tags.append('None')
            poly = np.int32(word['points'])
            size = rectify_poly(poly, word['orientation'], img_w, img_h)
            if word['orientation'] == 'Horizontal':
                hor_sizes.append(size)
            elif word['orientation'] == 'Vertical':
                ver_sizes.append(size)
            else:
                irr_sizes.append(size)
            
        else:
            count_ill += 1
    df['word_counts'].append(len(word_ann)-count_ill)

        
all_sizes = hor_sizes + ver_sizes + irr_sizes
quad_area = [all_sizes[i][0]*all_sizes[i][1] for i in range(len(all_sizes))]
total_area = []
for s in quad_area:
    if s.shape[0] == 1:
        total_area.append(np.sum(s[0])) 
    else:
        total_area.append(np.sum(s))

hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
ver_aspect_ratio = [ver_sizes[i][1]/ver_sizes[i][0] for i in range(len(ver_sizes))]

ICDAR_image_df = pd.DataFrame.from_dict(df)

In [42]:
df = {}
df['image'] = []
df['word_counts'] = []
df['image_width'] = []
df['image_height'] = []
df['image_tags'] = []
img_tags = []

quads = []
polys = []
seq_length = []
hor_sizes = []
ver_sizes = []
irr_sizes = []
languages = []
orientation = []
word_tags = []
aspect_ratio = []
ver_string = []

for image_key, image_value in UPSTAGE_data["images"].items():
    df['image'].append(image_key)
    img_w = image_value['img_w']
    img_h = image_value['img_h']
    df['image_width'].append(img_w)
    df['image_height'].append(img_h)
    df['image_tags'].append(image_value['tags'])
    df['image_tags']= [['None'] if v is None else v for v in df['image_tags']] # our data does not inlcude multi-tag images 
    word_ann = image_value['words']
    count_ill = 0 
    for word in word_ann.values():
        if word['illegibility']== False:
            orientation.append(word['orientation'])
            orientation = [v for v in orientation]
            seq_length.append(len(word['transcription']))
            languages.append(word['language'])
            languages = [['None'] if v is None else v for v in languages] # our data does not inlcude multi-language words
            if word['tags'] != None:
                word_tags.extend(word['tags'][:])
            elif word['tags']== None:
                word_tags.append('None')
            poly = np.int32(word['points'])
            size = rectify_poly(poly, word['orientation'], img_w, img_h)
            if word['orientation'] == 'Horizontal':
                hor_sizes.append(size)
            elif word['orientation'] == 'Vertical':
                ver_sizes.append(size)
            else:
                irr_sizes.append(size)
            
        else:
            count_ill += 1
    df['word_counts'].append(len(word_ann)-count_ill)

        
all_sizes = hor_sizes + ver_sizes + irr_sizes
quad_area = [all_sizes[i][0]*all_sizes[i][1] for i in range(len(all_sizes))]
total_area = []
for s in quad_area:
    if s.shape[0] == 1:
        total_area.append(np.sum(s[0])) 
    else:
        total_area.append(np.sum(s))

hor_aspect_ratio = [hor_sizes[i][1]/hor_sizes[i][0] for i in range(len(hor_sizes))]
ver_aspect_ratio = [ver_sizes[i][1]/ver_sizes[i][0] for i in range(len(ver_sizes))]

UPSTAGE_image_df = pd.DataFrame.from_dict(df)

In [43]:
merge_df = pd.concat([ICDAR_image_df,UPSTAGE_image_df])

In [44]:
merge_df.describe()

Unnamed: 0,word_counts,image_width,image_height
count,1510.0,1510.0,1510.0
mean,16.417881,2864.386093,2981.343709
std,28.465753,986.315138,1081.069312
min,0.0,409.0,351.0
25%,3.0,2328.0,2268.0
50%,8.0,3024.0,3024.0
75%,17.0,3120.0,4032.0
max,578.0,9248.0,9248.0


In [45]:
merge_df.head()

Unnamed: 0,image,word_counts,image_width,image_height,image_tags
0,img_4380.jpg,3,1836,2448,[None]
1,img_4583.jpg,2,2268,2268,[None]
2,img_4234.jpg,7,2592,3456,[None]
3,img_4345.jpg,7,1836,2448,[None]
4,img_4016.jpg,1,1836,2448,[None]


In [46]:
merge_df[merge_df['word_counts']<=5].count()

image           594
word_counts     594
image_width     594
image_height    594
image_tags      594
dtype: int64

In [47]:
merge_df[merge_df['word_counts']<=13].count()

image           1015
word_counts     1015
image_width     1015
image_height    1015
image_tags      1015
dtype: int64

In [48]:
merge_df[merge_df['word_counts']>13].count()

image           495
word_counts     495
image_width     495
image_height    495
image_tags      495
dtype: int64

merge_df['word_counts']<=5 : 594
3< merge_df['word_counts'] <=8 : 1015 -594 = 421
merge_df['word_counts'] > 13 : 495

In [55]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.3.2
  Downloading scipy-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.7/33.7 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.0 scipy-1.9.3 threadpoolctl-3.1.0
[0m

In [56]:
import sklearn

In [57]:
from sklearn.model_selection import train_test_split,StratifiedGroupKFold

In [58]:
ICDAR_image_df['word_counts_amount'] = np.where(ICDAR_image_df["word_counts"] <=5, 'few',
                                                np.where(ICDAR_image_df["word_counts"]<=13,'normal','many'))
UPSTAGE_image_df['word_counts_amount'] = np.where(UPSTAGE_image_df["word_counts"] <=5, 'few',
                                                np.where(UPSTAGE_image_df["word_counts"]<=13,'normal','many'))
#UPSTAGE_image_df

In [66]:
ICDAR_image_df.loc[0,'image']

'img_4380.jpg'

In [67]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

var = [(ICDAR_image_df.loc[idx,'image'], ICDAR_image_df.loc[idx,'word_counts_amount']) for idx in range(len(ICDAR_image_df))]

X = np.ones((len(var), ))
y = np.array([v[1] for v in var])
groups = np.array([v[0] for v in  var])

for train_idx, test_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx])
    print("      ", y[train_idx])
    print("TEST :", groups[test_idx])
    print("      ", y[test_idx])

TRAIN: ['img_4380.jpg' 'img_4583.jpg' 'img_4234.jpg' 'img_4345.jpg'
 'img_4016.jpg' 'img_4273.jpg' 'img_4366.jpg' 'img_4290.jpg'
 'img_4508.jpg' 'img_4322.jpg' 'img_4465.jpg' 'img_4389.jpg'
 'img_4694.jpg' 'img_4724.jpg' 'img_4728.jpg' 'img_4782.jpg'
 'img_4570.jpg' 'img_4251.jpg' 'img_4443.jpg' 'img_4705.jpg'
 'img_4084.jpg' 'img_4769.jpg' 'img_4556.jpg' 'img_4615.jpg'
 'img_4129.jpg' 'img_4279.jpg' 'img_4500.jpg' 'img_4127.jpg'
 'img_4520.jpg' 'img_4243.jpg' 'img_4104.jpg' 'img_4758.jpg'
 'img_4415.jpg' 'img_4721.jpg' 'img_4732.jpg' 'img_4155.jpg'
 'img_4607.jpg' 'img_4085.jpg' 'img_4632.jpg' 'img_4235.jpg'
 'img_4094.jpg' 'img_4617.jpg' 'img_4331.jpg' 'img_4416.jpg'
 'img_4073.jpg' 'img_4463.jpg' 'img_4267.jpg' 'img_4738.jpg'
 'img_4437.jpg' 'img_4603.jpg' 'img_4767.jpg' 'img_4128.jpg'
 'img_4482.jpg' 'img_4180.jpg' 'img_4676.jpg' 'img_4233.jpg'
 'img_4450.jpg' 'img_4543.jpg' 'img_4282.jpg' 'img_4651.jpg'
 'img_4361.jpg' 'img_4075.jpg' 'img_4781.jpg' 'img_4364.jpg'
 'img_4054.jpg' '

In [68]:
wd = {key:val for val, key in enumerate(sorted(ICDAR_image_df['word_counts_amount'].unique()))}
wd

{'few': 0, 'many': 1, 'normal': 2}

In [84]:
categories = [d for d in wd.keys()]
categories

['few', 'many', 'normal']

In [82]:
amounts = ['few','normal','many']
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())
    
    return [f'{y_distr[i]/y_vals_sum:.2%}'  for i in amounts]

get_distribution(y)


['55.78%', '33.40%', '10.82%']

In [85]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)
amounts = ['few','normal','many']
# check distribution
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())
    
    return [f'{y_distr[i]/y_vals_sum:.2%}'  for i in amounts]
    
distrs = [get_distribution(y)]
index = ['training set']

for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X,y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]
    
    assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')
                 
categories = [d for d in wd.keys()]

pd.DataFrame(distrs, index=index, columns = [amounts[i] for i in range(3)])

Unnamed: 0,few,normal,many
training set,55.78%,33.40%,10.82%
train - fold0,55.37%,34.58%,10.05%
val - fold0,57.41%,28.70%,13.89%
train - fold1,58.51%,31.24%,10.26%
val - fold1,44.86%,42.06%,13.08%
train - fold2,54.08%,34.27%,11.66%
val - fold2,62.62%,29.91%,7.48%
train - fold3,55.24%,33.33%,11.42%
val - fold3,57.94%,33.64%,8.41%
train - fold4,55.71%,33.57%,10.72%


In [86]:
ICDAR_image_df

Unnamed: 0,image,word_counts,image_width,image_height,image_tags,word_counts_amount
0,img_4380.jpg,3,1836,2448,[None],few
1,img_4583.jpg,2,2268,2268,[None],few
2,img_4234.jpg,7,2592,3456,[None],normal
3,img_4345.jpg,7,1836,2448,[None],normal
4,img_4016.jpg,1,1836,2448,[None],few
...,...,...,...,...,...,...
531,img_1048.jpg,4,3024,2268,[None],few
532,img_1071.jpg,2,2448,1836,[None],few
533,img_1122.jpg,8,2448,1836,[None],normal
534,img_1131.jpg,1,3144,2328,[None],few


In [127]:
for image in ICDAR_data['images']:
    print(image)

img_4380.jpg
img_4583.jpg
img_4234.jpg
img_4345.jpg
img_4016.jpg
img_4273.jpg
img_4609.jpg
img_4072.jpg
img_4366.jpg
img_4712.jpg
img_4249.jpg
img_4290.jpg
img_4508.jpg
img_4322.jpg
img_4465.jpg
img_4389.jpg
img_4694.jpg
img_4384.jpg
img_4724.jpg
img_4728.jpg
img_4782.jpg
img_4570.jpg
img_4413.jpg
img_4251.jpg
img_4443.jpg
img_4705.jpg
img_4084.jpg
img_4769.jpg
img_4556.jpg
img_4615.jpg
img_4129.jpg
img_4279.jpg
img_4500.jpg
img_4127.jpg
img_4520.jpg
img_4199.jpg
img_4243.jpg
img_4104.jpg
img_4758.jpg
img_4415.jpg
img_4721.jpg
img_4289.jpg
img_4732.jpg
img_4139.jpg
img_4155.jpg
img_4607.jpg
img_4085.jpg
img_4632.jpg
img_4235.jpg
img_4094.jpg
img_4617.jpg
img_4331.jpg
img_4416.jpg
img_4073.jpg
img_4352.jpg
img_4463.jpg
img_4267.jpg
img_4738.jpg
img_4795.jpg
img_4437.jpg
img_4603.jpg
img_4767.jpg
img_4128.jpg
img_4482.jpg
img_4180.jpg
img_4676.jpg
img_4233.jpg
img_4450.jpg
img_4258.jpg
img_4408.jpg
img_4543.jpg
img_4282.jpg
img_4651.jpg
img_4361.jpg
img_4075.jpg
img_4781.jpg
img_4364.jpg

In [149]:
ICDAR_data['images']['img_4380.jpg']
ICDAR_data['images']

{'img_h': 2448,
 'img_w': 1836,
 'words': {'0': {'points': [[662.0, 747.0],
    [945.0, 759.0],
    [922.0, 1582.0],
    [673.0, 1565.0]],
   'transcription': '출입금지',
   'language': ['ko'],
   'illegibility': False,
   'orientation': 'Horizontal',
   'word_tags': None},
  '1': {'points': [[476.0, 551.0],
    [1132.0, 554.0],
    [1118.0, 747.0],
    [471.0, 716.0]],
   'transcription': '오토바이',
   'language': ['ko'],
   'illegibility': False,
   'orientation': 'Horizontal',
   'word_tags': None},
  '2': {'points': [[455.0, 293.0],
    [1144.0, 310.0],
    [1129.0, 518.0],
    [457.0, 496.0]],
   'transcription': '자전거',
   'language': ['ko'],
   'illegibility': False,
   'orientation': 'Horizontal',
   'word_tags': None}},
 'tags': None,
 'license_tag': {'usability': True,
  'public': True,
  'commercial': True,
  'type': 'CC-BY-SA',
  'holder': None}}

{'img_4380.jpg': {'img_h': 2448,
  'img_w': 1836,
  'words': {'0': {'points': [[662.0, 747.0],
     [945.0, 759.0],
     [922.0, 1582.0],
     [673.0, 1565.0]],
    'transcription': '출입금지',
    'language': ['ko'],
    'illegibility': False,
    'orientation': 'Horizontal',
    'word_tags': None},
   '1': {'points': [[476.0, 551.0],
     [1132.0, 554.0],
     [1118.0, 747.0],
     [471.0, 716.0]],
    'transcription': '오토바이',
    'language': ['ko'],
    'illegibility': False,
    'orientation': 'Horizontal',
    'word_tags': None},
   '2': {'points': [[455.0, 293.0],
     [1144.0, 310.0],
     [1129.0, 518.0],
     [457.0, 496.0]],
    'transcription': '자전거',
    'language': ['ko'],
    'illegibility': False,
    'orientation': 'Horizontal',
    'word_tags': None}},
  'tags': None,
  'license_tag': {'usability': True,
   'public': True,
   'commercial': True,
   'type': 'CC-BY-SA',
   'holder': None}},
 'img_4583.jpg': {'img_h': 2268,
  'img_w': 2268,
  'words': {'0': {'points': [[323.0

In [130]:
for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
    print(len(groups[valid_idx]))

108
107
107
107
107


In [None]:
for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
    #[image for image in ICDAR_data['images'] if image['id'] in set(groups[train_idx])]
    print(ICDAR_data['images'][groups[train_idx][0]])
    for i in range(len(groups[train_idx])):
        print(ICDAR_data['images'][groups[train_idx][i]])
        break
        

In [157]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
    ICDAR_train_fold = dict();ICDAR_valid_fold = dict()
    ICDAR_train_fold['images']={};ICDAR_valid_fold['images']={}
    ICDAR_train_fold['images']['blblblajpg']= ICDAR_data['images']['img_4380.jpg']
    break
ICDAR_train_fold

{'images': {'blblblajpg': {'img_h': 2448,
   'img_w': 1836,
   'words': {'0': {'points': [[662.0, 747.0],
      [945.0, 759.0],
      [922.0, 1582.0],
      [673.0, 1565.0]],
     'transcription': '출입금지',
     'language': ['ko'],
     'illegibility': False,
     'orientation': 'Horizontal',
     'word_tags': None},
    '1': {'points': [[476.0, 551.0],
      [1132.0, 554.0],
      [1118.0, 747.0],
      [471.0, 716.0]],
     'transcription': '오토바이',
     'language': ['ko'],
     'illegibility': False,
     'orientation': 'Horizontal',
     'word_tags': None},
    '2': {'points': [[455.0, 293.0],
      [1144.0, 310.0],
      [1129.0, 518.0],
      [457.0, 496.0]],
     'transcription': '자전거',
     'language': ['ko'],
     'illegibility': False,
     'orientation': 'Horizontal',
     'word_tags': None}},
   'tags': None,
   'license_tag': {'usability': True,
    'public': True,
    'commercial': True,
    'type': 'CC-BY-SA',
    'holder': None}}}}

In [154]:
temp =dict()
#temp['images'] =[ICDAR_data['images'][groups[train_idx][i]] for i in range(len(groups[train_idx]))]
temp['images']={}
temp['images']['train']= {'hi'}
temp

{'images': {'train': {'hi'}}}

In [159]:
cv = StratifiedGroupKFold(n_splits=5, shuffle = True, random_state=2022)

for idx, (train_idx, valid_idx) in enumerate(cv.split(X,y, groups)):
    ICDAR_train_fold = dict();ICDAR_valid_fold = dict()
    ICDAR_train_fold['images']={};ICDAR_valid_fold['images']={}
    
    for i in range(len(groups[train_idx])):
        ICDAR_train_fold['images'][groups[train_idx][i]] = ICDAR_data['images'][groups[train_idx][i]]

    for i in range(len(groups[valid_idx])):
        ICDAR_valid_fold['images'][groups[valid_idx][i]] = ICDAR_data['images'][groups[valid_idx][i]]
    
    
    with open(f'ICDAR_train_fold{idx}.json', 'w') as f:
        json.dump(ICDAR_train_fold, f, indent = 4)

    with open(f'ICDAR_valid_fold{idx}.json', 'w') as f:
        json.dump(ICDAR_valid_fold, f, indent = 4)