In [1]:
import json
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from collections import defaultdict

In [2]:
# annotation = {train.json dataset file 경로}
annotation = '/opt/ml/input/data/ICDAR17_Korean/ufo/train.json'

with open(annotation) as f: 
    data = json.load(f)

In [21]:
file_list = os.listdir('/opt/ml/input/data/ICDAR17_Korean/images')

In [26]:
file_list = [filelist for filelist in file_list if os.path.splitext(filelist)[1] == ".jpg"]

In [30]:
len(file_list)

536

In [56]:
get_id = {"ko":0, "en":1}
get_category = ["ko", "en"]

In [51]:
var = [(idx, get_id[data["images"][file]["words"][i]['language'][0]]) for idx, file in enumerate(file_list) for i in data["images"][file]["words"]]

X = np.ones((len(var),1))                   # 각 ananotation 의 index
y = np.array([v[1] for v in var])           # cateogry_id 모음
groups = np.array([v[0] for v in var])      # image_id 모음


cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=41)

for train_idx, val_idx in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idx][:10]) # image_id
    print(" ", y[train_idx][:10])           # category_id
    print(" TEST:", groups[val_idx][:10])
    print(" ", y[val_idx][:10])

TRAIN: [0 0 0 0 0 0 3 3 3 3]
  [1 1 1 1 0 0 0 0 0 0]
 TEST: [1 1 1 1 1 1 2 2 6 6]
  [0 0 0 1 1 1 1 0 0 0]
TRAIN: [0 0 0 0 0 0 1 1 1 1]
  [1 1 1 1 0 0 0 0 0 1]
 TEST: [ 7  7  7 12 12 12 16 16 16 16]
  [1 1 0 0 0 0 0 0 0 0]
TRAIN: [0 0 0 0 0 0 1 1 1 1]
  [1 1 1 1 0 0 0 0 0 1]
 TEST: [5 8 8 8 8 8 8 8 8 8]
  [0 0 0 0 0 0 0 1 1 1]
TRAIN: [1 1 1 1 1 1 2 2 3 3]
  [0 0 0 1 1 1 1 0 0 0]
 TEST: [ 0  0  0  0  0  0 14 14 14 14]
  [1 1 1 1 0 0 0 0 1 0]
TRAIN: [0 0 0 0 0 0 1 1 1 1]
  [1 1 1 1 0 0 0 0 0 1]
 TEST: [3 3 3 3 3 3 3 3 3 3]
  [0 0 0 0 0 0 0 0 1 0]


In [43]:
from collections import Counter
import pandas as pd

In [52]:
def get_distribution(y):
    y_distr = Counter(y)
    y_vals_sum = sum(y_distr.values())

    return [f'{y_distr[i]/y_vals_sum:.2%}' for i in range(np.max(y) +1)]

In [53]:
distrs = [get_distribution(y)]
index = ['training set']

In [60]:
for fold_ind, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_y, val_y = y[train_idx], y[val_idx]
    train_gr, val_gr = groups[train_idx], groups[val_idx]

    assert len(set(train_gr) & set(val_gr)) == 0
    
    distrs.append(get_distribution(train_y))
    distrs.append(get_distribution(val_y))
    index.append(f'train - fold{fold_ind}')
    index.append(f'val - fold{fold_ind}')

In [62]:
pd.DataFrame(distrs, index=index, columns = [get_category[i] for i in range(np.max(y) + 1)])

Unnamed: 0,ko,en
training set,72.24%,27.76%
train - fold0,72.51%,27.49%
val - fold0,71.22%,28.78%
train - fold1,72.65%,27.35%
val - fold1,70.26%,29.74%
train - fold2,74.32%,25.68%
val - fold2,66.22%,33.78%
train - fold3,71.31%,28.69%
val - fold3,76.76%,23.24%
train - fold4,70.60%,29.40%


In [63]:
# annotation = {dataset 경로/K-fold}
output_filename = "/opt/ml/input/data/ICDAR17_Korean/ufo/K-fold"

In [65]:
for idx, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
    train_images, val_images = dict(), dict()
    train_annotations, val_annotations = [], []
    for i in groups[train_idx]: # image_id
        train_images[file_list[i]] = data["images"][file_list[i]].copy()
    for i in groups[val_idx]:   # image_id
        val_images[file_list[i]] = data["images"][file_list[i]].copy()

    train_split = {
            "images": train_images
        }

    val_split = {
            "images": val_images
        }
    
    output_files = []
    for split_type, split in zip(["train", "val"], [train_split, val_split]):
        output_files.append(output_filename + f"_{split_type}{idx+1}.json")
        with open(output_files[-1], "w") as f:
            json.dump(split, f, indent=2)

print("Split Done !")


Split Done !
