In [None]:
import os
import json
import shutil

from datasets import load_dataset

In [None]:
dataset = load_dataset("naver-clova-ix/cord-v2")

train_img_data = dataset['train']['image']
val_img_data = dataset['validation']['image']
test_img_data = dataset['test']['image']

train_json_data = dataset['train']['ground_truth']
val_json_data = dataset['validation']['ground_truth']
test_json_data = dataset['test']['ground_truth']

CORD 이미지 데이터셋 다운로드

In [None]:
os.makedirs('../CORD_dataset/train', exist_ok=True)
os.makedirs('../CORD_dataset/val', exist_ok=True)

for i in range(len(train_img_data)):
    file_name = f'../CORD_dataset/train/CORD_dataset_train_{str(i).zfill(4)}.png'
    train_img_data[i].save(file_name)

for i in range(len(val_img_data)):
    file_name = f'../CORD_dataset/val/CORD_dataset_val_{str(i).zfill(4)}.png'
    val_img_data[i].save(file_name)

CORD JSON -> UFO format으로 저장

In [None]:
def cord_json_to_ufo(type, output_path):
    ufo_format = {'images' : {}}
    if type == 'train':
        json_data = train_json_data
    elif type == 'val':
        json_data = val_json_data
    else:
        json_data = test_json_data
        
    for id, img_data in enumerate(json_data):
        img_name = f'CORD_dataset_{type}_{str(id).zfill(4)}.png'

        if img_name not in ufo_format['images']:
            ufo_format['images'][img_name] = {'words': {}}
        
        word_id = 0

        for word_data in eval(img_data)['valid_line']:
            for word_info in word_data['words']:

                quad = word_info['quad']
                points = [
                    [quad['x1'], quad['y1']],
                    [quad['x2'], quad['y2']],
                    [quad['x3'], quad['y3']],
                    [quad['x4'], quad['y4']]
                ]

                word_index = f'{str(word_id+1).zfill(4)}'

                ufo_annotation = {
                    'transcription': word_info['text'],
                    'points': points
                }

                ufo_format['images'][img_name]['words'][word_index] = ufo_annotation

                word_id += 1

        ufo_format['images'][img_name]['img_w'] = eval(img_data)['meta']['image_size']['width']
        ufo_format['images'][img_name]['img_h'] = eval(img_data)['meta']['image_size']['height']

    with open(output_path, 'w') as f:
        json.dump(ufo_format, f, indent=4)


In [None]:
cord_json_to_ufo('train', '../CORD_dataset/train.json')
cord_json_to_ufo('val', '../CORD_dataset/val.json')

CORD_dataset, original dataset 합치기 

In [None]:
cord_dataset_path = "../CORD_dataset"
dataset_path = "../dataset"

# 새롭게 합친 데이터셋 저장 경로
new_dataset_path = "../Ori_CORD_dataset"
os.makedirs(new_dataset_path, exist_ok=True)
os.makedirs(os.path.join(new_dataset_path, "train"), exist_ok=True)
os.makedirs(os.path.join(new_dataset_path, "val"), exist_ok=True)
os.makedirs(os.path.join(new_dataset_path, "test"), exist_ok=True)

def copy_images(src_folder, dest_folder):
    for filename in os.listdir(src_folder):
        src_path = os.path.join(src_folder, filename)
        dest_path = os.path.join(dest_folder, filename)
        shutil.copy2(src_path, dest_path)

copy_images(os.path.join(cord_dataset_path, "train"), os.path.join(new_dataset_path, "train"))
copy_images(os.path.join(dataset_path, "train"), os.path.join(new_dataset_path, "train"))

copy_images(os.path.join(cord_dataset_path, "val"), os.path.join(new_dataset_path, "val"))

copy_images(os.path.join(dataset_path, "test"), os.path.join(new_dataset_path, "test"))

def merge_json_files(json_files):
    merged_data = {"images": {}}
    
    for json_file in json_files:
        with open(json_file, 'r') as f:
            data = json.load(f)
            merged_data["images"].update(data["images"])

    return merged_data

train_json_files = [
    os.path.join(cord_dataset_path, "train.json"),
    os.path.join(dataset_path, "train.json")
]
merged_train_json = merge_json_files(train_json_files)

val_json_src = os.path.join(cord_dataset_path, "val.json")
test_json_src = os.path.join(dataset_path, "test.json")

with open(os.path.join(new_dataset_path, "train.json"), "w") as f:
    json.dump(merged_train_json, f)

shutil.copy2(val_json_src, os.path.join(new_dataset_path, "val.json"))
shutil.copy2(test_json_src, os.path.join(new_dataset_path, "test.json"))


if len(os.listdir(os.path.join(new_dataset_path, 'train'))) == 1200 and\
    len(os.listdir(os.path.join(new_dataset_path, 'val'))) == 100 and\
    len(os.listdir(os.path.join(new_dataset_path, 'test'))) == 120:
    print('Train, Val, and Test successfully created!')


데이터셋이 성공적으로 합쳐졌습니다.
