In [3]:
!pip install datasets

Collecting huggingface_hub
  Obtaining dependency information for huggingface_hub from https://files.pythonhosted.org/packages/60/bf/cea0b9720c32fa01b0c4ec4b16b9f4ae34ca106b202ebbae9f03ab98cd8f/huggingface_hub-0.26.2-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: huggingface_hub
Successfully installed huggingface_hub-0.26.2
[0m

In [4]:
from datasets import load_dataset

ds = load_dataset("naver-clova-ix/cord-v2")

import os
import json
import shutil
from PIL import Image

def convert_cord_to_ufo(ds_split, split_name):
    formatted_data = {"images": {}}
    
    for idx, item in enumerate(ds_split):
        # 이미지 ID를 split에 따라 다르게 생성
        image_id = f"receipt_{split_name}_{idx:04d}.jpg"
        gt_data = json.loads(item['ground_truth'])
        
        # 기본 이미지 정보 설정
        image_data = {
            "paragraphs": {},
            "words": {},
            "chars": {},
            "img_w": gt_data['meta']['image_size']['width'],
            "img_h": gt_data['meta']['image_size']['height'],
            "num_patches": None,
            "tags": [],
            "relations": {},
            "annotation_log": {
                "worker": "worker",
                "timestamp": "2024-05-30",
                "tool_version": "",
                "source": None
            },
            "license_tag": {
                "usability": True,
                "public": False,
                "commercial": True,
                "type": None,
                "holder": "CORD"
            },
            "img_path": ""
        }
        
        # valid_line에서 단어 정보 추출
        word_count = 1
        for line in gt_data['valid_line']:
            for word in line['words']:
                word_id = f"{word_count:04d}"
                word_count += 1
                
                quad = word['quad']
                points = [
                    [quad['x1'], quad['y1']],
                    [quad['x2'], quad['y2']],
                    [quad['x3'], quad['y3']],
                    [quad['x4'], quad['y4']]
                ]
                
                category = line['category'].split('.')[0]
                
                word_data = {
                    "transcription": word['text'],
                    "points": points,
                    "orientation": "Horizontal",
                    "language": None,
                    "tags": [],
                    "confidence": 1.0,
                    "illegibility": False
                }
                
                image_data['words'][word_id] = word_data
        
        formatted_data['images'][image_id] = image_data
    
    return formatted_data

# 디렉토리 생성
base_dir = '/data/ephemeral/home/code/data/other_receipt'
os.makedirs(os.path.join(base_dir, 'img/train'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'img/test'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'ufo'), exist_ok=True)

# train 데이터 처리
train_data = convert_cord_to_ufo(ds['train'], 'train')
for idx, img in enumerate(ds['train']['image']):
    img_filename = f'receipt_train_{idx:04d}.jpg'
    img_path = os.path.join(base_dir, 'img/train', img_filename)
    img.save(img_path)
    train_data['images'][img_filename]['img_path'] = f'train/{img_filename}'
    
    '''태그 추가 코드'''
    train_data['images'][img_filename]['tags'].append('external data')

# validation 데이터 처리
val_data = convert_cord_to_ufo(ds['validation'], 'val')
for idx, img in enumerate(ds['validation']['image']):
    img_filename = f'receipt_val_{idx:04d}.jpg'
    img_path = os.path.join(base_dir, 'img/train', img_filename)
    img.save(img_path)
    val_data['images'][img_filename]['img_path'] = f'train/{img_filename}'
    
    '''태그 추가 코드'''
    val_data['images'][img_filename]['tags'].append('external data')

# test 데이터 처리
test_data = convert_cord_to_ufo(ds['test'], 'test')
for idx, img in enumerate(ds['test']['image']):
    img_filename = f'receipt_test_{idx:04d}.jpg'
    img_path = os.path.join(base_dir, 'img/test', img_filename)
    img.save(img_path)
    test_data['images'][img_filename]['img_path'] = f'test/{img_filename}'

# UFO 데이터셋을 JSON 파일로 저장
with open(os.path.join(base_dir, 'ufo/train_random.json'), 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open(os.path.join(base_dir, 'ufo/val_random.json'), 'w', encoding='utf-8') as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)

with open(os.path.join(base_dir, 'ufo/test.json'), 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

print("이미지와 UFO 데이터셋이 성공적으로 저장되었습니다.")

이미지와 UFO 데이터셋이 성공적으로 저장되었습니다.
