In [2]:
from PIL import Image
import os
from io import BytesIO
import json
import pandas as pd

In [28]:
def make_img_json(parquet_path, img_dir, json_dir, sample_size=200,image_only=False):
    df = pd.read_parquet(parquet_path)
    
    image_data_list = []
    
    for idx, row in df.iterrows():
        img_data = row['image']['bytes']
        img = Image.open(BytesIO(img_data))
        
        if not image_only:
            gt_str = row['ground_truth']
            gt_dict = json.loads(gt_str)
            image_id = gt_dict['meta']['image_id']
        else:
            image_id = f"{idx}"
            
        with BytesIO() as output:
            img.save(output, format="jpeg")
            image_size = output.tell()
            
        if image_only:
            image_data_list.append((image_id, img_data, image_size))
        else:
            image_data_list.append((image_id, img_data, gt_dict, image_size))
        
    image_data_list.sort(key=lambda x: x[2 if image_only else 3], reverse=True)
    selected_images = image_data_list[:sample_size]
    
    for data in selected_images:
        image_id, img_data = data[0], data[1]
        img = Image.open(BytesIO(img_data))

        img_path = os.path.join(img_dir, f"image_{image_id}.jpg")
        img.save(img_path)

        if not image_only and len(data) > 2:
            gt_dict = data[2]
            if gt_dict is not None:
                gt_path = os.path.join(json_dir, f"{image_id}.json")
                with open(gt_path, 'w', encoding='utf-8') as f:
                    json.dump(gt_dict, f, ensure_ascii=False, indent=4)
                      
    print('DONE!')

In [30]:
parquet_path = "/data/ephemeral/home/cordx/parquet/zh.parquet"
image_dir = "/data/ephemeral/home/cordx/cord_zh_image"
json_dir = "/data/ephemeral/home/cordx/cord_zh_json"
os.makedirs(image_dir, exist_ok=True)
os.makedirs(json_dir, exist_ok=True)

In [31]:
make_img_json(parquet_path=parquet_path,
              img_dir=image_dir,
              json_dir=json_dir,
              image_only=True)

DONE!


In [12]:
import json

In [None]:
def make_cloba2datu(json_folder, output_path):
    dataset = {
        "info": {},
        "categories": {
            "label": {
                "labels": [
                    {
                        "name": "text",
                        "parent": "",
                        "attributes": []
                    }
                ],
                "attributes": []
            }
        },
        "items": []
    }

    label_name_to_id = {"text": 0}

    annotation_id = 0

    json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]

    json_files.sort()

    # 각 JSON 파일 처리
    for json_file in json_files:
        json_path = os.path.join(json_folder, json_file)
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # 이미지 정보 추출
        image_id ="image_" + data['meta']['image_id']
        image_size = data['meta']['image_size']
        width = image_size['width']
        height = image_size['height']
        image_path = f"{image_id}.jpg"  # 이미지 경로를 실제 경로에 맞게 조정

        # 어노테이션 목록 초기화
        annotations = []

        # 각 valid_line 처리
        for line in data['valid_line']:
            # 각 단어 처리
            for word_info in line['words']:
                quad = word_info['quad']
                # 좌표 추출 및 변환
                points = [
                    quad['x1'], quad['y1'],
                    quad['x2'], quad['y2'],
                    quad['x3'], quad['y3'],
                    quad['x4'], quad['y4']
                ]

                # 텍스트 및 속성 추출
                text = word_info.get('text', '')
                is_key = word_info.get('is_key', 0)
                row_id = word_info.get('row_id', None)

                # 어노테이션 생성
                annotation = {
                    "id": annotation_id,
                    "type": "polygon",
                    "attributes": {
                        "text": text,
                        "is_key": is_key,
                        "row_id": row_id
                    },
                    "group": 0,
                    "label_id": label_name_to_id["text"],
                    "points": [float(coord) for coord in points],
                    "z_order": 0
                }
                annotations.append(annotation)
                annotation_id += 1  # 어노테이션 ID 증가

        # 이미지 정보 구성
        item = {
            "id": str(image_id),
            "annotations": annotations,
            "image": {
                "size": [int(height), int(width)],
                "path": image_path  # 이미지 파일 이름과 일치
            }
        }
        dataset['items'].append(item)

    # Datumaro 포맷의 JSON 파일로 저장
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

    print(f"Datumaro 포맷의 JSON 파일이 생성되었습니다: {output_path}")

In [14]:
output_path = "cloba2datu.json"

make_cloba2datu(json_dir, output_path)

Datumaro 포맷의 JSON 파일이 생성되었습니다: cloba2datu.json
