## UFO to COCO

In [None]:
#UFO를 COCO 형식으로 변환

import json

def convert_to_coco(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    images = []
    annotations = []
    annotation_id = 1
    
    for image_filename, image_info in data["images"].items():
        image_id = len(images) + 1
        img_width = image_info.get("img_w", None)
        img_height = image_info.get("img_h", None)
        
        images.append({
            "id": image_id,
            "file_name": image_filename,
            "width": img_width,
            "height": img_height
        })

        for word_id, word_info in image_info.get("words", {}).items():
            transcription = word_info["transcription"]
            points = word_info["points"]
            if not points or len(points) < 3:
                continue
            
            # 다각형 포인트를 COCO segmentation 형식으로 변환
            segmentation = [coord for point in points for coord in point]
            
            annotations.append({
                "id": annotation_id,
                "image_id": image_id,
                "category_id": 1,  # OCR의 경우 단일 카테고리로 설정 가능
                "segmentation": [segmentation],
                "iscrowd": 0,
                "text": transcription
            })
            annotation_id += 1

    # COCO format을 위한 기본 구조
    coco_format = {
        "images": images,
        "annotations": annotations,
        "categories": [{
            "id": 1,
            "name": "text"
        }]
    }
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(coco_format, f, ensure_ascii=False, indent=2)

# 파일 경로 지정
input_path = "/data/ephemeral/home/MCG/code/data/vietnamese_receipt/ufo/train.json"
output_path = "/data/ephemeral/home/MCG/code/data/viet_train_coco.json"

# 변환 실행
convert_to_coco(input_path, output_path)
