# Datumaro to UFO

In [None]:
import json
import os

In [None]:
def read_json(path):
    with open(path, 'r') as f:
        file = json.load(f)
    return file    

In [None]:
def save_json(data, path):
    with open(path, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
def update_item_id_with_extension(item, image_folder,ufo=True):
    if ufo:
        if not item['id'].endswith(('.jpg', '.png')):
            image_path_jpg = os.path.join(image_folder, item['id'] + '.jpg')
            image_path_png = os.path.join(image_folder, item['id'] + '.png')
            if os.path.exists(image_path_jpg):
                item['id'] += '.jpg'
            elif os.path.exists(image_path_png):
                item['id'] += '.png'
            else:
                print(f"Image file not found for item id {item['id']}")

In [None]:
def datumaro_to_ufo(datumaro_json_path, original_json_path, img_folder, output_ufo_path, ufo=False):
    original_data = read_json(original_json_path)
    original_images = original_data.get('images', {}) if ufo else {i['id']: i for i in original_data.get('items', [])}
    
    datumaro_data = read_json(datumaro_json_path)
    print("Datumaro 데이터의 키:", datumaro_data.keys())
    
    ufo_data = {'images': {}}
    
    for item in datumaro_data['items']:
        update_item_id_with_extension(item, img_folder, ufo)
    
    if 'items' in datumaro_data:
        items = datumaro_data['items']
    else:
        print("Datumaro JSON의 구조가 예상과 다릅니다.")
        return
    
    for item in items:
        image_id = item.get('id') or os.path.splitext(item.get('name', ''))[0]
        if not image_id:
            print("이미지 이름을 찾을 수 없습니다.")
            continue

        if ufo:
            img_info = original_images.get(image_id)
            if img_info:
                img_w = img_info.get('img_w')
                img_h = img_info.get('img_h')
            else:
                print(f"원본 UFO 데이터에 '{image_id}' 이미지 정보가 없습니다.")
                continue
        else:
            img_info = original_images.get(image_id)
            if img_info and 'image' in img_info and 'size' in img_info['image']:
                img_h, img_w = img_info['image']['size']
            else:
                print(f"Original Datumaro data에 '{image_id}' 크기 정보가 없습니다.")
                continue

        ufo_item = {
            "paragraphs": {},
            "words": {},
            "chars": {},
            "img_w": img_w,
            "img_h": img_h,
            "num_patches": None,
            "tags": [],
            "relations": {},
            "annotation_log": {},
            "license_tag": {}
        }
        
        annotations = item.get('annotations', [])
        
        word_id = 1
        for ann in annotations:
            transcription = '1'
            if 'points' in ann and ann['points']:
                points = [[ann['points'][i], ann['points'][i+1]] for i in range(0, len(ann['points']), 2)]
            else:
                continue
            
            ufo_item['words'][f'{word_id:04d}'] = {
                "transcription": transcription,
                "points": points
            }
            word_id += 1

        ufo_data['images'][image_id] = ufo_item

    save_json(ufo_data, output_ufo_path)
    print(f'UFO 데이터가 {output_ufo_path}에 저장되었습니다.')

In [None]:
def file_json_rename(json_path, dir_path):
    data = read_json(json_path)
    image_files = sorted(os.listdir(dir_path))
    
    for idx, filename in enumerate(image_files):
        file_extension = os.path.splitext(filename)[1]
        new_filename = f"image_{idx}{file_extension}"
        
        old_path = os.path.join(dir_path, filename)
        new_path = os.path.join(dir_path, new_filename)
        os.rename(old_path, new_path)
        
        for key,value in data['images'].items():
            new_old_path = old_path.replace("/data/ephemeral/home/new_annotations/new_thai","images/train")
            if new_old_path == key+".jpg":
                new_key = new_path.replace("/data/ephemeral/home/new_annotations/new_thai","images/train")
                data['images'][new_key[:-4]] = value
                del data['images'][key]
                break
    save_json(data,"./rename_new_thai.json")
        

In [None]:
def newdatu_ufo_id_fix(new_json, what_lang, img_dir):
    data = read_json(new_json)
    new_images = {}
    for old_key, value in data['images'].items():
        split_name = old_key.split('/')
        key_name = split_name[2] + ".jpg"
        key_name_split = key_name.split("_")
        new_image_filename = f"image.{what_lang}.{key_name_split[1]}"
        new_images[new_image_filename] = value
    data['images'] = new_images
    
    for filename in os.listdir(img_dir):
        if filename.startswith('image_'):
            number = filename.split('_')[1]
            new_filename = f"image.{what_lang}.{number}"
            old_file = os.path.join(img_dir,filename)
            new_file = os.path.join(img_dir,new_filename)
            os.rename(old_file,new_file)
            print(f'rename {old_file} : {new_file}')
    save_json(data, new_json)

In [None]:
'''
datumaro_json_path : CVAT에서 나온 annotation json
origin_json_path : 원본 포맷 json
img_folder : 이미지 폴더
output_path : datu -> ufo 변환 후 저장 경로
is_ufo : origin json이 ufo 포맷일때 True
'''
datumaro_json_path = "/data/ephemeral/home/new_annotations/ajson/datu_json/syzh.json"
origin_json_path = "/data/ephemeral/home/new_annotations/ajson/origin_json/syzh.json"
img_folder = "/data/ephemeral/home/new_annotations/syzh_receipt/img/train"
output_path = "/data/ephemeral/home/new_annotations/ufo_tran/syzh.json"
origin_is_ufo = False

In [None]:
datumaro_to_ufo(datumaro_json_path, origin_json_path, img_folder, output_path, origin_is_ufo)

# NEW Data만 사용

In [None]:
newdatu_ufo_id_fix(output_path,'syzh',img_folder)