In [1]:
import os
import json
from tqdm import tqdm

In [2]:
def convert_coordinates(data):
    x,y,w,h = data
    return ",".join(map(str,[x, y, x+w, y, x+w, y+h, x, y+h]))

In [3]:
def make_ocr_and_caption_txt(json_file_path, output_sub_folder_path):
    with open(json_file_path) as f:
        datas = json.load(f)
        caption = []
        ocr = []
        for data in datas['annotations']:        
            word = data['text']
            caption.append(word)
            coordinate = convert_coordinates(data['bbox'])
            ocr.append(f'{word} {coordinate} 0.99\n')
            # ocr.txt 파일 생성
        with open(os.path.join(output_sub_folder_path, 'ocr.txt'), 'w') as f:
            f.write("".join(ocr))
        
        # caption.txt 파일 생성
        with open(os.path.join(output_sub_folder_path, 'caption.txt'), 'w') as f:
            f.write(' '.join(caption))

In [4]:
def make_info_json(file_name, json_file_path, output_sub_folder_path):
    info = {
            "NSFW": "UNLIKELY", "similarity": 0.99, "LICENSE": "scv", 
            "caption": "Day Drunk for America", 
            "url": "", "key": file_name, "status": "success", "error_message": "error", 
            "width": 1600, "height": 1200, "original_width": 1600, "original_height": 1200, 
            "exif": "{}", "sha256": ""
            }
    with open(os.path.join(output_sub_folder_path, 'info.json'), 'w') as f:
        f.write(str(info).replace("\'","\""))

In [5]:
def main(json_folder_path, img_folder_path, output_folder_path):
    folder_num = 41408
    img_files_list = os.listdir(img_folder_path)
    for i, img_file in tqdm(enumerate(img_files_list)):
        file_name = img_file.split('.')[0]
        json_file = file_name+'.json'
        json_file_path = os.path.join(json_folder_path, json_file)
        output_sub_folder_path = os.path.join(output_folder_path, str(folder_num + (i//300)), file_name)
        # 300개씩 폴더를 나누어 준다.
        if not os.path.exists(output_sub_folder_path):
            os.makedirs(output_sub_folder_path)
        else:
            continue
        
        # info.json 파일 생성
        make_info_json(file_name, json_file_path, output_sub_folder_path)

        # ocr.txt와 caption.txt 파일 생성
        make_ocr_and_caption_txt(json_file_path, output_sub_folder_path)

In [None]:
if __name__ == "__main__":
    # 기존 json 폴더 위치
    json_folder_path = '/workspace/logo_gen_project/selected_data/원본_json'

    # 기존 image 폴더 위치
    img_folder_path = '/workspace/logo_gen_project/selected_data/selected_images'

    # output 폴더 위치
    output_folder_path = '/workspace/logo_gen_project/laion_format'
    main(json_folder_path, img_folder_path, output_folder_path)