In [1]:
import os
import shutil
import json
from tqdm import tqdm

from PIL import Image
import numpy as np

from googletrans import Translator 

In [46]:
def convert_coordinates(data, img_size, or_w, or_h):
    x, y, w, h = data
    tr_w, tr_h = img_size
    w_scale = tr_w / or_w
    h_scale = tr_h / or_h
    re_x = int(np.round(x * w_scale))
    re_y = int(np.round(y * h_scale))
    re_xmax = int(np.round((x+w) * w_scale))
    re_ymax = int(np.round((y+h) * h_scale))
    x1, x2, x3, x4 = re_x, re_xmax, re_xmax, re_x
    y1, y2, y3, y4 = re_y, re_y, re_ymax, re_ymax
    return ",".join(map(str,[x1, y1, x2, y2, x3, y3, x4, y4]))

In [47]:
def make_info_json(caption, file_name, img_size, output_sub_folder_path):
    info = {
            "NSFW": "UNLIKELY", "similarity": 0.99, "LICENSE": "scv", 
            "caption": caption, 
            "url": "", "key": file_name, "status": "success", "error_message": "error", 
            "width": img_size[0], "height": img_size[1], "original_width": 1600, "original_height": 1200, 
            "exif": "{}", "sha256": ""
            }
    with open(os.path.join(output_sub_folder_path, 'info.json'), 'w') as f:
        f.write(str(info).replace("\'","\""))

In [48]:
translator = Translator()

def translate(prompt, tmp):
    if tmp // 30 == 0:
        translator = Translator()
    translation = translator.translate(prompt, src='ko', dest='en')
    return translation.text 

In [72]:

def main(img_folder_path, charseg_npy_path, caption_file_path, json_folder_path, output_folder_path):
    img_files_list, charseg_npy_list = sorted(os.listdir(img_folder_path)), sorted(os.listdir(charseg_npy_path))
    caption_files_list, json_files_list = sorted(os.listdir(caption_file_path)), sorted(os.listdir(json_folder_path))
    for i, (img_file, seg_file, caption_file, json_file) in tqdm(enumerate(zip(img_files_list, charseg_npy_list, caption_files_list, json_files_list))):
        name = img_file.split('.')[0]
        root_folder = os.path.join(output_folder_path, str(i), name)
        if not os.path.exists(root_folder):
            os.makedirs(root_folder)
        caption_path = os.path.join(root_folder, 'caption.txt')
        seg_path = os.path.join(root_folder, 'charseg.npy')
        img_path = os.path.join(root_folder, 'image.jpg')
        
        with open(os.path.join(caption_file_path, caption_file), 'r') as f:
            data = f.read()
        start_index = data.find('"') + 1
        end_index = data.find('"', start_index)
        caption = data[start_index:end_index]
        with open(caption_path, 'w') as c:
            c.write(caption)
        
        shutil.copyfile(os.path.join(charseg_npy_path, seg_file), seg_path)
        shutil.copyfile(os.path.join(img_folder_path, img_file), img_path)
        
        img = Image.open(os.path.join(img_folder_path, img_file))
        make_info_json(caption, img_file, img.size, root_folder)
        
        #ocr.txt
        with open(os.path.join(json_folder_path, json_file)) as j:
            datas = json.load(j)
            ocr = []
            original_width = datas['images'][0]['width']
            original_height = datas['images'][0]['height']
            
            if len(datas['annotations']) == 1:
                for data in datas['annotations']:
                    word = caption
                    coordinate = convert_coordinates(data['bbox'], img.size, original_width, original_height)
                    ocr.append(f'{word} {coordinate} 0.99\n')
            elif len(datas['annotations']) == len(caption.split(' ')):
                caption_list = list(caption.split(' '))
                for (data, cap) in zip(datas['annotations'], caption_list):
                    word = cap
                    coordinate = convert_coordinates(data['bbox'], img.size, original_width, original_height)
                    ocr.append(f'{word} {coordinate} 0.99\n')
            elif len(datas['annotations']) < len(caption.split(' ')):
                if len(datas['annotations']) * 2 <= len(caption.split(' ')):
                    caption_list = list(caption.split(' '))
                    i = 1
                    word = ''
                    for (data, cap) in zip(datas['annotations'], caption_list):
                        if i == 1:
                            word += cap
                        elif i == 0:
                            word += ' ' + cap
                            coordinate = convert_coordinates(data['bbox'], img.size, original_width, original_height)
                            ocr.append(f'{word} {coordinate} 0.99\n')
                        i = 0
                else:
                    cordinates = []
                    x_min = 2000
                    x_max = 0
                    y_min = 2000
                    y_max = 0
                    word = caption
                    for data in datas['annotations']:
                        coordinate = convert_coordinates(data['bbox'], img.size, original_width, original_height)
                        cordinates.append(coordinate)
                    for cordi in cordinates:
                        x1, y1, x2, y2, x3, y3, x4, y4 = map(int, cordi.split(','))
                        if x_min > x1:
                            x_min = x1
                        if x_max < x2:
                            x_max = x2
                        if y_min > y1:
                            y_min = y1
                        if y_max < y3:
                            y_max = y3
                    value = [x_min, y_min, x_max - x_min, y_max - y_min]
                    coordinate = convert_coordinates(value, img.size, original_width, original_height)
                    ocr.append(f'{word} {coordinate} 0.99\n')
            else:
                cordinates = []
                x_min = 2000
                x_max = 0
                y_min = 2000
                y_max = 0
                word = caption
                for data in datas['annotations']:
                    coordinate = convert_coordinates(data['bbox'], img.size, original_width, original_height)
                    cordinates.append(coordinate)
                for cordi in cordinates:
                    x1, y1, x2, y2, x3, y3, x4, y4 = map(int, cordi.split(','))
                    if x_min > x1:
                        x_min = x1
                    if x_max < x2:
                        x_max = x2
                    if y_min > y1:
                        y_min = y1
                    if y_max < y3:
                        y_max = y3
                value = [x_min, y_min, x_max - x_min, y_max - y_min]
                coordinate = convert_coordinates(value, img.size, original_width, original_height)
                ocr.append(f'{word} {coordinate} 0.99\n')
            with open(os.path.join(root_folder, 'ocr.txt'), 'w') as f:
                f.write("".join(ocr))
        os.remove(os.path.join(img_folder_path, img_file))
        os.remove(os.path.join(charseg_npy_path, seg_file))
        # break


In [None]:
if __name__ == "__main__":
    json_folder_path = './dataset/label_json'
    img_folder_path = './dataset/img_data'
    charseg_npy_path = './dataset/charseg_npy'
    caption_file_path = './dataset/labels_en_ver'

    # output 폴더 위치
    output_folder_path = './dataset/final/'
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    main(img_folder_path, charseg_npy_path, caption_file_path, json_folder_path, output_folder_path)