##### based on https://github.com/jason9075/opencv-mosaic-data-aug by jason9075

In [None]:
import os
import json
import random

import numpy as np
import cv2

In [None]:
# FIXME
dataset = 'medical'

TOTAL = 150
OUTPUT_SIZE = (4096, 4096)  # Height, Width
SCALE_RANGE = (0.3, 0.7)
FILTER_TINY = 0  

In [None]:
IMG_DIR = os.path.join('/opt/ml/input/data', dataset, 'img/train')

LABEL_DIR = os.path.join('/opt/ml/input/data/', dataset, 'ufo', 'train.json')

os.makedirs(os.path.join('/opt/ml/input/data/', 'mosaic', 'img/train'), exist_ok=True)
os.makedirs(os.path.join('/opt/ml/input/data/', 'mosaic', 'ufo'), exist_ok=True)

In [None]:
with open(LABEL_DIR, 'r', encoding='utf-8') as f:
    ufo = json.load(f)['images']
ufo_image_names = [x for x in ufo.keys()]

In [None]:
output_img = np.zeros([OUTPUT_SIZE[0], OUTPUT_SIZE[1], 3], dtype=np.uint8)

license_tag = dict(usability=True, public=True,
                   commercial=True, type='CC-BY-SA',
                   holder=None)
orientation = 'Horizontal'


new_anno = dict()

for i in range(TOTAL):
    
    # scale_x, scale_y는 SCALE_RANGE 사이의 숫자
    scale_x = SCALE_RANGE[0] + random.random() * (SCALE_RANGE[1] - SCALE_RANGE[0])
    scale_y = SCALE_RANGE[0] + random.random() * (SCALE_RANGE[1] - SCALE_RANGE[0])
    divid_point_x = int(scale_x * OUTPUT_SIZE[1])
    divid_point_y = int(scale_y * OUTPUT_SIZE[0])

    temp_anno = dict(img_h=OUTPUT_SIZE[0], img_w=OUTPUT_SIZE[1],
                     tags=None, license_tag=license_tag)
    temp_words = dict()
    counter = 0
    
    for j in range(4):
        
        index = random.randrange(len(ufo))
        key = ufo_image_names[index]
        value = ufo[key]
        img = cv2.imread(os.path.join(IMG_DIR, key))
        
        img_height, img_width, _ = img.shape
        
        # top-left
        if j == 0:
            img = cv2.resize(img, (divid_point_x, divid_point_y))
            output_img[:divid_point_y, :divid_point_x, :] = img

            for word in value['words'].items():
                word = word[1]
                a0, a1 = word['points'][0][0], word['points'][0][1]
                b0, b1 = word['points'][1][0], word['points'][1][1]
                c0, c1 = word['points'][2][0], word['points'][2][1]
                d0, d1 = word['points'][3][0], word['points'][3][1]
                
                a0 = float(int(a0*divid_point_x / img_width))
                a1 = float(int(a1*divid_point_y / img_height))
                
                b0 = float(int(b0*divid_point_x / img_width))
                b1 = float(int(b1*divid_point_y / img_height))
                
                c0 = float(int(c0*divid_point_x / img_width))
                c1 = float(int(c1*divid_point_y / img_height))
                
                d0 = float(int(d0*divid_point_x / img_width))
                d1 = float(int(d1*divid_point_y / img_height))

                if (c0-a0)*(c1-a1) < FILTER_TINY:
                    continue
                temp_words[counter] = dict(points=[[a0,a1],[b0,b1],[c0,c1],[d0,d1]],
                                        transcription=word['transcription'],
                                        language=word['language'],
                                        illegibility=word['illegibility'],
                                        orientation=word['orientation'],
                                        tags=word['tags'],
                                        confidence=word['confidence'])
                counter += 1

        # top-right
        elif j == 1:
            img = cv2.resize(img, (OUTPUT_SIZE[1] - divid_point_x, divid_point_y))
            output_img[:divid_point_y, divid_point_x:OUTPUT_SIZE[1], :] = img

            for word in value['words'].items():
                word = word[1]
                a0, a1 = word['points'][0][0], word['points'][0][1]
                b0, b1 = word['points'][1][0], word['points'][1][1]
                c0, c1 = word['points'][2][0], word['points'][2][1]
                d0, d1 = word['points'][3][0], word['points'][3][1]
                
                a0 = float(int(divid_point_x + (a0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                a1 = float(int(a1*divid_point_y / img_height))
                
                b0 = float(int(divid_point_x + (b0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                b1 = float(int(b1*divid_point_y / img_height))
                
                c0 = float(int(divid_point_x + (c0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                c1 = float(int(c1*divid_point_y / img_height))
                
                d0 = float(int(divid_point_x + (d0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                d1 = float(int(d1*divid_point_y / img_height))
                
                if (c0-a0)*(c1-a1) < FILTER_TINY:
                    continue
                temp_words[counter] = dict(points=[[a0,a1],[b0,b1],[c0,c1],[d0,d1]],
                                        transcription=word['transcription'],
                                        language=word['language'],
                                        illegibility=word['illegibility'],
                                        orientation=word['orientation'],
                                        tags=word['tags'],
                                        confidence=word['confidence'])
                counter += 1

        # bottom-left
        elif j == 2:
            img = cv2.resize(img, (divid_point_x, OUTPUT_SIZE[0] - divid_point_y))
            output_img[divid_point_y:OUTPUT_SIZE[0], :divid_point_x, :] = img

            for word in value['words'].items():
                word = word[1]
                a0, a1 = word['points'][0][0], word['points'][0][1]
                b0, b1 = word['points'][1][0], word['points'][1][1]
                c0, c1 = word['points'][2][0], word['points'][2][1]
                d0, d1 = word['points'][3][0], word['points'][3][1]
    
                a0 = float(int(a0*divid_point_x / img_width))
                a1 = float(int(divid_point_y + (a1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))
                
                b0 = float(int(b0*divid_point_x / img_width))
                b1 = float(int(divid_point_y + (b1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))
                
                c0 = float(int(c0*divid_point_x / img_width))
                c1 = float(int(divid_point_y + (c1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))
                
                d0 = float(int(d0*divid_point_x / img_width))
                d1 = float(int(divid_point_y + (d1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))

                if (c0-a0)*(c1-a1) < FILTER_TINY:
                    continue
                temp_words[counter] = dict(points=[[a0,a1],[b0,b1],[c0,c1],[d0,d1]],
                                        transcription=word['transcription'],
                                        language=word['language'],
                                        illegibility=word['illegibility'],
                                        orientation=word['orientation'],
                                        tags=word['tags'],
                                        confidence=word['confidence'])
                counter += 1

        # bottom-right
        else:
            img = cv2.resize(img, (OUTPUT_SIZE[1] - divid_point_x, OUTPUT_SIZE[0] - divid_point_y))
            output_img[divid_point_y:OUTPUT_SIZE[0], divid_point_x:OUTPUT_SIZE[1], :] = img

            for word in value['words'].items():
                word = word[1]
                a0, a1 = word['points'][0][0], word['points'][0][1]
                b0, b1 = word['points'][1][0], word['points'][1][1]
                c0, c1 = word['points'][2][0], word['points'][2][1]
                d0, d1 = word['points'][3][0], word['points'][3][1]
                
                a0 = float(int(divid_point_x + (a0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                a1 = float(int(divid_point_y + (a1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))
                
                b0 = float(int(divid_point_x + (b0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                b1 = float(int(divid_point_y + (b1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))
                
                c0 = float(int(divid_point_x + (c0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                c1 = float(int(divid_point_y + (c1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))
                
                d0 = float(int(divid_point_x + (d0*(OUTPUT_SIZE[1]-divid_point_x) / img_width)))
                d1 = float(int(divid_point_y + (d1*(OUTPUT_SIZE[0]-divid_point_y) / img_height)))
            
                if (c0-a0)*(c1-a1) < FILTER_TINY:
                    continue
                temp_words[counter] = dict(points=[[a0,a1],[b0,b1],[c0,c1],[d0,d1]],
                                        transcription=word['transcription'],
                                        language=word['language'],
                                        illegibility=word['illegibility'],
                                        orientation=word['orientation'],
                                        tags=word['tags'],
                                        confidence=word['confidence'])
                counter += 1
    
    # save image
    image_name = f'mosaic_{i}.jpg'
    cv2.imwrite(os.path.join('/opt/ml/input/data/', 'mosaic', 'img/train', image_name), output_img)
     
    temp_anno['words'] = temp_words
    new_anno[image_name] = temp_anno

# create ufo train.json
anno = dict(images=new_anno)

with open(os.path.join('/opt/ml/input/data/', 'mosaic', 'ufo', 'train.json'), 'w', encoding='utf-8') as f:
    json.dump(anno, f, indent=4)