# Image synthesis; preprocessing, directory creation, fixing of xml files, copying of files to correct directories

In [None]:
import os, io, sys, shutil, glob
from pathlib import Path
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import xmltodict

In [None]:
def prepareDirectories(file_dir, input_objects, input_background):
    # prepare the directory for synthesis, copy over chosen files and remove output directory
    
    selected_objects = os.path.join(file_dir, input_objects)
    selected_backgrounds = os.path.join(file_dir, input_background)

    objects_dir = os.path.join(file_dir, 'data_dir', 'objects_dir')
    bg_dir = os.path.join(file_dir, 'data_dir', 'backgrounds')

    if os.path.exists(objects_dir):
        shutil.rmtree(objects_dir)
    if os.path.exists(bg_dir):
        shutil.rmtree(bg_dir)

    shutil.copytree(selected_objects, objects_dir)
    shutil.copytree(selected_backgrounds, bg_dir)

    output_dir = os.path.join(file_dir, 'output_dir')
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
        
def convert_copy_xml(data_dir):
    #run from inside the EPRI generate synth dataset directory
    #convert xml files from the output_dir of generate synthetic data and copy to data_dir

    # clear existing directory; add if it doesn't exist
    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    
    file_dir = os.path.join('output_dir','train.txt')
    file = open(file_dir, 'r')
    lines = file.readlines()

    for line in lines:

        img_path = line.split()[0] # read the image path from the train.txt file (output of the synthesis)
        xml_path = line.split()[1] # read the xml path
        img_name = img_path.rsplit('/')[-1] # name of the jpg image without directories
        
        new_xml_file = os.path.join(data_dir,img_name.rsplit('.')[0]+str('.xml')) 
                #new xml file name based on jpg image name
        new_img_file = os.path.join(data_dir,img_name) #new directory for jpg image
        
        #copy renamed xml files and the jpg's
        shutil.copyfile(xml_path, new_xml_file)
        shutil.copyfile(img_path, new_img_file)

        #open the xml file and change the file_name to img_name
        with open(new_xml_file) as xf:
            xml_data = xf.read()
            dict_data = xmltodict.parse(xml_data)
            newText=xml_data.replace(dict_data['annotation']['filename'], img_name)

        with open(new_xml_file, "w") as f:
            f.write(newText)
    

# Set all the directories and chosen target images, backgrounds, and output directory

In [None]:
file_dir = Path('/Users/Aga/Desktop/EPRI_generate_synth_dataset')

#IMAGE_DIR = 'all_valid'
#TARGET_DIR = 'synth_valid'

IMAGE_DIR = 'all_train'
TARGET_DIR = 'synth_train'

BG_DIR = 'backgrounds_1024'

In [None]:
# delete old directory files and opy over files from IMAGE_DIR to input directory of dataset generator
prepareDirectories(file_dir, IMAGE_DIR, BG_DIR)

## Synthesize chosen images (i.e. train or valid, which backgrounds)

In [None]:
%cd '/Users/Aga/Desktop/EPRI_generate_synth_dataset'
!python dataset_generator.py --n_image 350 --dontocclude \
 --add_distractors --separate_box_mask data_dir/objects_dir output_dir

## Copy annotation files and rename them, including inside the xml files, for all blended synth image versions, copy files to target directory

In [None]:
convert_copy_xml(TARGET_DIR)

## remove none images

In [None]:
%cd $TARGET_DIR
noneImg = (glob.glob('*_none.jpg'))
noneXml = (glob.glob('*_none.xml'))
os.mkdir('none')
for nn in noneImg:
    shutil.move(nn,'none')
for n in noneXml:
    shutil.move(n,'none')

## investigate bounding box sizes

In [None]:
import xml.etree.ElementTree as ET
import numpy as np
def xml_to_boxes(path, rescale_width=None, rescale_height=None):
    """Extracts bounding-box widths and heights from ground-truth dataset.

    Args:
    path : Path to .xml annotation files for your dataset.
    rescale_width : Scaling factor to rescale width of bounding box.
    rescale_height : Scaling factor to rescale height of bounding box.

    Returns:
    bboxes : A numpy array with pairs of box dimensions as [width, height].
    """
    xml_list = []
    img_size = []
    bad_files = []
    filenames = os.listdir(os.path.join(path))
    filenames = [os.path.join(path, f) for f in filenames if (f.endswith('.xml'))]
    for xml_file in filenames:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('size'):
            ww = int(member.find('width').text)
            hh = int(member.find('height').text)
        for member in root.findall('object'):
            bndbox = member.find('bndbox')
            bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)
            bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)
            if rescale_width and rescale_height:
                size = root.find('size')
                bbox_width = bbox_width * (rescale_width / int(size.find('width').text))
                bbox_height = bbox_height * (rescale_height / int(size.find('height').text))
            xml_list.append([bbox_width, bbox_height])
            img_size.append([ww,hh])
            if ((bbox_width<34) or (bbox_height<34)):
                bad_files.append(xml_file)
    bboxes = np.array(xml_list)
    return bboxes, img_size, bad_files

In [None]:
file_dir = Path('/Users/Aga/Desktop/EPRI_generate_synth_dataset/synth_valid/')
boxArr, imgSize, bf_xml = xml_to_boxes(file_dir)

## find x and y width of bounding boxes for anchor box tuning

In [None]:
xw = [boxArr[n][0] for n in range(len(boxArr))]
yw = [boxArr[n][1] for n in range(len(boxArr))]

In [None]:
yw_s = sorted(yw,reverse=False)
yw_s[0:100]

In [None]:
ratio = [boxArr[n][0]/boxArr[n][1] for n in range(len(boxArr))]

fig, ax = plt.subplots(1,1, figsize=(15, 5))
ax.hist(ratio, bins = 50)
ax.set_title('X width / Y width, synth training data')
plt.show()

## remove files with too small bounding boxes

In [None]:
# what are the unique file numbers for bad files with small bounxing boxes (<34)
bf_unique = [f for f in bf_xml if (f.endswith('.xml'))]
print(len(set(bf_unique)), ' bad files in dir: ', str(file_dir))

bf_jpg = [b.replace('.xml','.jpg') for b in bf_xml]

In [None]:
bf_xml.sort()
bf_jpg.sort()
for i,f in enumerate(bf_xml):
    os.remove(f)
    os.remove(bf_jpg[i])

## check if all jpg's have xml files:

In [None]:
file_dir = Path('/Users/Aga/Desktop/EPRI_generate_synth_dataset/synth_train_valid_v5/synth_train/')

filenames = os.listdir(os.path.join(file_dir))
filenames = [os.path.join(file_dir, f) for f in filenames if (f.endswith('.jpg'))]
print(len(filenames))
for f in filenames:
    jf = f
    xf = f.replace('.jpg','.xml')
    xf_path = Path(xf)
    if not xf_path.is_file():
        print(xf)

In [None]:
filenames = os.listdir(os.path.join(file_dir))
filenames = [os.path.join(file_dir, f) for f in filenames if (f.endswith('.xml'))]
print(len(filenames))
for f in filenames:
    jf = f
    xf = f.replace('.xml','.jpg')
    xf_path = Path(xf)
    if not xf_path.is_file():
        print(xf)

## change names of extra synthetic images, also inside the xml file

In [None]:
def change_filename_XML(new_xml_file, img_name):
    #open the xml file and change the file_name to img_name
    with open(new_xml_file) as xf:
        xml_data = xf.read()
        dict_data = xmltodict.parse(xml_data)
        newText = xml_data.replace(dict_data['annotation']['filename'], img_name)
    with open(new_xml_file, "w") as f:
        f.write(newText)

In [None]:
%cd '/Users/Aga/Desktop/EPRI_generate_synth_dataset/synth_valid'
noneImg = (glob.glob('*.jpg'))
for im in noneImg:
    newName = im.replace('.jpg','_extra.jpg')
    os.rename(im,newName)
    xml_file = im.replace('.jpg','.xml')
    new_xml_file = xml_file.replace('.xml','_extra.xml')
    change_filename_XML(xml_file, newName)
    os.rename(xml_file,new_xml_file)