Related to [issue#5](https://github.com/barisozmen/aerial-autoaug/issues/5)

Data preprocess should be like:

1. Remove images having width or height less than 608*
2. Split images using SplitImg.py module of DOTA_devkit, where subsize=608 and gap=0.
3. Remove any image whose after-split dimensions are not order of 608
4. Convert oriented bounding boxes (OBB) to horizontal bounding boxes (HBB)

For pipeline v0.1, only use 20 images for training set, where 10 of them having "planes" in it. All images from test set. MVP targets only to detect planes.

The rationale of choosing 608 as size is that pre-trained model, which I will use in v0.1, was trained by 608x608 images (https://github.com/ringringyi/DOTA_models#training).

In [1]:
import pandas as pd
import numpy as np
import os
from PIL import Image
import shutil

%matplotlib inline
import matplotlib.pyplot as plt

from shutil import copytree, ignore_patterns
import sys
sys.path.insert(1, "../DOTA_devkit/")

from DOTA import DOTA
from ImgSplit import splitbase

## Set variables

In [None]:
DOTA_DIR = "../data/raw/DOTA/"
DOTA_TRAIN_DIR = "../data/raw/DOTA/training/"
DOTA_VAL_DIR = "../data/raw/DOTA/validation/"
DOTA_MVP_DIR = "../data/raw/DOTA_MVP/"

IMAGE_DIM = 300
RESIZE_FACTOR = 0.25

np.random.seed(123)

## 0-a. Delete neccessary directories

In [None]:
if os.path.exists("../data/raw/DOTA_MVP/"):
    shutil.rmtree("../data/raw/DOTA_MVP/")
if os.path.exists("../data/processed/DOTA_MVP/"):
    shutil.rmtree("../data/processed/DOTA_MVP/")
if os.path.exists("../data/processed/DOTA_MVP_splitted/"):
    shutil.rmtree("../data/processed/DOTA_MVP_splitted/")

## 0-b. Make MVP-DOTA directory

In [None]:
if os.path.exists(DOTA_MVP_DIR):
    print (f"{DOTA_MVP_DIR} already exist, dont make MVP-DOTA directory again")
else:
    
    from shutil import copyfile
    
    def copy_image_by_id(image_id, from_dir, to_dir):
        # copy image file
        image_file = f"{image_id}.png"
        from_path = from_dir + "images/" + image_file
        to_path = to_dir + "images/" + image_file
        copyfile(from_path, to_path)
        # copy label file
        label_file = f"{image_id}.txt"
        from_path = from_dir + "labelTxt/" + label_file
        to_path = to_dir + "labelTxt/" + label_file
        copyfile(from_path, to_path)
        
    dota_training = DOTA(DOTA_TRAIN_DIR)
    dota_validation = DOTA(DOTA_VAL_DIR)
    
    os.mkdir( DOTA_MVP_DIR )
    for split_set in ("/training/", "/validation/"):
        os.mkdir( DOTA_MVP_DIR + split_set)
        os.mkdir( DOTA_MVP_DIR + split_set + "images/" )
        os.mkdir( DOTA_MVP_DIR + split_set + "labelTxt/" )
    
    ################################################################
    # Copy 10 plane and 10 non-plane images from training set
    ################################################################
    
    all_tr_image_ids = dota_training.getImgIds()
    plane_tr_image_ids = dota_training.getImgIds(catNms=['plane'])
    non_plane_tr_image_ids = list(set(all_tr_image_ids).difference(plane_tr_image_ids))

    selected_plane_tr_image_ids = np.random.choice(plane_tr_image_ids,150)
    selected_nonplane_tr_image_ids = np.random.choice(non_plane_tr_image_ids,150)
    
    for image_id in np.concatenate([selected_plane_tr_image_ids, selected_nonplane_tr_image_ids]):
        copy_image_by_id(
            image_id = image_id,
            from_dir = DOTA_DIR + "training/",
            to_dir = DOTA_MVP_DIR + "training/"
        )
        
    ################################################################
    # Copy 100 images from validatation set
    ################################################################
    
    all_val_image_ids = dota_validation.getImgIds(catNms=['plane'])
    selected_val_image_ids = np.random.choice(all_val_image_ids, 2)
    
    for image_id in selected_val_image_ids:
        copy_image_by_id(
            image_id = image_id,
            from_dir = DOTA_DIR + "validation/",
            to_dir = DOTA_MVP_DIR + "validation/"
        )


## Copy whole library to /data/processed/

In [None]:
TARGET_DIR = "../data/processed/DOTA_MVP"

assert not os.path.exists(TARGET_DIR), 'data already preprocessed'

copytree(DOTA_MVP_DIR, TARGET_DIR, ignore=ignore_patterns('*.pyc', 'tmp*'))

In [None]:
target = DOTA(TARGET_DIR)

## 1. Remove images having width or height less than 300. Otherwise resize it.

In [None]:
for folder_set in ["training", "validation"]:
    for image_name in os.listdir(TARGET_DIR + f"/{folder_set}/images/"):
        image_path = TARGET_DIR + f"/{folder_set}/images/" + image_name
        im = Image.open(image_path)

        if im.size[0]<IMAGE_DIM or im.size[1]<IMAGE_DIM:
            os.remove(image_path)
            print (f"image at {image_path} removed")

## 2. Split images 

In [None]:
SPLITTED_TARGET_DIR = "../data/processed/DOTA_MVP_splitted"

for folder_set in ["training", "validation"]:
    split = splitbase(
        TARGET_DIR + "/" + folder_set, 
        SPLITTED_TARGET_DIR + "/" + folder_set, 
        choosebestpoint = True,
        subsize = IMAGE_DIM,
        gap=0
    )
    split.splitdata(RESIZE_FACTOR)

## 3. Remove images whose dimensions are not on the order of 300

In [None]:
removed=[]

for folder_set in ["training", "validation"]:
    for image_name in os.listdir(SPLITTED_TARGET_DIR + f"/{folder_set}/images/"):
        image_path = SPLITTED_TARGET_DIR + f"/{folder_set}/images/" + image_name
        im = Image.open(image_path)
        
        if im.size[0]!=IMAGE_DIM or im.size[1]!=IMAGE_DIM:
            os.remove(image_path)
            print (f"image at {image_path} removed")
            
        removed.append(image_name)

In [None]:
removed

## 4. Convert labels to Udacity driving dataset labels format

As here:
https://raw.githubusercontent.com/udacity/self-driving-car/master/annotations/labels_crowdai.csv

In [None]:
label_name

In [None]:
for folder_set in ["training", "validation"]:
    labels_path = SPLITTED_TARGET_DIR + f"/{folder_set}/labelTxt/"

    all_labels_df = pd.DataFrame()

    for label_name in os.listdir(labels_path):
        
        if label_name.replace(".txt","") in [x.replace(".png","") for x in removed]:
            continue

        label_path = labels_path + label_name

        label_df = pd.read_csv(
            label_path,
            sep=" ", header=None,
            names = ["x1","y1","x2","y2","x3","y3","x4","y4","category","difficulty"]
        )

        new_label_df = pd.DataFrame(columns=["image_name","xmin","xmax","ymin","ymax","class_id"])
        
        new_label_df["xmin"] = label_df[["x1","x2","x3","x4"]].min(axis=1).astype(int)
        new_label_df["xmax"] = label_df[["x1","x2","x3","x4"]].max(axis=1).astype(int)
        new_label_df["ymin"] = label_df[["y1","y2","y3","y4"]].min(axis=1).astype(int)
        new_label_df["ymax"] = label_df[["y1","y2","y3","y4"]].max(axis=1).astype(int)
        new_label_df["category"] = label_df["category"]
        new_label_df["image_name"] = label_name.split(".")[0] + "_" +label_name.split(".")[1]+".png"
        
#         for col in ["xmin","xmax","ymin","ymax"]:
#             new_label_df[col] = new_label_df[col]/RESIZE_FACTOR
        
        # Remove all whose class is not plane
        new_label_df = new_label_df[new_label_df["category"]=="plane"]
        
        new_label_df = new_label_df[new_label_df["image_name"].isin([x.replace(".","_") for x in removed])==False]
        
        # Only one class: plane
        new_label_df["class_id"]=1
        
        ORDER = ["image_name", "xmin", "xmax", "ymin", "ymax", "class_id"]
        new_label_df = new_label_df[ORDER]
        all_labels_df = pd.concat([all_labels_df, new_label_df])

    all_labels_df.to_csv(SPLITTED_TARGET_DIR + f"/{folder_set}_labels.csv", sep=",", index=False)
    
# TODO merge training and validation images somewhere

## Merge training and validation folders into "all_images" folder

In [None]:
#recursively merge two folders including subfolders
def mergefolders(root_src_dir, root_dst_dir):
    for src_dir, dirs, files in os.walk(root_src_dir):
        dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1)
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)
        for file_ in files:
            src_file = os.path.join(src_dir, file_)
            dst_file = os.path.join(dst_dir, file_)
            if os.path.exists(dst_file):
                os.remove(dst_file)
            shutil.copy(src_file, dst_dir)
            
mergefolders(
    SPLITTED_TARGET_DIR + f"/training/images/",
    SPLITTED_TARGET_DIR + f"/all_images/"
)
mergefolders(
    SPLITTED_TARGET_DIR + f"/validation/images/",
    SPLITTED_TARGET_DIR + f"/all_images/"
)

## Rename all by removing dots in the middle
e.g. 'P0032__0.25__300___0.png' to 'P0032__0_25__300___0.png'

In [None]:
SPLITTED_TARGET_DIR + f"/all_images/"