# Data Augmentation

In [67]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
ALL = True

## $I/$ Access to Dataset

In [69]:
from config import *
import os
import numpy as np

In [70]:
individuals = os.listdir(DATASET_PATH)
if not ALL:
    individuals = [np.random.choice(individuals)]

print(len(individuals), " individuals")
if not ALL:
    print(individuals)

346  individuals


In [71]:
files = []
for individual in individuals:
    files += os.listdir(DATASET_PATH + individual)

print(len(files), " files")
if not ALL:
    print(files)

44570  files


In [72]:
csvs = []
pictures = []
for file in files:
    csvs.append(file) if file.split('.')[-1] == "csv" else pictures.append(file)

if len(csvs) != len(individuals):
    print("Mismatch between number of individuals and number of csv")

print(len(csvs), " csv")
if not ALL:
    print(csvs)
print(len(pictures), "pictures")
if not ALL:
    print(pictures)

Mismatch between number of individuals and number of csv
344  csv
44226 pictures


## $II/$ Resize and Padding

In [73]:
import json
from tqdm import tqdm
import pandas as pd
import imageio
import imgaug as ia
from imgaug import augmenters as iaa 
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
from utils import orginalname_to_cropname

In [74]:
resize = iaa.Resize({"longer-side": CROP_SIZE, "shorter-side": "keep-aspect-ratio"})
padding = iaa.PadToSquare(pad_mode="constant", pad_cval=0, position="left-bottom")

trainfile = open("train.txt", 'w')

for csv in tqdm(csvs):

    individual = csv.split('_')[0]
    if individual not in individuals:
        print("Individual from csv unknown")

    path = DATASET_PATH + individual + '/'
    df = pd.read_csv(path + csv)

    for picturename in tqdm(df["filename"].unique()):

        crop_picturename = orginalname_to_cropname(picturename)

        pic = imageio.imread(path + picturename)

        bbs = []

        for idx, data in df.loc[df["filename"]==picturename].iterrows():

            shape = json.loads(data["region_shape_attributes"])
            x = shape['x']
            y = shape['y']
            width = shape["width"]
            height = shape["height"]

            bbs.append(BoundingBox(x, y, x + width, y + height))

        bbs = BoundingBoxesOnImage(bbs, shape=pic.shape)

        pic, bbs = resize(image=pic, bounding_boxes=bbs)
        pic, bbs = padding(image=pic, bounding_boxes=bbs)

        imageio.imsave(path + crop_picturename, pic)

        trainfile.write(path + crop_picturename+"\n")

        annotationtxt = open(path+'.'.join(crop_picturename.split('.')[:-1])+".txt", 'w')

        for bb in bbs:
            coords = "0 "
            coords += str(((bb.x1 + bb.x2)/2)/pic.shape[1]) + ' '
            coords += str(((bb.y1 + bb.y2)/2)/pic.shape[0]) + ' '
            coords += str((bb.x2-bb.x1)/pic.shape[1]) + ' '
            coords += str((bb.y2-bb.y1)/pic.shape[0]) + "\n"
            annotationtxt.write(coords)
        
        annotationtxt.close()
        
trainfile.close()

    

100%|██████████| 4/4 [00:03<00:00,  1.12it/s]
100%|██████████| 10/10 [00:06<00:00,  1.60it/s]
100%|██████████| 10/10 [00:06<00:00,  1.43it/s]
100%|██████████| 21/21 [00:13<00:00,  1.56it/s]
100%|██████████| 7/7 [00:05<00:00,  1.31it/s]t]
100%|██████████| 32/32 [00:19<00:00,  1.64it/s]
100%|██████████| 17/17 [00:08<00:00,  2.02it/s]t]
100%|██████████| 23/23 [00:12<00:00,  1.84it/s]  
100%|██████████| 58/58 [00:28<00:00,  2.06it/s]t]
100%|██████████| 33/33 [00:15<00:00,  2.11it/s]t]
100%|██████████| 75/75 [00:32<00:00,  2.28it/s]it]
100%|██████████| 9/9 [00:05<00:00,  1.57it/s]s/it]
100%|██████████| 7/7 [00:02<00:00,  3.08it/s]s/it]
100%|██████████| 14/14 [00:08<00:00,  1.64it/s]it]
100%|██████████| 38/38 [00:20<00:00,  1.87it/s]it]
100%|██████████| 27/27 [00:12<00:00,  2.11it/s]it]
100%|██████████| 53/53 [00:29<00:00,  1.80it/s]it]
100%|██████████| 23/23 [00:15<00:00,  1.47it/s]it]
100%|██████████| 17/17 [00:10<00:00,  1.58it/s]it]
100%|██████████| 2/2 [00:00<00:00,  3.47it/s]s/it]
100%