# Dataset Preparation
Dataset processing and augmentation tools

Created by [Artem Konevskikh](https://github.com/artem-konevskikh)

In [None]:
#@title Import libraries
#@markdown Import libraries required for image manipulation
!apt-get update > /dev/null 2>&1
!apt-get install imagemagick > /dev/null 2>&1

import glob
import os
from math import floor, ceil
from PIL import Image
import numpy as np
from tqdm.notebook import tqdm
from fastai.vision import verify_images



In [None]:
#@title Define functions

#@markdown Functions for image manipulations

def crop_center(img, ratio):
  img = img.copy()
  width, height = img.size
  crop_size= ceil(ratio * min(img.size))
 
  left = (width - crop_size)/2
  top = (height - crop_size)/2
  right = (width + crop_size)/2
  bottom = (height + crop_size)/2

  return img.crop((left, top, right, bottom))

def save_crop(img, cnt, path, augment=False):
    img = img.copy()

    img.save(f'{path}{cnt:08d}.png')
    cnt += 1
    if augment:
        img.transpose(Image.FLIP_LEFT_RIGHT).save(f'{path}/{cnt:08d}.png')
        cnt += 1
        img.transpose(Image.FLIP_TOP_BOTTOM).save(f'{path}/{cnt:08d}.png')
        cnt += 1
        img.transpose(Image.ROTATE_180).save(f'{path}/{cnt:08d}.png')
        cnt += 1

    return cnt

def thumbnail(img, size=256):
    """
    resize image so smallest side will be equal to size
    """
    
    img = img.copy()

    if img.mode not in ('L', 'RGB'):
        img = img.convert('RGB')

    width, height = img.size

    if width == height:
        img.thumbnail((size, size), Image.ANTIALIAS)

    elif height > width:
        ratio = float(height) / float(width)
        newheight = ratio * size
        img = img.resize((size, int(floor(newheight))), Image.ANTIALIAS)

    elif width > height:
        ratio = float(width) / float(height)
        newwidth = ratio * size
        img = img.resize((int(floor(newwidth)), size), Image.ANTIALIAS)


    return img

def get_crop_bboxes(w, h):
    """
    calculate bounding boxes based on width and height
    """
    n_step = ceil(float(w) / float(h))
    shift = (w - h) // n_step
    shifting_array = []
    for step in range(0, n_step):
        if w > h:
            shifting = (shift * step, 0, shift * step + h, h)
        else:
            shifting = (0, shift * step, w, shift * step + w)
        shifting_array.append(shifting)
    if w > h:
        shifting = (w - h, 0, w, h)
    else:
        shifting = (0, h - w, w, h)
    shifting_array.append(shifting)
    return shifting_array

In [None]:
#@title Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Cropping

In [None]:
#@title Crop center
#@markdown This cell will crop the center of each image in input directory, resize it and save result to output directory

#@markdown Input directory
input_dir = '/content/drive/MyDrive/uibk/Experiment-cropped' #@param {type:'string'}
#@markdown Output directory. Will be created if not exist
out_dir = '/content/drive/MyDrive/ants-exp' #@param {type:'string'}
#@markdown Resize to
resize = "512" #@param [256, 512, 1024] {allow-input: true}
resize = int(resize)
#@markdown Crop size ratio. The size of cropped square will be equal to `ratio * min side size`
ratio = 1 #@param {type:"slider", min:0.1, max:1, step:0.05}
#@markdown Augment dataset with rotation
augment=False #@param {type:'boolean'}

if out_dir[-1] != "/":
  out_dir += "/"

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

img_ext = ['tif', 'tiff', 'TIF', 'TIFF', 'png', 'PNG', 'jpg', 'JPG', 'jpeg', 'JPEG', 'bmp', 'BMP']
images = []
for ext in img_ext:
  images += glob.glob(f'{input_dir}/*.{ext}')

cnt=0
print(f'Cropping {len(images)} images\n')
for image in tqdm(images):
  img = Image.open(image)
  cropped = crop_center(img, ratio)
  resized = cropped.resize((resize, resize), Image.ANTIALIAS)
  cnt=save_crop(resized, cnt, out_dir, augment)
print(f'Number of images saved: {cnt}')

In [None]:
#@title Slice images

#@markdown This cell will resize each image in input directory, slice it into several square images and save result to output directory

#@markdown Input directory
input_dir = "/content/drive/MyDrive/scraped-images/Lichen" #@param {type:'string'}
#@markdown Output directory. Will be created if not exist
out_dir = '/content/drive/MyDrive/lichen-sliced' #@param {type:'string'}
#@markdown Resize to
resize = "1024" #@param [256, 512, 1024] {allow-input: true}
resize = int(resize)
#@markdown Augment dataset with rotation
augment=True #@param {type:'boolean'}

if out_dir[-1] != "/":
  out_dir += "/"

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

img_ext = ['tif', 'tiff', 'TIF', 'TIFF', 'png', 'PNG', 'jpg', 'JPG', 'jpeg', 'JPEG', 'bmp', 'BMP']
images = []
for ext in img_ext:
  images += glob.glob(f'{input_dir}/*.{ext}')

cnt=0
print(f'Cropping {len(images)} images\n')
for image in tqdm(images):
    img = Image.open(image)
    img = thumbnail(img, size=resize)
    w, h = img.size
    bboxes = get_crop_bboxes(w, h)
    for bbox in bboxes:
        img_cropped = img.crop(bbox)
        cnt = save_crop(img_cropped, cnt, out_dir, augment)
print(f'Number of images saved: {cnt}')

## Enhancing

In [None]:
#@title Auto-enhance images



#@markdown Input directory
input_dir = '/content/drive/MyDrive/lichen-sliced' #@param {type:'string'}
#@markdown Output directory. Will be created if not exist
out_dir = '/content/drive/MyDrive/lichen-sliced-level-gamma' #@param {type:'string'}
#@markdown Enhancing options
auto_level = False #@param {type:"boolean"}
auto_gamma = False #@param {type:"boolean"}
invert = False #@param {type:"boolean"}

options = ''
options+=' -auto-level' if auto_level else ''
options+=' -auto-gamma' if auto_gamma else ''
options+=' -negate' if invert else ''

if out_dir[-1] != "/":
  out_dir += "/"

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

img_ext = ['tif', 'tiff', 'TIF', 'TIFF', 'png', 'PNG', 'jpg', 'JPG', 'jpeg', 'JPEG', 'bmp', 'BMP']
images = []
for ext in img_ext:
  images += glob.glob(f'{input_dir}/*.{ext}')

print(f'Enhancing {len(images)} images\n')
for image in tqdm(images):
  prefix = options.replace('-auto','').replace(' ', '')[1:]
  outfile = f"{out_dir}{prefix}-{image.split('/')[-1]}"
  !convert {image} {options} {outfile}

## Verifying

In [None]:
#@title Verify images
#@markdown This cell will verify images in input directory. Images will be resized to defined size, converted to RGB and saved to output directory

#@markdown Input directory
input_dir = '/content/drive/MyDrive/lichen-sliced' #@param {type:'string'}
#@markdown Output directory. Will be created if not exist
out_dir = '/content/drive/MyDrive/lichen-dataset-256' #@param {type:'string'}
#@markdown Resize to
resize = "512" #@param [256, 512, 1024] {allow-input: true}
resize = int(resize)

if out_dir[-1] != "/":
  out_dir += "/"

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

img_ext = ['tif', 'tiff', 'TIF', 'TIFF', 'png', 'PNG', 'jpg', 'JPG', 'jpeg', 'JPEG', 'bmp', 'BMP']
images = []
for ext in img_ext:
  images += glob.glob(f'{input_dir}/*.{ext}')

cnt=0
print(f'Verifying {len(images)} images\n')
for image in tqdm(images):
    img = Image.open(image)
    img = img.resize((resize, resize), Image.ANTIALIAS)
    imgarr = np.array(img)
    img_channels = 1 if len(imgarr.shape) == 2 else imgarr.shape[2]
    if img_channels == 1:
      # print(imgarr.shape)
      R = np.stack((imgarr, imgarr, imgarr), axis=2)
      img = Image.fromarray(R, 'RGB')
      imgarr = np.array(img)
      # print(imgarr.shape)
    if img.mode in ('RGBA', 'LA') or (img.mode == 'P' and 'transparency' in img.info):
      bg = Image.new('RGB', img.size, (255, 255, 255))
      bg.paste(img, (0, 0), img)
      img = bg
    img.save(f'{out_dir}{cnt:08d}.png')
    cnt+=1
    