<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/object_detection_for_image_cropping/chiroptera/chiroptera_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-processing and image augmentation for object detection model training and testing datasets
---
*Last Updated 4 Oct 2022*   
An [EOL user generated cropping dataset](https://editors.eol.org/other_files/EOL_v2_files/image_crops_withEOL_pk.txt.zip) is pre-processed and transformed to formatting standards for use with YOLO via Darkflow and SSD and Faster-RCNN object detection models implemented in Tensorflow. All train and test images are also downloaded to Google Drive for use training and testing.

Before reformatting to object detection model standards, training data is augmented using the [imgaug library](https://github.com/aleju/imgaug). Image augmentation is used to increase training data sample size and diversity to reduce overfitting when training object detection models. Both images and cropping coordinates are augmented. Augmented and original training datasets are then combined before being transformed to object detection model formatting standards.

Notes:   
* Run code blocks by pressing play button in brackets on left
* Before you you start: change the runtime to "GPU" with "High RAM"
* Change parameters using form fields on right (find details at corresponding lines of code by searching '#@param')

## Installs & Imports
---

In [None]:
#@title Choose where to save results & set up directory structure
# Use dropdown menu on right
save = "in Colab runtime (files deleted after each session)" #@param ["in my Google Drive", "in Colab runtime (files deleted after each session)"]
print("Saving results ", save)

# Mount google drive to export image cropping coordinate file(s)
if 'Google Drive' in save:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

# Type in the path to your working directory in form field to right
import os
basewd = "/content/drive/MyDrive/train/tf2" #@param ["/content/drive/MyDrive/train/tf2"] {allow-input: true}
if not os.path.exists(basewd):
    os.makedirs(basewd)

# Enter taxon of interest in form field
taxon = "Chiroptera" #@param ["Chiroptera"] {allow-input: true}

# Folder where pre-processing results will be saved
preprocessing_folder = "pre-processing" #@param ["pre-processing"] {allow-input: true}
cwd = basewd + '/' + preprocessing_folder
print("\nWorking directory set to: \n", cwd)

# Folder where train images will be saved
train_folder = "images" #@param ["images"] {allow-input: true}
train_wd = cwd + '/' + train_folder
if not os.path.exists(train_wd):
    os.makedirs(train_wd)
print("\nTraining images directory set to: \n", train_wd)

# Folder where test images will be saved
test_folder = "test_images" #@param ["test_images"] {allow-input: true}
test_wd = cwd + '/' + test_folder
if not os.path.exists(test_wd):
    os.makedirs(test_wd)
print("\nTesting images directory set to: \n", test_wd)

In [None]:
# Install libraries for augmenting and displaying images
!pip install imgaug
!pip install pillow
!pip install scipy==1.1.0

# For importing/exporting files, working with arrays, etc
import pathlib
import os
import imageio
import time
import csv
import numpy as np
import pandas as pd
from urllib.request import urlopen
from scipy import misc
from scipy.misc import imread
from PIL import Image
# Set number of seconds to timeout if image url taking too long to open
import socket
socket.setdefaulttimeout(10)

# For augmenting the images and bounding boxes
import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

# For drawing onto and plotting the images
import matplotlib.pyplot as plt
import cv2
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

## Build train and test datasets from EOL user-generated cropping data
---
Full cropping dataset is available [here](https://editors.eol.org/other_files/EOL_v2_files/image_crops_withEOL_pk.txt.zip).

In [None]:
#@title Define functions

# Suppress pandas warning about writing over a copy of data
pd.options.mode.chained_assignment = None  # default='warn'

# To read in EOL formatted data files
def read_datafile(fpath, sep="\t", header=0, disp_head=True):
    hdr = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
        }
    try:
        df = pd.read_csv(fpath, sep=sep, header=header, storage_options=hdr)
        if disp_head:
          print("Data header: \n", df.head())
    except FileNotFoundError as e:
        raise Exception("File not found: Enter the path to your file in form field and re-run").with_traceback(e.__traceback__)
    
    return df

# Reformat cropping dimensions
def reformat_crops(crops, disp_head=True):
    # Remove/replace characters in crop_dimensions string
    crops.crop_dimensions.replace('"|{|}', '', regex=True, inplace=True)
    crops.crop_dimensions.replace(':', ',', regex=True, inplace=True)
    
    # Split crop_dimensions into their own columns
    cols = crops.crop_dimensions.str.split(",", expand=True)
    crops["im_height"] = cols[1]
    crops["im_width"] = cols[3]
    crops["xmin"] = cols[5]
    crops["ymin"] = cols[7]
    crops["xmax"] = cols[5].astype(float) + cols[9].astype(float) # add cropwidth to xmin, note crops are square so width=height
    crops["ymax"] = cols[7].astype(float) + cols[9].astype(float) # add cropheight to ymin, note crops are square so width=height
    
    # Remove crop_dimensions column
    crops.drop(columns =["crop_dimensions"], inplace = True) 
    if disp_head:
        print("\n~~~Reformatted EOL crops head~~~\n", crops.head())

    return crops

# Filter by taxon of interest
filter = taxon # defined in first code block
def filter_by_taxon(crops, filter=filter, disp_head=False):
    taxon = crops.loc[crops.ancestry.str.contains(filter, case=False, na=False)]
    taxon.drop(columns =["ancestry"], inplace = True) 
    taxon['name'] = filter
    taxon.reset_index(inplace=True)
    if disp_head:
          print("Showing dataset for only {}: {}\n".format(filter, taxon.head()))
    print("\n~~~Number of available cropping coordinates for training/testing with {}~~~: \n{}\n".format(filter, len(taxon)))

    return taxon

# Split into train and test datasets
def split_train_test(crops, outfpath, frac, disp_head=False):
    # Randomly select 80% of data to use for training (set random_state seed for reproducibility)
    idx = crops.sample(frac = 0.8, random_state=2).index
    train = crops.iloc[idx]
    if disp_head:
        print("Training data for {} (n={} crops): \n".format(filter, len(train), train.head()))

    # Select the remaining 20% of data for testing
    # Uses the inverse index from above
    test = crops.iloc[crops.index.difference(idx)]
    if disp_head:
        print("Testing data for {} (n={} crops): \n".format(filter, len(test), test.head()))

    # Write test and train to tsvs 
    train_outfpath = os.path.splitext(outfpath)[0] + '_train' + '.tsv'
    train.to_csv(train_outfpath, sep='\t', header=True, index=False)
    test_outfpath = os.path.splitext(outfpath)[0] + '_test' + '.tsv'
    test.to_csv(test_outfpath, sep='\t', header=True, index=False)
    print("\n Train and test datasets sucessfully split and saved to: \n\n{}\n{}"\
          .format(train_outfpath, test_outfpath))

    return train, test

In [None]:
#@title Filter EOL cropping coordinates for taxon of interest and reformat to Pascal VOC Annotation Style

# Download EOL user generated cropping file to temporary runtime location
print("Downloading EOL user-generated cropping dataset\n")
!wget --user-agent="Mozilla" https://editors.eol.org/other_files/EOL_v2_files/image_crops_withEOL_pk.txt.zip

# Unzip cropping file to your working directory
!unzip /content/image_crops_withEOL_pk.txt.zip -d $basewd

# Change to your training directory within Google Drive
%cd $basewd
!mv image_crops_withEOL_pk.txt $preprocessing_folder
%cd $cwd 

# Read in user-generated image cropping file
fpath = cwd + '/image_crops_withEOL_pk.txt'
df = read_datafile(fpath, disp_head=False)

# Reformat cropping dimensions
reformatted = reformat_crops(df, disp_head=True)

# Filter by taxon of interest (Chiroptera)
filtered = filter_by_taxon(reformatted, disp_head=False)

# Export Chiroptera crops as tsv
outfpath = filter + '_crops.tsv'
filtered.to_csv(outfpath, sep='\t', index=False)

# Split into train (80%) and test (20%) datasets
train, test = split_train_test(filtered, outfpath, 0.8, disp_head=False)

## Pre-process train dataset
---

In [None]:
#@title Define functions
%cd $cwd

# So URL's don't get truncated & show all cols in display
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_columns', None)

# Set seed to make augmentation reproducible across runs, otherwise will be random each time
ia.seed(1) 

# Folder where train images will be saved (defined in first code block)
folder = train_folder

# Define start and stop indices in EOL bundle for running inference   
def set_start_stop(run):
    # To test with a tiny subset, use 5 random bundle images
    if "tiny subset" in run:
        start=np.random.choice(a=1000, size=1)[0]
        stop=start+5
    # To run for all images
    else:
        start=None
        stop=None
    
    return start, stop

# To display an image already loaded into the runtime
def display_image(image):
  fig = plt.figure(figsize=(20, 15))
  plt.grid(False)
  plt.imshow(image)

# To draw cropping coordinates on an image
def draw_boxes(image, box, class_name):
  image_wboxes = cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), \
                               (255, 0, 157), 3) # change box color and thickness
  
  return image_wboxes

# Define image augmentation pipeline
# modified from https://github.com/aleju/imgaug
seq = iaa.Sequential([
    iaa.Crop(px=(1, 16), keep_size=False), # crop by 1-16px, resize resulting image to orig dims
    iaa.Affine(rotate=(-25, 25)), # rotate -25 to 25 degrees
    iaa.GaussianBlur(sigma=(0, 3.0)), # blur using gaussian kernel with sigma of 0-3
    iaa.AddToHueAndSaturation((-50, 50), per_channel=True)
])

# To augment an image
def augment_image(image, crops, filter=filter, folder=folder):
    pathbase = folder + '/'
    class_name = filter

    # Define image info needed for export
    im_h, im_w = image.shape[:2]
    xmin = crops.xmin[i].astype(int)
    ymin = crops.ymin[i].astype(int)
    xmax = crops.xmax[i].astype(int)
    ymax = crops.ymax[i].astype(int)
    box = [xmin, ymin, xmax, ymax]
    fn = str(crops['data_object_id'][i]) + '.jpg'
    fpath = pathbase + fn
    
    # Export unaugmented image info for future use training object detectors
    with open(outfpath, 'a') as out_file:
          tsv_writer = csv.writer(out_file, delimiter='\t')
          tsv_writer.writerow([crops.data_object_id[i], crops.obj_url[i], \
                              im_h, im_w, box[0], box[1], \
                              box[2], box[3], fn, fpath, class_name])

    # Load original bounding box coordinates to imgaug format
    bb  = ia.BoundingBox(x1=xmin, y1=ymin, x2=xmax, y2=ymax)        
    bb = BoundingBoxesOnImage([bb], shape=image.shape)
    
    # Augment image using settings defined above in seq
    image_aug, bb_aug = seq.augment(image=image, bounding_boxes=bb)

    # Define augmentation results needed for export
    fn_aug = str(crops['data_object_id'][i]) + '_aug' + '.jpg'
    fpath_aug = pathbase + fn_aug
    im_h_aug, im_w_aug = image_aug.shape[:2]
    xmin_aug = bb_aug.bounding_boxes[0].x1.astype(int)
    ymin_aug = bb_aug.bounding_boxes[0].y1.astype(int)
    xmax_aug = bb_aug.bounding_boxes[0].x2.astype(int)
    ymax_aug = bb_aug.bounding_boxes[0].y2.astype(int)
    box_aug = [xmin_aug, ymin_aug, xmax_aug, ymax_aug]
        
    # Export augmentation results for future use training object detectors
    with open(outfpath, 'a') as out_file:
          tsv_writer = csv.writer(out_file, delimiter='\t')
          tsv_writer.writerow([crops.data_object_id[i], crops.obj_url[i], \
                              im_h_aug, im_w_aug, box_aug[0], box_aug[1], \
                              box_aug[2], box_aug[3], fn_aug, fpath_aug, class_name])

    # Draw augmented bounding box and image
    # Only use for up to 50 images
    if display_results:
        image_wboxes = draw_boxes(image_aug, box_aug, class_name)
        display_image(image_wboxes)
        plt.title('{}) Successfully augmented image from {}'.format(format(i+1, '.0f'), url))
    
    return image_aug, fpath_aug

# Remove out of bounds values
def remove_oob(crops):
    # Set negative values to 0
    crops.xmin[crops.xmin < 0] = 0
    crops.ymin[crops.ymin < 0] = 0

    # Remove out of bounds cropping dimensions
    ## When crop height > image height, set crop height equal to image height
    idx = crops.index[crops.ymax > crops.im_height]
    crops.ymin.iloc[idx] = 0
    crops.ymax.iloc[idx] = crops.im_height.iloc[idx]
    ## When crop width > image width, set crop width equal to image width
    idx = crops.index[crops.xmax > crops.im_width]
    crops.xmin.iloc[idx] = 0
    crops.xmax.iloc[idx] = crops.im_width.iloc[idx]

    # Write relevant results to csv formatted for training and annotations needed by Tensorflow and YOLO
    crops_oobrem = crops[['xmin', 'ymin', 'xmax', 'ymax',
                  'filename', 'im_width', 'im_height', 'class']]

    return crops_oobrem

# Write header of crops_aug.tsv before looping through crops for remaining data
outfpath = cwd + '/' + filter + '_crops_train_aug.tsv'
if not os.path.isfile(outfpath): # Prevents writing duplicate header rows
    with open(outfpath, 'a') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(["data_object_id",	"obj_url",	"im_height",	"im_width",	"xmin",
                                "ymin",	"xmax",	"ymax",	"filename",	"path",	"class"])

In [None]:
#@title Augment training images & save them to Google Drive

# Read in EOL user generated cropping data
fpath = filter + "_crops_train.tsv"
crops = read_datafile(fpath, disp_head=False)

# Test pipeline with a smaller subset than 5k images?
run = "test with tiny subset" #@param ["test with tiny subset", "for all images"]

# Display detection results on images
display_results = True #@param {type:"boolean"}

# Download images, augment them, and save to Google Drive
print("Downloading and augmenting training images")
start, stop = set_start_stop(run)

for i, row in crops.iloc[start:stop].iterrows():
    try:
        # Load image from url
        # Note: Use imread instead of imageio.imread to load images from url and get consistent output type and shape
        url = crops["obj_url"][i]
        with urlopen(url) as file:
            image = imread(file, mode='RGB')

        # Augment the image and bounding box
        image_aug, fpath_aug = augment_image(image, crops)

        # Save image to Google Drive
        imageio.imwrite(fpath_aug, image_aug)

        # Save unaugmented image to Google Drive
        fpath = fpath_aug.replace("_aug", "")
        imageio.imwrite(fpath, image)
    
        # Display message to track augmentation process by image
        print('{}) Successfully downloaded & augmented image from {}'.format(format(i+1, '.0f'), url))
  
    except:
        print('{}) Error: check if web address for image from {} is valid'.format(format(i+1, '.0f'), url))

# Remove out of bounds values
aug_crops = read_datafile(outfpath, disp_head=False)
crops_oobrem = remove_oob(aug_crops)

# Save results for use training object detectors
outfpath = os.path.splitext(outfpath)[0] + '_oob_rem_fin.csv' 
crops_oobrem.to_csv(outfpath, sep=',', index=False)

## Pre-process test dataset
---


In [None]:
#@title Define functions
%cd $cwd

# Folder where test images will be saved (defined in first code block)
folder = test_folder

# Get info from EOL user generated cropping file
def get_image_info(image, crops, folder=folder, filter=filter):    
    pathbase = folder + '/'
    class_name = filter

    # Define image info needed for export
    im_h, im_w = image.shape[:2]
    xmin = crops.xmin[i].astype(int)
    ymin = crops.ymin[i].astype(int)
    xmax = crops.xmax[i].astype(int)
    ymax = crops.ymax[i].astype(int)
    box = [xmin, ymin, xmax, ymax]
    fn = str(crops['data_object_id'][i]) + '.jpg'
    fpath = pathbase + fn
    
    # Export to crops_test.tsv
    with open(outfpath, 'a') as out_file:
            tsv_writer = csv.writer(out_file, delimiter='\t')
            tsv_writer.writerow([crops.data_object_id[i], crops.obj_url[i], \
                                 im_h, im_w, box[0], box[1], box[2], box[3], \
                                 fn, fpath, class_name])

    return fpath

# Write header of crops_test_notaug.tsv before looping through crops for other data
fpath = cwd + "/" + filter + "_crops_test.tsv"
outfpath = os.path.splitext(fpath)[0] + '_notaug.tsv'
if not os.path.isfile(outfpath): # Prevents writing duplicate header rows
    with open(outfpath, 'a') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(["data_object_id",	"obj_url",	"im_height",	"im_width",	"xmin",
                              "ymin",	"xmax",	"ymax",	"filename",	"path",	"class"])

In [None]:
#@title Save test images to Google Drive

# Read in EOL user generated cropping data
crops = read_datafile(fpath, disp_head=False)

# Test pipeline with a smaller subset than 5k images?
run = "test with tiny subset" #@param ["test with tiny subset", "for all images"]

# Display detection results on images
display_results = True #@param {type:"boolean"}

# Loop through crop test data
print("Downloading testing images")
start, stop = set_start_stop(run)

for i, row in crops.iloc[start:stop].iterrows():
    try:
        # Load image from url
        # Use imread instead of imageio.imread to load images from url and get consistent output type and shape
        url = crops["obj_url"][i]
        with urlopen(url) as file:
            image = imread(file, mode='RGB')

        # Define variables needed in exported dataset
        fpath = get_image_info(image, crops)

        # Save image to Google Drive
        imageio.imwrite(fpath, image)
    
        # Display message to track download process by image
        print('{}) Successfully downloaded image from {}'.format(format(i+1, '.0f'), url))
  
    except:
        print('{}) Error: check if web address for image from {} is valid'.format(format(i+1, '.0f'), url))

# Remove out of bounds values
crops = read_datafile(outfpath, disp_head=False)
crops_oobrem = remove_oob(crops)

# Save results for use training object detectors
outfpath = os.path.splitext(outfpath)[0] + '_oob_rem_fin.csv' 
crops_oobrem.to_csv(outfpath, sep=',', index=False)

## Inspect pre-preprocessed crops on images
---
If needed, adjust "iaa.Sequential" augmentation parameters and/or "remove_oob" transformations above and re-visualize until desired results are acheived.

In [None]:
#@title Define functions and specify which dataset to visualize (train or test)
%cd $cwd
import cv2
from scipy.misc import imread

# Read in cropping file for displaying results
dataset = "train" #@param ["train", "test"] {allow-input: true}
pathbase = filter + '_crops_'
if dataset == "test":
    dataset = dataset + "_notaug"
    im_path = "test_images"
else:
    dataset = dataset + "_aug"
    im_path = "images"
outfpath = pathbase + dataset + '_oob_rem_fin.csv'
df = read_datafile(outfpath, sep=',', disp_head=True)

# Draw cropping box on image
def draw_box_on_image(df, img):
    # Get box coordinates
    xmin = df['xmin'][i].astype(int)
    ymin = df['ymin'][i].astype(int)
    xmax = df['xmax'][i].astype(int)
    ymax = df['ymax'][i].astype(int)
    box = [xmin, ymin, xmax, ymax]

    # Set box/font color and size
    maxdim = max(df['im_height'][i],df['im_width'][i])
    fontScale = maxdim/600
    box_col = (255, 0, 157)
  
    # Add label to image
    tag = df['class'][i]
    image_wbox = cv2.putText(img, tag, (xmin+7, ymax-12), cv2.FONT_HERSHEY_SIMPLEX, fontScale, box_col, 2, cv2.LINE_AA)  
  
    # Draw box label on image
    image_wbox = cv2.rectangle(img, (xmin, ymax), (xmax, ymin), box_col, 5)

    return image_wbox, box

In [None]:
#@title Choose starting index for crops to display

# Adjust line to right to see up to 50 images displayed at a time
start = 0 #@param {type:"slider", min:0, max:5000, step:50}
stop = start+50

# Loop through images
for i, row in df.iloc[start:stop].iterrows():
    # Read in image 
    fn = df['filename'][i]
    fpath = im_path + '/' + fn
    img = imread(fpath, mode='RGB')
  
    # Draw bounding box on image
    image_wbox, box = draw_box_on_image(df, img)
  
    # Plot cropping box on image
    _, ax = plt.subplots(figsize=(10, 10))
    ax.imshow(image_wbox)

    # Display image URL and coordinates above image
    plt.title('{} \n xmin: {}, ymin: {}, xmax: {}, ymax: {}'.format(url, box[0], box[1], box[2], box[3]))