<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/object_detection_for_image_tagging/scat_footprint/scat_footprint_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-process Scat/Footprint Detector Training Images
---
*Last Updated 23 February 2021*   
Follow steps below to download images from iNaturalist observation bundles to Google Drive, then perform image augmentation on them, and last move files to their appropriate folders for use training scat/footprint detection models.     

**Notes**
* Change filepaths or information using the form fields to the right of code blocks (also noted in code with 'TO DO')

### Connect to Google Drive
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Imports and Installs
import os
import pandas as pd

### 1) Build image bundles
---
Bundles were downloaded from iNaturalist under "Explore" for all terrestrial vertebrate taxa (no fish) with all creative commons licenses and the keywords "scat" or "footprint"

In [None]:
# Unzip iNaturalist image bundles
%cd /content/drive/My Drive/spr21/classification/scat_footprint/images
#!unzip '*.zip' -d .
#!rm ./*.zip

# Read in unzipped iNaturalist image bundles for CC0 and CC-BY exports
# For scat
num = '128689'
fn = 'observations-' + num + '.csv'
dfs = pd.read_csv(fn, sep=',', header=0, na_filter = False)
print("Total number of training images for scat: \n {}".format(len(dfs)))

# For footprint
num = '128749'
fn = 'observations-' + num + '.csv'
dff = pd.read_csv(fn, sep=',', header=0, na_filter = False)
print("Total number of training images for footprint: \n {}".format(len(dff)))

# Write combined bundles to file for each training class
#!mkdir scat
#!mkdir footprint
dfs['image_url'].to_csv('scat_imgs.txt', sep='\n', index=False, header=False)
dff['image_url'].to_csv('footprint_imgs.txt', sep='\n', index=False, header=False)

# Take 1200 images from each bundle to use for training
import os
import pandas as pd

# Bundle filenames
classes = ['scat', 'footprint']
fns = [clas + '_imgs.txt' for clas in classes] # Bulk image filenames
fns_subset = [clas + '/' + clas + '_download_subset.txt' for clas in classes] # Future n=1200 image bundle filenames
print(fns)

# Take the first 1200 images (best rated) for each class and write to csv
for num, f in enumerate(fns):
  df = pd.read_table(f, sep='\n')
  bundle = df.head(1200)
  fn = str(fns_subset[num])
  print(fn)
  print(bundle.head())
  bundle.to_csv(fn, sep='\n', index=False, header=False)

### 2) Download images to Google Drive
---
Run all steps once per rating class 1-5. Where you see 'TO DO' (3 places), change number to match rating class each time you run 

In [None]:
# Install aria2 for downloading images in parallel
!apt-get install aria2

In [None]:
# Download scat images 
%cd /content/drive/My Drive/spr21/classification/scat_footprint/images
%cd scat 
!aria2c -x 16 -s 1 -i "scat_download_subset.txt"
# Check how many images downloaded
print("Number of images downloaded to Google Drive: ")
!ls . | wc -l

In [None]:
# Download footprint images 
%cd /content/drive/My Drive/spr21/classification/scat_footprint/images
%cd footprint 
!aria2c -x 16 -s 1 -i "footprint_download_subset.txt"
# Check how many images downloaded
print("Number of images downloaded to Google Drive: ")
!ls . | wc -l

In [None]:
# Move text file to image_data/bundles
%cd /content/drive/My Drive/spr21/classification/scat_footprint
#!mkdir -p image_data/bundles
#!mv images/*.csv image_data/bundles/
#!mv images/*.txt image_data/bundles/
#!mv images/**/*.txt image_data/bundles/

### 3) Delete all downloaded non-image files
---

In [None]:
from os import listdir
from PIL import Image
Image.MAX_IMAGE_PIXELS = 95000000 # To suppress errors from Pillow about decompression bombs
import io
import os

# Scat
%cd /content/drive/My Drive/spr21/objdet/scat_footprint/images/scat
# Loop through downloaded files and delete non-images
for num, path in enumerate(listdir('./'), start=1):
  with open(path, 'rb') as f:
    try:
      img = Image.open(io.BytesIO(f.read()))
      img.verify() # verify that it is an image
      if len(str(os.path.splitext(path)[1])) < 3:
        newpath = str(num) + '.jpg' # add jpg extension to image files without exts 
      else:
        newpath = str(num) + str(os.path.splitext(path)[1]) # make sure all filenames and exts are unique 
      os.rename(path, newpath)
    except (IOError, SyntaxError) as e:
      print('Bad file:', filename)
      if '(' in filename: # rm doesn't work for files with parenthesis in name, need to manually remove
        print("Manually remove from Google Drive: {}".format(filename)) 
      else:
        !rm $filename
newstart = int(num)
print("Number of images in scat after non-image files removed: ")
!ls . | wc -l

# Footprint
%cd /content/drive/My Drive/spr21/objdet/scat_footprint/images/footprint
# Loop through downloaded files and delete non-images
for num, path in enumerate(listdir('./'), start=newstart):
  with open(path, 'rb') as f:
    try:
      img = Image.open(io.BytesIO(f.read()))
      img.verify() # verify that it is an image
      if len(str(os.path.splitext(path)[1])) < 3:
        newpath = str(num) + '.jpg' # add jpg extension to image files without exts 
      else:
        newpath = str(num) + str(os.path.splitext(path)[1]) # make sure all filenames and exts are unique 
      os.rename(path, newpath)
    except (IOError, SyntaxError) as e:
      print('Bad file:', filename)
      if '(' in filename: # rm doesn't work for files with parenthesis in name, need to manually remove
        print("Manually remove from Google Drive: {}".format(filename)) 
      else:
        !rm $filename

print("Number of images in footprint after non-image files removed: ")
!ls . | wc -l

### 4) Make number of images per class even
---

In [None]:
%cd /content/drive/My Drive/spr21/objdet/scat_footprint/images

# Randomly delete all but 600 images from scat and footprint folders
#!find "scat" -type f -print0 | sort -zR | tail -zn +601 | xargs -0 rm
#!find "footprint" -type f -print0 | sort -zR | tail -zn +601 | xargs -0 rm

print("Final number of scat images:")
!ls /content/drive/'My Drive'/spr21/objdet/scat_footprint/images/scat | wc -l
print("Final number of footprint images:")
!ls /content/drive/'My Drive'/spr21/objdet/scat_footprint/images/footprint | wc -l

### 5) Zip image folders for download to local machine and annotation with labelImg
---

In [None]:
!zip -r "/content/drive/My Drive/spr21/objdet/scat_footprint/images_fordl.zip" "/content/drive/My Drive/spr21/objdet/scat_footprint/images"

### 6) Upload zipped images and annotations to Google Drive and resume here
---

In [None]:
# Images
!unzip /content/drive/'My Drive'/spr21/objdet/scat_footprint/images_foranns.zip -d /content/drive/'My Drive'/train/images

In [None]:
# Delete Mac OS file from subfolders
%cd /content/drive/'My Drive'/train/images
!find . -name "*.DS_Store" -type f -delete

In [None]:
# Annotations
!unzip /content/drive/'My Drive'/spr21/objdet/scat_footprint/annotations.zip -d /content/drive/'My Drive'/train/darkflow-master/test/training/annotations

In [None]:
# Move all xml files from subfolds to main annotations folder
!mv -v /content/drive/'My Drive'/train/darkflow-master/test/training/annotations/annotations/scat/* /content/drive/'My Drive'/train/darkflow-master/test/training/annotations
!mv -v /content/drive/'My Drive'/train/darkflow-master/test/training/annotations/annotations/footprint/* /content/drive/'My Drive'/train/darkflow-master/test/training/annotations

In [None]:
# Remove empty folders from uploaded labelImg zipped files
%cd /content/drive/'My Drive'/train/darkflow-master/test/training/annotations/
!rm -r */

In [None]:
# Delete Mac OS file from subfolders
%cd /content/drive/'My Drive'/train/darkflow-master/test/training/annotations
!find . -name "*.DS_Store" -type f -delete

In [None]:
print("Final number of scat images:")
!ls /content/drive/'My Drive'/train/images/images/scat | wc -l
print("Final number of footprint images:")
!ls /content/drive/'My Drive'/train/images/images/footprint | wc -l

print("Final number of annotations:")
!ls /content/drive/'My Drive'/train/darkflow-master/test/training/annotations | wc -l

### 7) Prepare Train and Test image datasets
---

#### A) 80/20 split into Train/Test

In [None]:
import os
import random
import shutil

# Take subset of un-augmented images
# TO DO: Run 1x per class
base = '/content/drive/My Drive/train/images/'
imclass = "footprint/" #@param ["scat/", "footprint/"]
path = base + 'images/' + imclass
files = []
for fname in os.listdir(path):
    files.append(fname)

# Select 20% of unaugmented images to use for testing the trained model
print("Number of images in class", imclass, len(files))
subset = int(0.2*(len(files)*2))
print("20% of images in class to be used for testing:", subset)
test_imgs = random.sample(files, subset)

# Move test images to train/test_images/
test_dir = '/content/drive/My Drive/train/test_images'

mv_imgs = []
for file in test_imgs:
  name = os.path.join(path, file)
  if os.path.isfile(name):
      shutil.move(name, test_dir)
      print('successfully moved {} to {}'.format(name, test_dir))
      mv_imgs.append(name)
      #print('files already moved, un-comment out 27 to run with new images')
  else:
      print('file does not exist', name)

print("Number of images moved to test_images/:", len(mv_imgs))
print("New number of test images:")
!ls /content/drive/'My Drive'/train/test_images | wc -l

In [None]:
# Move training images to train/images/
!mv -v /content/drive/'My Drive'/train/images/images/footprint/* /content/drive/'My Drive'/train/images
!mv -v /content/drive/'My Drive'/train/images/images/scat/* /content/drive/'My Drive'/train/images

print("Number of train images before augmentation:")
!ls /content/drive/'My Drive'/train/images | wc -l

In [None]:
# Remove extra folders from uploaded labelImg zipped files
%cd /content/drive/'My Drive'/train/images/
!rm -r */

In [None]:
!ls -a -l /content/drive/'My Drive'/train/images/

In [None]:
# Move matching 20% of test annotations to the appropriate folder
ann_dir = '/content/drive/My Drive/train/darkflow-master/test/training/annotations/'
test_dir = '/content/drive/My Drive/train/test_images/'
testann_dir = '/content/drive/My Drive/train/test_ann/'
files = os.listdir(test_dir)

# Find xml files matching test images and move to test_ann/
for file in files:
  base = os.path.splitext(os.path.basename(file))[0]
  test_xml = ann_dir + base + '.xml'
  if os.path.exists(test_xml):
    #shutil.move(test_xml, testann_dir)
    print("moved {} to test_ann".format(file))
    #print("matching xmls already moved, un-comment out line 12 to run for new anns")
  else:
    print("!!!xml missing for image {}".format(file))
    #os.remove(file)

print("Number of test images:")
!ls /content/drive/'My Drive'/train/test_images | wc -l

print("Number of test annotations:")
!ls /content/drive/'My Drive'/train/test_ann | wc -l

In [None]:
# Check that all train images have corresponding annotation
ann_dir = '/content/drive/My Drive/train/darkflow-master/test/training/annotations/'
train_dir = '/content/drive/My Drive/train/images/'
files = os.listdir(train_dir)

# Loop through train images to see if xml for each one
for file in files:
  base = os.path.splitext(os.path.basename(file))[0]
  train_xml = ann_dir + base + '.xml'
  if os.path.exists(train_xml):
    print("xml exists for {}".format(file))
  else:
    print("!!!xml missing for image {}".format(file))
    #os.remove(file)

print("Number of train images:")
!ls /content/drive/'My Drive'/train/images | wc -l

print("Number of train annotations:")
!ls /content/drive/'My Drive'/train/darkflow-master/test/training/annotations/ | wc -l

#### B) Augment images and bounding boxes
Some code modified from https://github.com/asetkn/Tutorial-Image-and-Multiple-Bounding-Boxes-Augmentation-for-Deep-Learning-in-4-Steps/blob/master/Tutorial-Image-and-Multiple-Bounding-Boxes-Augmentation-for-Deep-Learning-in-4-Steps.ipynb

In [None]:
# Change to your training directory within Google Drive
%cd /content/drive/My Drive/train

# For importing/exporting files, working with arrays, xmls, etc
import pathlib
import os
import glob
import re
import imageio
import time
import csv
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import shutil

# For augmenting the images and bounding boxes
!pip install imgaug
!pip install pillow
import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

# For drawing onto and plotting the images
import matplotlib.pyplot as plt
import cv2
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

---

#### Run all steps B1-B4 below in a row 1x for Train and 1x for Test images

B1) Load in, inspect, and convert image and bounding box files to prep for augmentation and future use with Tensorflow

In [None]:
## TO DO: Run 1x for Train and Test images
imtype = "test" #@param ["train", "test"]

if imtype == "train":
  path = 'images/*'
  annpath = '/content/drive/My Drive/train/darkflow-master/test/training/annotations/'
else:
  path = 'test_images/*'
  annpath = '/content/drive/My Drive/train/test_ann/'

# Images
# Load images as numpy arrays and append them to images list
images = []
for index, file in enumerate(glob.glob(path)):
    images.append(imageio.imread(file))
    
# Count total training images before augmentation
print('Total training images: {}'.format(len(images)))

# Inspect two images
ia.imshow(images[2])
ia.imshow(images[7])

# Annotations
# See 5 annotation filenames (that match image file names)
for index, file in enumerate(glob.glob(annpath + '/*.xml')[:5]):
    print(os.path.basename(file))

# Inspect an xml file
# Notice there are multiple bounding boxes of footprints
#shutil.copy('/content/drive/My Drive/train/test_ann/638.xml', '/content/638.txt')
#ann_text = open("/content/638.txt", "r")
#print(ann_text.read())
#ann_text.close()

In [None]:
# Function that will extract column data for our CSV file
# From https://github.com/datitran/raccoon_dataset/blob/master/xml_to_csv.py
def xml_to_csv(path):
    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text)
                     )
            xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df

# Convert xml annotations to labels.csv
labels = xml_to_csv(annpath)

if imtype == "train":
  labels.to_csv(('pre-processing/train_labels_notaug.csv'), index=None)
  #pass
else:
  labels.to_csv(('pre-processing/test_labels_notaug.csv'), index=None)
  #pass

print('Successfully converted xmls to csv')
labels

if imtype == "train":
  print("Number of train images:")
  !sudo ls /content/drive/'My Drive'/train/images | wc -l
else:
  print("Number of test images:")
  !sudo ls /content/drive/'My Drive'/train/test_images | wc -l

print("Number of files with annotations in labels csv:")
print(len(labels.groupby('filename')))

In [None]:
# Optional: Only run in case mismatch in images in folder and images in labels csv
from os import listdir
from os.path import isfile, join
path = '/content/drive/My Drive/train/test_ann'
fns_orig = [f for f in listdir(path) if isfile(join(path, f))]
non_fns = [f for f in listdir(path) if not isfile(join(path, f))]
df = pd.read_csv('pre-processing/test_labels_notaug.csv')
fns_csv = df.filename.unique()
print(len(fns_orig))
print(len(fns_csv))
print(non_fns)

#### B2) Resize images to make training faster

In [None]:
# to resize the images we create two augmenters
# one is used when the image height is more than 600px and the other when the width is more than 600px
height_resize = iaa.Sequential([ 
    iaa.Resize({"height": 600, "width": 'keep-aspect-ratio'})
])

width_resize = iaa.Sequential([ 
    iaa.Resize({"height": 'keep-aspect-ratio', "width": 600})
])

# function to convert BoundingBoxesOnImage object into DataFrame
def bbs_obj_to_df(bbs_object):
#     convert BoundingBoxesOnImage object into array
    bbs_array = bbs_object.to_xyxy_array()
#     convert array into a DataFrame ['xmin', 'ymin', 'xmax', 'ymax'] columns
    df_bbs = pd.DataFrame(bbs_array, columns=['xmin', 'ymin', 'xmax', 'ymax'])
    return df_bbs

def resize_imgaug(df, images_path, aug_images_path, image_prefix):
    # create data frame which we're going to populate with augmented image info
    aug_bbs_xy = pd.DataFrame(columns=
                              ['filename','width','height','class', 'xmin', 'ymin', 'xmax', 'ymax']
                             )
    grouped = df.groupby('filename')    
    
    for filename in df['filename'].unique():
    #   Get separate data frame grouped by file name
        group_df = grouped.get_group(filename)
        group_df = group_df.reset_index()
        group_df = group_df.drop(['index'], axis=1)
        
    #   The only difference between if and elif statements below is the use of height_resize and width_resize augmentors
    #   deffined previously.

    #   If image height is greater than or equal to image width 
    #   AND greater than 600px perform resizing augmentation shrinking image height to 600px.
        if group_df['height'].unique()[0] >= group_df['width'].unique()[0] and group_df['height'].unique()[0] > 600:
        #   read the image
            image = imageio.imread(images_path+filename)
        #   get bounding boxes coordinates and write into array        
            bb_array = group_df.drop(['filename', 'width', 'height', 'class'], axis=1).values
        #   pass the array of bounding boxes coordinates to the imgaug library
            bbs = BoundingBoxesOnImage.from_xyxy_array(bb_array, shape=image.shape)
        #   apply augmentation on image and on the bounding boxes
            image_aug, bbs_aug = height_resize(image=image, bounding_boxes=bbs)
        #   write augmented image to a file
            imageio.imwrite(aug_images_path+image_prefix+filename, image_aug)  
        #   create a data frame with augmented values of image width and height
            info_df = group_df.drop(['xmin', 'ymin', 'xmax', 'ymax'], axis=1)        
            for index, _ in info_df.iterrows():
                info_df.at[index, 'width'] = image_aug.shape[1]
                info_df.at[index, 'height'] = image_aug.shape[0]
        #   rename filenames by adding the predifined prefix
            info_df['filename'] = info_df['filename'].apply(lambda x: image_prefix+x)
        #   create a data frame with augmented bounding boxes coordinates using the function we created earlier
            bbs_df = bbs_obj_to_df(bbs_aug)
        #   concat all new augmented info into new data frame
            aug_df = pd.concat([info_df, bbs_df], axis=1)
        #   append rows to aug_bbs_xy data frame
            aug_bbs_xy = pd.concat([aug_bbs_xy, aug_df])
            
    #   if image width is greater than image height 
    #   AND greater than 600px perform resizing augmentation shrinking image width to 600px
        elif group_df['width'].unique()[0] > group_df['height'].unique()[0] and group_df['width'].unique()[0] > 600:
        #   read the image
            image = imageio.imread(images_path+filename)
        #   get bounding boxes coordinates and write into array        
            bb_array = group_df.drop(['filename', 'width', 'height', 'class'], axis=1).values
        #   pass the array of bounding boxes coordinates to the imgaug library
            bbs = BoundingBoxesOnImage.from_xyxy_array(bb_array, shape=image.shape)
        #   apply augmentation on image and on the bounding boxes
            image_aug, bbs_aug = width_resize(image=image, bounding_boxes=bbs)
        #   write augmented image to a file
            imageio.imwrite(aug_images_path+image_prefix+filename, image_aug)  
        #   create a data frame with augmented values of image width and height
            info_df = group_df.drop(['xmin', 'ymin', 'xmax', 'ymax'], axis=1)        
            for index, _ in info_df.iterrows():
                info_df.at[index, 'width'] = image_aug.shape[1]
                info_df.at[index, 'height'] = image_aug.shape[0]
        #   rename filenames by adding the predifined prefix
            info_df['filename'] = info_df['filename'].apply(lambda x: image_prefix+x)
        #   create a data frame with augmented bounding boxes coordinates using the function we created earlier
            bbs_df = bbs_obj_to_df(bbs_aug)
        #   concat all new augmented info into new data frame
            aug_df = pd.concat([info_df, bbs_df], axis=1)
        #   append rows to aug_bbs_xy data frame
            aug_bbs_xy = pd.concat([aug_bbs_xy, aug_df])

    #     append image info without any changes if it's height and width are both less than 600px 
        else:
            aug_bbs_xy = pd.concat([aug_bbs_xy, group_df])
    # return dataframe with updated images and bounding boxes annotations 
    aug_bbs_xy = aug_bbs_xy.reset_index()
    aug_bbs_xy = aug_bbs_xy.drop(['index'], axis=1)
    return aug_bbs_xy

# Save resized images and bounding boxes to file
if imtype == "train":
  resized_df = resize_imgaug(labels, 'images/', 'images/', '')
  resized_df
  resized_df.to_csv('pre-processing/train_labels_notaug_rsz.csv', index=False)

else:
  resized_df = resize_imgaug(labels, 'test_images/', 'test_images/', '')
  resized_df
  resized_df.to_csv('pre-processing/test_labels_notaug_rsz.csv', index=False)

#### B3) Actual augmentation of images and bounding boxes

In [None]:
# Set up augmentation parameters and augment images
aug = iaa.SomeOf(2, [    
    iaa.Affine(scale=(0.5, 1.5)),
    iaa.Affine(rotate=(-60, 60)),
    iaa.Affine(translate_percent={"x": (-0.3, 0.3), "y": (-0.3, 0.3)}),
    iaa.Fliplr(1),
    iaa.Multiply((0.5, 1.5)),
    iaa.GaussianBlur(sigma=(1.0, 3.0)),
    iaa.AdditiveGaussianNoise(scale=(0.03*255, 0.05*255))
])

def image_aug(df, images_path, aug_images_path, image_prefix, augmentor):
    # create data frame which we're going to populate with augmented image info
    aug_bbs_xy = pd.DataFrame(columns=
                              ['filename','width','height','class', 'xmin', 'ymin', 'xmax', 'ymax']
                             )
    grouped = df.groupby('filename')
    
    for filename in df['filename'].unique():
    #   get separate data frame grouped by file name
        group_df = grouped.get_group(filename)
        group_df = group_df.reset_index()
        group_df = group_df.drop(['index'], axis=1)   
    #   read the image
        image = imageio.imread(images_path+filename)
    #   get bounding boxes coordinates and write into array        
        bb_array = group_df.drop(['filename', 'width', 'height', 'class'], axis=1).values
    #   pass the array of bounding boxes coordinates to the imgaug library
        bbs = BoundingBoxesOnImage.from_xyxy_array(bb_array, shape=image.shape)
    #   apply augmentation on image and on the bounding boxes
        image_aug, bbs_aug = augmentor(image=image, bounding_boxes=bbs)
    #   disregard bounding boxes which have fallen out of image pane    
        bbs_aug = bbs_aug.remove_out_of_image()
    #   clip bounding boxes which are partially outside of image pane
        bbs_aug = bbs_aug.clip_out_of_image()
        
    #   don't perform any actions with the image if there are no bounding boxes left in it    
        if re.findall('Image...', str(bbs_aug)) == ['Image([]']:
            pass
        
    #   otherwise continue
        else:
        #   write augmented image to a file
            imageio.imwrite(aug_images_path+image_prefix+filename, image_aug)  
        #   create a data frame with augmented values of image width and height
            info_df = group_df.drop(['xmin', 'ymin', 'xmax', 'ymax'], axis=1)    
            for index, _ in info_df.iterrows():
                info_df.at[index, 'width'] = image_aug.shape[1]
                info_df.at[index, 'height'] = image_aug.shape[0]
        #   rename filenames by adding the predifined prefix
            info_df['filename'] = info_df['filename'].apply(lambda x: image_prefix+x)
        #   create a data frame with augmented bounding boxes coordinates using the function we created earlier
            bbs_df = bbs_obj_to_df(bbs_aug)
        #   concat all new augmented info into new data frame
            aug_df = pd.concat([info_df, bbs_df], axis=1)
        #   append rows to aug_bbs_xy data frame
            aug_bbs_xy = pd.concat([aug_bbs_xy, aug_df])            
    
    # return dataframe with updated images and bounding boxes annotations 
    aug_bbs_xy = aug_bbs_xy.reset_index()
    aug_bbs_xy = aug_bbs_xy.drop(['index'], axis=1)
    return aug_bbs_xy

# Apply augmentation to our images and save files into 'aug_images/' folder with 'aug_' prefix.
# Write the updated images and bounding boxes annotations to the augmented_images_df dataframe.
if imtype == "train":
  augmented_df = image_aug(resized_df, 'images/', 'images/', 'aug_', aug)

else:
  augmented_df = image_aug(resized_df, 'test_images/', 'test_images/', 'aug_', aug)

print(augmented_df)

# Save augmented images and bounding boxes to file
all_labels_df = pd.concat([resized_df, augmented_df])

if imtype == "train":
  all_labels_df.to_csv('pre-processing/train_labels_augall.csv', index=False)
  #pass

else:
  all_labels_df.to_csv('pre-processing/test_labels_augall.csv', index=False)
  #pass

In [None]:
# Inspect augmentation results
if imtype == "train":
  imgs = "images/"
else:
  imgs = "test_images/"

grouped_resized = resized_df.groupby('filename')
grouped_augmented = augmented_df.groupby('filename')

for filename in resized_df['filename'].unique()[:5]:    
    group_r_df = grouped_resized.get_group(filename)
    group_r_df = group_r_df.reset_index()
    group_r_df = group_r_df.drop(['index'], axis=1)
    bb_r_array = group_r_df.drop(['filename', 'width', 'height', 'class'], axis=1).values
    resized_img = imageio.imread(imgs+filename)
    bbs_r = BoundingBoxesOnImage.from_xyxy_array(bb_r_array, shape=resized_img.shape)
    
    group_a_df = grouped_augmented.get_group('aug_'+filename)
    group_a_df = group_a_df.reset_index()
    group_a_df = group_a_df.drop(['index'], axis=1)
    bb_a_array = group_a_df.drop(['filename', 'width', 'height', 'class'], axis=1).values
    bb_a_array = bb_a_array[~np.isnan(bb_a_array).any(axis=1)]
    augmented_img = imageio.imread(imgs+'aug_'+filename)
    bbs_a = BoundingBoxesOnImage.from_xyxy_array(bb_a_array, shape=augmented_img.shape)
    
    ia.imshow(np.hstack([
            bbs_r.draw_on_image(resized_img, size=2),
            bbs_a.draw_on_image(augmented_img, size=2)
            ]))

#### B4) Remove out of bounds values resulting from augmentation

In [None]:
# Remove out of bounds values
if imtype == "train":
  df = pd.read_csv("pre-processing/train_labels_augall.csv")
else:
  df = pd.read_csv("pre-processing/test_labels_augall.csv")
df.head()

# Remove out of bounds (OOB) cropping dimensions
# Set positive out of bounds values (OOB +) equal to image dimensions
for i, row in df.iterrows():
    # When crop height > image height, set crop height equal to image height:
    if df.ymax[i] > df.height[i]:
        df.ymax[i] = df.height[i]
    # When crop width > image width, set crop width equal to image width:
    if df.xmax[i] > df.width[i]:
        df.xmax[i] = df.width[i]  
df.dropna(inplace=True)
df.xmin = df.xmin.astype(int)
df.ymin = df.ymin.astype(int)
df.xmax = df.xmax.astype(int)
df.ymax = df.ymax.astype(int)
# Set negative values (OOB -) equal to 1 # was getting errors when mins = 0
df.xmin[df.xmin <= 0] = 1
df.ymin[df.ymin <= 0] = 1

if imtype == "train":
  df.to_csv('pre-processing/train_labels_augall_oobrem.csv', sep=',', index=False)
else:
  df.to_csv('pre-processing/test_labels_augall_oobrem.csv', sep=',', index=False)

----

#### C) Loop through images and annotations to confirm all files are valid to avoid problems training downstream

In [None]:
# to find problematic images
# modified from https://github.com/AjayZinngg/random-scripts/blob/master/check_images.py 
# also errors https://github.com/tensorflow/models/issues/5474
# about error messages https://github.com/tensorflow/models/issues/1754
import csv
import cv2 
import os
import numpy as np

csv_files = ['pre-processing/train_labels_augall_oobrem.csv', 'pre-processing/test_labels_augall_oobrem.csv']
folders = ['images', 'test_images']

for i in range(len(folders)):
    FOLDER = folders[i]
    CSV_FILE = csv_files[i]

    with open(CSV_FILE, 'r') as fid:
        
        print('Checking file:', CSV_FILE, 'in folder:', FOLDER)
        
        file = csv.reader(fid, delimiter=',')
        first = True
        
        cnt = 0
        error_cnt = 0
        error = False
        for row in file:
            if error == True:
                error_cnt += 1
                error = False
                
            if first == True:
                first = False
                continue
            
            cnt += 1
            
            name, width, height, xmin, ymin, xmax, ymax = row[0], int(float(row[1])), int(float(row[2])), int(float(row[4])), int(float(row[5])), int(float(row[6])), int(float(row[7]))
            
            path = os.path.join(FOLDER, name)
            img = cv2.imread(path)
            
            if type(img) == type(None):
                error = True
                print('Could not read image', path)
                continue
            
            org_height, org_width = img.shape[:2]
            
            if org_width != width:
                error = True
                print('Width mismatch for image: ', name, width, '!=', org_width)
            
            if org_height != height:
                error = True
                print('Height mismatch for image: ', name, height, '!=', org_height)
            
            if xmin > org_width:
                error = True
                print('XMIN > org_width for file', name)
                
            if xmin <= 0:
                error = True
                print('XMIN < 0 for file', name)
                
            if xmax > org_width:
                error = True
                print('XMAX > org_width for file', name)
            
            if ymin > org_height:
                error = True
                print('YMIN > org_height for file', name)
            
            if ymin <= 0:
                error = True
                print('YMIN < 0 for file', name)
            
            if ymax > org_height:
                error = True
                print('YMAX > org_height for file', name)
            
            if xmin >= xmax:
                error = True
                print('xmin >= xmax for file', name)
                
            if ymin >= ymax:
                error = True
                print('ymin >= ymax for file', name)
            
            if error == True:
                print('Error for file: %s' % name)
                print()
            
        print('Checked %d bounding boxes and realized %d errors' % (cnt, error_cnt))

In [None]:
#Manually deleted random files from abo e that say xmin > xmax when values werent greater than
#df = pd.read_csv('pre-processing/train_labels_augall_oobrem.csv')
#df1 = df[df.filename != 'aug_912.jpeg']
#df1.to_csv('pre-processing/train_labels_augall_oobrem.csv', index=False)

### 8) Generate xmls (annotations) for new annotated image dataset

In [None]:
# Run this block 1x per class

# Convert train and test csvs to xmls with updated filepaths
# modified from here https://gist.github.com/calisir/568190a5e55a79e08be318c285688457

%cd /content/drive/My Drive/train

import pandas as pd
import numpy as np
import lxml
from lxml import etree
import xml.etree.cElementTree as ET

def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

imtype = "test" #@param ["train", "test"]

# Read in train or test image label data
if imtype == "train":
  folder = "images" 
  fpath = "content/drive/My Drive/train/images/" 
  labfile = "pre-processing/train_labels_augall_oobrem.csv"
else:
  folder = "test_images" 
  fpath = "content/drive/My Drive/train/test_images/" 
  labfile = "pre-processing/test_labels_augall_oobrem.csv"
df1 = pd.read_csv(labfile)

df = df1.groupby('filename', as_index=False).agg(lambda x: list(x))

# Change the name of the file.
# Replace any / with - to avoid errors in xmls
def nameChange(x):
    x = x.replace("/", "-")
    return x

df['filename'] = df['filename'].apply(nameChange)

for i in range(0, len(df)):
#for i in range(0, 5):
    height = df['height'].iloc[i][0]
    width = df['width'].iloc[i][0]
    depth = 3

    annotation = ET.Element('annotation')
    ET.SubElement(annotation, 'folder').text = folder
    ET.SubElement(annotation, 'filename').text = str(df['filename'].iloc[i])
    ET.SubElement(annotation, 'path').text = fpath + str(df['filename'].iloc[i])
    
    source = ET.SubElement(annotation, 'source')
    ET.SubElement(source, 'database').text = 'Unknown'
    
    size = ET.SubElement(annotation, 'size')
    ET.SubElement(size, 'width').text = str(width)
    ET.SubElement(size, 'height').text = str(height)
    ET.SubElement(size, 'depth').text = str(depth)

    ET.SubElement(annotation, 'segmented').text = '0'
    
    # To handle images with >1 annotation
    for x in range(0, len(df['xmin'].iloc[i])):
      ob = ET.SubElement(annotation, 'object')
      ET.SubElement(ob, 'name').text = str(df['class'].iloc[i][x])
      ET.SubElement(ob, 'pose').text = 'Unspecified'
      ET.SubElement(ob, 'truncated').text = '0'
      ET.SubElement(ob, 'difficult').text = '0'

      bbox = ET.SubElement(ob, 'bndbox')
      ET.SubElement(bbox, 'xmin').text = str(int(df['xmin'].iloc[i][x]))
      ET.SubElement(bbox, 'ymin').text = str(int(df['ymin'].iloc[i][x]))
      ET.SubElement(bbox, 'xmax').text = str(int(df['xmax'].iloc[i][x]))
      ET.SubElement(bbox, 'ymax').text = str(int(df['ymax'].iloc[i][x]))

    fileName = str(df['filename'].iloc[i])
    tree = ET.ElementTree(annotation)
    indent(annotation)
    if imtype == "train":
      outf = "pre-processing/train_ann/"
    else:
      outf = "pre-processing/test_ann/"
    outpath = outf + os.path.splitext(fileName)[0] + ".xml"
    tree.write(outpath, encoding='utf8', xml_declaration=False)

In [None]:
import numpy as np

# Check that all train images have corresponding annotation
ann_dir = '/content/drive/My Drive/train/pre-processing/test_ann/'
train_dir = '/content/drive/My Drive/train/test_images/'
files = os.listdir(train_dir)

# Loop through train images to see if xml for each one
for file in files:
  base = os.path.splitext(os.path.basename(file))[0]
  train_xml = ann_dir + base + '.xml'
  if os.path.exists(train_xml):
    print("xml exists for {}".format(file))
  else:
    print("!!!xml missing for image {}".format(file))
    #os.remove(file)

# Check for duplicate xmls
#import collections
#print([item for item, count in collections.Counter(files).items() if count > 1])
#!ls -l -a /content/drive/'My Drive'/train/pre-processing/test_ann/

# Check for xmls that don't have corresp img
%cd /content/drive/My Drive/train
xmls = os.listdir('pre-processing/test_ann/')
imgs = os.listdir('test_images/')
xbases = []
ibases = []
for xml in xmls:
  xbase = os.path.splitext(os.path.basename(xml))[0]
  xbases.append(xbase)
for img in imgs:
  ibase = os.path.splitext(os.path.basename(img))[0]
  ibases.append(ibase)

# yields the elements in `list_2` that are NOT in `list_1`
diffs = np.setdiff1d(xbases,ibases)
print("xml(s) that need to be deleted bc have no corresp img: {}".format(diffs))

print("Number of test images:")
!ls /content/drive/'My Drive'/train/test_images | wc -l

print("Number of test annotations:")
!ls /content/drive/'My Drive'/train/pre-processing/test_ann/ | wc -l

In [None]:
# Check that all train images have corresponding annotation
ann_dir = '/content/drive/My Drive/train/pre-processing/train_ann/'
train_dir = '/content/drive/My Drive/train/images/'
files = os.listdir(train_dir)

# Check for duplicate xmls
import collections
print([item for item, count in collections.Counter(files).items() if count > 1])
#!ls -l -a /content/drive/'My Drive'/train/pre-processing/train_ann/

# Loop through train images to see if xml for each one
for file in files:
  base = os.path.splitext(os.path.basename(file))[0]
  train_xml = ann_dir + base + '.xml'
  if os.path.exists(train_xml):
    print("xml exists for {}".format(file))
  else:
    print("!!!xml missing for image {}".format(file))
    #os.remove(file)

# Check for xmls that don't have corresp img
%cd /content/drive/My Drive/train
xmls = os.listdir('pre-processing/train_ann/')
imgs = os.listdir('images/')
xbases = []
ibases = []
for xml in xmls:
  xbase = os.path.splitext(os.path.basename(xml))[0]
  xbases.append(xbase)
for img in imgs:
  ibase = os.path.splitext(os.path.basename(img))[0]
  ibases.append(ibase)

# yields the elements in `list_2` that are NOT in `list_1`
diffs = np.setdiff1d(xbases,ibases)
print("xml(s) that need to be deleted bc have no corresp img: {}".format(diffs))

print("Number of train images:")
!ls /content/drive/'My Drive'/train/images | wc -l

print("Number of train annotations:")
!ls /content/drive/'My Drive'/train/pre-processing/train_ann/ | wc -l

In [None]:
# Delete augmented test images and annotations
# Hacky fix bc steps above do resizing and aug for test and train images

import glob
import os
import pandas as pd

# Check that all train images have corresponding annotation
ann_dir = '/content/drive/My Drive/train/pre-processing/test_ann/'
im_dir = '/content/drive/My Drive/train/test_images/'
files = os.listdir(im_dir)

# Delete all augmented xmls
fns = glob.glob(ann_dir + "*aug_*")
print(fns)
print(len(fns))
for fn in fns:
    try:
        os.remove(fn)
    except:
        print("Error while deleting file : ", fn)

# Delete all augmented imgs
fns = glob.glob(im_dir + "*aug_*")
print(fns)
print(len(fns))
for fn in fns:
    try:
        os.remove(fn)
    except:
        print("Error while deleting file : ", fn)

# Delete aug img rows from test_labels_augall_oobrem.csv
df = pd.read_csv('pre-processing/test_labels_augall_oobrem.csv')
df = df[~df.filename.str.contains("aug_")]
df.to_csv('pre-processing/test_labels_augall_oobrem.csv', sep=',', index=False)

In [None]:
# Inspect number of images and annotations for train and test (should be 1 image/annotation in each group and test should be ~20-30% of train)
!ls /content/drive/'My Drive'/train/pre-processing/train_ann | wc -l
!ls /content/drive/'My Drive'/train/images | wc -l

!ls /content/drive/'My Drive'/train/pre-processing/test_ann | wc -l
!ls /content/drive/'My Drive'/train/test_images | wc -l

In [None]:
# Copy final datasets to train and test folders for object detection
#!mv /content/drive/'My Drive'/train/pre-processing/train_ann/* /content/drive/'My Drive'/train/darkflow-master/test/training/annotations
!rm -r /content/drive/'My Drive'/train/pre-processing/train_ann

#!mv /content/drive/'My Drive'/train/pre-processing/test_ann/* /content/drive/'My Drive'/train/test_ann
!rm -r /content/drive/'My Drive'/train/pre-processing/test_ann