<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/object_detection_for_image_tagging/scat_footprint/scat_footprint_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-process Scat/Footprint Detector Training Images
---
*Last Updated 2 June 2021*   
Follow steps below to download images from iNaturalist observation bundles to Google Drive, then  then augment images to increase training dataset size, and last move files to their appropriate folders for use training scat/footprint detection models.     

**Notes**
* The steps in this notebook took several days to complete for a training dataset size of 1200 images per class. Preparing datasets is often the most time-consuming step of a machine learning pipeline, especially for object detection where you must manage images and their corresponding annotation files.
* After Step 5, one step needs to be completed on your local machine - image annotation using [labelImg](https://github.com/tzutalin/labelImg) - before moving onto step 6. 
* Change filepaths or information using the form fields to the right of code blocks (also noted in code with 'TO DO')

## Installs & Imports
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# For importing/exporting files, working with arrays, xmls, etc
import pathlib
import os
from os import listdir
import glob
import re
import imageio
import io
import time
import csv
import numpy as np
import pandas as pd
import shutil
import random

# For drawing onto and plotting images
import matplotlib.pyplot as plt
import cv2
from PIL import Image
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# For augmenting images
!pip install imgaug
import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

# For handling annotations
import lxml
from lxml import etree
import xml.etree.cElementTree as ET

### 1) Build image bundles for each class
---
Bundles were downloaded from [iNaturalist](https://www.inaturalist.org/observations) under "Explore" for all terrestrial vertebrate taxa (no fish) with all creative commons licenses and the keywords "scat" or "footprint"

In [None]:
# TO DO: Type in the path to your working directory in form field to right
wd = "/content/drive/MyDrive/train/tf2/pre-processing/scat_footprint" #@param {type:"string"}
%cd $wd

# Download iNaturalist image bundles for CC0 and CC-BY images
# Scat
fn = 'observations-128689.csv'
if not os.path.exists(fn):
    !gdown --id 1KnRRusRVXpGEATmkSEhAVPB2_htiVfWm
scat = pd.read_csv(fn, sep=',', header=0, na_filter = False)
print("Total number of available images for scat: \n {}".format(len(scat)))

# Footprint
fn = 'observations-128749.csv'
if not os.path.exists(fn):
    !gdown --id 1KlfqTu_dS_hpqQJI_A2e0RYCZNllRDz9
footprint = pd.read_csv(fn, sep=',', header=0, na_filter = False)
print("\nTotal number of available images for footprint: \n {}".format(len(footprint)))

# Set up directories for pre-processing images
def directory_setup(df, folder): 
    if not os.path.exists(folder):
        # Make directory for image class
        os.makedirs(folder)
        # List all image urls
        outfpath = os.path.split(folder)[0] + '/' + os.path.split(folder)[1] + '_imgs.txt'
        df['image_url'].to_csv(outfpath, sep='\n', index=False, header=False)
        # Make subset of image urls for download
        outfpath = os.path.split(folder)[0] + '/' + os.path.split(folder)[1] + '/' + os.path.split(folder)[1] + '_download_subset.txt' 
        # TO DO: Choose how many images to download
        num_imgs = 1200 #@param {type:"integer"}
        bundle = df.head(num_imgs)
        bundle.to_csv(outfpath, sep='\n', index=False, header=False)
        print("Bundle with {} {} images for download saved to {}\n".format(num_imgs, folder, outfpath))

# Set up directory and bundle for scat
directory_setup(scat, "images/scat")

# Set up directory and bundle for footprint
directory_setup(footprint, "images/footprint")

### 2) Download images to Google Drive
---

In [None]:
# Install aria2 for downloading images in parallel
!apt-get install aria2

In [None]:
# Download scat images 
%cd $wd
%cd images/scat 
!aria2c -x 16 -s 1 -i "scat_download_subset.txt"
# Check how many images downloaded
print("Number of images downloaded to Google Drive: ")
!ls . | wc -l

In [None]:
# Download footprint images 
%cd $wd
%cd images/footprint 
!aria2c -x 16 -s 1 -i "footprint_download_subset.txt"
# Check how many images downloaded
print("Number of images downloaded to Google Drive: ")
!ls . | wc -l

In [None]:
# Move image data files to image_data/
%cd $wd
!mkdir -p image_data
!mv ./*.csv image_data/
!mv images/*.txt image_data/
!mv images/scat/*.txt image_data/
!mv images/footprint/*.txt image_data/

### 3) Delete all downloaded non-image files
---

In [None]:
Image.MAX_IMAGE_PIXELS = 95000000 # To suppress errors from Pillow about decompression bombs

# Remove bad image files
def remove_bad_images(folder):
    %cd $wd
    %cd $folder
    # Loop through downloaded files and delete non-images
    for num, path in enumerate(listdir('./'), start=1):
        try:
            with open(path, 'rb') as f:
                try:
                    img = Image.open(io.BytesIO(f.read()))
                    img.verify() # verify that it is an image
                    if len(str(os.path.splitext(path)[1])) < 3:
                        newpath = str(num) + '.jpg' # add jpg extension to image files without exts 
                    else:
                        newpath = str(num) + str(os.path.splitext(path)[1]) # make sure all filenames and exts are unique 
                        os.rename(path, newpath)
                except (IOError, SyntaxError) as e:
                    print('Bad file:', path)
                    if '(' in path: # rm doesn't work for files with parenthesis in name, need to manually remove
                        print("Manually remove from Google Drive: {}".format(filename)) 
                    else:
                        !rm $path
        except IsADirectoryError as e:
            print("{} is a directory. \nRemoving directory...".format(path))
            os.removedirs(path)
    print("Number of images in {} after non-image files removed: ".format(folder))
    !ls . | wc -l

# Remove bad image files from scat/
remove_bad_images("images/scat")

# Remove bad image files from footprint/
remove_bad_images("images/footprint")

### 4) Make number of images per class even
---

In [None]:
%cd $wd

# Randomly delete all but N-images from scat and footprint folders
# TO DO: Choose how many images to keep for each class
to_keep = 600 #@param {type:"integer"}
!find "images/scat" -type f -print0 | sort -zR | tail -zn +{to_keep} | xargs -0 rm
!find "images/footprint" -type f -print0 | sort -zR | tail -zn +{to_keep} | xargs -0 rm

print("Final number of scat images:")
!ls images/scat | wc -l
print("Final number of footprint images:")
!ls images/footprint | wc -l

### 5) Zip image folders for download to local machine and annotation with labelImg
---

In [None]:
!zip -r "images_fordl.zip" "images"

### 6) Upload zipped images and annotations to Google Drive and resume here
---
Upload files to tf2/ or your base wd

In [None]:
# TO DO: Type in the path to your working directory in form field to right
wd = "/content/drive/MyDrive/train/tf2" #@param {type:"string"}
%cd $wd

# TO DO: Type in filename of zipped images folder
filename = "images_foranns.zip" #@param {type:"string"}

# Unzip images to tf2/images
!mkdir images
!mkdir test_images
!unzip {filename} -d images

In [None]:
# Move all images from subfolders to main annotations folder
os.makedirs('images/scat')
os.makedirs('images/footprint')
!mv -v images/images/scat/* images/scat
!mv -v images/images/footprint/* images/footprint

# Remove empty folders from uploaded labelImg zipped files
!rm -r images/images

In [None]:
# Optional: If used Mac for labelImg, delete Mac OS files from subfolders
!find images/ -name "*.DS_Store" -type f -delete
!rm -r images/__MACOSX

In [None]:
# Unzip annotations to tf2/annotations
!mkdir annotations
!mkdir test_ann
!unzip annotations.zip -d annotations

# Move all xml files from subfolders to main annotations folder
!mv -v annotations/annotations/scat/* annotations
!mv -v annotations/annotations/footprint/* annotations

# Remove empty folders from uploaded labelImg zipped files
!rm -r annotations/annotations

In [None]:
# Optional: If used Mac for labelImg, delete Mac OS files from subfolders
!find annotations/ -name "*.DS_Store" -type f -delete
!rm -r annotations/__MACOSX

In [None]:
%cd $wd

print("\nFinal number of scat images:")
!ls images/scat | wc -l
print("Final number of footprint images:")
!ls images/footprint | wc -l

print("Final number of annotations (scat and footprint combined):")
!ls annotations | wc -l

### 7) Pre-process train and test images
---

In [None]:
# Define functions

# TO DO: Type in the path to your working directory in form field to right
wd = "/content/drive/MyDrive/train/tf2" #@param {type:"string"}
%cd $wd

# List all images contained in folder
def list_dir_images(dir):    
    basepath = 'images/'
    fpath = basepath + dir
    filenames = os.listdir(fpath)
    
    return filenames

# Split into train and test datasets
def split_train_test(filenames, img_class):
    # Select 30% of images to use for testing the trained model
    # Ratios will be 80/20 after augmenting training images
    print("Number of images in {}\n".format(img_class, len(filenames)))
    subset = int(0.3*(len(filenames)*2))
    test_imgs = random.sample(filenames, subset)
    print("30% of images in {} to be used for testing: {}".format(img_class, subset))

    return test_imgs

# Move test images to test_images/
def move_test_images(test_imgs, img_class):
    test_dir = 'test_images'
    train_dir = 'images/' + img_class
    filenames = []
    for i, filename in enumerate(test_imgs, start=1):
        fpath = os.path.join(train_dir, filename)
        if os.path.isfile(fpath):
            shutil.move(fpath, test_dir)
            print('{}) Successfully moved {} to {}'.format(i, fpath, test_dir))
            filenames.append(fpath)
        else:
            print('File not found: ', fpath)

# Move test annotations to test_ann/
def move_test_anns(test_imgs):
    ann_dir = 'annotations/'
    testann_dir = 'test_ann/'
    # Find xml files matching test images and move to test_ann/
    for test_img in test_imgs:
        base = os.path.splitext(os.path.basename(test_img))[0]
        test_xml = ann_dir + base + '.xml'
        if os.path.exists(test_xml):
            shutil.move(test_xml, testann_dir)
            print("Moved {} to test_ann".format(test_xml))
        else:
            print("!!!xml missing for image {}".format(test_img))
            #os.remove(test_img)

# Check that each train image has an annotation
def check_train_anns(train_dir):
    ann_dir = 'annotations/'
    train_imgs = os.listdir(train_dir)
    # Loop through train images to see if xml for each one
    for train_img in train_imgs:
        base = os.path.splitext(os.path.basename(train_img))[0]
        train_xml = ann_dir + base + '.xml'
        if os.path.exists(train_xml):
            print("xml exists for {}".format(train_img))
        else:
            print("!!!xml missing for image {}".format(train_img))
            #os.remove(file)

#### A) Split into train and test datasets

In [None]:
# Make test dataset

# Select 30% of images for test dataset
# TO DO: Enter image classes as a list
img_classes = ['scat', 'footprint'] #@param
for img_class in img_classes:
    # Make list of test images
    img_files = list_dir_images(img_class)
    test_imgs = split_train_test(img_files, img_class)
    # Move test images to test_images/
    move_test_images(test_imgs, img_class)

# Move matching 30% of test annotations to test_ann/
move_test_anns(test_imgs)

# Summary of test image dataset
print("Number of test images:")
!ls test_images | wc -l

print("Number of test annotations:")
!ls test_ann | wc -l

In [None]:
# Make train dataset

# Move train images from class folders to images/
!mv -v images/footprint/* images
!mv -v images/scat/* images

# Remove empty folders for image classes
!rm -r images/footprint
!rm -r images/scat

# Check that each train image has an annotation
train_dir = 'images' 
check_train_anns(train_dir)

# Summary of train image dataset
print("Number of train images:")
!ls images | wc -l

print("Number of train annotations:")
!ls annotations | wc -l

#### B) Augment images and bounding boxes  
Some code modified from [asetkn's GitHub](https://github.com/asetkn/Tutorial-Image-and-Multiple-Bounding-Boxes-Augmentation-for-Deep-Learning-in-4-Steps/blob/master/Tutorial-Image-and-Multiple-Bounding-Boxes-Augmentation-for-Deep-Learning-in-4-Steps.ipynb)

In [None]:
# Define functions

# TO DO: Type in the path to your working directory in form field to right
wd = "/content/drive/MyDrive/train/tf2" #@param {type:"string"}
%cd $wd

# Inspect N-images from directory
def inspect_images(path, num=5):
    image_fns = os.listdir(path)[:num]
    image_fpaths = [path + image_fn for image_fn in image_fns]
    for image_fpath in image_fpaths:
        print("Showing image ", image_fpath)
        ia.imshow(imageio.imread(image_fpath))

# Inspect an image annotation file
def inspect_ann(annpath):
    ann_fn = os.listdir(annpath)[1]
    ann_fpath = annpath + ann_fn
    shutil.copy(ann_fpath, '/content/ann0.txt')
    ann_text = open("/content/ann0.txt", "r")
    print("\nShowing sample annotation ", ann_fpath)
    print(ann_text.read())
    ann_text.close()

# Extract info from annotation xmls into csv file
# Modified from https://github.com/datitran/raccoon_dataset/blob/master/xml_to_csv.py
def xml_to_csv(path, imtype):
    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = ET.parse(xml_file)
        root = tree.getroot()
        for member in root.findall('object'):
            value = (root.find('filename').text,
                     int(root.find('size')[0].text),
                     int(root.find('size')[1].text),
                     member[0].text,
                     int(member[4][0].text),
                     int(member[4][1].text),
                     int(member[4][2].text),
                     int(member[4][3].text))
            xml_list.append(value)
    column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    outfpath = 'pre-processing/' + imtype + '_labels_notaug' + '.csv'
    xml_df.to_csv(outfpath, index=None)
    print('Successfully converted xmls to: ', outfpath)

    return xml_df, outfpath

##### Convert annotation xmls to csv

In [None]:
# Data Inspection & convert annotations to csv

imtypes = ['train', 'test'] 
for imtype in imtypes:
    if imtype == "train":
        path = 'images/'
        annpath = 'annotations/'
    else:
        path = 'test_images/'
        annpath = 'test_ann/'
    print("\nInspecting {} images\n".format(imtype))
    # Inspect five images
    inspect_images(path, 5)
    
    # Inspect an annotation file 
    inspect_ann(annpath)

    # Convert xml annotations to labels.csv & save results
    labels, outfpath = xml_to_csv(annpath, imtype)

    # Check that the number of images and annotations in labels.csv match
    print("Number of {} images: ".format(imtype))
    !sudo ls $path | wc -l

    print("\nNumber of files with annotations in {}: \n{}".format(outfpath, len(labels.groupby('filename'))))

In [None]:
# Optional: Only run in case mismatch in images in folder and images in train or test labels csv
from os import listdir
from os.path import isfile, join

# TO DO: Choose which training dataset to inspect
path = "test_images/" #@param ["images/", "test_images/"]
fns_orig = [f for f in listdir(path) if isfile(join(path, f))]
non_fns = [f for f in listdir(path) if not isfile(join(path, f))]
if 'test' in path:
    outfpath = 'pre-processing/test_labels_notaug.csv'
else:
    outfpath = 'pre-processing/train_labels_notaug.csv'
df = pd.read_csv(outfpath)
fns_csv = df.filename.unique()

print("Number of image files in {}: {}\n".format(path, len(fns_orig)))
print("Number of image files in {}: {}\n".format(outfpath, len(fns_csv)))
print("Invalid files found in {}: {}".format(path, non_fns))

In [None]:
# TO DO: Enter invalid filename(s) to delete and run
filename_to_del = ".ipynb_checkpoints" #@param {type:"string"}

# TO DO: Is it a file or a directory?
ftype = "d" #@param ["f", "d"]

!find {path} -name {filename_to_del} -type {ftype} -delete

##### Augmentation of images and bounding boxes

In [None]:
# Define image augmentation pipeline
# modified from https://github.com/aleju/imgaug
seq = iaa.SomeOf(2, [    
    iaa.Affine(scale=(0.5, 1.5)),
    iaa.Affine(rotate=(-60, 60)), # rotate by -60 to 60 degrees
    iaa.Affine(translate_percent={"x": (-0.3, 0.3), "y": (-0.3, 0.3)}),
    iaa.Fliplr(0.8),
    iaa.Multiply((0.5, 1.5)),
    iaa.GaussianBlur(sigma=(1.0, 3.0)), # blur using gaussian kernel with sigma of 0-3
    iaa.AdditiveGaussianNoise(scale=(0.03*255, 0.05*255))
])

# Convert bounding boxes to dataframe
def bbs_obj_to_df(bbs_object):
    bbs_array = bbs_object.to_xyxy_array()
    df_bbs = pd.DataFrame(bbs_array, columns=['xmin', 'ymin', 'xmax', 'ymax'])
    return df_bbs

# Augment images from dataframe info and save results
def augment_image(df, path, seq=seq):
    # Output dataframe for augmented image info
    aug_bbs = pd.DataFrame(columns=['filename','width','height','class', \
                                       'xmin', 'ymin', 'xmax', 'ymax'])
    grouped = df.groupby('filename')
    for filename in df['filename'].unique():
        
        # Group df by filename
        group_df = grouped.get_group(filename)
        group_df = group_df.reset_index()
        group_df = group_df.drop(['index'], axis=1)   
        
        # Read in image
        fpath = path + filename
        image = imageio.imread(fpath)
        print("Augmenting image: ", filename)
        
        # Load image bounding box coordinates to imgaug format        
        bb_array = group_df.drop(['filename', 'width', 'height', 'class'], axis=1).values
        bbs = BoundingBoxesOnImage.from_xyxy_array(bb_array, shape=image.shape)
    
        # Augment image using settings defined above in seq
        image_aug, bbs_aug = seq(image=image, bounding_boxes=bbs)
        
        # Write augmented image to file
        fpath_aug = path + 'aug_' + filename
        imageio.imwrite(fpath_aug, image_aug)  
        
        # Add augmented values for img height, width, and filename to new df
        aug_df = group_df.drop(['xmin', 'ymin', 'xmax', 'ymax'], axis=1)    
        for i, _ in aug_df.iterrows():
                aug_df.at[i, 'width'] = image_aug.shape[1]
                aug_df.at[i, 'height'] = image_aug.shape[0]
        # Add 'aug_' prefix to filenames
        aug_df['filename'] = aug_df['filename'].apply(lambda x: 'aug_' + x)
        
        # Write augmented bboxes to new df
        bbs_df = bbs_obj_to_df(bbs_aug)
        # Concat all new augmented info into new data frame
        aug_bbs_df = pd.concat([aug_df, bbs_df], axis=1)
        # Append rows to aug_bbs data frame
        aug_bbs = pd.concat([aug_bbs, aug_bbs_df])            
    
    # Return augmented df
    aug_bbs = aug_bbs.reset_index()
    aug_bbs = aug_bbs.drop(['index'], axis=1)
    
    return aug_bbs

# Remove out of bounds values
def remove_oob(crops):
    # Set negative values to 0
    crops.xmin[crops.xmin < 0] = 0
    crops.ymin[crops.ymin < 0] = 0

    # Remove out of bounds cropping dimensions
    ## When crop height > image height, set crop height equal to image height
    idx = crops.index[crops.ymax > crops.height]
    crops.ymin.iloc[idx] = 0
    crops.ymax.iloc[idx] = crops.height.iloc[idx]
    ## When crop width > image width, set crop width equal to image width
    idx = crops.index[crops.xmax > crops.width]
    crops.xmin.iloc[idx] = 0
    crops.xmax.iloc[idx] = crops.width.iloc[idx]

    # Write relevant results to csv formatted for training and annotations needed by Tensorflow and YOLO
    crops_oobrem = crops[['xmin', 'ymin', 'xmax', 'ymax', 'filename', 'width', \
                          'height', 'class']]

    return crops_oobrem

In [None]:
# Augment train images and bounding boxes
outfpath = 'pre-processing/train_labels_notaug.csv'
df = pd.read_csv(outfpath)
img_path = 'images/'
augmented_df = augment_image(df, img_path)
print("Dataframe with augmented images: \n", augmented_df.head())

# Combine augmented and not augmented dfs & save to file
all_imgs_df = pd.concat([df, augmented_df])
outfpath = 'pre-processing/train_labels_augall.csv'
all_imgs_df.to_csv(outfpath, index=False)

# Remove out of bounds values resulting from augmentation
all_oobrem = remove_oob(all_imgs_df)

# Save results for use training object detectors
outfpath = os.path.splitext(outfpath)[0] + '_oob_rem_fin' + '.csv' 
all_oobrem.to_csv(outfpath, sep=',', index=False)

In [None]:
# Test images and bounding boxes are not augmented

# Remove out of bounds values resulting from augmentation
outfpath = 'pre-processing/test_labels_notaug.csv'
df = pd.read_csv(outfpath)
all_oobrem = remove_oob(df)

# Save results for use training object detectors
outfpath = os.path.splitext(outfpath)[0] + '_oob_rem_fin' + '.csv' 
all_oobrem.to_csv(outfpath, sep=',', index=False)

#### C) Loop through images and annotations to confirm all files are valid to avoid problems training downstream

In [None]:
# Search for problematic images
# modified from https://github.com/AjayZinngg/random-scripts/blob/master/check_images.py 
# Notes on possible image errors during training https://github.com/tensorflow/models/issues/5474
# more notes on errors https://github.com/tensorflow/models/issues/1754

# Change to your training directory within Google Drive
%cd $wd

csv_files = ['pre-processing/train_labels_augall_oob_rem_fin.csv', 'pre-processing/test_labels_notaug_oob_rem_fin.csv']
folders = ['images', 'test_images']

for i in range(len(folders)):
    FOLDER = folders[i]
    CSV_FILE = csv_files[i]

    with open(CSV_FILE, 'r') as fid:
        
        print('Checking file:', CSV_FILE, 'in folder:', FOLDER)
        
        file = csv.reader(fid, delimiter=',')
        first = True
        
        cnt = 0
        error_cnt = 0
        error = False

        for row in file:
            if error == True:
                error_cnt += 1
                error = False
                
            if first == True:
                first = False
                continue
            
            cnt += 1
            
            xmin, ymin, xmax, ymax, name, width, height = int(float(row[0])), int(float(row[1])), int(float(row[2])), int(float(row[3])), row[4], int(float(row[5])), int(float(row[6]))
            
            path = os.path.join(FOLDER, name)
            img = cv2.imread(path)
            
            if type(img) == type(None):
                error = True
                print('Could not read image', path)
                continue
            
            org_height, org_width = img.shape[:2]
            
            if org_width != width:
                error = True
                print('Width mismatch for image: ', name, width, '!=', org_width)
            
            if org_height != height:
                error = True
                print('Height mismatch for image: ', name, height, '!=', org_height)
            
            if xmin > org_width:
                error = True
                print('XMIN > org_width for file', name)
                
            if xmin <= 0:
                error = True
                print('XMIN < 0 for file', name)
                
            if xmax > org_width:
                error = True
                print('XMAX > org_width for file', name)

            if xmax > org_height: #added because training errors for OOBs when none present
                error = True
                print('XMAX > org_height for file', name)
            
            if ymin > org_height:
                error = True
                print('YMIN > org_height for file', name)
            
            if ymin <= 0:
                error = True
                print('YMIN < 0 for file', name)
            
            if ymax > org_height:
                error = True
                print('YMAX > org_height for file', name)
            
            if xmin >= xmax:
                error = True
                print('xmin >= xmax for file', name)
                
            if ymin >= ymax:
                error = True
                print('ymin >= ymax for file', name)
            
            if error == True:
                print('Error for file: %s' % name)
                print()

        print('Checked %d bounding boxes and realized %d errors' % (cnt, error_cnt))

In [None]:
# Manually delete OOB files & img info found above
csv_fpath = "pre-processing/train_labels_augall_oob_rem_fin.csv" #@param ["pre-processing/train_labels_augall_oob_rem_fin.csv", "pre-processing/test_labels_notaug_oob_rem_fin.csv"] {allow-input: true}
df = pd.read_csv(csv_fpath)
file_to_del = "aug_85.jpeg" #@param {type:"string"}
if 'train' in csv_fpath:
    path_to_del = 'images/' + file_to_del
    os.remove(path_to_del)
else:
    path_to_del = 'test_images/' + file_to_del
    os.remove(path_to_del)

# Update results in train/test csv
df1 = df[df.filename != file_to_del]
df1.to_csv(csv_fpath, index=False)

### 8) Generate xmls (annotations) for new annotated image dataset

In [None]:
# Run this block 1x per class

# Convert train and test csvs to xmls with updated filepaths
# modified from here https://gist.github.com/calisir/568190a5e55a79e08be318c285688457
%cd $wd

imtype = "test" #@param ["train", "test"]

# Read in train or test image label data
if imtype == "train":
  folder = "images" 
  fpath = "images/" 
  labfile = "pre-processing/train_labels_augall_oob_rem_fin.csv"
else:
  folder = "test_images" 
  fpath = "test_images/" 
  labfile = "pre-processing/test_labels_notaug_oob_rem_fin.csv"
df1 = pd.read_csv(labfile)
df = df1.groupby('filename', as_index=False).agg(lambda x: list(x))

# Make folders for annotations
!mkdir pre-processing/train_ann
!mkdir pre-processing/test_ann

# Define functions

def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

for i in range(0, len(df)):
    height = df['height'].iloc[i][0]
    width = df['width'].iloc[i][0]
    depth = 3

    annotation = ET.Element('annotation')
    ET.SubElement(annotation, 'folder').text = folder
    ET.SubElement(annotation, 'filename').text = str(df['filename'].iloc[i])
    ET.SubElement(annotation, 'path').text = fpath + str(df['filename'].iloc[i])
    
    source = ET.SubElement(annotation, 'source')
    ET.SubElement(source, 'database').text = 'Unknown'
    
    size = ET.SubElement(annotation, 'size')
    ET.SubElement(size, 'width').text = str(width)
    ET.SubElement(size, 'height').text = str(height)
    ET.SubElement(size, 'depth').text = str(depth)

    ET.SubElement(annotation, 'segmented').text = '0'
    
    # To handle images with >1 annotation
    for x in range(0, len(df['xmin'].iloc[i])):
      ob = ET.SubElement(annotation, 'object')
      ET.SubElement(ob, 'name').text = str(df['class'].iloc[i][x])
      ET.SubElement(ob, 'pose').text = 'Unspecified'
      ET.SubElement(ob, 'truncated').text = '0'
      ET.SubElement(ob, 'difficult').text = '0'

      bbox = ET.SubElement(ob, 'bndbox')
      ET.SubElement(bbox, 'xmin').text = str(int(df['xmin'].iloc[i][x]))
      ET.SubElement(bbox, 'ymin').text = str(int(df['ymin'].iloc[i][x]))
      ET.SubElement(bbox, 'xmax').text = str(int(df['xmax'].iloc[i][x]))
      ET.SubElement(bbox, 'ymax').text = str(int(df['ymax'].iloc[i][x]))

    fileName = str(df['filename'].iloc[i])
    tree = ET.ElementTree(annotation)
    indent(annotation)
    if imtype == "train":
      outf = "pre-processing/train_ann/"
    else:
      outf = "pre-processing/test_ann/"
    outpath = outf + os.path.splitext(fileName)[0] + ".xml"
    tree.write(outpath, encoding='utf8', xml_declaration=False)

In [None]:
# Check that all train images have corresponding annotation
ann_dir = 'pre-processing/train_ann/'
train_dir = 'images/'
files = os.listdir(train_dir)

# Check for duplicate xmls
import collections
print([item for item, count in collections.Counter(files).items() if count > 1])
#!ls -l -a /content/drive/'My Drive'/train/pre-processing/train_ann/

# Loop through train images to see if xml for each one
for file in files:
  base = os.path.splitext(os.path.basename(file))[0]
  train_xml = ann_dir + base + '.xml'
  if os.path.exists(train_xml):
    print("xml exists for {}".format(file))
  else:
    print("!!!xml missing for image {}".format(file))
    #os.remove(file)

# Check for xmls that don't have corresp img
xmls = os.listdir('pre-processing/train_ann/')
imgs = os.listdir('images/')
xbases = []
ibases = []
for xml in xmls:
  xbase = os.path.splitext(os.path.basename(xml))[0]
  xbases.append(xbase)
for img in imgs:
  ibase = os.path.splitext(os.path.basename(img))[0]
  ibases.append(ibase)

# yields the elements in `list_2` that are NOT in `list_1`
diffs = np.setdiff1d(xbases,ibases)
print("xml(s) that need to be deleted bc have no corresp img: {}".format(diffs))

print("Number of train images:")
!ls images | wc -l

print("Number of train annotations:")
!ls pre-processing/train_ann/ | wc -l

In [None]:
# Inspect number of images and annotations for train and test (should be 1 image/annotation in each group and test should be ~20-30% of train)
!ls pre-processing/train_ann | wc -l
!ls images | wc -l

!ls pre-processing/test_ann | wc -l
!ls test_images | wc -l

In [None]:
# Move final datasets to train and test folders for object detection
!mv pre-processing/train_ann/* annotations
#!rm -r /pre-processing/train_ann

!mv pre-processing/test_ann/* test_ann
#!rm -r pre-processing/test_ann