<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/image_type/image_type_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-process Image Type Classifier Training Images
---
*Last Updated 28 Oct 2021*   
Follow steps below to make training and testing datasets using map, phylogeny, illustration, and herbarium sheet image bundles. ~800 images per image type class are downloaded to Google Drive for use training models in [image_type_train.ipynb](https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/image_type/image_type_train.ipynb). 

Image bundles were made from sources containing *mostly* images from the specified class, but sometimes contain other images. One step of this notebook requires that you go to Google Drive and manually curate the downloaded images. Smaller training datasets generally require more curation for models to learn well.

Notes:
* Change filepaths or information using the form fields to the right of code blocks (also noted in code with 'TO DO')
* One step of this notebook requires that you go to Google Drive and manually curate the downloaded images.

## Installs & Imports
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Install libraries for augmenting, displaying, and saving images
!pip install imgaug
!pip install pillow

# For downloading images
!apt-get install aria2

# For importing/exporting files, working with arrays, etc
import pathlib
import os
import imageio
import time
import csv
import numpy as np
import pandas as pd
from urllib.request import urlopen
from scipy import misc

# For augmenting the images and bounding boxes
import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage

# For drawing onto and plotting the images
import matplotlib.pyplot as plt
import cv2
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# Define functions

# To read in EOL formatted data files
def read_datafile(fpath, sep="\t", header=0, disp_head=True, lineterminator='\n', encoding='latin1'):
    """
    Defaults to tab-separated data files with header in row 0
    """
    try:
        df = pd.read_csv(fpath, sep=sep, header=header, lineterminator=lineterminator, encoding=encoding)
        if disp_head:
          print("Data header: \n", df.head())
    except FileNotFoundError as e:
        raise Exception("File not found: Enter the path to your file in form field and re-run").with_traceback(e.__traceback__)
    
    return df

# To display an image already loaded into the runtime
def display_image(image):
  fig = plt.figure(figsize=(20, 15))
  plt.grid(False)
  plt.imshow(image)

## Download images to Google Drive from EOL, Wikimedia, and Flickr BHL image bundles
---
Run this step 5x (once per image bundle). For each iteration, use the dropdown menu to the right to select the image bundle to download images from.

In [None]:
# Set up directory structure
# TO DO: Type in the path to your working directory in form field to right
wd = "/content/drive/MyDrive/train" #@param {type:"string"}
cwd = wd + '/pre-processing/images/'
image_data = wd + '/pre-processing/image_data/'
if not os.path.isdir(cwd):
    os.makedirs(cwd)
    os.makedirs(image_data)
%cd $cwd

# Image Type classes
imclasses = ["map", "herb", "phylo", "illus"]

# Image Type bundles
# Map, Herbarium Sheet, Phylogeny
bundles = ["https://editors.eol.org/other_files/bundle_images/classifier/maps.txt", 
           "https://editors.eol.org/other_files/bundle_images/classifier/Phylogeny_images.txt", 
           "https://editors.eol.org/other_files/bundle_images/classifier/herbarium_sheets_download.txt"]

# Illustration
# Combine zoology and botany into one illustration bundle
illus_bundles = ["https://editors.eol.org/other_files/bundle_images/classifier/Zoological_illustrations_download.txt", 
                 "https://editors.eol.org/other_files/bundle_images/classifier/Botanical_illustrations_download.txt"]

In [None]:
# Optional: Test downloads with a small subset first?
# TO DO: If yes, check test_with_tiny_subset box
test_with_tiny_subset = True #@param {type: "boolean"}

# Test downloads with tiny subset
if test_with_tiny_subset:
    filenames_tiny = []
    # Download images for each class
    for i, imclass in enumerate(imclasses):
        # Make folder for each class
        %cd $cwd
        impath = cwd + imclass + "/"
        if not os.path.isdir(impath):
            os.makedirs(impath)
        print("Path to images:")
        %cd $impath

        # Read in corresponding bundle 
        # For map, herbarium sheet, phylogeny
        if imclass != 'illus':
            bundle = bundles[i]
            fn = os.path.basename(bundle)
            df = pd.read_table(bundle, sep='\n', header=None)
        # For illustration
        else:
            fn = 'illustrations_download.txt'
            df = pd.concat([pd.read_table(f, sep='\n', header=None, 
                                na_filter = False) for f in illus_bundles], 
                                ignore_index=True)
        # Save tiny subset file
        df = df.head()
        fn_tiny = impath + os.path.splitext(fn)[0] + '_tinysubset.txt'
        df.to_csv(fn_tiny, sep='\n', header=False)

        # Download images
        !aria2c -x 16 -s 1 -i $fn_tiny

        # Check how many images downloaded
        print("Number of images downloaded to Google Drive for class {}:".format(imclass))
        !ls . | wc -l

        # Move text file to image_data/bundles
        %cd ../..
        !mv $fn_tiny image_data

In [None]:
# Run for all images

# Download images for each class
for i, imclass in enumerate(imclasses):
        # Make folder for each class
        %cd $cwd
        impath = cwd + imclass + "/"
        if not os.path.isdir(impath):
            os.makedirs(impath)
        print("Path to images:")
        %cd $impath

        # Read in corresponding bundle 
        # For map, herbarium sheet, phylogeny
        if imclass != 'illus':
            bundle = bundles[i]
            !wget --user-agent="Mozilla" $bundle
            fn = os.path.basename(bundle)  
        # For illustration
        else:
            
            df = pd.concat([pd.read_table(f, sep='\n', header=None, 
                                na_filter = False) for f in illus_bundles], 
                                ignore_index=True)
            fn = 'illustrations_download.txt'
            df.to_csv(fn, sep='\n', header=False, index=False)

        # Download images
        !aria2c -x 16 -s 1 -i $fn

        # Check how many images downloaded
        print("Number of images downloaded to Google Drive for class {}:".format(imclass))
        !ls . | wc -l

        # Move text file to image_data/bundles
        fpath = impath + fn
        %cd ../..
        !mv $fpath image_data

## Build "null" image class from EOL images
---   
Having a negative control will help train the classifier on what images do not belong in any of the above classes

In [None]:
# Download null.zip images folder leftover from flower_fruit classifier model
!gdown --id 1-8-5EVq21jMUSvuEJynOBryKSJojOH49

# Unzip images
print("Unzipping botanical null images")
!unzip null.zip

# Google Drive Zipped folders have preserved directory structure
# Hacky workaround to move images to null folder
if not os.path.isdir('null'):
      os.makedirs('null')
!mv content/drive/'My Drive'/summer20/classification/image_type/images/null/* images/null

# Check how many images in 'null/'
print("Number of images in 'null' class:")
!ls . | wc -l

# Delete not needed files/folders
!rm -r content
!rm -r null.zip

## Go to Google Drive and visually inspect images in each folder
---   
Delete images based on chosen exclusion criteria to get consistent classes with representative images.

## Standardize number of images per class
---

In [None]:
# Inspect the number of images in each folder
print("Number of map images:")
maps = !ls images/map | wc -l
print(maps)
print("Number of herbarium sheet images:")
herb = !ls images/herb | wc -l
print(herb)
print("Number of phylogeny images:")
phylo = !ls images/phylo | wc -l
print(phylo)
print("Number of illustration images:")
illus = !ls images/illus | wc -l
print(illus)
print("Number of null images:")
null = !ls images/null | wc -l
print(null)

# Check which folder has the smallest number of images
folders = [maps, herb, phylo, illus, null]
foldernames = ["maps", "herb", "phylo", "illus", "null"]
num_imgs = [int(x.list[0]) for x in folders]
min_imgs = (min(num_imgs))
idx = num_imgs.index(min(num_imgs))
keepfolder = foldernames[idx]
print("The minimum number of images is {} in the folder {}".format(min_imgs, foldernames[idx]))

#### Augment phylogenies to increase dataset size and diversity
Phylogeny has half the images of other folders. Use image augmentation to increase the number and diversity of phylogeny images, then make remaining image classes even.

In [None]:
# Define image augmentation pipeline
# modified from https://github.com/aleju/imgaug
seq = iaa.Sequential([
    iaa.Crop(px=(1, 16), keep_size=False), # crop by 1-16px, resize resulting image to orig dims
    iaa.Affine(rotate=(-25, 25)), # rotate -25 to 25 degrees
    iaa.GaussianBlur(sigma=(0, 3.0)), # blur using gaussian kernel with sigma of 0-3
    iaa.AddToHueAndSaturation((-50, 50), per_channel=True)
])

# Optional: set seed to make augmentations reproducible across runs, otherwise will be random each time
ia.seed(1) 

In [None]:
# Optional: Test downloads with a small subset first?
# TO DO: If yes, check test_with_tiny_subset box
test_with_tiny_subset = True #@param {type: "boolean"}

# Augment phylogeny images to increase dataset size and diversity
filenames = os.listdir("images/phylo")
if test_with_tiny_subset:
    filenames = filenames[:6]
    display_results = True
# Loop through phylogeny images 
for i, fn in enumerate(filenames, start=1):
    # Read in image
    impath = "images/phylo/" + fn
    image = imageio.imread(impath, pilmode='RGB')
    
    # Augment image using settings defined above in seq
    image_aug = seq.augment(image=image)
    
    # Define augmentation results needed in exported dataset
    fn_aug = os.path.splitext(impath)[0] + '_aug.jpg'

    # Export augmented images to Google Drive
    imageio.imwrite(fn_aug, image_aug)
    
    # Display original and augmented image
    if display_results:
        display_image(image)
        display_image(image_aug)    

    # Display message to track augmentation process by image
    print('{}) Successfully augmented image from {}'.format(i, fn))

#### Delete excess images from classes so that folders have roughly the same number of images

In [None]:
# Randomly delete all but 3000 images from whichever folders have too many images
!find "images/illus" -type f -print0 | sort -zR | tail -zn +3001 | xargs -0 rm
!find "images/phylogeny" -type f -print0 | sort -zR | tail -zn +3001 | xargs -0 rm