<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/image_type/image_type_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-process Image Type Classifier Training Images
---
*Last Updated 7 Aug 2025*   
Follow steps below to make training and testing datasets using map, phylogeny, illustration, and herbarium sheet image bundles. ~800 images per image type class are downloaded to Google Drive for use training models in [image_type_train.ipynb](https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/image_type/image_type_train.ipynb).

Image bundles were made from sources containing *mostly* images from the specified class, but sometimes contain other images. One step of this notebook requires that you go to Google Drive and manually curate the downloaded images. Smaller training datasets generally require more curation for models to learn well.

Notes:   
* Run code blocks by pressing play button in brackets on left
* Before you you start: change the runtime to "GPU" with "High RAM"
* Change parameters using form fields on right (find details at corresponding lines of code by searching '#@param')

## Installs & Imports
---

In [None]:
#@title Choose where to save results (or keep defaults) & set up environment
import os

# Use dropdown menu on right
save = "in Colab runtime (files deleted after each session)" #@param ["in my Google Drive", "in Colab runtime (files deleted after each session)"]
print("Saving results ", save)

# Mount google drive to export file(s)
if 'Google Drive' in save:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

# Type of classification pipeline
classif_type = "image_type" #@param ["image_type", "rating"] {allow-input: true}

# Type in the path to your working directory in form field to right
basewd = "/content/drive/MyDrive/train/tf2" #@param ["/content/drive/MyDrive/train/tf2"] {allow-input: true}
basewd = basewd + '/' + classif_type

# Folder where preprocessing outputs will be saved
folder = "pre-processing" # @param ["pre-processing","inspect_resul","results"] {"allow-input":true}
cwd = basewd + '/' + folder

# Folder where image metadata will be saved
data_folder = "image_data" #@param ["image_data"] {allow-input: true}
data_wd = cwd + '/' + data_folder

# Folder where train/test images will be saved
train_folder = "images" #@param ["images"] {allow-input: true}
train_wd = cwd + '/' + train_folder

# Enter image classes of interest in form field
filters = ["map", "phylo", "herb", "illus"] #@param ["[\"map\", \"phylo, \"herb\", \"illus\"]"] {type:"raw", allow-input: true}

# Download helper_funcs folder
!pip3 -q install --upgrade gdown
!gdown 1xmkrYEJKLJvei9q4zulKfqsGTgDvfvpR
!tar -xzvf helper_funcs.tar.gz -C .

# Install requirements.txt
!pip3 -q install -r requirements.txt

# Set up directory structure
from setup import setup_dirs

# Set up directory structure
setup_dirs(cwd, data_wd, train_wd)
print("\nWorking directory set to: \n", cwd)
print("\nImage metadata directory set to: \n", data_wd)
print("\nTraining images directory set to: \n", train_wd)

In [None]:
#@title Import libraries

# For augmenting, displaying, and saving images
!pip install imaug
!pip install pillow

# For downloading images
!apt-get install aria2

# For importing/exporting files, working with arrays, etc
import pathlib
import os
import imageio
import time
import csv
import numpy as np
import pandas as pd
from urllib.request import urlopen

# For augmenting the images
import imgaug as ia
import imgaug.augmenters as iaa

# For drawing onto and plotting the images
import matplotlib.pyplot as plt
import cv2
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# Define functions
from wrangle_data import *

# Image Type bundle urls
# Map, Herbarium Sheet, Phylogeny
bundles = ["https://editors.eol.org/other_files/bundle_images/classifier/maps.txt",
           "https://editors.eol.org/other_files/bundle_images/classifier/Phylogeny_images.txt",
           "https://editors.eol.org/other_files/bundle_images/classifier/herbarium_sheets_download.txt"]

# Illustration
# Pool zoology and botany into one illustration bundle
illus_bundles = ["https://editors.eol.org/other_files/bundle_images/classifier/Zoological_illustrations_download.txt",
                 "https://editors.eol.org/other_files/bundle_images/classifier/Botanical_illustrations_download.txt"]

## Download images to Google Drive from EOL, Wikimedia, and Flickr BHL image bundles
---
Run this step 5x (once per image bundle). For each iteration, use the dropdown menu to the right to select the image bundle to download images from.

In [None]:
#@title Download images for each class

# Test pipeline with a smaller subset than 5k images?
run = "test with tiny subset" #@param ["test with tiny subset", "for all images"]
print("Run: ", run)

# Download images, augment them, and save to Google Drive
print("\nDownloading training images for each class")

# Download images for each class
for i, imclass in enumerate(filters):

        # Make folder for each class
        %cd $train_wd
        impath = train_wd + "/" + imclass + "/"
        if not os.path.isdir(impath):
            os.makedirs(impath)
        print("Path to images:")
        %cd $impath

        # Read in corresponding bundle
        # For map, herbarium sheet, phylogeny
        if imclass != 'illus':
            bundle = bundles[i]
            !wget --user-agent="Mozilla" $bundle
            fn = os.path.basename(bundle)
            df = pd.read_table(fn)

        # For illustration
        else:
            il_fns = []
            for illus_bundle in illus_bundles:
                !wget --user-agent="Mozilla" $illus_bundle
                il_fn = os.path.basename(illus_bundle)
                il_fns.append(il_fn)

            df = pd.concat([pd.read_table(il_fn) for il_fn in il_fns], ignore_index=True)
            fn = 'illustrations_download.txt'

        # Take tiny subset or all images from bundle
        start, stop = set_start_stop(run, df)
        df = df.iloc[start:stop]
        df.to_csv(fn, sep='\n', header=False, index=False)

        # Download images
        !aria2c -x 16 -s 1 -i $fn

        # Check how many images downloaded
        print("Number of images downloaded to Google Drive for class {}:".format(imclass))
        !ls . | wc -l

        # Move image metadata text file(s) to image_data/bundles
        %cd $cwd
        impath = impath + "*.txt"
        !mv $impath image_data/

## Build "null" image class from EOL images
---   
Having a negative control will help train the classifier on what images do not belong in any of the above classes

In [None]:
# Download null.zip images folder leftover from flower_fruit classifier model
%cd $train_wd
!pip3 install --upgrade gdown
!gdown 1-8-5EVq21jMUSvuEJynOBryKSJojOH49

# Unzip images
print("Unzipping botanical null images...")
!unzip null.zip

# Move unzipped null image folder content to images/null
# Google Drive Zipped folders have preserved directory structure
if not os.path.isdir('null'):
      os.makedirs('null')
# Only move 5 images if running in demo mode
if "tiny subset" in run:
    !shuf -n 6 -e content/drive/'My Drive'/summer20/classification/image_type/images/null/* | xargs -i mv {} null
# Run for all images
else:
    !mv content/drive/'My Drive'/summer20/classification/image_type/images/null/* null

# Check how many images in 'null/'
print("Number of images in 'null' class:")
%cd null
!ls . | wc -l

# Delete not needed files/folders
%cd ../
!rm -r content
!rm -r null.zip

## Go to Google Drive and visually inspect images in each folder
---   
Delete images based on chosen exclusion criteria to get consistent classes with representative images.

In [None]:
#@title Standardize number of images per class
%cd $cwd

# Inspect the number of images in each folder
print("Number of map images:")
maps = !ls images/map | wc -l
print(maps)
print("Number of herbarium sheet images:")
herb = !ls images/herb | wc -l
print(herb)
print("Number of phylogeny images:")
phylo = !ls images/phylo | wc -l
print(phylo)
print("Number of illustration images:")
illus = !ls images/illus | wc -l
print(illus)
print("Number of null images:")
null = !ls images/null | wc -l
print(null)

# Check which folder has the smallest number of images
folders = [maps, herb, phylo, illus, null]
foldernames = ["maps", "herb", "phylo", "illus", "null"]
num_imgs = [int(x.list[0]) for x in folders]
min_imgs = (min(num_imgs))
idx = num_imgs.index(min(num_imgs))
keepfolder = foldernames[idx]
print("\033[93mThe minimum number of images is {} in the folder {}\033[0m".format(min_imgs, foldernames[idx]))

In [None]:
#@title Augment phylogeny images to increase dataset size and diversity
# Phylogeny has half the images of other folders. Use image augmentation to increase the number and diversity of phylogeny images, then make remaining image classes even.

# Test pipeline with a smaller subset than 5k images?
run = "test with tiny subset" #@param ["test with tiny subset", "for all images"]
print("Run: ", run)

# Download images, augment them, and save to Google Drive
print("\nAugmenting images for phylogeny")
filenames = os.listdir("images/phylo")
start, stop = set_start_stop(run, filenames)

# Loop through phylogeny images
for i, fn in enumerate(filenames[start:stop], start=1):
    # Read in image
    impath = "images/phylo/" + fn
    image = imageio.imread(impath, pilmode='RGB')

    # Augment image using settings defined above in seq
    ##image_aug = seq.augment(image=image)
    image_aug = augment_image(image)

    # Define augmentation results needed in exported dataset
    fn_aug = os.path.splitext(impath)[0] + '_aug.jpg'

    # Export augmented images to Google Drive
    imageio.imwrite(fn_aug, image_aug)

    # Display original and augmented image
    if 'tiny subset' in run:
        display_image(image)
        display_image(image_aug)

    # Display message to track augmentation process by image
    print('\033[92m{}) Successfully augmented image from {}\033[0m'.format(i, fn))

In [None]:
#@title Randomly delete all but N images to normalize number of images per folder
# Only run for 5 images if running in demo mode
if "tiny subset" in run:
    print("\033[92mDeleting all but 5 images from all folders...\033[0m")
    !find "images/illus" -type f -print0 | sort -zR | tail -zn +6 | xargs -0 -r rm -v
    !find "images/phylo" -type f -print0 | sort -zR | tail -zn +6 | xargs -0 -r rm -v
    !find "images/null" -type f -print0 | sort -zR | tail -zn +6 | xargs -0 -r rm -v
    !find "images/map" -type f -print0 | sort -zR | tail -zn +6 | xargs -0 -r rm -v
    !find "images/herb" -type f -print0 | sort -zR | tail -zn +6 | xargs -0 -r rm -v
# Run for all images - up to 3k per training class
else:
    print("\033[92mDeleting all but 5 images from all folders...\033[0m")
    !find "images/illus" -type f -print0 | sort -zR | tail -zn +3001 | xargs -0 -r rm -v
    !find "images/phylo" -type f -print0 | sort -zR | tail -zn +3001 | xargs -0 -r rm -v
    !find "images/null" -type f -print0 | sort -zR | tail -zn +3001 | xargs -0 -r rm -v
    !find "images/map" -type f -print0 | sort -zR | tail -zn +3001 | xargs -0 -r rm -v
    !find "images/herb" -type f -print0 | sort -zR | tail -zn +3001 | xargs -0 -r rm -v

In [None]:
#@title Inspect the final number of images for each class
import orjson

# Inspect the number of images in each folder
print("Number of map images:")
maps = !ls images/map | wc -l
print(maps)
print("Number of herbarium sheet images:")
herb = !ls images/herb | wc -l
print(herb)
print("Number of phylogeny images:")
phylo = !ls images/phylo | wc -l
print(phylo)
print("Number of illustration images:")
illus = !ls images/illus | wc -l
print(illus)
print("Number of null images:")
null = !ls images/null | wc -l
print(null)

# Make dictionary of folders and number of images they contain
folders = [maps, herb, phylo, illus, null]
foldernames = ["maps", "herb", "phylo", "illus", "null"]
num_imgs = [int(x.list[0]) for x in folders]
num_train = dict(zip(foldernames, num_imgs))
print(num_train)

# Save number of images per training class to a json file
outfpath = "image_data/num_imgs_per_class.json"
print("\nSaving data on number of images per training class to: ", outfpath)
with open(outfpath, "wb") as f:
        f.write(orjson.dumps(num_train, option= orjson.OPT_INDENT_2))