<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/object_detection_for_image_cropping/lepidoptera/lepidoptera_train_tf2_ssd_rcnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Train Tensorflow Faster-RCNN and SSD models to detect butterflies & moths (Lepidoptera) from EOL images
---   
*Last Updated 29 May 2021*  
-Now runs in Python 3 with Tensorflow 2.0-     

Use EOL user generated cropping coordinates to train Faster-RCNN and SSD Object Detection Models implemented in Tensorflow to detect butterflies & moths from EOL images. Training data consists of the user-determined best square thumbnail crop of an image, so model outputs will also be a square around objects of interest.

Datasets were downloaded to Google Drive in [lepidoptera_preprocessing.ipynb](https://github.com/aubricot/computer_vision_with_eol_images/blob/master/object_detection_for_image_cropping/lepidoptera/lepidoptera_preprocessing.ipynb).

***Models were trained in Python 2 and TF 1 in Jan 2020: RCNN trained for 2 days to 200,000 steps and SSD for 2 days to 200,000 steps.*** 

Notes:   
* Before you you start: change the runtime to "GPU" with "High RAM"
* Change filepaths/taxon names where you see 'TO DO' 
* For each 24 hour period on Google Colab, you have up to 12 hours of free GPU access. 

References:     
* [Official Tensorflow Object Detection API Instructions](https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/training.html) 
* [Medium Blog on training using Tensorflow Object Detection API in Colab](https://medium.com/analytics-vidhya/training-an-object-detection-model-with-tensorflow-api-using-google-colab-4f9a688d5e8b)

## Installs & Imports
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# For running inference on the TF-Hub module
import tensorflow as tf
import tensorflow_hub as hub

# For downloading and displaying images
import matplotlib
import matplotlib.pyplot as plt
import tempfile
import urllib
from urllib.request import urlretrieve
from six.moves.urllib.request import urlopen
from six import BytesIO

# For drawing onto images
from PIL import Image
from PIL import ImageColor
from PIL import ImageDraw
from PIL import ImageFont
from PIL import ImageOps

# For measuring the inference time
import time

# For working with data
import numpy as np
import pandas as pd
import os
import csv

# Print Tensorflow version
print('Tensorflow Version: %s' % tf.__version__)

# Check available GPU devices
print('The following GPU devices are available: %s' % tf.test.gpu_device_name())

# Define functions

# Read in data file exported from "Combine output files A-D" block above
def read_datafile(fpath, sep="\t", header=0, disp_head=True):
    """
    Defaults to tab-separated data files with header in row 0
    """
    try:
        df = pd.read_csv(fpath, sep=sep, header=header)
        if disp_head:
          print("Data header: \n", df.head())
    except FileNotFoundError as e:
        raise Exception("File not found: Enter the path to your file in form field and re-run").with_traceback(e.__traceback__)
    
    return df

# To load image in and do something with it
def load_img(path): 
  img = tf.io.read_file(path)
  img = tf.image.decode_jpeg(img, channels=3)
  return img

# To display loaded image
def display_image(image):
  fig = plt.figure(figsize=(20, 15))
  plt.grid(False)
  plt.imshow(image)

# For reading in images from URL and passing through TF models for inference
def download_and_resize_image(url, new_width=256, new_height=256, #From URL
                              display=False):
  _, filename = tempfile.mkstemp(suffix=".jpg")
  response = urlopen(url)
  image_data = response.read()
  image_data = BytesIO(image_data)
  pil_image = Image.open(image_data)
  im_h, im_w = pil_image.size
  pil_image = ImageOps.fit(pil_image, (new_width, new_height), Image.ANTIALIAS)
  pil_image_rgb = pil_image.convert("RGB")
  pil_image_rgb.save(filename, format="JPEG", quality=90)
  #print("Image downloaded to %s." % filename)
  if display:
    display_image(pil_image)
  return filename, im_h, im_w

In [None]:
# Download, compile and build the Tensorflow Object Detection API (takes 4-9 minutes)

# TO DO: Type in the path to your working directory in form field to right
basewd = "/content/drive/MyDrive/train" #@param {type:"string"}
%cd $basewd

# Set up directory for TF2 Model Garden
# TO DO: Type in the folder you would like to contain TF2
folder = "tf2" #@param {type:"string"}
if not os.path.exists(folder):
    os.makedirs(folder)
    %cd $folder
    os.makedirs("tf_models")
    %cd tf_models
    # Clone the Tensorflow Model Garden
    !git clone --depth 1 https://github.com/tensorflow/models/
    %cd ../..

# Build the Object Detection API
wd = basewd + '/' + folder
%cd $wd
!cd tf_models/models/research/ && protoc object_detection/protos/*.proto --python_out=. && cp object_detection/packages/tf2/setup.py . && python -m pip install .

## Model preparation (only run once)
---
These blocks download and set-up files needed for training object detectors. After running once, you can train and re-train as many times as you'd like.

### Download and extract pre-trained models 

In [None]:
# Download pre-trained models from Tensorflow Object Detection Model Zoo
# https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md
# SSD and Faster-RCNN used as options below
# modified from https://github.com/RomRoc/objdet_train_tensorflow_colab/blob/master/objdet_custom_tf_colab.ipynb

import shutil
import glob
import tarfile

# CD to folder where TF models are installed (tf2)
%cd $wd

# Make folders for your training files for each model
# Faster RCNN Model
if not (os.path.exists('tf_models/train_demo')):
  !mkdir tf_models/train_demo
if not (os.path.exists('tf_models/train_demo/rcnn')):
  !mkdir tf_models/train_demo/rcnn
if not (os.path.exists('tf_models/train_demo/rcnn/pretrained_model')):
  !mkdir tf_models/train_demo/rcnn/pretrained_model
if not (os.path.exists('tf_models/train_demo/rcnn/finetuned_model')):
  !mkdir tf_models/train_demo/rcnn/finetuned_model
if not (os.path.exists('tf_models/train_demo/rcnn/trained')):
  !mkdir tf_models/train_demo/rcnn/trained
# Download the model
MODEL = 'faster_rcnn_resnet50_v1_640x640_coco17_tpu-8'
MODEL_FILE = MODEL + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/tf2/20200711/'
DEST_DIR = 'tf_models/train_demo/rcnn/pretrained_model'
if not (os.path.exists(MODEL_FILE)):
  urlretrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)

tar = tarfile.open(MODEL_FILE)
tar.extractall()
tar.close()

os.remove(MODEL_FILE)
if (os.path.exists(DEST_DIR)):
  shutil.rmtree(DEST_DIR)
os.rename(MODEL, DEST_DIR)

# SSD Model
if not (os.path.exists('tf_models/train_demo/ssd')):
  !mkdir tf_models/train_demo/ssd
if not (os.path.exists('tf_models/train_demo/ssd/pretrained_model')):
  !mkdir tf_models/train_demo/ssd/pretrained_model
if not (os.path.exists('tf_models/train_demo/ssd/finetuned_model')):
  !mkdir tf_models/train_demo/ssd/finetuned_model
if not (os.path.exists('tf_models/train_demo/ssd/trained')):
  !mkdir tf_models/train_demo/ssd/trained
# Download the model
MODEL = 'ssd_mobilenet_v2_320x320_coco17_tpu-8'
MODEL_FILE = MODEL + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/tf2/20200711/'
DEST_DIR = 'tf_models/train_demo/ssd/pretrained_model'
if not (os.path.exists(MODEL_FILE)):
  urlretrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)

tar = tarfile.open(MODEL_FILE)
tar.extractall()
tar.close()

os.remove(MODEL_FILE)
if (os.path.exists(DEST_DIR)):
  shutil.rmtree(DEST_DIR)
os.rename(MODEL, DEST_DIR)

### Convert training data to tf.record format

1) Download generate_tfrecord.py using code block below

2) Open the Colab file explorer on the right and navigate to your current working directory

3) Double click on generate_tfrecord.py to open it in the Colab text editor.

4) Modify the file for your train dataset: 
*   update label names to the class(es) of interest at line 31 (Lepidoptera)
        # TO-DO replace this with label map
        def class_text_to_int(row_label):
          if row_label == 'Lepidoptera':
            return 1
          else:
            None
*   update the filepath where you want your train tf.record file to save at line 85
        # TO-DO replace path with your filepath
        def main(_):
            writer = tf.python_io.TFRecordWriter('/content/drive/MyDrive/[yourfilepath]/tf.record')

5) Close Colab text editor and proceed with steps below to generate tf.record files for your test and train datasets

In [None]:
# Download lepidoptera_generate_tfrecord.py to your wd in Google Drive
# Follow directions above to modify the file for your dataset
!gdown --id 1_pRlENeAvGV-h_c-_rl2d0Y1QxMJ0TAS

In [None]:
# Convert crops_test to tf.record format for test data
# Modified from https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/training.html

# TO DO: Update file paths in form fields
csv_input = "/content/drive/MyDrive/train/tf2/pre-processing/Lepidoptera_crops_test_notaug_oob_rem_fin.csv" #@param {type:"string"}
output_path = "/content/drive/MyDrive/train/tf2/test_images/tf.record" #@param {type:"string"}
test_image_dir = "/content/drive/MyDrive/train/tf2/test_images" #@param {type:"string"}

!python lepidoptera_generate_tfrecord.py --csv_input=$csv_input  --output_path=$output_path  --image_dir=$test_image_dir

In [None]:
# Move tf.record for test images to test images directory
!mv tf.record $image_dir

In [None]:
# Convert crops_train to tf.record format for train data
# Modified from https://tensorflow-object-detection-api-tutorial.readthedocs.io/en/latest/training.html

# TO DO: Update file paths in form fields
csv_input = "/content/drive/MyDrive/train/tf2/pre-processing/Lepidoptera_crops_train_aug_oob_rem_fin.csv" #@param {type:"string"}
output_path = "/content/drive/MyDrive/train/tf2/images/tf.record" #@param {type:"string"}
train_image_dir = "/content/drive/MyDrive/train/tf2/images" #@param {type:"string"}
global image_dir

!python lepidoptera_generate_tfrecord.py --csv_input=$csv_input  --output_path=$output_path  --image_dir=$train_image_dir

In [None]:
# Move tf.record for training images to train images directory
!mv tf.record $image_dir

### Make label map for class Lepidoptera

In [None]:
%%writefile labelmap.pbtxt
item {
  id: 1
  name: 'Lepidoptera'
}

### Modify model config files for training Faster-RCNN and SSD with your dataset

If you have errors with training, check the pipline_config_path and model_dir in the config files for R-FCN or Faster-RCNN model

In [None]:
# Adjust model config file based on training/testing datasets
# Modified from https://stackoverflow.com/a/63645324
from google.protobuf import text_format
from object_detection.protos import pipeline_pb2
%cd $wd

# TO DO: Adjust parameters  ## add form fields here
filter = "Lepidoptera" #@param {type:"string"}
config_basepath = "tf_models/train_demo/" #@param {type:"string"}
label_map = 'labelmap.pbtxt'
train_tfrecord_path = "/content/drive/MyDrive/train/tf2/images/tf.record" #@param {type:"string"}
test_tfrecord_path = "/content/drive/MyDrive/train/tf2/test_images/tf.record" #@param {type:"string"}
ft_ckpt_basepath = "/content/drive/MyDrive/train/tf2/tf_models/train_demo/" #@param {type:"string"}
ft_ckpt_type = "detection" #@param ["detection", "classification"]
num_classes = 1 #@param
batch_size = 1 #@param ["1", "4", "8", "16", "32", "64", "128"] {type:"raw"}

# Define pipeline for modifying model config files

def read_config(model_config):
    if 'rcnn/' in model_config:
        model_ckpt = 'rcnn/pretrained_model/checkpoint/ckpt-0'
    elif 'ssd/' in model_config:
        model_ckpt = 'ssd/pretrained_model/checkpoint/ckpt-0'
    config_fpath = config_basepath + model_config
    pipeline = pipeline_pb2.TrainEvalPipelineConfig()                                                                                                                                                                                                          
    with tf.io.gfile.GFile(config_fpath, "r") as f:                                                                                                                                                                                                                     
        proto_str = f.read()                                                                                                                                                                                                                                          
        text_format.Merge(proto_str, pipeline)
    return pipeline, model_ckpt, config_fpath

def modify_config(pipeline, model_ckpt, ft_ckpt_basepath):
    finetune_checkpoint = ft_ckpt_basepath + model_ckpt
    pipeline.model.faster_rcnn.num_classes = num_classes
    pipeline.train_config.fine_tune_checkpoint = finetune_checkpoint
    pipeline.train_config.fine_tune_checkpoint_type = ft_ckpt_type
    pipeline.train_config.batch_size = batch_size
    pipeline.train_config.use_bfloat16 = False # True only if training on TPU

    pipeline.train_input_reader.label_map_path = label_map
    pipeline.train_input_reader.tf_record_input_reader.input_path[0] = train_tfrecord_path

    pipeline.eval_input_reader[0].label_map_path = label_map
    pipeline.eval_input_reader[0].tf_record_input_reader.input_path[0] = test_tfrecord_path

    return pipeline

def write_config(pipeline, config_fpath):
    config_outfpath = os.path.splitext(config_fpath)[0] + '_' + filter + '.config'
    config_text = text_format.MessageToString(pipeline)                                                                                                                                                                                                        
    with tf.io.gfile.GFile(config_outfpath, "wb") as f:                                                                                                                                                                                                                       
        f.write(config_text)
    
    return config_outfpath

def setup_pipeline(model_config, ft_ckpt_basepath):
    print('\n Modifying model config file for {}'.format(model_config))
    pipeline, model_ckpt, config_fpath = read_config(model_config)
    pipeline = modify_config(pipeline, model_ckpt, ft_ckpt_basepath)
    config_outfpath = write_config(pipeline, config_fpath)
    print(' Modifed model config file saved to {}'.format(config_outfpath))
    if config_outfpath:
        return "Success!"
    else:
        return "Fail: try again"

# Modify model configs
model_configs = ['rcnn/pretrained_model/pipeline.config', 'ssd/pretrained_model/pipeline.config']
[setup_pipeline(model_config, ft_ckpt_basepath) for model_config in model_configs]

## Train
--- 

In [None]:
# Determine how many train and eval steps to use based on dataset size

# TO DO: Only need to update path if you didn't just run "Model Preparation" block above
try: 
    train_image_dir
except NameError:
    train_image_dir = "/content/drive/MyDrive/train/tf2/images" #@param {type:"string"}
examples = len(os.listdir(train_image_dir))
print("Number of train examples: \n", examples)

# Get the number of testing examples
# TO DO: Only need to update path if you didn't just run "Model Preparation" block above
try:
    test_image_dir
except NameError:
    test_image_dir = "/content/drive/MyDrive/train/tf2/test_images" #@param {type:"string"}
test_examples = len(os.listdir(test_image_dir))
print("Number of test examples: \n", test_examples)

# Get the training batch size
# TO DO: Only need to update value if you didn't just run "Model Preparation" block above
try:
    batch_size
except NameError:
    batch_size = 1 #@param ["1", "4", "8", "16", "32", "64", "128"] {type:"raw"}
print("Batch size: \n", batch_size)

# Calculate roughly how many steps to use for training and testing
steps_per_epoch = examples / batch_size
num_eval_steps = test_examples / batch_size
print("Number of steps per training epoch: \n", int(steps_per_epoch))
print("Number of evaluation steps: \n", int(num_eval_steps))

In [None]:
# TO DO: Choose how many epochs to train for
epochs = 410 #@param {type:"slider", min:10, max:1000, step:100}
num_train_steps = int(epochs * steps_per_epoch)
num_eval_steps = int(num_eval_steps)
# TO DO: Choose paths for RCNN or SSD model
pipeline_config_path = "tf_models/train_demo/rcnn/pretrained_model/pipeline_Lepidoptera.config" #@param ["tf_models/train_demo/rcnn/pretrained_model/pipeline_Lepidoptera.config", "tf_models/train_demo/ssd/pretrained_model/pipeline_Lepidoptera.config"]
model_dir = "tf_models/train_demo/rcnn/trained" #@param ["tf_models/train_demo/rcnn/trained", "tf_models/train_demo/ssd/trained"]
output_directory = "tf_models/train_demo/rcnn/finetuned_model" #@param ["tf_models/train_demo/rcnn/finetuned_model", "tf_models/train_demo/ssd/finetuned_model"]
trained_checkpoint_dir = "tf_models/train_demo/rcnn/trained" #@param ["tf_models/train_demo/rcnn/trained", "tf_models/train_demo/ssd/trained"] {allow-input: true}

# Save vars to environment for access with cmd line tools below
os.environ["trained_checkpoint_dir"] = "trained_checkpoint_dir"
os.environ["num_train_steps"] = "num_train_steps"
os.environ["num_eval_steps"] = "num_eval_steps"
os.environ["pipeline_config_path"] = "pipeline_config_path"
os.environ["model_dir"] = "model_dir"
os.environ["output_directory"] = "output_directory"

In [None]:
# Optional: Visualize training progress with Tensorboard

# Load the TensorBoard notebook extension
%load_ext tensorboard
# Log training progress using TensorBoard
%tensorboard --logdir $model_dir

In [None]:
# Actual training
# Note: You can change the number of epochs in code block below and re-run to train longer
# Modified from https://github.com/RomRoc/objdet_train_tensorflow_colab/blob/master/objdet_custom_tf_colab.ipynb
matplotlib.use('Agg')
%cd $wd

!python tf_models/models/research/object_detection/model_main_tf2.py \
    --alsologtostderr \
    --num_train_steps=$num_train_steps \
    --num_eval_steps=$num_eval_steps \
    --pipeline_config_path=$pipeline_config_path \
    --model_dir=$model_dir 

In [None]:
# Export trained model
# Modified from https://github.com/RomRoc/objdet_train_tensorflow_colab/blob/master/objdet_custom_tf_colab.ipynb
%cd $wd

# Save the model
!python tf_models/models/research/object_detection/exporter_main_v2.py \
    --input_type image_tensor \
    --pipeline_config_path=$pipeline_config_path \
    --trained_checkpoint_dir=$trained_checkpoint_dir \
    --output_directory=$output_directory

In [None]:
# Evaluate trained model to get mAP and IoU stats for COCO 2017
# Change pipeline_config_path and checkpoint_dir when switching between SSD and Faster-RCNN models
matplotlib.use('Agg')

!python tf_models/models/research/object_detection/model_main_tf2.py \
    --alsologtostderr \
    --model_dir=$model_dir \
    --pipeline_config_path=$pipeline_config_path \
    --checkpoint_dir=$trained_checkpoint_dir