<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/object_detection_for_image_tagging/plant_pollinator/plant_poll_generate_tags_yolov3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using YOLO v3 pre-trained on Google Open Images to add plant-pollinator co-occurrence tags for ladybugs, beetles, and insects in plant images
---
*Last Updated 9 June 2021*   
Using a YOLOv3 model (downloaded from [here](https://github.com/AlexeyAB/darknet) ) pre-trained on [Google Open Images](https://storage.googleapis.com/openimages/web/visualizer/index.html?set=train&type=detection&c=%2Fm%2F03vt0) as a method to do customized, large-scale image processing. EOL Angiosperm images will be tagged for plant-pollinator co-occurrence using the detected insects. Tags will further extend EOLv3 image search functions.

## Installs & Imports
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# For importing/exporting files, working with arrays, etc
import os
import glob
import pathlib
import six.moves.urllib as urllib
import sys
import tarfile
import zipfile
import numpy as np 
import csv
import matplotlib.pyplot as plt
import time
import pandas as pd

# For downloading images
!apt-get install aria2

# For drawing onto and plotting images
import matplotlib.pyplot as plt
from PIL import Image
from PIL import ImageColor
from PIL import ImageDraw
from PIL import ImageFont
from PIL import ImageOps
import cv2
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libc-ares2
The following NEW packages will be installed:
  aria2 libc-ares2
0 upgraded, 2 newly installed, 0 to remove and 39 not upgraded.
Need to get 1,274 kB of archives.
After this operation, 4,912 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libc-ares2 amd64 1.14.0-1 [37.1 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 aria2 amd64 1.33.1-1 [1,236 kB]
Fetched 1,274 kB in 0s (7,577 kB/s)
Selecting previously unselected package libc-ares2:amd64.
(Reading database ... 160772 files and directories currently installed.)
Preparing to unpack .../libc-ares2_1.14.0-1_amd64.deb ...
Unpacking libc-ares2:amd64 (1.14.0-1) ...
Selecting previously unselected package aria2.
Preparing to unpack .../aria2_1.33.1-1_amd64.deb ...
Unpacking aria2 (1.33.1-1) ...
Setting up libc-ares2

## Model preparation (only run once)
---

In [None]:
# Install darknet

# TO DO: Type in the path to your working directory in form field to right
basewd = "/content/drive/MyDrive/train" #@param {type:"string"}
wd = 'darknet'
%cd $basewd

# Download darknet (the native implementation of YOLO)
if os.path.exists(wd):
    %cd $wd

elif not os.path.exists(wd):
    !git clone https://github.com/AlexeyAB/darknet
    # Compile darknet
    %cd $wd
    !python setup.py build_ext --inplace
    # Change makefile to have GPU and OPENCV enabled
    !sed -i 's/OPENCV=0/OPENCV=1/' Makefile
    !sed -i 's/GPU=0/GPU=1/' Makefile
    !sed -i 's/CUDNN=0/CUDNN=1/' Makefile
    !sed -i 's/CUDNN_HALF=0/CUDNN_HALF=1/' Makefile
    # Download pretrained YOLOv3 weights for Open Images
    !wget https://pjreddie.com/media/files/yolov3-openimages.weights

# Verify CUDA version (for using GPU)
!/usr/local/cuda/bin/nvcc --version

# Make darknet
!make

/content/drive/MyDrive/train
/content/drive/MyDrive/train/darknet2
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0
/content/drive/MyDrive/train/darknet2/darknet
chmod +x *.sh


## Generate cropping coordinates for images
---
Run EOL 20k image bundles through pre-trained object detection models and save results in 4 batches (A-D). 

### Prepare object detection functions and settings

In [None]:
# Functions

# Read in data file
def read_datafile(fpath, sep="\t", header=0, disp_head=True):
    """
    Defaults to tab-separated data files with header in row 0
    """
    try:
        df = pd.read_csv(fpath, sep=sep, header=header)
        if disp_head:
            print("Data header: \n", df.head())
    except FileNotFoundError as e:
        raise Exception("File not found: Enter the path to your file in form field and re-run").with_traceback(e.__traceback__)
    
    return df

# Read in bundle images
def read_eolbundle(bundle, no_bundles):
    # Get first 20k images for Lepidoptera bundles using initial bundle basename
    base = os.path.splitext(os.path.basename(bundle))[0].rsplit('_',1)[0]
    # Load in all sub-bundles (ex: 000001 - 000031 for Angiosperms)
    nums1 = list(range(1, 10))
    nums2 = list(range(10, no_bundles))
    exts1 = ["00000" + str(num) + ".txt" for num in nums1]
    exts2 = ["0000" + str(num) + ".txt" for num in nums2]
    exts = exts1 + exts2
    all_filenames = ["https://editors.eol.org/other_files/bundle_images/files/" + base + "_" + e for e in exts]
    bundles = pd.concat([pd.read_csv(f, sep='\t', header=None) for f in all_filenames], ignore_index=True)
    print("EOL image bundle with {} images: ".format(bundles.head(), len(bundles))

# Define start and stop indices in EOL bundle for running inference   
def set_start_stop():
    # To test with a tiny subset, use 5 random bundle images
    if test_with_tiny_subset:
        start=np.random.choice(a=1000, size=1)[0]
        stop=start+5
    # To run inference on 4 batches of 5k images each
    elif "_a." in outfpath: # batch a is from 0-5000
        start=0
        stop=5000
    elif "_b." in outfpath: # batch b is from 5000-1000
        start=5000
        stop=10000
    elif "_c." in outfpath: # batch c is from 10000-15000
        start=10000
        stop=15000
    elif "_d." in outfpath: # batch d is from 15000-20000
        start=15000
        stop=20000
    
    return start, stop

# To display results
def imShow(path):
    image = cv2.imread(path)
    height, width = image.shape[:2]
    resized_image = cv2.resize(image,(3*width, 3*height), interpolation = cv2.INTER_CUBIC)

    fig = plt.gcf()
    fig.set_size_inches(18, 10)
    plt.axis("off")
    plt.imshow(cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB))
    plt.show()

# For uploading an image from url
# Modified from https://www.pyimagesearch.com/2015/03/02/convert-url-to-image-with-python-and-opencv/
def url_to_image(url):
    resp = urllib.request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    im_h, im_w = image.shape[:2]
 
    return image

### Temporarily download images from EOL bundle to Google Drive (YOLO cannot directly parse URL images)

In [None]:
# Download images for 20K bundle of Angiosperm images with 31 sub-bundles
# To DO: Enter any EOL Angiosperm image bundle URL
bundle = "https://editors.eol.org/other_files/bundle_images/files/images_for_Angiosperms_20K_breakdown_download_000031.txt" #@param {type:"string"}
df = read_eolbundle(bundle, 31)

# Test with a smaller subset than 5k images?
# TO DO: If yes, check test_with_tiny_subset box
test_with_tiny_subset = True #@param {type: "boolean"}

# Take 5k subset of bundle for running inference
# TO DO: Change file name for each bundle/run abcd if doing 4 batches using dropdown form to right
subset = "plant_poll_coocc_tags_a" #@param ["plant_poll_coocc_tags_a", "plant_poll_coocc_tags_b", "plant_poll_coocc_tags_c", "plant_poll_coocc_tags_d"] {allow-input: true}
outfpath = wd + "/data/imgs/" + subset + ".txt"

# Save 5k subset to text file for image download
start, stop = set_start_stop()
df = df.iloc[start:stop]
df.to_csv(outfpath, sep='\n', index=False, header=False)

# Download images 
# Note: Takes 7-10 min per 5k imgs, aria2 downloads 16imgs at a time
img_outfpath = wd + "/data/imgs"
%cd $img_outfpath
!aria2c -x 16 -s 1 -i $subset

# Verify how many images downloaded
print("Number of images downloaded to Google Drive: ")
!ls . | wc -l

In [None]:
# If images downloaded correctly, move text file to data/img_info/
%cd ../
!mv imgs/*.txt img_info/

In [None]:
# Make imgs.txt file to run images through YOLO for inference in batches by filename
%cd $wd
%cd data

inf_subset = img_outfpath + '/' + subset
with open(inf_subset, 'w', encoding='utf-8') as f:
    for dir, dirs, files in os.walk(path):
        files = [fn for fn in files]
        for fn in files:
            if 'txt' not in fn:
                out = "data/imgs/" + fn
                f.writelines(out + '\n')

# Inspect imgs.txt file to confirm length and content
print("\nNumber of images in {}: {}".format(inf_subset, len(df)))
print("\nImages textfile: \n")
df = read_datafile(inf_subset, header=None, sep='\n', disp_head=True)

### Run images through trained model
---

#### Test: Run individual image through by filename and display results

In [None]:
# Run inference on a single image by filename and show results
%cd $wd

#
# TO DO: Enter image filename to run inference on

# Run darknet and show bounding box coordinates
!./darknet detector test cfg/openimages.data cfg/yolov3-openimages.cfg yolov3-openimages.weights data/imgs/caterpillar_3.jpg

# Display detection results
imShow('predictions.jpg')

### Generate crops: Run inference on EOL images & save results for cropping
Use 20K EOL Angiosperm image bundles to get bounding boxes of detected pollinators. Results are saved to [crops_file].tsv.   
Run in 4 batches of 5K images to backup regularly in case of Colab timeouts.

In [None]:
# Run inference on 5k image subset using darknet
%cd $wd

# Create a symbolic link so that /content/gdrive/My\ Drive/ is equal to /mydrive
!ln -s /content/gdrive/My\ Drive/ /mydrive

# Filepath to image file list for inference
filepath = os.path.basename(inf_subset) 

# Run darknet with flag to not show bounding box coordinates
!./darknet detector test cfg/openimages.data cfg/yolov3-openimages.cfg yolov3-openimages.weights -dont_show -save_labels < {filepath}

## Post-process detection results
--- 
Combine output files for batches A-D. Then, convert detection boxes into plant-pollinator co-occurrence tags.

In [None]:
# Combine individual prediction files for each image to all_predictions.txt

# Delete image file list for inference
inf_subset = 'data/imgs/' + os.path.basename(inf_subset)
!rm $inf_subset

# Combine individual text files and image filenames into all_predictions.txt
fns = os.listdir('data/imgs')
with open('data/results/all_predictions.txt', 'w') as outfile:
  header = "class_id x y w h img_id"
  outfile.write(header + "\n")
  for fn in fns:
        if 'txt' in fn:
          with open('data/imgs/'+fn) as infile:
            lines = infile.readlines()
            newlines = [''.join([x.strip(), ' ' + os.path.splitext(fn)[0] + '\n']) for x in lines]
            outfile.writelines(newlines)

# Inspect saved predictions
df = pd.read_csv('data/results/all_predictions.txt')
print(df.head())

# Delete all individual prediction files
!rm -r data/imgs/*.txt

# Delete all image files now that they have been used for inference
!rm -r data/imgs/*

In [None]:
# Create final predictions dataframe with class names (instead of numbers) and image urls
# EOL 20k image url bundle
df = pd.read_csv(bundle)
df.columns = ['url']
print("EOL media URL's corresponding to inference images: \n", df)

# Model predictions with number-coded classes
predict = pd.read_csv('data/results/all_predictions.txt', header=0, sep=" ")
predict.class_id = predict.class_id - 1 #class_id counts started from 1 instead of 0 from YOLO
print("\nModel predictions by class id: \n", predict)

# Add class names to model predictions
classnames = pd.read_table('data/openimages.names')
classnames.columns = ['classname']
#print("Verifying class names: \n", classnames)
tag_df = predict.copy()
di = pd.Series(classnames.classname.values,index=classnames.index).to_dict()
tag_df.replace({"class_id":di}, inplace=True)
tag_df['class_id'] = tag_df['class_id'].astype(str)
print("\nModel prediction classes translated from class id's: \n", tag_df)

# Add EOL media URL's to model predictions
map_urls = df.copy()
img_ids = map_urls['url'].apply(lambda x: os.path.splitext((os.path.basename(x)))[0])
map_urls['img_id'] = img_ids
tag_df.set_index('img_id', inplace=True, drop=True)
map_urls.set_index('img_id', inplace=True, drop=True)
mapped_tagdf = tag_df.merge(map_urls, left_index=True, right_index=True)
mapped_tagdf.reset_index(drop=False, inplace=True)
mapped_tagdf.drop_duplicates(inplace=True, ignore_index=True)
print("\nModel predictions with EOL media URL's: \n", mapped_tagdf.head())

# Save final tags to file
fn = os.path.splitext(os.path.basename(inf_subset))[0]
outpath = 'data/results/' + fn + '.tsv'
mapped_tagdf.to_csv(outpath, sep="\t", index=False)

#### Merge batch output files A-D

In [None]:
# Write header row of output tagging file
# TO DO: Enter any filename from 4 batches of tagging files
tags_file = "plant_poll_coocc_tags_d" #@param {type:"string"}
tags_fpath = "data/results/" + tags_file + ".tsv"

# Combine exported model predictions and confidence values for all batches
fpath =  os.path.splitext(tags_fpath)[0]
base = fpath.rsplit('_',1)[0] + '_'
exts = ['a.tsv', 'b.tsv', 'c.tsv', 'd.tsv'] 
all_filenames = [base + e for e in exts]
df1 = pd.concat([pd.read_csv(f, sep='\t', header=0, na_filter = False) for f in all_filenames], ignore_index=True)

# Filter for desired classes
# TO DO: Enter a list of pollinator classes to filter by
filter = ['Butterfly', 'Insect', 'Beetle', 'Ant', 'Bat (Animal)', 'Bird', 'Bee', 'Invertebrate', 'Animal'] #@param
pattern = '|'.join(filter)
df = df1.copy()
df.loc[df['class_id'].str.contains(pattern), 'class_id'] = 'Pollinator'
print("No. tags matching filtered classes: \n", len(df.class_id[df.class_id.str.contains(pattern)]))
print("\nTags matching filtered classes: \n", df.class_id[df.class_id.str.contains(pattern)])
df.loc[~df.class_id.str.contains(pattern), 'class_id'] = 'None'
print("\nNo. not tags matching filtered classes: \n", len(df.class_id[~df.class_id.str.contains(pattern)]))
print("\nTags not matching filtered classes: \n", df[~df.class_id.str.contains(pattern)])

# Write results to tsv
outfpath = base + 'finaltags.tsv'
df.to_csv(outfpath, sep='\t', index=False)
print("\n\nFinal output tagging file {}: \n{}".format(outfpath, df.head()))

## Display cropping results on images
---

In [None]:
# TO DO: Do you want to use the tagging file exported above?
use_outfpath = "no" #@param ["yes", "no"]
# If no, choose other path to use
otherpath = "data/results/plant_poll_coocc_tags_finaltags.tsv" #@param {type:"string"}
if use_outfpath == "yes":
  outfpath = outfpath
else:
  outfpath = otherpath
df = pd.read_csv(outfpath, sep="\t", header=0)
print("File for tag export {}: \n{}".format(outfpath, df.head()))

In [None]:
# Display tags on images

# TO DO: Adjust line below to see up to 50 images displayed at a time
start = 0 #@param {type:"slider", min:0, max:5000, step:50}
stop = start+50

# Loop through images
for i, row in df.iloc[start:stop].iterrows():
    # Read in image 
    url = df['eolMediaURL'][i]
    img = url_to_image(url)

    # Fetch image tag
    tag = df['class_id'][i]
  
    # Plot cropping box on image
    _, ax = plt.subplots(figsize=(10, 10))
    ax.imshow(img)

    # Display image URL and coordinatesabove image
    # Helps with fine-tuning data transforms in post-processing steps above
    plt.title('{}) {} \n Tag: {}'.format(i+1, url, tag))