<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/classify_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run images through image rating classification pipeline
--- 
Classify images as "bad" or "good" quality.  
*Last Updated 26 January 2021* 

1) Run images through trained MobileNet SSD v2 model (Attempt 18) to add tags to images for image rating (bad, good) for predictions with confidence > 1.5. (Confidence value chosen in [inspect_train_results.ipynb](https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/inspect_train_results.ipynb)).

2) Discard "good" predications (model accuracy was low for these) and keep only "bad" predications (model accuracy was high for these) above the chosen confidence threshold (1.5).*

3) Display tagging results on images to verify behavior is as expected.

**Notes**:     
**We observed controversy among users assigning ratings to "good" images, and general consensus for assigning ratings to "bad" images. Model behavior matches this observation.*

Change filepaths or information using the form fields to the right of code blocks (also noted in code with 'TO DO')

### Imports
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# For working with data and plotting graphs
import itertools
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.linalg import norm
from scipy import sum, average
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# For image classification and training
import tensorflow as tf

# For working with images
!pip install pillow
!pip install scipy==1.1.0
import cv2
import scipy
from scipy import misc

### 1) Classification
----

#### Define functions & variables
---
To run classification on batches of 5k images at a time, change tag output file name (abcd for 4 batches from 20k bundle + taxon name) and start/end rows (a/b) using form fields to right. 

In [None]:
# For images to read in from bundle

# Load in image from URL
# Modified from https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/guide/saved_model.ipynb#scrollTo=JhVecdzJTsKE
def image_from_url(url, fn):
  file = tf.keras.utils.get_file(fn, url) # Filename doesn't matter
  disp_img = tf.keras.preprocessing.image.load_img(file)
  img = tf.keras.preprocessing.image.load_img(file, target_size=[224, 224])
  x = tf.keras.preprocessing.image.img_to_array(img)
  x = tf.keras.applications.mobilenet_v2.preprocess_input(
    x[tf.newaxis,...])
  return x, disp_img

# Read in EOL image bundle dataframe
# TO DO: Type in image bundle address using form field to right
bundle = "https://editors.eol.org/other_files/bundle_images/files/images_for_Chiroptera_breakdown_000001.txt" #@param ["https://editors.eol.org/other_files/bundle_images/files/images_for_Angiosperms_20K_breakdown_000031.txt", "https://editors.eol.org/other_files/bundle_images/files/images_for_Anura_20K_breakdown_000001.txt", "https://editors.eol.org/other_files/bundle_images/files/images_for_Chiroptera_breakdown_000001.txt", "https://editors.eol.org/other_files/bundle_images/files/images_for_Squamata_breakdown_000001.txt"] {allow-input: true}
df = pd.read_csv(bundle, sep='\t', header=0)
print(df.head())

# For exporting tagging results
import csv

# Write header row of output tagging file
# TO DO: Change file name for each bundle/run abcd if doing 4 batches using dropdown form to right
tags_file = "tags_rating_20k_d" #@param ["tags_rating_20k_a", "tags_rating_20k_b", "tags_rating_20k_c", "tags_rating_20k_d"] {allow-input: true}
tags_fpath = "/content/drive/My Drive/summer20/classification/rating/results/" + tags_file + ".tsv"

# Run in 4 batches of 5k images each (batch a is from 0-5000, b from 5000 to 10000, etc)
if "a" in tags_file:
  a=0
  b=5000
elif "b" in tags_file:
  a=5000
  b=10000
elif "c" in tags_file:
  a=10000
  b=15000
elif "d" in tags_file:
  a=15000
  b=20000

#### Run images through model for image rating classification
---  
Use model selected in inspect_train_results.ipynb (MobileNet SSD v2, Train attempt 18) to classify image quality as good or bad.

In [None]:
from PIL import Image
import time

# Write header row of tagging files
with open(tags_fpath, 'a') as out_file:
      tsv_writer = csv.writer(out_file, delimiter='\t')
      tsv_writer.writerow(["eolMediaURL", "identifier", \
                          "dataObjectVersionID", "ancestry", \
                          "tag_rating", "confidence"])

# Load trained model from path
TRAIN_SESS_NUM = "18"
saved_model_path = '/content/drive/My Drive/summer20/classification/rating/saved_models/' + TRAIN_SESS_NUM
model = tf.keras.models.load_model(saved_model_path)
label_names = ['bad', 'good']
module_selection = ("mobilenet_v2_1.0_224", 224)
handle_base, pixels = module_selection
IMAGE_SIZE = (pixels, pixels)

# Set number of seconds to timeout if image url taking too long to open
import socket
socket.setdefaulttimeout(10)
import time
from PIL import Image

# Loop through EOL image bundle to classify images and generate tags
for i, row in df.iloc[a:b].iterrows():
  try:
    # Get url from image bundle
    url = df['eolMediaURL'][i]
    # Read in image from url
    fn = str(i) + '.jpg'
    img, disp_img = image_from_url(url, fn)
    #ax.imshow(disp_img)
    # Record inference time
    start_time = time.time()
    # Detection and draw boxes on image
    # For flowers/fruits (reproductive structures)
    predictions = model.predict(img, batch_size=1)
    label_num = np.argmax(predictions)
    conf = predictions[0][label_num]
    imclass = label_names[label_num]
    end_time = time.time()
    # Display progress message after each image
    print('Inference complete for Row {} of {} images in {} sec'.format(i, (b-a), \
                                            format(end_time-start_time, '.2f')))

    # Export tagging results to tsv
    # Define variables for export
    identifier = df['identifier'][i]
    dataObjectVersionID = df['dataObjectVersionID'][i]
    if 'ancestry' in df.columns:
      ancestry = df['ancestry'][i]
    else:
      ancestry = "NA"
    with open(tags_fpath, 'a') as out_file:
      tsv_writer = csv.writer(out_file, delimiter='\t')
      tsv_writer.writerow([url, identifier, dataObjectVersionID, ancestry, \
                               imclass, conf])
  except:
    print('Check if URL from {} is valid'.format(url))

### 2) Post-process classification predictions using confidence threshold values
---
MobileNet SSD v2 confidence threshold (>1.5) for all 'bad' predictions was chosen in inspect_train_results.ipynb to minimize false detections.

In [None]:
# TO DO: Input and adjust classification confidence thresholds
conf_thresh = 1.5 #@param 

# Combine exported model predictions and confidence values from above to one dataframe
fpath =  os.path.splitext(tags_fpath)[0]
base = fpath.rsplit('_',1)[0] + '_'
exts = ['a.tsv', 'b.tsv', 'c.tsv', 'd.tsv']
all_filenames = [base + e for e in exts]
df = pd.concat([pd.read_csv(f, sep='\t', header=0, na_filter = False) for f in all_filenames], ignore_index=True)
df[['confidence']] = df[['confidence']].apply(pd.to_numeric)

# Adjust final tag so that all 'bad' predications > 1.5 confidence value = 'bad'
for i, row in df.iterrows():
  if (df['tag_rating'][i] == 'bad') and (df['confidence'][i]>conf_thresh): 
    df['tag_rating'][i] = df['tag_rating'][i]  
  else: 
    df['tag_rating'][i] = 'NA' 

# Write results to tsv
print(df.head())
outfpath = base + 'finaltags.tsv'
df.to_csv(outfpath, sep='\t', index=False)

### 3) Display final classification results on images
---

In [None]:
# Set number of seconds to timeout if image url taking too long to open
import socket
socket.setdefaulttimeout(10)

# TO DO: Update file path to finaltags.tsv file
path = "/content/drive/My Drive/summer20/classification/rating/results/"
f = "tags_rating_20k_finaltags.tsv" #@param
fpath = path + f
df = pd.read_csv(fpath, sep='\t', header=0, na_filter = False)

# Function to load in image from URL
# Modified from https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/guide/saved_model.ipynb#scrollTo=JhVecdzJTsKE
def image_from_url(url, fn):
  file = tf.keras.utils.get_file(fn, url) # Filename doesn't matter
  disp_img = tf.keras.preprocessing.image.load_img(file)
  img = tf.keras.preprocessing.image.load_img(file, target_size=[224, 224])
  x = tf.keras.preprocessing.image.img_to_array(img)
  x = tf.keras.applications.mobilenet_v2.preprocess_input(
    x[tf.newaxis,...])
  return x, disp_img

# TO DO: Set start and end rows to run inference for from EOL image bundle using form field to right
start =  0#@param {type:"raw"}
end = 50 #@param {type:"raw"}

# Loop through EOL image bundle to classify images and generate tags
for i, row in df.iloc[start:end].iterrows():
  try:
    # Get url from image bundle
    url = df['eolMediaURL'][i]
    # Read in image from url
    fn = str(i) + '.jpg'
    img, disp_img = image_from_url(url, fn)
    # Record inference time
    tag = df['tag_rating'][i]
    # Display progress message after each image is loaded
    print('Successfully loaded {} of {} images'.format(i+1, (end-start)))

    # Show classification results for images
    # Only use to view predictions on <50 images at a time
    _, ax = plt.subplots(figsize=(10, 10))
    ax.imshow(disp_img)
    plt.axis('off')
    plt.title("{}) Image quality rating: {} ".format(i+1, tag))

  except:
    print('Check if URL from {} is valid'.format(url))