<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/classify_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run images through image rating classification pipeline
---
*Last Updated 12 December 2022*  
-Runs in Python 3 with Tensorflow 2.0-   

Use trained image classification model to add tags for image quality rating (bad, good) to EOL images.

Models were trained in [rating_train.ipynb](https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/rating_train.ipynb). Confidence threshold for the best trained model was selected in [inspect_train_results.ipynb](https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/inspect_train_results.ipynb). 

We observed controversy among users assigning ratings to "good" images, and consensus for assigning ratings to "bad" images (Users were more conflicted on what they like than what they don't like). Model behavior matched this observation. In post-processing, keep only "bad" image quality predictions (model accuracy was high for this class) when confidence > 1.5. "Good" image quality predications are discarded (model accuracy was low for this class). 

Finally, display tagging results on images to verify behavior is as expected.

***Models were trained in Python 2 and TF 1 in December 2020: MobileNet SSD v2 (Run 18, trained on 'good' and 'bad' classes) was trained for 12 hours to 10 epochs with Batch Size=16, Lr=0.001, Dropout=0.2.***

Notes:     
* Run code blocks by pressing play button in brackets on left
* Before you you start: change the runtime to "GPU" with "High RAM"
* Change parameters using form fields on right (find details at corresponding lines of code by searching '#@param')

## Installs & Imports
---

In [None]:
#@title Choose where to save results
import os

# Use dropdown menu on right
save = "in Colab runtime (files deleted after each session)" #@param ["in my Google Drive", "in Colab runtime (files deleted after each session)"]

# Mount google drive to export image tagging file(s)
if 'Google Drive' in save:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

# Type in the path to your project wd in form field on right
basewd = "/content/drive/MyDrive/train" #@param ["/content/drive/MyDrive/train"] {allow-input: true}

# Make folder for image tags within base wd
cwd = basewd + '/results/'
if not os.path.exists(cwd):
    os.makedirs(cwd)

print("Saving results {} to {}".format(save, cwd))

In [None]:
#@title Import libraries

# For downloading and displaying images
from PIL import Image
import cv2
import imageio
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# For working with data
import numpy as np
import pandas as pd
from os import path
import csv
import itertools
from scipy.linalg import norm
from scipy import sum, average
# So URL's don't get truncated in display
pd.set_option('display.max_colwidth',1000)
pd.options.display.max_columns = None

# For measuring inference time
import time

# For image classification
import tensorflow as tf
print('\nTensorflow Version: %s' % tf.__version__)

# Set number of seconds to timeout if image url taking too long to open
import socket
socket.setdefaulttimeout(10)

In [None]:
#@title Choose saved model parameters (if using EOL model, defaults are already selected)

# Use EOL pre-trained model for object detection?
use_EOL_model = True #@param {type: "boolean"}

# Get info about trained classification model
def get_model_info(use_EOL_model):
    # Use EOL pre-trained model
    if use_EOL_model:
        # Model metadata
        module_selection =('mobilenet_v2_1.0_224', 224)
        dataset_labels = ["bad","good"]
        TRAIN_SESS_NUM = '18'
        saved_models_dir = basewd + '/saved_models/'
        # If running for the first time, download model
        if not os.path.exists(saved_models_dir):
            # Make folder for trained model
            os.makedirs(saved_models_dir)
            %cd $saved_models_dir
            os.makedirs(TRAIN_SESS_NUM)
            # Download saved model files for Run 18 - MobileNet SSD v2
            !pip3 install --upgrade gdown
            !gdown --id 1L-WqfuoQtPgqJzU8tDKjgsZC98M-68w9 # 18.zip 404 Mb
            !unzip 18.zip -d .
            !mv -v content/drive/MyDrive/summer20/classification/rating/saved_models/18/* 18
            !rm -r content
            !rm -r 18.zip
            %cd ../
            print("\nSuccessfully downloaded pre-trained EOL model to: \n", (saved_models_dir + TRAIN_SESS_NUM))
    
    # Use your own trained model
    elif not use_EOL_model:
        # Change values to match your trained model
        module_selection = ("mobilenet_v2_1.0_224", 224) #@param ["(\"mobilenet_v2_1.0_224\", 224)", "(\"inception_v3\", 299)"] {type:"raw", allow-input: true}
        dataset_labels = ["bad", "good"] #@param ["[\"bad\", \"good\"]"] {type:"raw", allow-input: true}
        saved_models_dir = "train/saved_models/" #@param ["train/saved_models/"] {allow-input: true}
        TRAIN_SESS_NUM = "18" #@param ["18"] {allow-input: true}

    return module_selection, dataset_labels, saved_models_dir, TRAIN_SESS_NUM

# Load saved model
module_selection, dataset_labels, saved_models_dir, TRAIN_SESS_NUM = get_model_info(use_EOL_model)
model, pixels, handle_base = load_saved_model(saved_models_dir, TRAIN_SESS_NUM, module_selection)

## Generate tags: Run inference on EOL images & save results for tagging - Run 4X for batches A-D
---
Use 20K EOL image bundle to classify image quality rating as bad or good. Results are saved to [tags_file].tsv. Run this section 4 times (to make batches A-D) of 5K images each to incrementally save in case of Colab timeouts.

### Prepare classification functions and settings

In [None]:
#@title Define functions

# To read in EOL formatted data files
def read_datafile(fpath, sep="\t", header=0, disp_head=True):
    try:
        df = pd.read_csv(fpath, sep=sep, header=header, storage_options=hdr)
        if disp_head:
          print("Data header: \n", df.head())
    except FileNotFoundError as e:
        raise Exception("File not found: Enter the path to your file in form field and re-run").with_traceback(e.__traceback__)
    
    return df

# Define start and stop indices in EOL bundle for running inference   
def set_start_stop(df):
    # To test with a tiny subset, use 5 random bundle images
    if "tiny subset" in run:
        start=np.random.choice(a=len(df), size=1)[0]
        stop=start+5
    # To run inference on 4 batches of 5k images each
    elif "_a." in outfpath: # batch a is from 0-5000
        start=0
        stop=5000
    elif "_b." in outfpath: # batch b is from 5000-1000
        start=5000
        stop=10000
    elif "_c." in outfpath: # batch c is from 10000-15000
        start=10000
        stop=15000
    elif "_d." in outfpath: # batch d is from 15000-20000
        start=15000
        stop=20000

    return start, stop

# Load in image from URL
# Modified from https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/guide/saved_model.ipynb#scrollTo=JhVecdzJTsKE
def image_from_url(url, fn):
    # Formatted for classification
    f = tf.keras.utils.get_file(fn, url) # Filename doesn't matter
    disp_img = tf.keras.preprocessing.image.load_img(f) # For display
    img_cv = np.array(disp_img) # For working with cv2 lib
    image = tf.keras.preprocessing.image.load_img(f, target_size=[pixels, pixels])
    image = tf.keras.preprocessing.image.img_to_array(image)
    image = tf.keras.applications.mobilenet_v2.preprocess_input(
        image[tf.newaxis,...])
    
    return image, disp_img

# Load saved model from directory
def load_saved_model(saved_models_dir, TRAIN_SESS_NUM, module_selection):
    # Load trained model from path
    saved_model_path = saved_models_dir + TRAIN_SESS_NUM
    model = tf.keras.models.load_model(saved_model_path)
    # Get name and image size for model type
    handle_base, pixels = module_selection

    return model, pixels, handle_base

# Get info from predictions to display on images
def get_predict_info(predictions, url, i, stop, start):
    # Get info from predictions
    label_num = np.argmax(predictions[0], axis=-1)
    conf = predictions[0][label_num]
    im_class = dataset_labels[label_num]
    # Display progress message after each image
    print("Completed for {}, {} of {} files".format(url, i+1, format(stop-start, '.0f')))
    
    return label_num, conf, im_class

# Set filename for saving classification results
def set_outpath(tags_file):
    outpath = basewd + '/results/' + tags_file + '.tsv'
    print("\nSaving results to: \n", outpath)

    return outpath

# Export results
def export_results(df, url, det_imclass, conf):
    # Define variables for export
    if 'ancestry' in df.columns:
        ancestry = df['ancestry'][i]
    else:
        ancestry = "NA"
    identifier = df['identifier'][i]
    dataObjectVersionID = df['dataObjectVersionID'][i] 
    # Write row with results for each image
    results = [url, identifier, dataObjectVersionID, ancestry,  
               det_imclass, conf]
    with open(outfpath, 'a') as out_file:
        tsv_writer = csv.writer(out_file, delimiter='\t')
        tsv_writer.writerow(results)

In [None]:
#@title Enter EOL image bundle and choose inference settings. Change **tags_file** for each batch A-D
%cd $cwd

# Load in EOL image bundle
bundle = "https://editors.eol.org/other_files/bundle_images/files/images_for_Squamata_20K_breakdown_000001.txt" #@param ["https://editors.eol.org/other_files/bundle_images/files/images_for_Squamata_20K_breakdown_000001.txt", "https://editors.eol.org/other_files/bundle_images/files/images_for_Coleoptera_20K_breakdown_000001.txt", "https://editors.eol.org/other_files/bundle_images/files/images_for_Anura_20K_breakdown_000001.txt", "https://editors.eol.org/other_files/bundle_images/files/images_for_Carnivora_20K_breakdown_000001.txt"] {allow-input: true}
df = read_datafile(bundle, sep='\t', header=0, disp_head=False)

# Test pipeline with a smaller subset than 5k images?
run = "test with tiny subset" #@param ["test with tiny subset", "for all images"]

# Take 5k subset of bundle for running inference
# Change filename for each batch
tags_file = "rating_tags_tf2_c" #@param ["rating_tags_tf2_a", "rating_tags_tf2_b", "rating_tags_tf2_c", "rating_tags_tf2_d"] {allow-input: true}
outfpath = set_outpath(tags_file)

# Write header row of tagging file
if not os.path.isfile(outfpath): 
    with open(outfpath, 'a') as out_file:
              tsv_writer = csv.writer(out_file, delimiter='\t')
              tsv_writer.writerow(["eolMediaURL", "identifier", \
                                   "dataObjectVersionID", "ancestry", \
                                   "imclass", "confidence"])

### Add tags to images

In [None]:
#@title Run inference 
start, stop = set_start_stop(df)
for i, row in enumerate(df.iloc[start:stop].iterrows()):
    try:       
        # Read in image from url
        url = df['eolMediaURL'][i]
        fn = str(i) + '.jpg'
        img, disp_img = image_from_url(url, fn)

        # Image classification
        start_time = time.time() # Record inference time
        predictions = model.predict(img, batch_size=1)
        label_num, conf, det_imclass = get_predict_info(predictions, url, i, stop, start)
        end_time = time.time()
        print("Inference time: {} sec".format(format(end_time-start_time, '.2f')))

        # Export tagging results to 
        export_results(df, url, det_imclass, conf)

    except:
        print('Check if URL from {} is valid'.format(url))

print("\n\n~~~\nInference complete! Run these steps for remaining batches A-D before proceeding.\n~~~")

## Post-process classification results
---
MobileNet SSD v2 confidence threshold (>1.5) for all 'bad' predictions was chosen in inspect_train_results.ipynb to minimize false detections and maximize dataset coverage. All 'good' predictions and any 'bad' predictions below the confidence threshold are discarded.

In [None]:
#@title Use chosen confidence threshold (or EOL default)

# Adjust confidence threshold parameter
conf_thresh = 1.5 #@param ["1.5"] {type:"raw", allow-input: true}

# Combine tagging files for batches A-D
fpath =  os.path.splitext(tags_file)[0] # Get name of one tag file
base = cwd + fpath.rsplit('_',1)[0] + '_' # Remove lettered suffix to get basename
exts = ['a.tsv', 'b.tsv', 'c.tsv', 'd.tsv']
all_filenames = [base + e for e in exts] # List all tag filenames
df = pd.concat([pd.read_csv(f, sep='\t', header=0, na_filter = False) for f in all_filenames], ignore_index=True)
df[['confidence']] = df[['confidence']].apply(pd.to_numeric)

# Summarize combined results
print("Model predictions for Training Attempt {}, {}:".format(TRAIN_SESS_NUM, handle_base))
print("No. Images: {}\n{}".format(len(df), df[['eolMediaURL', 'imclass', 'confidence']].head()))

# Discard all predictions for 'good' or below confidence threshold
# (Final tag to keep -> predictions for 'bad' with confidence > 1.5) 
idx_tokeep = df.index[(df.imclass == 'bad') & (df.confidence > conf_thresh)]
idx_todiscard = df.index.difference(idx_tokeep)
df.loc[idx_todiscard, 'imclass'] = 'NA'

# Write results to tsv
print("\nFinal tagging dataset after filtering predictions: \n", df[['eolMediaURL', 'imclass', 'confidence']].head())
outfpath = base + 'final.tsv'
print("\nSaving results to: \n", outfpath)
df.to_csv(outfpath, sep='\t', index=False)

## Display classification results on images
---

In [None]:
#@title Adjust start index and display up to 50 images with their tags

# Adjust start index using slider
start = 0 #@param {type:"slider", min:0, max:5000, step:50}
stop = min((start+50), len(df))

# Loop through EOL image bundle to classify images and generate tags
for i, row in df.iloc[start:stop].iterrows():
    try:
        # Read in image from url
        url = df['eolMediaURL'][i]
        fn = str(i) + '.jpg'
        img, disp_img = image_from_url(url, fn)
    
        # Get quality rating tag
        tag = df['imclass'][i]
    
        # Display progress message after each image is loaded
        print('Successfully loaded {} of {} images'.format(i+1, (stop-start)))

        # Show classification results for images
        # Only use to view predictions on <50 images at a time
        _, ax = plt.subplots(figsize=(10, 10))
        ax.imshow(disp_img)
        plt.axis('off')
        plt.title("{}) Image quality rating: {} ".format(i+1, tag))

    except:
        print('Check if URL from {} is valid'.format(url))