<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/rating_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-process Image Rating Classifier Training Images
---
*Last Updated 4 Oct 2022*   
Follow steps below to make training and testing datasets using the [EOL user generated image ratings file](https://editors.eol.org/other_files/EOL_v2_files/image_ratings.txt.zip). 7K images per rating class (1 - 5) are downloaded to Google Drive for use training models in [rating_train.ipynb](https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/rating_train.ipynb).     

Notes:   
* Run code blocks by pressing play button in brackets on left
* Before you you start: change the runtime to "GPU" with "High RAM"
* Change parameters using form fields on right (find details at corresponding lines of code by searching '#@param')

## Installs & Imports
---

In [None]:
#@title Choose where to save results & set up directory structure
# Use dropdown menu on right
save = "in Colab runtime (files deleted after each session)" #@param ["in my Google Drive", "in Colab runtime (files deleted after each session)"]
print("Saving results ", save)

# Mount google drive to export image cropping coordinate file(s)
if 'Google Drive' in save:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

# Type in the path to your working directory in form field to right
import os
basewd = "/content/drive/MyDrive/train/tf2" #@param ["/content/drive/MyDrive/train/tf2"] {allow-input: true}
if not os.path.exists(basewd):
    os.makedirs(basewd)

# Enter image classes of interest in form field
filters = ["1", "2", "3", "4", "5"] #@param ["[\"1\", \"2\", \"3\", \"4\", \"5\"]"] {type:"raw", allow-input: true}

# Folder where pre-processing results will be saved
preprocessing_folder = "pre-processing" #@param ["pre-processing"] {allow-input: true}
cwd = basewd + '/' + preprocessing_folder
print("\nWorking directory set to: \n", cwd)

# Folder where image metadata will be saved
data_folder = "image_data" #@param ["image_data"] {allow-input: true}
data_wd = cwd + '/' + data_folder
if not os.path.exists(data_wd):
    os.makedirs(data_wd)
print("\nImage metadata directory set to: \n", data_wd)

# Folder where train/test images will be saved
train_folder = "images" #@param ["images"] {allow-input: true}
train_wd = cwd + '/' + train_folder
if not os.path.exists(train_wd):
    os.makedirs(train_wd)
print("\nTraining images directory set to: \n", train_wd)

In [None]:
# For importing/exporting files, working with arrays, etc
import os
from os import listdir
import pandas as pd
import numpy as np

# For working with images
from PIL import Image
Image.MAX_IMAGE_PIXELS = 95000000 # To suppress errors from Pillow about decompression bombs
import io

# For downloading images
!apt-get install aria2

# Set number of seconds to timeout if image url taking too long to open
import socket
socket.setdefaulttimeout(10)

# Define functions

# EOL image data bundle
bundle = "https://editors.eol.org/other_files/EOL_v2_files/image_ratings.txt.zip" #@param ["https://editors.eol.org/other_files/EOL_v2_files/image_ratings.txt.zip"] {allow-input: true}

# Read in data file exported from "Combine output files A-D" block above
def read_datafile(fpath, sep="\t", header=0, disp_head=True):
    hdr = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'none',
        'Accept-Language': 'en-US,en;q=0.8',
        'Connection': 'keep-alive'
        }
    try:
        df = pd.read_csv(fpath, sep=sep, header=header, storage_options=hdr)
        if disp_head:
          print("Data header: \n", df.head())
    except FileNotFoundError as e:
        raise Exception("File not found: Enter the path to your file in form field and re-run").with_traceback(e.__traceback__)
    
    return df

# Suppress pandas warning about writing over a copy of data
pd.options.mode.chained_assignment = None  # default='warn'

# Optional = Un-comment out if want to view individual URLs, increase output length so full URL visible
pd.set_option('display.max_colwidth', 1000) # Print full urls for inspection

# Filter by rating of interest
def filter_by_rating(df, filter=filter, disp_head=False):
    rating = df.loc[round(df["overall_rating"])==int(filter)]
    rating = rating["obj_url"].copy()
    
    if disp_head:
          print("Rating = {}}:\n {}".format(filter, rating.head()))
    print("\n Number of available ratings for training/testing class {}: \n {}".format(filter, len(rating)))

    return rating

# Define start and stop indices in EOL bundle for running inference   
def set_start_stop(run):
    # To test with a tiny subset, use 5 random bundle images
    if "tiny subset" in run:
        start=np.random.choice(a=1000, size=1)[0]
        stop=start+5
    # To run for all images
    else:
        start=None
        stop=None
    
    return start, stop

## Inspect EOL User Generated Image Ratings File
---

In [None]:
#@title Filter EOL image quality ratings by class (1-bad to 5-good)

# Download EOL user generated rating file to temporary runtime location
!wget --user-agent="Mozilla" $bundle

# Unzip cropping file to your working directory
!unzip /content/image_ratings.txt.zip -d $cwd

# Read in user-generated image rating file
%cd $cwd
fpath = os.path.splitext(os.path.basename(bundle))[0]
df = pd.read_csv(fpath, sep='\t', header=0, lineterminator='\n', encoding='latin1')

# Make train and test datasets for each rating class 1-5
for filter in filters:
    # Filter by rating of interest 
    filtered = filter_by_rating(df, filter, disp_head=False)

    # Make folder for rating class
    dir = 'images/' + filter
    os.makedirs(dir)

    # Export filtered dataset as txt to folder of interest
    outfpath = dir + '/' + filter + '_download.txt'
    filtered.to_csv(outfpath, sep='\n', index=False, header=False)

## Build 7k image bundles for rating classes 1-5
---

In [None]:
# Make 7k image bundles for rating classes 1-4 
# Rating class 5 only has 1200 images, so it is built differently in next code block
%cd $train_wd

# Future image directories for training classifier
ratings = filters[:4]
all_filenames = [rating + '/' + rating + '_download.txt' for rating in ratings] # Image rating filenames
filenames_7k = [rating + '/' + rating + '_download_7k.txt' for rating in ratings] # Future 7K image bundle filenames

# Randomly pick 7,000 images from each rating class and write to csv
for num, filename in enumerate(all_filenames):
    df = pd.read_table(filename, sep='\n')
    bundle = df.sample(7000)
    fn = str(filenames_7k[num])
    print("7k image bundle filename for rating {}: {}\n{}\n".format((num+1), fn, bundle.head()))
    bundle.to_csv(fn, sep='\n', index=False, header=False)

In [None]:
# Make 7k bundle for Rating = 5 dataset
# Different because Rating=5 had only 1200 images total

# Add images to Rating = 5 dataset from EOL User Exemplar File
# Download EOL user generated exemplar file to temporary runtime location
!wget --user-agent="Mozilla" https://editors.eol.org/other_files/EOL_v2_files/images_selected_as_exemplar.txt.zip

# Unzip cropping file to your working directory
!unzip images_selected_as_exemplar.txt.zip -d ./5

# Read in user-generated image exemplars file
fpath = '5/' + 'images_selected_as_exemplar.txt'
df = pd.read_csv(fpath, sep='\t', header=0, lineterminator='\n', encoding='latin1')

# Include all duplicates from exemplar file 
# (these ones may be better or more controversial, see email from JH 28 Oct 2020)
idx = df.index[df.duplicated(['object_url'])].tolist()
dups = df.loc[idx]
dups = pd.DataFrame(dups["object_url"])

# Add 4k random images from exemplar file
unq = df.drop(idx, errors='ignore')
unq = unq.sample(4000)
unq = pd.DataFrame(unq["object_url"])

# Read in Rating = 5 images
df1 = pd.read_table('5/5_download.txt', sep='\n')
df1.columns = unq.columns

# Make combined 7k bundle from Exemplar duplicates & random images, + Rating = 5 images
comb = pd.concat([df1,unq,dups], ignore_index=True)
print("Rating = 5:\n {}".format(comb.head()))
print("\n Number of available ratings for training/testing class 5: \n {}".format(len(comb)))
comb.to_csv('5/5_download_7k.txt', sep='\n', index=False, header=False)
filenames_7k.append('5/5_download_7k.txt')

## Download images to Google Drive
---

In [None]:
#@title Download images for each class
%cd $train_wd

# Test pipeline with a smaller subset than 5k images?
run = "test with tiny subset" #@param ["test with tiny subset", "for all images"]
print("Run: ", run)

# Download images, augment them, and save to Google Drive
print("\nDownloading training images for each class")
start, stop = set_start_stop(run)

# Loop through image data files for each rating class to download images into their respective folders
for imclass, fn in enumerate(filenames_7k, start=1):

    # CWD to image class folder
    impath = train_wd + '/' + str(imclass) + '/'
    %cd $impath

    # Take tiny subset or all images from bundle
    fn = os.path.basename(fn)
    df = pd.read_csv(fn, sep='\n', header=None)
    df = df.iloc[start:stop]
    df.to_csv(fn, sep='\n', header=False, index=False)

    # Download images
    !aria2c -x 16 -s 1 -i $fn

    # Check how many images downloaded
    print("Number of images downloaded to Google Drive: ")
    !ls . | wc -l

    # Move text file to image_data/bundles
    %cd $cwd
    impath = impath + "*.txt"
    !mv $impath image_data/

## Delete all downloaded non-image files
---


In [None]:
# Loop through image rating class folders 1 - 5
for imclass in range(1,6):
    # CWD to image class folder
    impath = train_wd + '/' + str(imclass) + '/'
    %cd $impath
    # Inspect each file in folder
    for path in listdir('./'):
        with open(path, 'rb') as f:
            # Verify that file is an image 
            try:
                if '.html' not in path: # hacky fix to catch htmls
                    img = Image.open(io.BytesIO(f.read()))
                    img.verify() 
                else:
                    raise NameError
            # If file isn't an image, delete it
            except (IOError, SyntaxError, NameError) as e:
                print('Bad file:', path)
                if '(' in path: # rm doesn't work for files with parenthesis in name, need to manually remove
                    print("Manually remove from Google Drive: {}".format(path)) 
                else:
                    !rm $path 

## Aggregate classes into good (4 & 5) and bad (1 & 2) because models did not learn classes 1-5 with any hyperparameter combinations
---
*Afternote: Users were more conflicted on what makes an image "good" than what makes it "bad." Because models learn patterns from the training data, this resulted in high accuracy for predicting "bad" images (classes 1 & 2), but mixed accuracy for predicting "ok" or "good" (classes 4 & 5)*   
Models were retrained using aggregated "bad" and "good" classes with improved success.

In [None]:
# Move text file to image_data/bundles
%cd $train_wd

# Make aggregated 'bad' images folder (combined classes 1 and 2)
!mkdir -p agg/bad
!cp 1/* agg/bad/
!cp 2/* agg/bad/
print("Number of images in new aggregated 'bad' folder: ")
!ls agg/bad | wc -l

# Make aggregated 'good' images folder (combined classes 4 and 5)
!mkdir -p agg/good 
!cp 4/* agg/good/
!cp 5/* agg/good/
print("Number of images in new aggregated 'good' folder: ")
!ls agg/good | wc -l