<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/rating_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-process Image Rating Classifier Training Images
---
*Last Updated 20 Oct 2021*   
Follow steps below to make training and testing datasets using the [EOL user generated image ratings file](https://editors.eol.org/other_files/EOL_v2_files/image_ratings.txt.zip). 7K images per rating class (1 - 5) are downloaded to Google Drive for use training models in [rating_train.ipynb](https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/rating_train.ipynb).     

## Installs & Imports
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Install libraries for working with datasets
import os
import pandas as pd

# For downloading images
!apt-get install aria2

# Define functions

# To read in EOL formatted data files
def read_datafile(fpath, sep="\t", header=0, disp_head=True, lineterminator='\n', encoding='latin1'):
    """
    Defaults to tab-separated data files with header in row 0
    """
    try:
        df = pd.read_csv(fpath, sep=sep, header=header, lineterminator=lineterminator, encoding=encoding)
        if disp_head:
          print("Data header: \n", df.head())
    except FileNotFoundError as e:
        raise Exception("File not found: Enter the path to your file in form field and re-run").with_traceback(e.__traceback__)
    
    return df

# Suppress pandas warning about writing over a copy of data
pd.options.mode.chained_assignment = None  # default='warn'

# Filter by rating of interest
def filter_by_rating(df, filter=filter, disp_head=False):
    rating = df.loc[round(df["overall_rating"])==int(filter)]
    rating = rating["obj_url"].copy()
    
    if disp_head:
          print("Rating = {}}:\n {}".format(filter, rating.head()))
    print("\n Number of available ratings for training/testing class {}: \n {}".format(filter, len(rating)))

    return rating

### 1) Inspect EOL User Generated Image Ratings File
---

In [None]:
# Download EOL user generated rating data

# TO DO: Type in the path to your working directory in form field to right
wd = "/content/drive/MyDrive/train" #@param {type:"string"}

# Download EOL user generated rating file to temporary runtime location
!wget --user-agent="Mozilla" https://editors.eol.org/other_files/EOL_v2_files/image_ratings.txt.zip

# Unzip cropping file to your working directory
!unzip /content/image_ratings.txt.zip -d $wd

# Change to your training directory within Google Drive and move to pre-processing/
%cd $wd
!mkdir pre-processing
!mv image_ratings.txt pre-processing/

In [None]:
# Filter EOL rating coordinates by class (1-bad to 5-good)

# Optional = Un-comment out if want to view individual URLs, increase output length so full URL visible
pd.set_option('display.max_colwidth', 1000) # Print full urls for inspection

# Read in user-generated image rating file
fpath = 'pre-processing/' + 'image_ratings.txt'
df = read_datafile(fpath)

# Make train and test datasets for each rating class
# List of rating classes to filter by
filters = ['1', '2', '3', '4', '5']

for filter in filters:
    # Filter by rating of interest 
    filtered = filter_by_rating(df, filter, disp_head=False)

    # Make folder for rating class
    dir = 'pre-processing/images/' + filter
    os.makedirs(dir)

    # Export filtered dataset as txt to folder of interest
    outfpath = dir + '/' + filter + '_download.txt'
    filtered.to_csv(outfpath, sep='\n', index=False, header=False)

### 2) Build 7k image bundles for rating classes 1-5
---

In [None]:
# Make 7k image bundles for rating classes 1-4 
# Rating class 5 only has 1200 images, so it is built differently in next code block
%cd pre-processing/images

# Future image directories for training classifier
ratings = filters[:4]
all_filenames = [rating + '/' + rating + '_download.txt' for rating in ratings] # Image rating filenames
filenames_7k = [rating + '/' + rating + '_download_7k.txt' for rating in ratings] # Future 7K image bundle filenames

# Randomly pick 7,000 images from each rating class and write to csv
for num, filename in enumerate(all_filenames):
    df = pd.read_table(filename, sep='\n')
    bundle = df.sample(7000)
    fn = str(filenames_7k[num])
    print("7k image bundle filename for rating {}: {}\n{}\n".format((num+1), fn, bundle.head()))
    bundle.to_csv(fn, sep='\n', index=False, header=False)

In [None]:
# Make 7k bundle for Rating = 5 dataset
# Different because only 1200 images total

# Add images to Rating = 5 dataset from EOL User Exemplar File
# Read in Exemplar File
# Download EOL user generated rating file to temporary runtime location
!wget --user-agent="Mozilla" https://editors.eol.org/other_files/EOL_v2_files/images_selected_as_exemplar.txt.zip

# Unzip cropping file to your working directory
!unzip images_selected_as_exemplar.txt.zip -d ./5

# Read in user-generated image exemplars file
fpath = '5/' + 'images_selected_as_exemplar.txt'
df = read_datafile(fpath)

# Include all duplicates from exemplar file 
# (these ones may be better or more controversial, see email from JH 28 Oct 2020)
idx = df.index[df.duplicated(['object_url'])].tolist()
dups = df.loc[idx]
dups = pd.DataFrame(dups["object_url"])
# Add 4k random images from exemplar file
unq = df.drop(idx, errors='ignore')
unq = unq.sample(4000)
unq = pd.DataFrame(unq["object_url"])
# Read in Rating = 5 images
df1 = pd.read_table('5/5_download.txt', sep='\n')
df1.columns = unq.columns
# Make combined 7k bundle from Exemplar duplicates & random images, + Rating = 5 images
comb = pd.concat([df1,unq,dups], ignore_index=True)
print("Rating = 5:\n {}".format(comb.head()))
print("\n Number of available ratings for training/testing class 5: \n {}".format(len(comb)))
comb.to_csv('5/5_download_7k.txt', sep='\n', index=False, header=False)

### 3) Download images to Google Drive
---
Run all steps once per rating class 1-5. Where you see 'TO DO' (3 places), change number to match rating class each time you run 

In [None]:
# Optional: Test downloads with a small subset first?

# TO DO: If yes, check test_with_tiny_subset box
test_with_tiny_subset = True #@param {type: "boolean"}

cwd = wd + 'pre-processing/images'
%cd $cwd

# Test downloads with tiny subset
if test_with_tiny_subset:
    filenames_tiny = []
    # Make tiny subsets with only 5 images per class
    for fn in filenames_7k:
        df = pd.read_table(fn, sep='\n')
        df1 = df.head().copy()
        fn1 = os.path.splitext(fn)[0] + '_tinysubset.txt'
        filenames_tiny.append(fn1)
        df1.to_csv(fn1, sep='\n', index=False, header=False)

    # Download images
    # Loop through image data files to download images into their respective folders
    for num, fn in enumerate(filenames_tiny, start=1):
        # Download images
        cwd = wd + '/pre-processing/images/' + str(num)
        %cd $cwd
        fn = os.path.basename(fn)
        !aria2c -x 16 -s 1 -i $fn

    # Move text file to image_data/bundles
    %cd ../..
    fpath = 'images/' + str(num) + '/*.txt'
    !mv $fpath image_data

In [None]:
# Run for all images
# Make a folder to store image data files after downloading images
cwd = wd + '/pre-processing'
%cd $cwd
!mkdir image_data

# Loop through image data files for each rating class to download images into their respective folders
for num, fn in enumerate(filenames_7k, start=1):
    # Download images (this will take ~2 hours / class)
    cwd = wd + '/pre-processing/images/' + str(num)
    %cd $cwd
    fn = os.path.basename(fn)
    !aria2c -x 16 -s 1 -i $fn
    
    # Check how many images downloaded
    print("Number of images downloaded to Google Drive: ")
    !ls . | wc -l

    # Move text file to image_data/bundles
    %cd ../..
    fpath = 'images/' + str(num) + '/*.txt'
    !mv $fpath image_data

### 4) Delete all downloaded non-image files
---


In [None]:
# Find and delete all downloaded non-image files
from os import listdir
from PIL import Image
Image.MAX_IMAGE_PIXELS = 95000000 # To suppress errors from Pillow about decompression bombs
import io

# Loop through image rating class folders 1 - 5
for num in range(1,6):
    cwd = wd + '/pre-processing/images/' + str(num)
    %cd $cwd
    # Inspect each file in folder
    for path in listdir('./'):
        with open(path, 'rb') as f:
            # Verify that file is an image 
            try:
                if '.html' not in path: # hacky fix to catch htmls
                    img = Image.open(io.BytesIO(f.read()))
                    img.verify() 
                else:
                    raise NameError
            # If file isn't an image, delete it
            except (IOError, SyntaxError, NameError) as e:
                print('Bad file:', path)
                if '(' in path: # rm doesn't work for files with parenthesis in name, need to manually remove
                    print("Manually remove from Google Drive: {}".format(path)) 
                else:
                    !rm $path 

### 5) Aggregate classes into good (4 & 5) and bad (1 & 2) because models did not learn classes 1-5 with any hyperparameter combinations
---
*Afternote: Users were more conflicted on what makes an image "good" than what makes it "bad." Because models learn patterns from the training data, this resulted in high accuracy for predicting "bad" images (classes 1 & 2), but mixed accuracy for predicting "ok" or "good" (classes 4 & 5)*   
Models were retrained using aggregated "bad" and "good" classes with improved success.

In [None]:
# Move text file to image_data/bundles
cwd = wd + '/pre-processing/images/'
%cd $cwd

# Make aggregated 'bad' images folder (combined classes 1 and 2)
!mkdir -p agg/bad
!cp 1/* agg/bad/
!cp 2/* agg/bad/
print("Number of images in new aggregated 'bad' folder: ")
!ls agg/bad | wc -l

# Make aggregated 'good' images folder (combined classes 4 and 5)
!mkdir -p agg/good 
!cp 4/* agg/good/
!cp 5/* agg/good/
print("Number of images in new aggregated 'good' folder: ")
!ls agg/good | wc -l