<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/classification_for_image_tagging/rating/rating_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-process Image Rating Classifier Training Images
---
*Last Updated 26 Dec 2020*   
Follow steps below to download images from EOL generated user image ratings file (image_ratings.txt) to Google Drive into their appropriate folders for use training image rating classification models.     

**Notes**
* Change filepaths or information using the form fields to the right of code blocks (also noted in code with 'TO DO')

### Connect to Google Drive
---

In [None]:
# Mount google drive to import/export files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Imports and Installs
import os
import pandas as pd
!apt-get install aria2

### 1) Inspect EOL User Generated Image Ratings File
---

In [None]:
# Read in EOL user generated rating file
df = pd.read_csv("/content/drive/My Drive/summer20/classification/rating/image_data/bundles/image_ratings.txt", sep="\t", lineterminator='\n', encoding='latin1', header=0)
print("Total number of EOL user generated image ratings: {}".format(len(df)))

In [None]:
# Inspect Data

# Optional = Un-comment out if want to view individual URLs, increase output length so full URL visible
#pd.set_option('display.max_colwidth', 1000) # Print full urls for inspection

# See column names
print("Column Names:")
for col in df.columns: 
    print(col) 

# Split dataset by image ratings
# Rating = 1 (rounded to nearest whole number)
one = df.loc[round(df["overall_rating"])==1]
one = one[["obj_with_overall_rating", "obj_url", "overall_rating", "ancestry"]].copy()
one["obj_url"].to_csv("/content/drive/My Drive/summer20/classification/rating/image_data/one_download.txt", sep="\n", index=False, header=False)
print(len(one))
print("Rating = 1:\n {}".format(one.head()))

# Rating = 2 (rounded to nearest whole number)
two = df.loc[round(df["overall_rating"])==2]
two = two[["obj_with_overall_rating", "obj_url", "overall_rating", "ancestry"]].copy()
two["obj_url"].to_csv("/content/drive/My Drive/summer20/classification/rating/image_data/two_download.txt", sep="\n", index=False, header=False)
print(len(two))
print("Rating = 2:\n {}".format(two.head()))

# Rating = 3 (rounded to nearest whole number)
three = df.loc[round(df["overall_rating"])==3]
three = three[["obj_with_overall_rating", "obj_url", "overall_rating", "ancestry"]].copy()
three["obj_url"].to_csv("/content/drive/My Drive/summer20/classification/rating/image_data/three_download.txt", sep="\n", index=False, header=False)
print(len(three))
print("Rating = 3:\n {}".format(three.head()))

# Rating = 4 (rounded to nearest whole number)
four = df.loc[round(df["overall_rating"])==4]
four = four[["obj_with_overall_rating", "obj_url", "overall_rating", "ancestry"]].copy()
four["obj_url"].to_csv("/content/drive/My Drive/summer20/classification/rating/image_data/four_download.txt", sep="\n", index=False, header=False)
print(len(four))
print("Rating = 4:\n {}".format(four.head()))

# Rating = 5 (rounded to nearest whole number)
five = df.loc[round(df["overall_rating"])==5]
five = five[["obj_with_overall_rating", "obj_url", "overall_rating", "ancestry"]].copy()
five["obj_url"].to_csv("/content/drive/My Drive/summer20/classification/rating/image_data/five_download.txt", sep="\n", index=False, header=False)
print(len(five))
print("Rating = 5:\n {}".format(five.head()))

### 2) Build 7k image bundles for rating classes 1-5
---

In [None]:
# Make 7k image bundles for rating classes 1-4 
# Rating class 5 only has 1200 images, so it is built differently in next code block

%cd drive/My Drive/summer20/classification/rating/images
import os
import pandas as pd

# Future image directories for training classifier
folders = ['1', '2', '3', '4']
nums = ['one', 'two', 'three', 'four']
all_filenames = [folder + '/' + num + '_download.txt' for folder, num in zip(folders, nums)] # Image rating filenames
filenames_7k = [folder + '/' + num + '_download_7k.txt' for folder, num in zip(folders, nums)] # Future 7K image bundle filenames
print(all_filenames)

# Randomly pick 7,000 images from each rating class and write to csv
for num, f in enumerate(all_filenames):
  df = pd.read_table(f, sep='\n')
  bundle = df.sample(7000)
  fn = str(filenames_7k[num])
  print(fn)
  print(bundle.head())
  #bundle.to_csv(fn, sep='\n')

In [None]:
# Make 7k bundle for Rating = 5 dataset
# Different because only 1200 images total

%cd drive/My Drive/summer20/classification/rating/images
import os
import pandas as pd

# Add images to Rating = 5 dataset from EOL User Exemplar File
# Read in Exemplar File
df = pd.read_csv("5/images_selected_as_exemplar.txt", sep="\t", lineterminator='\n', encoding='latin1', header=0)
# Include all duplicates from exemplar file (these ones may be better or more controversial, see email from jen 28 oct 2020)
idx = df.index[df.duplicated(['object_url'])].tolist()
dups = df.loc[idx]
dups = pd.DataFrame(dups["object_url"])
# Add 4k random images from exemplar file
unq = df.drop(idx, errors='ignore')
unq = unq.sample(4000)
unq = pd.DataFrame(unq["object_url"])
# Read in Rating = 5 images
df1 = pd.read_table('5/five_download.txt', sep='\n')
df1.columns = unq.columns
# Make 7k bundle from Exemplar duplicates & random images, and Rating = 5 images
new5 = pd.concat([df1,unq,dups], ignore_index=True)
print(new5)
print(len(new5))
#new5.to_csv('5/five_download_7k.txt', sep='\n', index=False, header=False)

### 3) Download images to Google Drive
---
Run all steps once per rating class 1-5. Where you see 'TO DO' (3 places), change number to match rating class each time you run 

In [None]:
# Download images (this will take a few hours)
%cd drive/My Drive/summer20/classification/rating/images
%cd 1 #TO DO: Change number for each rating class
!aria2c -x 16 -s 1 -i "one_download_7k.txt" #TO DO: Change number for each rating class

In [None]:
# Check how many images downloaded
print("Number of images downloaded to Google Drive: ")
!ls . | wc -l

In [None]:
# Move text file to image_data/bundles
%cd ../..
!mv images/1/*.txt image_data/bundles/ # TO DO: Change folder number for each rating class

### 4) Delete all downloaded non-image files
---
Run all steps once per rating class 1-5. Where you see 'TO DO' (1 place), change number to match rating class each time you run

In [None]:
from os import listdir
from PIL import Image
Image.MAX_IMAGE_PIXELS = 95000000 # To suppress errors from Pillow about decompression bombs
import io

#TO DO: Change each time you run to match image class
%cd /content/drive/My Drive/summer20/classification/rating/images/4

for path in listdir('./'):
  with open(path, 'rb') as f:
    try:
      img = Image.open(io.BytesIO(f.read()))
      img.verify() # verify that it is an image
    except (IOError, SyntaxError) as e:
      print('Bad file:', filename)
      if '(' in filename: # rm doesn't work for files with parenthesis in name, need to manually remove
        print("Manually remove from Google Drive: {}".format(filename)) 
      else:
        !rm $filename 

### 5) Aggregate classes into good (4 & 5) and bad (1 & 2) because models not learning classes 1-5 with any hyperparameter combinations
---

In [None]:
# Move text file to image_data/bundles
%cd drive/My Drive/summer20/classification/rating/images
# Make aggregated training images folder
#!mkdir agg

# Make aggregated 'bad' images folder (combined classes 1 and 2)
#!mkdir agg/bad
#!cp 1/* agg/bad/
#!cp 2/* agg/bad/
print("Number of images in new aggregated 'bad' folder: ")
!ls agg/bad | wc -l

# Make aggregated 'good' images folder (combined classes 4 and 5)
#!mkdir agg/good
#!cp 4/* agg/good/
#!cp 5/* agg/good/
print("Number of images in new aggregated 'good' folder: ")
!ls agg/good | wc -l