In [3]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from enum import Enum
from shutil import copyfile
from scipy.misc import imresize
from skimage.exposure import adjust_gamma
%matplotlib inline

In [4]:
# Convenient for specifying directories
class CameraLabel(Enum):
    HTC_1_M7 = 'htc_m7'
    LG_Nexus_5x = 'nexus_5x'
    Motorola_Droid_Maxx = 'moto_maxx'
    Motorola_Nexus_6 = 'nexus_6'
    Motorola_X = 'moto_x'
    Samsung_Galaxy_Note3 = 'samsung_note3'
    Samsung_Galaxy_S4 = 'samsung_s4'
    Sony_NEX_7 = 'sony_nex7'
    iPhone_4s = 'iphone_4s'
    iPhone_6 = 'iphone_6'

In [8]:
# Root of output directories
target_dir = 'data-512-with-flickr'
# Output directories for each manipulation
output_dir = ['train_cropped',
              'train_compressed_90',
              'train_compressed_70',
              'train_resize_5',
              'train_resize_8',
              'train_resize_15',
              'train_resize_20',
              'train_gamma_8',
              'train_gamma_12', 
              '../data/flickr-validation']

# Source of images to be cropped & manipulated
train_dir = 'data/train'
flickr_train_dir = 'data/flickr_train'

# Validation Dir
validation_dir = 'data/flickr-validation'
# Raw flickr images, including ones we don't want
flickr_src = 'flickr_images'

test_img = 'Motorola-X/(MotoX)100.jpg'

## Create the Directory Structure
Our raw data exists under `train_dir` and will exist under `flickr_train_dir` as well. As we load and manipulate them, we want the output directories to exist, so let's create those real quick, if they don't already exist

In [9]:
if not os.path.isdir(target_dir):
    print('Root target directory does not exist. Creating: ', target_dir)
    os.mkdir(target_dir)
# Generate Directories for each of the manipulations (including simple cropping)
for directory in output_dir:
    target = os.path.join(target_dir, directory)
    if not os.path.isdir(target):
        print('Creating directory: ', target)
        os.mkdir(target)
    for label in CameraLabel:
        camera_dir = os.path.join(target, label.name.replace('_', '-'))  # Substitution to match with existing data dir names
        if not os.path.isdir(camera_dir):
            print('Creating camera dir: ', camera_dir)
            os.mkdir(camera_dir)

Creating camera dir:  data-512-with-flickr/../data/flickr-validation/HTC-1-M7
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/LG-Nexus-5x
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/Motorola-Droid-Maxx
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/Motorola-Nexus-6
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/Motorola-X
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/Samsung-Galaxy-Note3
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/Samsung-Galaxy-S4
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/Sony-NEX-7
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/iPhone-4s
Creating camera dir:  data-512-with-flickr/../data/flickr-validation/iPhone-6


## Get Good Images From Flicker Data
We have a few thousand photos in the `flickr_src` directory, but only a subset of them match the requirements we have for our data (uncompressed, the right shape, etc). Lucky for us, the `flickr_src` directory has a file with paths for all of the files we want called `good_jpgs`. We'll simply read the file line by line, see if the file exists in `flickr_train_dir` and copy it over if not. 

A full copy should move **5377** files.

In [4]:
file_paths = np.loadtxt(os.path.join(flickr_src, 'good_jpgs'), dtype='str', delimiter="\n")

In [48]:
files_copied = 0
for file_path in file_paths:
    camera, filename = file_path.split('/')[1:] # gives us something like: 'iphone_6', '2556...a_0.jpg'
    camera_path = CameraLabel(camera).name.replace('_', '-')
    source_file = os.path.join(flickr_src, camera_path, filename)
    target_file = os.path.join(flickr_train_dir, camera_path, filename)
    if not os.path.isfile(target_file):
        files_copied += 1
        copyfile(source_file, target_file)
        
print('Copied', files_copied, "file(s)")

Copied 0 file(s)


## Manipulation Time!
The process I'll use is to load a raw image, apply the manipulation, and then center-crop it to be 512x512. Finally, I'll save the manipulated files to the directories defined in `output_dir`

In [5]:
def crop(img, crop_dim=512):
    edge = crop_dim // 2
    height, width, _ = img.shape
    center_height = height // 2
    center_width = width // 2
    top = center_height - edge
    bottom = center_height + edge
    left = center_width - edge
    right = center_width + edge
    return img[top:bottom,left:right]

def write_image(path, image, flags=None, force=False, verbose=False):
    if not os.path.isfile(path) or force:
        if flags is None:
            cv2.imwrite(path, image)
        else:
            cv2.imwrite(path, image, flags)
    elif verbose:
        print('File not written')

In [6]:
def generate_data(source_dir, target_dir='data-512-with-flickr', crop_dimension=512, file_format='.jpg', verbose=False):
    for camera_dir in os.listdir(source_dir):  # 
        full_camera_path = os.path.join(source_dir, camera_dir)  # e.g. data/train/HTC-1-M7
        print('Parsing files in ', full_camera_path)
        for filename in os.listdir(full_camera_path):
            if verbose:
                print('Processing file:', filename, '. Writing to: ', os.path.join(target_dir, '...', filename[:-4] + file_format))  # yeah this could be 2750 or 5377 lines of output
            
            must_process = False
            for target in output_dir:
                target_file = os.path.join(target_dir, target, camera_dir, filename[:-4] + file_format)
                if not os.path.isfile(target_file):
                    must_process = True
            
            if not must_process:
                continue
            img = cv2.imread(os.path.join(full_camera_path, filename))

            # Compression & Crop is the only time we'll crop and then save
            cropped = crop(img, crop_dimension)
            if cropped.shape[0] != 512 and cropped.shape[1] != 512:
                print('Invalid shape! Image: ', os.path.join(full_camera_path, filename))
                continue
                
            target_file = os.path.join(target_dir, output_dir[0], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file, cropped)
            target_file = os.path.join(target_dir, output_dir[1], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file, cropped, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
            target_file = os.path.join(target_dir, output_dir[2], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file, cropped, [int(cv2.IMWRITE_JPEG_QUALITY), 70])
            
            # Resize 0.5
            target_file = os.path.join(target_dir, output_dir[3], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file,crop(imresize(img, 0.5, interp='bicubic')))
            # Resize 0.8
            target_file = os.path.join(target_dir, output_dir[4], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file,crop(imresize(img, 0.8, interp='bicubic')))
            # Resize 1.5
            target_file = os.path.join(target_dir, output_dir[5], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file,crop(imresize(img, 1.5, interp='bicubic')))
            # Resize 2.0
            target_file = os.path.join(target_dir, output_dir[6], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file,crop(imresize(img, 2.0, interp='bicubic')))
            # Gamma Adjustment 0.8
            target_file = os.path.join(target_dir, output_dir[7], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file,crop(adjust_gamma(img, gamma=0.8)))
            # Gamma Adjustment 1.2
            target_file = os.path.join(target_dir, output_dir[8], camera_dir, filename[:-4] + file_format)
            if not os.path.isfile(target_file):
                cv2.imwrite(target_file,crop(adjust_gamma(img, gamma=1.2)))


In [19]:
generate_data(train_dir, verbose=False)

Parsing files in  data/train/LG-Nexus-5x
Parsing files in  data/train/Motorola-Droid-Maxx
Parsing files in  data/train/iPhone-4s
Parsing files in  data/train/iPhone-6
Parsing files in  data/train/Sony-NEX-7
Parsing files in  data/train/Motorola-Nexus-6
Parsing files in  data/train/HTC-1-M7
Parsing files in  data/train/Samsung-Galaxy-Note3
Parsing files in  data/train/Motorola-X
Parsing files in  data/train/Samsung-Galaxy-S4


In [7]:
generate_data(flickr_train_dir, verbose=False)

Parsing files in  data/flickr_train/iPhone-4s
Parsing files in  data/flickr_train/Motorola-Nexus-6
Parsing files in  data/flickr_train/Samsung-Galaxy-S4
Parsing files in  data/flickr_train/Motorola-Droid-Maxx
Parsing files in  data/flickr_train/iPhone-6
Parsing files in  data/flickr_train/LG-Nexus-5x
Parsing files in  data/flickr_train/Motorola-X
Parsing files in  data/flickr_train/HTC-1-M7
Parsing files in  data/flickr_train/Samsung-Galaxy-Note3
Parsing files in  data/flickr_train/Sony-NEX-7


`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.
`imresize` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``skimage.transform.resize`` instead.


# Data Fix
Previously, I ran the above code without a check for the dimensions of the cropped image, resulting in some images with shapes like (69, 6, 3). To find these, I did the following in bash in my data directory:  
`exiftool -q -r -ext jpg -if '$ImageHeight != 512 || $ImageWidth != 512' -p '$Directory/$FileName' ./*`  
I then removed each of the listed files

# Create Validation Data
It's great that I have training data, but I'll need validation data as well for convenience. I'll generate the validation data in the following way:  
**Target Number of Samples:** 20%  
First, create a directory structure that mirrors this structure:  
- data-512-with-flickr
- - train_cropped
- - - HTC-1-M7
- - - ...
- - train_compressed_70
- - - HTC-1-M7
- - - ...
- - train_resize_5
- - ...  

Next, step through `data-512-with-flickr` and for each camera in each manipulation, do the following:
- Get a list of filenames in the directory
- Shuffle the list of filenames
- Select a subset of the list (e.g. `selected_files = filenames[:len(filenames]`)
- For each file, copy it to the appropriate target directory


In [17]:

source_dirs = os.listdir(target_dir)  # I know, looks weird, but remember we defined target_dir above as 'data-512-with-flickr'

In [21]:
validation_dir

'data/flickr-validation'

In [30]:
source_dir = target_dir
for manip_dir in os.listdir(source_dir):
    manip_path = os.path.join(source_dir, manip_dir)
    for camera_dir in os.listdir(manip_path):
        source_path = os.path.join(manip_path, camera_dir)
        #source_path = os.path.join(source_dir, manip_dir, camera_dir)
        output_path = os.path.join(validation_dir, camera_dir)  # No manipulation dir in validation
        filenames = os.listdir(source_path)
        np.random.shuffle(filenames)
        number_of_samples = int(len(filenames) * 0.2)
        print('Working on ', source_path, 'with', len(filenames), 'files. Creating', number_of_samples, 'files', 'in ', output_path)
        # generate indices here
        for filename in filenames[:number_of_samples]:
            copyfile(
                os.path.join(source_path, filename), 
                os.path.join(output_path, manip_dir + '-' + filename))

Working on  data-512-with-flickr/train_compressed_70/LG-Nexus-5x with 680 files. Creating 136 files in  data/flickr-validation/LG-Nexus-5x
Working on  data-512-with-flickr/train_compressed_70/Motorola-Nexus-6 with 926 files. Creating 185 files in  data/flickr-validation/Motorola-Nexus-6
Working on  data-512-with-flickr/train_compressed_70/iPhone-6 with 823 files. Creating 164 files in  data/flickr-validation/iPhone-6
Working on  data-512-with-flickr/train_compressed_70/Motorola-Droid-Maxx with 825 files. Creating 165 files in  data/flickr-validation/Motorola-Droid-Maxx
Working on  data-512-with-flickr/train_compressed_70/HTC-1-M7 with 1023 files. Creating 204 files in  data/flickr-validation/HTC-1-M7
Working on  data-512-with-flickr/train_compressed_70/Sony-NEX-7 with 832 files. Creating 166 files in  data/flickr-validation/Sony-NEX-7
Working on  data-512-with-flickr/train_compressed_70/iPhone-4s with 774 files. Creating 154 files in  data/flickr-validation/iPhone-4s
Working on  data-5

Working on  data-512-with-flickr/train_resize_5/Sony-NEX-7 with 832 files. Creating 166 files in  data/flickr-validation/Sony-NEX-7
Working on  data-512-with-flickr/train_gamma_12/Samsung-Galaxy-S4 with 1412 files. Creating 282 files in  data/flickr-validation/Samsung-Galaxy-S4
Working on  data-512-with-flickr/train_gamma_12/Motorola-X with 275 files. Creating 55 files in  data/flickr-validation/Motorola-X
Working on  data-512-with-flickr/train_gamma_12/Samsung-Galaxy-Note3 with 549 files. Creating 109 files in  data/flickr-validation/Samsung-Galaxy-Note3
Working on  data-512-with-flickr/train_gamma_12/Motorola-Nexus-6 with 926 files. Creating 185 files in  data/flickr-validation/Motorola-Nexus-6
Working on  data-512-with-flickr/train_gamma_12/Motorola-Droid-Maxx with 825 files. Creating 165 files in  data/flickr-validation/Motorola-Droid-Maxx
Working on  data-512-with-flickr/train_gamma_12/Sony-NEX-7 with 832 files. Creating 166 files in  data/flickr-validation/Sony-NEX-7
Working on  