# Preprocessing

### Setup
- This code assumes the data is in 'train' and 'test' folders in the same directory as the notebook. 
- Additionally, for this project the training data is assumed to be placed in ten subdirectories (representing the 10 classes) when it is downloaded.

In [11]:
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir + '/data'

In [12]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))

%cd $LESSON_HOME_DIR
#import modules
from utils import *
import vgg16; reload(vgg16)
from vgg16 import Vgg16
from shutil import copyfile
import numpy as np
from glob import glob

%matplotlib inline

/home/ubuntu/driver


In [13]:
# Create directory structure:
%cd $DATA_HOME_DIR

%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown

/home/ubuntu/driver/data


In [16]:
# Create subdirectory structure for the validation data (10 class labels):

%cd $DATA_HOME_DIR/valid
for i in range(10):
    os.mkdir(os.path.expanduser(DATA_HOME_DIR + '/valid/c' + str(i)))
    

/home/ubuntu/driver/data/valid


In [17]:
# Repeat this process so we have subdirectories for our sample training and validation sets.
for i in range(10):
    os.mkdir(os.path.expanduser(DATA_HOME_DIR + '/sample/valid/c' + str(i)))
    os.mkdir(os.path.expanduser(DATA_HOME_DIR + '/sample/train/c' + str(i)))

### Create validation set

In [21]:
# Move 700 randomly chosen image files from each training data class to create validation set:

%cd $DATA_HOME_DIR/train
for i in range(10):
    class_path = DATA_HOME_DIR + '/train/c' + str(i)
    %cd $class_path
    g = glob('*.jpg')
    shuf = np.random.permutation(g)
    for j in range(700):
        os.rename(shuf[j], DATA_HOME_DIR + '/valid/' + ('c' + str(i) + '/') + shuf[j])
        #print(DATA_HOME_DIR + '/valid/' + ('c' + str(i) + '/') + shuf[j])

/home/ubuntu/driver/train
/home/ubuntu/driver/train/c0
/home/ubuntu/driver/train/c1
/home/ubuntu/driver/train/c2
/home/ubuntu/driver/train/c3
/home/ubuntu/driver/train/c4
/home/ubuntu/driver/train/c5
/home/ubuntu/driver/train/c6
/home/ubuntu/driver/train/c7
/home/ubuntu/driver/train/c8
/home/ubuntu/driver/train/c9


### Create sample data set (w/validation and test data)

In [18]:
from shutil import copyfile

In [None]:
# Create sample data. Includes copying files from both train and valid to create miniature subsets of each.

for i in range(10):
    train_class_path = DATA_HOME_DIR + '/train/c' + str(i)
    valid_class_path = DATA_HOME_DIR + '/valid/c' + str(i)
    %cd $train_class_path
    g = glob('*.jpg')
    shuf = np.random.permutation(g)
    for j in range(10):
        copyfile(shuf[j], DATA_HOME_DIR+'/sample/train/c' + str(i) + '/' + shuf[j])
        #print DATA_HOME_DIR+'/sample/train/c' + str(i) + '/' + shuf[j]
    %cd $valid_class_path
    g = glob('*.jpg')
    shuf = np.random.permutation(g)
    for j in range(3):
        copyfile(shuf[j], DATA_HOME_DIR+'/sample/valid/c' + str(i) + '/' + shuf[j])
        #print DATA_HOME_DIR+'/sample/valid/c' + str(i) + '/' + shuf[j]

In [None]:
# We need to move the test data to an 'unknown' directory so vgg can read it.

%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/

%cd $DATA_HOME_DIR/sample/test
%mkdir unknown
%mv *.jpg unknown/

### Handover to model

#### Make batches generators for training and valid data.

In [20]:
# Load vgg helper class.
%cd $LESSON_HOME_DIR

vgg = Vgg16()
batch_size = 64

/home/ubuntu/driver


#### Set up path variables

In [19]:
#Use the second path variable for testing on a small sample.

path = DATA_HOME_DIR 
#path = DATA_HOME_DIR + '/sample'

test_path = DATA_HOME_DIR + '/test/'
results_path=DATA_HOME_DIR + '/results/'
valid_path = path + '/valid/'
train_path = path + '/train/'

In [5]:
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)

array([[[[  43.,   40.,   41., ...,  151.,  214.,  163.],
         [  40.,   37.,   39., ...,   60.,   45.,    0.],
         [  36.,   34.,   37., ...,  218.,  218.,   85.],
         ..., 
         [  91.,   86.,   73., ...,   12.,   15.,   16.],
         [   0.,   49.,    0., ...,   11.,    9.,    9.],
         [   2.,    0.,   11., ...,    6.,   11.,   11.]],

        [[  52.,   49.,   50., ...,  182.,  255.,  221.],
         [  48.,   45.,   47., ...,   95.,   83.,   21.],
         [  44.,   42.,   45., ...,  255.,  247.,  104.],
         ..., 
         [ 135.,  126.,  101., ...,   11.,   14.,   15.],
         [  26.,   75.,    9., ...,   10.,    8.,    8.],
         [  18.,    6.,   15., ...,    5.,   10.,   10.]],

        [[  35.,   32.,   33., ...,  140.,  219.,  181.],
         [  33.,   30.,   32., ...,   55.,   46.,    0.],
         [  33.,   31.,   34., ...,  221.,  217.,   84.],
         ..., 
         [ 138.,  128.,  102., ...,    9.,   12.,   13.],
         [  28.,   76.,