In [1]:
#Data
#79726 Test Images
#22425 Train Images
#26 Drivers (in training set)
#10 Class labels

In [2]:
#Verify we are in the lesson3 directory
%pwd

u'/home/ubuntu/deep_learning_course/lesson3'

In [3]:
#Create references to important directories we will use over and over
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/statefarm'
METADATA_FILENAME = 'driver_imgs_list.csv'

In [4]:
#Allow relative imports to directories above lesson1/
sys.path.insert(1, os.path.join(sys.path[0], '..'))

#import modules
from utils.utils import *
from utils.vgg16 import Vgg16

#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
%matplotlib inline

Using Theano backend.


In [5]:
def make_dir_if_not_exists(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)

In [6]:
%mkdir $DATA_HOME_DIR/valid
%mkdir $DATA_HOME_DIR/results
%mkdir -p $DATA_HOME_DIR/test/unknown
%mkdir -p $DATA_HOME_DIR/sample/train
%mkdir -p $DATA_HOME_DIR/sample/test
%mkdir -p $DATA_HOME_DIR/sample/valid
%mkdir -p $DATA_HOME_DIR/sample/results

In [7]:
from glob import glob
import numpy as np
from shutil import copyfile
from random import randrange

In [8]:
def get_lines_from_file(filename):
    fileobj = open(os.path.join(DATA_HOME_DIR,filename),'r')
    headers = fileobj.readline()
    #Read lines into file without '\n'
    lines = fileobj.read().splitlines()
    fileobj.close()   
    return lines

In [9]:
def get_shuffled_images_by_person_and_class():
    lines = get_lines_from_file(METADATA_FILENAME)
    image_dict = {}
    for line in lines:
        subject,classlabel,filename = line.split(",")
        if subject not in image_dict:
            image_dict[subject] = {}
        if classlabel not in image_dict[subject]:
            image_dict[subject][classlabel] = [] #files array
        image_filenames = image_dict[subject][classlabel]
        #Randomly insert the new filename
        image_filenames.insert(randrange(len(image_filenames)+1), filename)
    return image_dict

images = get_shuffled_images_by_person_and_class()
print images['p002'].keys()

VALIDATION_SET_DRIVERS=['p002','p012','p014']
def create_validation_set_from_out_of_sample_drivers():
    '''
    Since the real-life test set will be different drivers
    Let's create a validation set of different drivers too!
    '''
    images = get_shuffled_images_by_person_and_class()
    for driver in VALIDATION_SET_DRIVERS:
        print "DRIVER="+driver
        for classlabel in images[driver].keys():
            print "CLASS="+classlabel
            make_dir_if_not_exists(DATA_HOME_DIR+'/valid/'+classlabel)
            train_filepath = DATA_HOME_DIR+'/train/'+classlabel+'/'
            valid_filepath = DATA_HOME_DIR+'/valid/'+classlabel+'/'
            for filename in os.listdir(train_filepath):
                if filename in images[driver][classlabel]:
                    print filename
                    os.rename(train_filepath+filename, valid_filepath+filename)
                    

create_validation_set_from_out_of_sample_drivers()

['c9', 'c8', 'c3', 'c2', 'c1', 'c0', 'c7', 'c6', 'c5', 'c4']
DRIVER=p002
CLASS=c9
img_88330.jpg
img_28463.jpg
img_30987.jpg
img_37091.jpg
img_85445.jpg
img_51172.jpg
img_59020.jpg
img_34195.jpg
img_13318.jpg
img_61361.jpg
img_28988.jpg
img_42064.jpg
img_19205.jpg
img_14413.jpg
img_56129.jpg
img_50897.jpg
img_45169.jpg
img_92289.jpg
img_61066.jpg
img_40093.jpg
img_77780.jpg
img_31757.jpg
img_29490.jpg
img_40697.jpg
img_46648.jpg
img_55213.jpg
img_6045.jpg
img_50883.jpg
img_86613.jpg
img_993.jpg
img_86087.jpg
img_9877.jpg
img_6171.jpg
img_52112.jpg
img_20981.jpg
img_9194.jpg
img_58919.jpg
img_8564.jpg
img_39097.jpg
img_6666.jpg
img_39644.jpg
img_18997.jpg
img_3484.jpg
img_67621.jpg
img_37205.jpg
img_60621.jpg
img_81316.jpg
img_45690.jpg
img_70124.jpg
img_63625.jpg
img_60364.jpg
CLASS=c8
img_28231.jpg
img_69584.jpg
img_100846.jpg
img_99160.jpg
img_96126.jpg
img_7186.jpg
img_6916.jpg
img_79203.jpg
img_3967.jpg
img_46280.jpg
img_12886.jpg
img_50461.jpg
img_33383.jpg
img_3371.jpg
img_95517.j

In [101]:
#Move representative sample of training images from /train to /valid
#20183 Training images (90%)
#2242 Validation Images (10%)
#26 Drivers
#10 Class labels
#86 images per driver
#8 images per class label per driver

#Organize images into classlabels and subjects randoming shuffling them simultanously
def get_shuffled_images_by_class_and_person(input_filename):
    lines = get_lines_from_file(METADATA_FILENAME)
    image_dict = {}
    for line in lines:
        subject,classlabel,filename = line.split(",")
        if classlabel not in image_dict:
            image_dict[classlabel] = {}
        if subject not in image_dict[classlabel]:
            image_dict[classlabel][subject] = []
        image_list = image_dict[classlabel][subject]
        image_list.insert(randrange(len(image_list)+1), filename)

 
    return image_dict

def create_driver_distributed_validation_set():
    '''
    Move 8 images per category per subject from /train to /valid
    Images are pre-shuffled inside each filename array
    This technique avoids bias toward drivers who have way more images than others
    '''
    VALIDATION_IMAGES_PER_DRIVER=8
    images = get_shuffle_images_by_class_and_person(METADATA_FILENAME)
    
    for classlabel in images.keys():
        make_dir_if_not_exists(DATA_HOME_DIR+'/valid/'+classlabel)
        for subject in images[classlabel].keys():
            for i in range(VALIDATION_IMAGES_PER_DRIVER):
                img_filename = images[classlabel][subject][i]
                train_filepath = DATA_HOME_DIR+'/train/'+classlabel+'/'+img_filename
                valid_filepath = DATA_HOME_DIR+'/valid/'+classlabel+'/'+img_filename
                os.rename(train_filepath, valid_filepath)

#create_driver_distributed_validation_set()

/home/ubuntu/deep_learning_course/lesson3/data/statefarm/train/c9/img_71535.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/valid/c9/img_71535.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/train/c9/img_95108.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/valid/c9/img_95108.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/train/c9/img_91897.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/valid/c9/img_91897.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/train/c9/img_53661.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/valid/c9/img_53661.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/train/c9/img_59391.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/valid/c9/img_59391.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/train/c9/img_97229.jpg
/home/ubuntu/deep_learning_course/lesson3/data/statefarm/valid/c9/img_97229.jpg
/home/ubuntu/deep_learning_course/lesson

IndexError: list index out of range

In [None]:
# Old way just randomly moving over 8 images per classlabel   
#def create_driver_distributed_validation_set():
#    print "classlabel: "+classlabel
#    make_dir_if_not_exists(DATA_HOME_DIR+'/valid/'+classlabel)
#    g = glob(DATA_HOME_DIR+'/train/'+classlabel+'/*.jpg')
#    shuffled_filepaths = np.random.permutation(g)
#    for i in range(VALIDATION_IMAGES_PER_DRIVER): 
#        print shuffled_filepaths[i]
#        filename = shuffled_filepaths[i].split("/")[-1]
#        print filename
#        new_filepath = DATA_HOME_DIR+'/valid/'+classlabel+'/'+filename
#        os.rename(shuffled_filepaths[i], new_filepath)

In [35]:
#Create class subdirectories in sample/train and sample/valid
class_directory_names = [x[1] for x in os.walk(DATA_HOME_DIR+'/train')][0]

for class_name in class_directory_names:
    make_dir_if_not_exists(DATA_HOME_DIR+'/sample/train/'+class_name)
    make_dir_if_not_exists(DATA_HOME_DIR+'/sample/valid/'+class_name)

In [32]:
#Method to copy random files from one dir to another
def copy_random_files(current_dir, target_dir, count, file_ext):
    g = glob(current_dir+'/*.'+file_ext)
    shuffled_filename_paths = np.random.permutation(g)
    for i in range(count): 
        filename = shuffled_filename_paths[i].split('/')[-1]   
        copyfile(shuffled_filename_paths[i], target_dir+'/'+filename)
    
#copy_random_files(DATA_HOME_DIR+'/train/c0',
#                  DATA_HOME_DIR+'/sample/train/c0',
#                  200,'jpg')

In [33]:
#Copy Training Images To Sample/Train
SAMPLE_TRAIN_COUNT=200 #per c0,c1,c2... directory
for class_name in class_directory_names:
    current_filepath = DATA_HOME_DIR+'/train/'+class_name
    new_filepath = DATA_HOME_DIR+'/sample/train/'+class_name
    copy_random_files(current_filepath, new_filepath, SAMPLE_TRAIN_COUNT, 'jpg')

In [36]:
#Copy Valid Images To Sample/Valid
SAMPLE_VALID_COUNT=50 #per c0,c1,c2... directory
for class_name in class_directory_names:
    current_filepath = DATA_HOME_DIR+'/valid/'+class_name
    new_filepath = DATA_HOME_DIR+'/sample/valid/'+class_name
    copy_random_files(current_filepath, new_filepath, SAMPLE_VALID_COUNT, 'jpg')

In [37]:
#Copy images from /test to /sample/test
SAMPLE_TEST_COUNT=2000
current_filepath = DATA_HOME_DIR+'/test'
new_filepath = DATA_HOME_DIR+'/sample/test'
copy_random_files(current_filepath, new_filepath, SAMPLE_TEST_COUNT, 'jpg')

In [5]:
from keras.preprocessing import image
from utils.utils import plots

#Helper function to plot images by index in the validation set
def plots_idx(idx, titles=None):
    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [None]:
#Let's view some images
from PIL import Image

#get_image_filenames_from_directory(dir_name):
#    pass

Image.open(data_path + 'c2/' + filenames[2])