In [1]:
import os
import re
sub_dir = [x for x in os.walk("data/dogscats/train/")]
labels = [re.sub(r'[^a-z0-9]+', ' ', dir_name.lower()) for dir_name in sub_dir[0][1]]
images = {}
for i, lab in enumerate(labels):
    images[lab] = {
        'dir': os.path.abspath(sub_dir[i+1][0]),
        'images': sub_dir[i+1][2]
    }

#Print first 10 images of the first label
print(images[labels[0]]['images'][1:10])

['cat.5303.jpg', 'cat.3715.jpg', 'cat.3301.jpg', 'cat.7418.jpg', 'cat.3593.jpg', 'cat.7769.jpg', 'cat.2282.jpg', 'cat.12432.jpg', 'cat.10506.jpg']


In [3]:
import os
import re
import hashlib

MAX_NUM_IMAGES_PER_CLASS = 2 ** 27 - 1

def hash_filename_to_percentage(file_name, salt=''):
    base_file = salt+os.path.basename(file_name)
    hash_name = re.sub(r'_nohash_.*$', '', base_file)
    hash_name_hashed = hashlib.sha1(hash_name.encode('utf-8')).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                          (MAX_NUM_IMAGES_PER_CLASS + 1)) *
                         (1.0 / MAX_NUM_IMAGES_PER_CLASS))
    return percentage_hash

[hash_filename_to_percentage(x) for x in images[labels[0]]['images'][:10]]

[0.14222791896930276,
 0.3191536316212537,
 0.8284764426088067,
 0.3231362053985611,
 0.3535057183616289,
 0.8567337308580706,
 0.08962918884775928,
 0.6806448003697753,
 0.8907745248882064,
 0.6396378400894838]

In [4]:
import warnings
import math

def create_train_valid_test_split(filelist, train_percent=0.6, test_percent=0.2, valid_percent=0.2):
    train = []
    valid = []
    test = []
    assert(train_percent + test_percent <= 1)
    if train_percent + test_percent + valid_percent != 1.0:
        valid_percent = 1.0 - test_percent - train_percent
        warnings.warn("sum not equal to 1 - valid_percent set to {}".format(valid_percent))
        
    for file in filelist:
        percentage = hash_filename_to_percentage(file)
        if percentage < train_percent:
            train.append(file)
        elif percentage < train_percent + test_percent:
            test.append(file)
        else:
            valid.append(file)
    return train, valid, test

trn, val, tst = create_train_valid_test_split(images[labels[0]])

In [8]:
data = {}
for label, dictionary in images.items():
    trn, val, tst = create_train_valid_test_split(dictionary['images'])
    data[label] = {}
    data[label]['path'] = dictionary['dir']
    data[label]['train'] = trn
    data[label]['test'] = tst
    data[label]['valid'] = val

In [13]:

def move_data_to_train_valid_test_split(data, directory):
    
    for sub in ['','train/','test','valid']:
        if not os.path.exists(directory+sub):
            os.makedirs(directory+sub)
            
    for sub in ['train/','test','valid']:
        for label in data.keys():
            os.makedirs(directory+sub+label+"/")
    
move_data_to_train_valid_test_split(data, 'data/dogscats-split/') 


In [9]:
data.keys()

dict_keys(['cat', 'dog'])

In [11]:
extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']

In [17]:
tmp_file = os.path.basename(images[labels[0]][0])
print(tmp_file)

cat.2015.jpg


In [18]:
hash_name = re.sub(r'_nohash_.*$', '', tmp_file)
print(hash_name)

cat.2015.jpg


In [21]:
import hashlib

print(hash_name_hashed)

ec225a94b98cbc0054eca64c2e032e1cf9234864


In [29]:
percentage_hash = ((int(hash_name_hashed, 16) %
                          (MAX_NUM_IMAGES_PER_CLASS + 1)) *
                         (1.0 / MAX_NUM_IMAGES_PER_CLASS))
print(percentage_hash)

0.14222791896930276


0.14222791896930276