In [1]:
from __future__ import division, print_function
import os
from os.path import join, isfile
import random
from itertools import groupby
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from experiment import datadir
from experimenter.utilities import pop_per_key, sample_per_key, mkdir_p, write_datalist, groupby
%load_ext autoreload
%autoreload 2

In [None]:
mkdir_p(datadir)
base_dir = '/home/bchu/data/sun397'
image_dir = join(base_dir, 'images')
seed = 1035711226
random.seed(seed)
np.random.seed(seed)

with open(join(base_dir, 'ClassName.txt'), 'r') as f:
    categories = [l.strip() for l in f.readlines()]
print('Number of categories: %i' % len(categories))
def create_label2files(pairs):
    key = lambda p: p[1]
    return groupby(key, pairs)

In [None]:
label2files = {}
train = []
for i, category in enumerate(categories):
    folder_path = join(base_dir, category[1:]) # strip category's leading slash
    files = os.listdir(folder_path)
    files = [ (join(folder_path, filename), i) for filename in files]
    label2files[i] = files
    train.extend(list(files))

min_images_per_class = len(min(label2files.items(), key=lambda kv: len(kv[1]))[1])
print('Minimum number of train images per class: %i' % min_images_per_class )

In [None]:
test = pop_per_key(label2files, 25)
val = pop_per_key(label2files, 5)
sizes = [1, 10, 50, 70]
datasets = []
prev_size = 0
subset = []
for size in sizes:
    subset.extend(pop_per_key(label2files, size - prev_size))
    datasets.append(subset[:])
    prev_size = size
sizedatasets = zip(sizes, datasets)
print('Test set:', len(test))
print('Validation set:', len(val))
for size, dataset in sizedatasets:
    print('Train %i: %i' % (size, len(dataset)) )

In [None]:
imagenetdelta_train, imagenetdelta_validate = train_test_split(train, train_size=4500, test_size=500, random_state=seed)
imagenetdelta_train = [ (path, 1)  for path, label in imagenetdelta_train ]
imagenetdelta_validate = [ (path, 1)  for path, label in imagenetdelta_validate ]
imagenetdelta_test = [ (path, 1)  for path, label in random.sample(test, 1000) ]
write_datalist(imagenetdelta_train, join(datadir, 'imagenetdelta_train.txt'))
write_datalist(imagenetdelta_validate, join(datadir, 'imagenetdelta_validate.txt'))
write_datalist(imagenetdelta_test, join(datadir, 'imagenetdelta_test.txt'))

In [None]:
write_datalist(train, join(datadir, 'all.txt'))
write_datalist(test, join(datadir, 'test.txt'))
write_datalist(val, join(datadir, 'val.txt'))
for size, dataset in sizedatasets:
    write_datalist(dataset, join(datadir, 'train%i.txt' % size ))

In [None]:
def overlap_count(s1, s2):
    return len(set(s1) & set(s2))
assert overlap_count(val, test) == 0
for s in datasets:
    assert overlap_count(val, s) == 0
    assert overlap_count(test, s) == 0

In [None]:
vallabels = zip(*val)[1]
uniq = set(vallabels)
plt.hist(vallabels, bins=256)