# Generating HDF5 data
** This notebook takes CIFAR10 data and generates hdf5 data for use as a layer in caffe**

In [3]:
# As usual, a bit of setup

import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.cnn import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.data_utils import load_CIFAR10
from cs231n.gradient_check import eval_numerical_gradient_array, eval_numerical_gradient
from cs231n.layers import *
from cs231n.fast_layers import *
from cs231n.solver import Solver

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [2]:
# Load the (preprocessed) CIFAR10 data.
rawdata={}
rawdata['X_train'],rawdata['y_train'],rawdata['X_test'],rawdata['y_test'] = load_CIFAR10('cs231n/datasets/cifar-10-batches-py')

rawdata['X_train'] = rawdata['X_train'].transpose(0,3,1,2)
rawdata['X_test'] = rawdata['X_test'].transpose(0,3,1,2)


for k, v in rawdata.iteritems():
    print '%s: ' % k, v.shape

X_test:  (10000, 3, 32, 32)
X_train:  (50000, 3, 32, 32)
y_train:  (50000,)
y_test:  (10000,)


# Now output the hdf5 data

In [11]:
import os
import h5py

script_dir = os.getcwd()

#First discretize the data
pts=128.0
# Discretize the arrays
rawdata['X_train_disc'] = (rawdata['X_train']//(256/pts))*(256/pts) + 0.5*(256/pts)
rawdata['X_test_disc'] = (rawdata['X_test']//(256/pts))*(256/pts) + 0.5*(256/pts)
rawdata['X_train_disc'][rawdata['X_train_disc'] > 255.0] = 255.0
rawdata['X_test_disc'][rawdata['X_test_disc'] > 255.0] = 255.0



meandisc_image = np.mean(rawdata['X_train_disc'], axis=0)
mean_image = np.mean(rawdata['X_train'], axis=0)
rawdata['X_train']-= mean_image
rawdata['X_test']-= mean_image
rawdata['X_train_disc']-= meandisc_image
rawdata['X_test_disc']-= meandisc_image



# Generate HDF5DataLayer sample_data.h5
traindata=rawdata['X_train_disc']
trainlabels = rawdata['y_train'][:,np.newaxis]
traindata = traindata.astype('float32')

testdata=rawdata['X_test_disc']
testlabels = rawdata['y_test'][:,np.newaxis]
testdata = testdata.astype('float32')

with h5py.File(script_dir + '/cifar_train_disc128_gzip.h5', 'w') as f:
    f.create_dataset(
        'data', data=traindata,
        compression='gzip', compression_opts=1
    )
    f.create_dataset(
        'label', data=trainlabels,
        compression='gzip', compression_opts=1,
        dtype='uint8',
    )
    
with h5py.File(script_dir + '/cifar_test_disc128_gzip.h5', 'w') as f:
    f.create_dataset(
        'data', data=testdata,
        compression='gzip', compression_opts=1
    )
    f.create_dataset(
        'label', data=testlabels,
        compression='gzip', compression_opts=1,
        dtype='uint8',
    )