# How to use the HDF5DataGenerator class in DataMakerPlus.py

The purpose of the HDF5DataGenerator class is to implement batching to a large training sample that includes images.
All you need are the file paths of the HDF5 files you want to use for training, validation, and testing.

Here is an example of using the generator for a CNN.

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
import keras
import os
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Input, Concatenate
from tensorflow.keras.optimizers import Adam

from photoz_utils import *
from DataMakerPlus import *

In [3]:
# It's good practice to limit memory use for models.
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(1000)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [4]:
# Declare your HDF5 file paths
TRAIN_PATH = f'/data/HSC/HSC_v6/step2A/127x127/5x127x127_training_with_morphology_normalized.hdf5'
VAL_PATH = f'/data/HSC/HSC_v6/step2A/127x127/5x127x127_validation_with_morphology_normalized.hdf5'
TEST_PATH = f'/data/HSC/HSC_v6/step2A/127x127/5x127x127_testing_with_morphology_normalized.hdf5'

In [5]:
# Set up a dict to pass in the arguments for the generator.
gen_args = {
    'image_key': 'image',
    'numerical_keys': None,
    'y_key': 'specz_redshift',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': 256,
    'shuffle': False}

Put the name of the image column into 'image_key'. Put in the column of your label, what you want to predict, in 'y_key'.
Put a list of numerican feature columns that you want to use in 'numerical_keys'. If you only wish to use the images, set 
'numerical_keys' to none. I recommend setting 'shuffle' to false and it significantly speeds up training. It is also important to
set up a batch size. 'scaler' divides pixel values by 255 to normalize them.

In [6]:
# Create generator objects with the proper modes.
train_gen = HDF5DataGenerator(TRAIN_PATH, mode='train', **gen_args)
val_gen = HDF5DataGenerator(VAL_PATH, mode='train', **gen_args)
test_gen = HDF5DataGenerator(TEST_PATH, mode='test', **gen_args)

In [7]:
# Make your model.
input_ = Input(shape=(5,127,127))
conv1 = Conv2D(16, kernel_size=(3, 3), activation='tanh', padding='same', data_format='channels_first')(input_)
pool1 = MaxPooling2D(pool_size = (2,2), data_format='channels_first')(conv1)
conv2 = Conv2D(32, kernel_size=(3, 3), activation='tanh', padding='same', data_format='channels_first')(pool1)
pool2 = MaxPooling2D(pool_size = (2,2), data_format='channels_first')(conv2)
flatten = Flatten()(conv2)
dense1 = Dense(200, activation='tanh')(flatten)
dense2 = Dense(64, activation='tanh')(dense1)
output = Dense(1)(dense2)
model = Model(inputs=[input_], outputs=[output])

In [8]:
model.compile(optimizer=Adam(), loss='mse', metrics='mse')

In [None]:
model.fit(train_gen, batch_size=256, epochs=5, shuffle=True, verbose=1, validation_data=val_gen)

In [None]:
pred = model.predict(test_gen)

This next example is using the generator for both images and photometry/morphology.
Specify a list of numerical data columns and then pass it into the args dict.

In [9]:
num_keys = ['g_cmodel_mag', 'g_isophotal_area', 'r_cmodel_mag', 'r_isophotal_area'] # and so on
gen_args2 = {
    'image_key': 'image',
    'numerical_keys': num_keys,
    'y_key': 'specz_redshift',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': 256,
    'shuffle': False}

In [10]:
train_gen2 = HDF5DataGenerator(TRAIN_PATH, mode='train', **gen_args2)
val_gen2 = HDF5DataGenerator(VAL_PATH, mode='train', **gen_args2)
test_gen2 = HDF5DataGenerator(TEST_PATH, mode='test', **gen_args2)

Make sure the numerical input layer's shape is dynamic to the length of the num key list.

In [None]:
# Note the 2 input layers.
input_cnn = Input(shape=(5,127,127))
input_nn = Input(shape=(len(num_keys),)) # Don't forget to add the shape to correspond with all the num keys.

# CNN
conv1 = Conv2D(32, kernel_size=(3, 3), activation='tanh', padding='same', data_format='channels_first')(input_cnn)
pool1 = MaxPooling2D(pool_size = (2,2), data_format='channels_first')(conv1)
conv2 = Conv2D(64, kernel_size=(3, 3), activation='tanh', padding='same', data_format='channels_first')(pool1)
pool2 = MaxPooling2D(pool_size = (2,2), data_format='channels_first')(conv2)
conv3 = Conv2D(128, kernel_size=(3, 3), activation='tanh', padding='same', data_format='channels_first')(pool2)
pool3 = MaxPooling2D(pool_size = (2,2), data_format='channels_first')(conv3)
conv4 = Conv2D(256, kernel_size=(3, 3), activation='tanh', padding='same', data_format='channels_first')(pool3)
pool4 = MaxPooling2D(pool_size = (2,2), data_format='channels_first')(conv4)
conv5 = Conv2D(256, kernel_size=(3, 3), activation='tanh', padding='same', data_format='channels_first')(pool4)
pool5 = MaxPooling2D(pool_size = (2,2), data_format='channels_first')(conv5)
conv6 = Conv2D(512, kernel_size=(3, 3),activation='relu', padding='same', data_format='channels_first')(pool5)
conv7 = Conv2D(512, kernel_size=(3, 3),activation='relu', padding='same', data_format='channels_first')(conv6)
flatten = Flatten()(conv7)
dense1 = Dense(512, activation='tanh')(flatten)
dense2 = Dense(128, activation='tanh')(dense1)
dense3 = Dense(32, activation='tanh')(dense2)

# NN
hidden1 = Dense(hparams['num_dense_units'], activation="relu")(input_nn)
hidden2 = Dense(hparams['num_dense_units'], activation="relu")(hidden1)
hidden3 = Dense(hparams['num_dense_units'], activation="relu")(hidden2)
hidden4 = Dense(hparams['num_dense_units'], activation="relu")(hidden3)
hidden5 = Dense(hparams['num_dense_units'], activation="relu")(hidden4)
hidden6 = Dense(hparams['num_dense_units'], activation="relu")(hidden5)

# Concat & Output
concat = Concatenate()([dense3, hidden6]) # Concat the input layers
output = Dense(1)(concat)
model2 = Model(inputs=[input_cnn, input_nn], outputs=[output]) # Specify 2 input layers here all well

In [None]:
model2.compile(optimizer=Adam(), loss='mse', metrics='mse')

In [None]:
model2.fit(train_gen2, batch_size=256, epochs=5, shuffle=True, verbose=1, validation_data=val_gen2)

In [None]:
pred = model.predict(test_gen)