# VAE - Galaxy Images

2022-08-03

Zooey Nguyen

## Setup

In [1]:
import os
import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
from DataMaker import *

In [2]:
LATENT_DIM = 16
IMAGE_SHAPE = (5, 127, 127)
BATCH_SIZE = 128
EPOCHS = 20
GB_LIMIT = 10
CHECKPOINTS_TO_SAVE = 4

In [3]:
DATASET_NAME = "HSC_v6_small"
MODEL_TYPE = "VAE"
MODEL_VERSION = "v1"

model_id = '_'.join([DATASET_NAME, MODEL_TYPE, MODEL_VERSION])
dir_model = os.path.join('/models', model_id, 'model')
dir_checkpoints = os.path.join('/models', model_id, 'checkpoints')
dir_logs = os.path.join('/logs', model_id)
dir_predictions = os.path.join('/predictions', model_id)

print(dir_model)
print(dir_checkpoints)
print(dir_logs)
print(dir_predictions)

os.makedirs(dir_model, exist_ok=True)
os.makedirs(dir_checkpoints, exist_ok=True)
os.makedirs(dir_logs, exist_ok=True)
os.makedirs(dir_predictions, exist_ok=True)

/models/HSC_v6_small_VAE_v1/model
/models/HSC_v6_small_VAE_v1/checkpoints
/logs/HSC_v6_small_VAE_v1
/predictions/HSC_v6_small_VAE_v1


## Allocate GPU

Make sure to check others' current usage on Aurora in terminal using `watch nvidia-smi`. Set your `GB_LIMIT` accordingly, in gigabytes. Aurora has 50GB of memory.

In [4]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=GB_LIMIT*1000)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

1 Physical GPUs, 1 Logical GPUs


## Data

Create the shuffled and batched data generators.

In [5]:
args_gen = {'X_key': 'image',
    'y_key': 'specz_redshift',
    'scaler': False,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

start_path = "/data/HSC/HSC_v6/step2A/127x127/five_band_image127x127_with_metadata_corrected_"
end_path = "_small.hdf5"
TRAIN_PATH = start_path + "training" + end_path
VAL_PATH = start_path + "validation" + end_path
TEST_PATH = start_path + "testing" + end_path

In [6]:
with h5py.File(TRAIN_PATH) as train_hf:
    print(list(train_hf.keys()))
    print(train_hf['image'].shape)
    TRAIN_SIZE = train_hf['image'].shape[0]

['coord', 'dec', 'g_cmodel_mag', 'g_cmodel_magsigma', 'i_cmodel_mag', 'i_cmodel_magsigma', 'image', 'object_id', 'r_cmodel_mag', 'r_cmodel_magsigma', 'ra', 'skymap_id', 'specz_dec', 'specz_flag_homogeneous', 'specz_mag_i', 'specz_name', 'specz_ra', 'specz_redshift', 'specz_redshift_err', 'y_cmodel_mag', 'y_cmodel_magsigma', 'z_cmodel_mag', 'z_cmodel_magsigma']
(10000, 5, 127, 127)


In [7]:
train_gen = HDF5ImageGenerator(src=TRAIN_PATH, **args_gen)
val_gen = HDF5ImageGenerator(src=VAL_PATH, **args_gen)
test_gen = HDF5ImageGenerator(src=TEST_PATH, **args_gen)

## Choose losses and metrics

We need these for callbacks and training the model.

In [8]:
LOSS = tf.keras.losses.MeanSquaredError()
METRICS = [
    tf.keras.metrics.MeanAbsoluteError()
]

## Callbacks

What we'd like to save during training.

- Metrics logs per epoch to visualise in TensorBoard.
- Model weights per epoch.
- Predictions and corresponding metrics per prediction per epoch.

An alternative to ReduceLROnPlateau is LearningRateScheduler to decrease step size after a fixed number of epochs, but I think we'd prefer to do it by metric.

In [9]:
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau

logs_callback = TensorBoard(log_dir=dir_logs)
weights_callback = ModelCheckpoint(filepath=os.path.join(dir_checkpoints, 'weights_epoch{epoch}.hdf5'),
                                   save_freq=int(EPOCHS/CHECKPOINTS_TO_SAVE),
                                   save_weights_only=True
                                  )
LR_callback = ReduceLROnPlateau()

## CVAE Model

### Define model

In [10]:
from tensorflow.keras.layers import Layer

class Sampling(Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Conv2DTranspose, MaxPooling2D
from tensorflow.keras.layers import Input, Flatten, Dense, Reshape
from tensorflow.keras.initializers import Zeros

class VAE(Model):
    def __init__(self):
        super(VAE, self).__init__()
        
        images = Input(shape=(IMAGE_SHAPE))
        x = Conv2D(32, (3,3), activation='relu', padding='same', data_format='channels_first')(images)
        x = Conv2D(32, (3,3), activation='relu', padding='same', data_format='channels_first')(x)
        x = Flatten()(x)
        mean = Dense(LATENT_DIM)(x)
        logvar = Dense(LATENT_DIM, kernel_initializer=Zeros)(x)
        samples = Sampling()([mean, logvar])
        self.encoder = Model(images, samples)

        latents = Input(shape=(LATENT_DIM,))
        x = Dense(units=np.prod(IMAGE_SHAPE), activation='relu')(latents)
        x = Reshape(target_shape=IMAGE_SHAPE)(x)
        x = Conv2DTranspose(32, (3,3), activation='relu', padding='same', data_format='channels_first')(x)
        x = Conv2DTranspose(32, (3,3), activation='relu', padding='same', data_format='channels_first')(x)
        x = Conv2DTranspose(5, (3,3), activation='sigmoid', padding='same', data_format='channels_first')(x)
        outputs = Reshape(target_shape=IMAGE_SHAPE)(x)
        self.decoder = Model(latents, outputs)
 
    def call(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

### Create model

In [12]:
model = VAE()

In [13]:
model.compile(optimizer='adam', loss='mse')

### Train model

In [14]:
history = model.fit(
    train_gen,
    epochs=EPOCHS,
    callbacks=[logs_callback, weights_callback],
    validation_data=val_gen,
    verbose=1)

Epoch 1/20


InvalidArgumentError:  Incompatible shapes: [128,1] vs. [128,5,127,127]
	 [[node mean_squared_error/SquaredDifference (defined at <ipython-input-14-6eecdda167c0>:1) ]] [Op:__inference_train_function_1448]

Errors may have originated from an input operation.
Input Source operations connected to node mean_squared_error/SquaredDifference:
 vae/model_1/reshape_1/Reshape (defined at <ipython-input-11-95c1c38ff78b>:30)

Function call stack:
train_function


### Save final model and results

In [None]:
model.save(dir_model)

Save training results.

In [None]:
train_preds = model.predict(train_gen)

In [None]:
train_preds_path = os.path.join(dir_predictions, "train_preds.csv")

In [None]:
np.savetxt(train_preds_path, train_preds)

Save validation results.

In [None]:
val_preds = model.predict(val_gen)

In [None]:
val_preds_path = os.path.join(dir_predictions, "val_preds.csv")

In [None]:
np.savetxt(val_preds_path, val_preds)

Save test results.

In [None]:
test_preds = model.predict(test_gen)

In [None]:
test_preds_path = os.path.join(dir_predictions, "test_preds.hdf5")

In [None]:
np.savetxt(test_preds_path, test_preds)

## Results

Reload previously saved model or saved results if you'd like to.

In [None]:
savedmodel = tf.keras.models.load_model(dir_model, compile=False)
savedmodel.compile(optimizer='adam', loss='mse', metrics=[data_callback.collector])

### Example results

### Evaluate model