In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
import keras
import os
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, Input, Concatenate

from photoz_utils import *
from DataMaker import *

GB_LIMIT = 5
BATCH_SIZE = 256

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(GB_LIMIT*1000)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
available_modes = {"train", "test"}
class HDF5DataGenerator(Sequence):
    """Just a simple custom Keras HDF5 ImageDataGenerator.
    
    Custom Keras ImageDataGenerator that generates
    batches of tensor images (or data points) from HDF5 files with (optional) real-time
    data augmentation.
     
    Arguments
    ---------
    src : str
        Path of the hdf5 source file.
    label_key : str
        Key of the h5 file labels dataset.
        Default is "labels".
    classes_key : str
        Key of the h5 file dataset containing
        the raw classes.
        Default is None.
    batch_size : int
        Size of each batch, must be a power of two.
        (16, 32, 64, 128, 256, ...)
        Default is 32.
    shuffle : bool
        Shuffle images at the end of each epoch.
        Default is True.
    scaler : "std", "norm" or False
        "std" mode means standardization to range [-1, 1]
        with 0 mean and unit variance.
        "norm" mode means normalization to range [0, 1].
        Default is "std".
    num_classes : None or int
        Specifies the total number of classes
        for labels encoding.
        Default is None.
    labels_encoding : "hot", "smooth" or False
        "hot" mode means classic one hot encoding.
        "smooth" mode means smooth hot encoding.
        Default is "hot".
    smooth_factor : int or float
        smooth factor used by smooth
        labels encoding.
        Default is 0.1.
    augmenter : albumentations Compose([]) Pipeline or False
        An albumentations transformations pipeline
        to apply to each sample.
        Default is False.
    mode : str "train" or "test"
        Model generator type. "train" is used for
        fit_generator() and evaluate_generator.
        "test" is used for predict_generator().
        Default is "train".
        
    Notes
    -----
    Turn off scaler (scaler=False) if using the
    ToFloat(max_value=255) transformation from
    albumentations.
        
    Examples
    --------
    Example of usage:
    ```python
    my_augmenter = Compose([
        HorizontalFlip(p=0.5),
        RandomContrast(limit=0.2, p=0.5),
        RandomGamma(gamma_limit=(80, 120), p=0.5),
        RandomBrightness(limit=0.2, p=0.5),
        Resize(227, 227, cv2.INTER_AREA)
    ])

    # Create the generator.
    train_gen = HDF5ImageGenerator(
        'path/to/my/file.h5',
         augmenter=my_augmenter)
    ```
    """
    def __init__(
        self,
        src,
        label_key="labels",
        classes_key=None,
        batch_size=200,
        shuffle=True,
        scaler=True,
        num_classes=None,
        labels_encoding="hot",
        smooth_factor=0.1,
        augmenter=False,
        mode="train",
    ):

        if mode not in available_modes:
            raise ValueError('`mode` should be `train` '
                             '(fit_generator() and evaluate_generator()) or '
                             '`test` (predict_generator(). '
                             'Received: %s' % mode)
        self.mode = mode

        if labels_encoding not in available_labels_encoding:
            raise ValueError('`labels_encoding` should be `hot` '
                             '(classic binary matrix) or '
                             '`smooth` (smooth encoding) or '
                             'False (no labels encoding). '
                             'Received: %s' % labels_encoding)
        self.labels_encoding = labels_encoding

        if (self.labels_encoding == "smooth") and not (0 < smooth_factor <= 1):
            raise ValueError('`smooth` labels encoding '
                             'must use a `smooth_factor` '
                             '< 0 smooth_factor <= 1')

        if augmenter and not isinstance(augmenter, Compose):
            raise ValueError('`augmenter` argument '
                             'must be an instance of albumentations '
                             '`Compose` class. '
                             'Received type: %s' % type(augmenter))
        self.augmenter = augmenter

        self.src: str = src
        self.label_key: str = label_key
        self.classes_key: str = classes_key
        self.batch_size: int = batch_size
        self.shuffle: bool = shuffle
        self.scaler: bool = scaler
        self.num_classes: int = num_classes
        self.smooth_factor: float = smooth_factor

        self._indices = np.arange(self.__get_dataset_shape(self.label_key, 0))

    def __repr__(self):
        """Representation of the class."""
        return f"{self.__class__.__name__}({self.__dict__!r})"

    def __get_dataset_shape(self, dataset: str, index: int) -> Tuple[int, ...]:
        """Get an h5py dataset shape.
        
        Arguments
        ---------
        dataset : str
            The dataset key.
        index : int
            The dataset index.
         
        Returns
        -------
        tuple of ints
            A tuple of array dimensions.
        """
        with h5.File(self.src, "r") as file:
            return file[dataset].shape[index]

    def __get_dataset_items(
        self,
        indices: np.ndarray,
        dataset: Optional[str] = None
    ) -> Union[np.ndarray, Tuple[np.ndarray]]:
        """Get an HDF5 dataset items.
        
        Arguments
        ---------
        indices : ndarray, 
            The list of current batch indices.
        dataset : (optional) str
            The dataset key. If None, returns
            a batch of (image tensors, labels).
            Defaults to None.
         
        Returns
        -------
        np.ndarray or a tuple of ndarrays
            A batch of samples.
        """
        with h5.File(self.src, "r") as file:
            if dataset is not None:
                return np.asarray(file[dataset][indices]).astype('float32')
            else:
                return np.asarray(file[self.label_key][indices]).astype('float32')
    
    @property
    def num_items(self) -> int:
        """Grab the total number of examples
         from the dataset.
         
        Returns
        -------
        int
            The total number of examples.
        """
        with h5.File(self.src, "r") as file:
            return file[self.label_key].shape[0]
    
    @property 
    def classes(self) -> list:
        """Grab "human" classes from the dataset.
        
        Returns
        -------
        list
            A list of the raw classes.
        """
        if self.classes_key is None:
            raise ValueError('Canceled. parameter `classes_key` '
                             'is set to None.')
        
        with h5.File(self.src, "r") as file:
            return file[self.classes_key][:]

    def __len__(self):
        """Denotes the number of batches per epoch.
         
        Returns
        -------
        int
            The number of batches per epochs.
        """
        return int(
            np.ceil(
                self.__get_dataset_shape(self.label_key, 0) /
                float(self.batch_size)))

    @staticmethod
    def apply_labels_smoothing(batch_y: np.ndarray,
                               factor: float) -> np.ndarray:
        """Applies labels smoothing to the original
         labels binary matrix.
         
        Arguments
        ---------
        batch_y : np.ndarray
            Current batch integer labels.
        factor : float
            Smoothing factor.
        
        Returns
        -------
        np.ndarray
            A binary class matrix.
        """
        batch_y *= 1 - factor
        batch_y += factor / batch_y.shape[1]

        return batch_y

    def apply_labels_encoding(
            self,
            batch_y: np.ndarray,
            smooth_factor: Optional[float] = None) -> np.ndarray:
        """Converts a class vector (integers) to binary class matrix.
         See Keras to_categorical utils function.
         
        Arguments
        ---------
        batch_y : np.ndarray
            Current batch integer labels.
        smooth_factor : (optional) Float
            Smooth factor.
            Defaults to None.
        
        Returns
        -------
        np.ndarray
            A binary class matrix.
        """
        batch_y = to_categorical(batch_y, num_classes=self.num_classes)

        if smooth_factor is not None:
            batch_y = self.apply_labels_smoothing(batch_y,
                                                  factor=smooth_factor)

        return batch_y

    @staticmethod
    def apply_normalization(batch_X: np.ndarray) -> np.ndarray:
        """Normalize the pixel intensities. 
        
        Normalize the pixel intensities to the range [0, 1].
         
        Arguments
        ---------
        batch_X : np.ndarray
            Batch of image tensors to be normalized.
        
        Returns
        -------
        np.ndarray
            A batch of normalized image tensors.
        """
        return batch_X.astype("float32") / 4.0

    def __next_batch(self,
                     indices: np.ndarray) -> Tuple[np.ndarray]:
        """Generates a batch of train/val data for the given indices.
        
        Arguments
        ---------
        index : int
            The index for the batch.
            
        Returns
        -------
        tuple of ndarrays
            A tuple containing a batch of image tensors
            and their associated labels.
        """
        # Grab samples (tensors, labels) HDF5 source file.
        (batch_y) = np.asarray(self.__get_dataset_items(indices))


        # Shall we apply labels encoding?
        if self.labels_encoding:
            batch_y = self.apply_labels_encoding(
                batch_y,
                smooth_factor=self.smooth_factor
                if self.labels_encoding == "smooth" else None,
            )

        return (batch_y)

    def __getitem__(
            self,
            index: int) -> Union[np.ndarray, Tuple[np.ndarray]]:
        """Generates a batch of data for the given index.
        
        Arguments
        ---------
        index : int
            The index for the current batch.
            
        Returns
        -------
        tuple of ndarrays or ndarray
            A tuple containing a batch of image tensors
            and their associated labels (train) or
            a tuple of image tensors (predict).
        """
        # Indices for the current batch.
        indices = np.sort(self._indices[index * self.batch_size:(index + 1) *
                                        self.batch_size])

        if self.mode == "train":
            return self.__next_batch(indices)
        else:
            return self.__next_batch_test(indices)

    def __shuffle_indices(self):
        """If the shuffle parameter is set to True,
         dataset will be shuffled (in-place).
         (not available in test 'mode').
        """
        if (self.mode == "train") and self.shuffle:
            np.random.shuffle(self._indices)

    def on_epoch_end(self):
        """Triggered once at the very beginning as well as 
         at the end of each epoch.
        """
        self.__shuffle_indices()

In [4]:
args_images = {'label_key': 'image',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

args_specz = {'label_key': 'specz_redshift',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

TRAIN_PATH = '/data/HSC/HSC_v6/step2A/127x127/five_band_image127x127_with_metadata_corrected_training.hdf5'
VAL_PATH = '/data/HSC/HSC_v6/step2A/127x127/five_band_image127x127_with_metadata_corrected_validation.hdf5'
TEST_PATH = '/data/HSC/HSC_v6/step2A/127x127/five_band_image127x127_with_metadata_corrected_testing.hdf5'

In [5]:
args_g = {'label_key': 'g_cmodel_mag',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

args_r = {'label_key': 'r_cmodel_mag',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

args_i = {'label_key': 'i_cmodel_mag',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

args_z = {'label_key': 'z_cmodel_mag',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

args_y = {'label_key': 'y_cmodel_mag',
    'scaler': True,
    'labels_encoding': False,
    'batch_size': BATCH_SIZE,
    'mode': 'train',
    'shuffle': True}

In [6]:
train_images = HDF5DataGenerator(src=TRAIN_PATH, **args_images)
train_specz = HDF5DataGenerator(src=TRAIN_PATH, **args_specz)
train_g = HDF5DataGenerator(src=TRAIN_PATH, **args_g)
train_r = HDF5DataGenerator(src=TRAIN_PATH, **args_r)
train_i = HDF5DataGenerator(src=TRAIN_PATH, **args_i)
train_z = HDF5DataGenerator(src=TRAIN_PATH, **args_z)
train_y = HDF5DataGenerator(src=TRAIN_PATH, **args_y)

In [7]:
#train_images.__getitem__(0)

In [8]:
#train_specz.__getitem__(0)

In [9]:
class GalaxiesMLDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, image_gen, label_gen, mag_gens=[]):
        self.image_gen: HDF5DataGenerator = image_gen
        self.label_gen: HDF5DataGenerator = label_gen
        self.mag_gens: list[HDF5DataGenerator] = mag_gens

    def __len__(self):
        return len(self.label_gen)
    
    def __getitem__(self, index: int):
        print(index)
        mags = np.column_stack(self.mag_gens)[index]
        images = self.image_gen[index]
        labels = self.label_gen[index]
        return [mags, images], labels
    def on_epoch_end(self):
        self.image_gen.on_epoch_end()
        self.image_gen.on_epoch_end()
        for gen in range(len(mag_gens)):
            gen.on_epoch_end()

In [10]:
train_mags = [train_g, train_r, train_i, train_z, train_y]
test = GalaxiesMLDataGenerator(train_images, train_specz, train_mags)

In [11]:
test.__getitem__(0)[0][0]

0


  arr = array(v, copy=False, subok=True)


array([array([20.162785, 21.448307, 22.126463, 20.625122, 22.521536, 20.032892,
       22.786655, 20.899387, 22.237429, 21.116575, 21.12973 , 22.127296,
       20.314907, 22.21736 , 21.849468, 21.996683, 21.148739, 18.464205,
       20.998287, 21.048534, 22.113327, 20.225723, 21.952774, 21.787464,
       22.874868, 22.51006 , 17.967129, 18.923727, 17.454239, 22.351887,
       17.979868, 17.094007, 22.687695, 17.91275 , 22.172754, 19.440645,
       21.565865, 20.424072, 17.885138, 22.531103, 22.270655, 23.350174,
       22.029373, 19.626734, 20.254416, 20.490948, 21.80517 , 20.326132,
       19.658785, 20.720455, 18.112314, 21.72443 , 19.440845, 19.230871,
       22.222292, 22.4515  , 20.42808 , 19.95344 , 19.90621 , 21.468887,
       21.198145, 22.107922, 22.236313, 21.609041, 21.963835, 22.345888,
       21.066013, 22.922447, 21.16402 , 20.464642, 20.2089  , 20.086338,
       20.314669, 18.093761, 22.411438, 21.26249 , 21.231304, 22.137314,
       21.218239, 21.882399, 18.459158, 21.7

In [12]:
# repeat the same way Evan did

In [13]:
class JoinedMags(tf.keras.utils.Sequence):
    def __init__(self, input_gen1, input_gen2, input_gen3,input_gen4,input_gen5):
        self.gen1 = input_gen1
        self.gen2 = input_gen2
        self.gen3 = input_gen3
        self.gen4 = input_gen4        
        self.gen5 = input_gen5

        #assert len(input_gen1) == len(input_gen2) == len(target_gen)

    def __len__(self):
        return len(self.gen1)

    def __getitem__(self, i):
        x1 = self.gen1[i]
        x2 = self.gen2[i]
        x3 = self.gen3[i]
        x4 = self.gen4[i]
        x5 = self.gen5[i]

        return np.column_stack([x1, x2, x3, x4, x5])
    def on_epoch_end(self):
        self.gen1.on_epoch_end()
        self.gen2.on_epoch_end()
        self.gen3.on_epoch_end()
        self.gen4.on_epoch_end()
        self.gen5.on_epoch_end()

In [14]:
class JoinedGen(tf.keras.utils.Sequence):
    def __init__(self, input_gen1, input_gen2, target_gen):
        self.gen1 = input_gen1
        self.gen2 = input_gen2
        self.gen3 = target_gen

        #assert len(input_gen1) == len(input_gen2) == len(target_gen)

    def __len__(self):
        return len(self.gen1)

    def __getitem__(self, i):
        x1 = self.gen1[i]
        x2 = self.gen2[i]
        y = self.gen3[i]

        return [x1, x2], y

    def on_epoch_end(self):
        self.gen1.on_epoch_end()
        self.gen2.on_epoch_end()
        self.gen3.on_epoch_end()

In [15]:
train_mags2 = JoinedMags(train_g, train_r, train_i, train_z, train_y)
joint_gen = JoinedGen(train_images, train_mags2, train_specz)

In [18]:
joint_gen.__getitem__(0)[0][1]

array([[20.162785, 20.30324 , 20.003777, 19.59278 , 19.839014],
       [21.448307, 21.5829  , 21.402744, 20.637022, 20.885056],
       [22.126463, 20.349356, 19.516052, 19.154455, 18.874588],
       ...,
       [20.402996, 18.878391, 18.246067, 17.909672, 17.740322],
       [22.63111 , 22.008833, 21.642885, 21.354769, 21.28284 ],
       [19.446146, 17.942152, 17.334478, 17.024094, 16.81932 ]],
      dtype=float32)

In [None]:
# the mags are different in each gen. in test, there are 5 arrays with each having 256 mags of the same band.
#in joint_gen, there are 256 arrays with each having 5 mags of different bands