In [1]:
from itertools import product
from tensorflow.keras.callbacks import ModelCheckpoint
from model import unet
from os.path import join

import data
import numpy as np
import tensorflow as tf

print(f'Num GPUs Available: {len(tf.config.experimental.list_physical_devices("GPU"))}\n')
print(tf.config.experimental.list_physical_devices('GPU'))
#tf.debugging.set_log_device_placement(True)

Using TensorFlow backend.


Num GPUs Available: 8

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:4', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:5', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:6', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:7', device_type='GPU')]


In [2]:
base_folder = '/home/alex/data/larson_2019/data'

mirrored_strategy = tf.distribute.MirroredStrategy(devices=['/gpu:0', '/gpu:1', '/gpu:2', '/gpu:3',
                                                            '/gpu:4', '/gpu:5'])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5')


In [None]:
with mirrored_strategy.scope():  # processing in 4 GPUs
    # Hyperparameters:
    batch_size = 128  # original: 23, (256x256). Can put a larger size because the system shares it
    target_size = (256, 256)
    steps_per_epoch = int(60000 // batch_size)  # original: 600 x 10^2 = 60000
    validation_steps = int(25000 // batch_size)  # original: 250 x 10^2 = 25000

    data_gen_args = dict(rotation_range=0.2,
                         width_shift_range=0.05,
                         height_shift_range=0.05,
                         shear_range=0.05,
                         zoom_range=0.05,
                         horizontal_flip=True,
                         fill_mode='nearest')

    train_gene = data.train_generator(batch_size=batch_size,
                                      train_path=join(base_folder,
                                                      'train'),
                                      image_folder='image',
                                      label_folder='label',
                                      aug_dict=data_gen_args,
                                      target_size=target_size,
                                      save_to_dir=None)

    valid_gene = data.train_generator(batch_size=batch_size,
                                      train_path=join(base_folder,
                                                      'validate'),
                                      image_folder='image',
                                      label_folder='label',
                                      aug_dict=data_gen_args,
                                      target_size=target_size,
                                      save_to_dir=None)
    model = unet(input_size=(256, 256, 1))

    model_checkpoint = ModelCheckpoint('unet_larson.hdf5',
                                       monitor='val_loss',
                                       verbose=1,
                                       save_best_only=True)

    history = model.fit(train_gene,
                        steps_per_epoch=steps_per_epoch,
                        epochs=100,
                        validation_data=valid_gene,
                        validation_steps=validation_steps,
                        verbose=1,
                        callbacks=[model_checkpoint])

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Found 60000 images belonging to 1 classes.
Found 60000 images belonging to 1 classes.
  ...
    to  
  ['...']
Found 25000 images belonging to 1 classes.
Found 25000 images belonging to 1 classes.
  ...
    to  
  ['...']
Train for 468 steps, validate for 195 steps
Epoch 1/100
INFO:tensorflow:batch_all_reduce: 48 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tenso

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()