In [1]:
import os
import glob
import pickle

import tensorflow as tf

# Custom imports
from werdich_cfr.tfutils.ModeltrainerInc1 import VideoTrainer

%load_ext autoreload
%autoreload 2

In [5]:
# GPU CONFIGURATION
use_device_string = '2,3'
use_device_idx = list(range(len(use_device_string.split(','))))

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = use_device_string

physical_devices = tf.config.list_physical_devices('GPU')
device_list = [physical_devices[idx].name.replace('physical_device:', '') for idx in use_device_idx]

print('AVAILABLE GPUs:')
print(*physical_devices, sep='\n')
print('TRAIN DEVICE LIST:')
print(*device_list, sep='\n')

try:
  for dev in physical_devices:
    tf.config.experimental.set_memory_growth(dev, True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

AVAILABLE GPUs:
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')
TRAIN DEVICE LIST:
/GPU:0
/GPU:1


In [14]:
# Model name
cfr_meta_date = '200304'
model_name = 'cfr'+cfr_meta_date+'gpu4'
cfr_dir = os.path.normpath('/mnt/obi0/andreas/data/cfr')
log_dir = os.path.join(cfr_dir, 'log', model_name)
tfr_data_dir = os.path.join(cfr_dir, 'tfr_'+cfr_meta_date)
train_files = sorted(glob.glob(os.path.join(tfr_data_dir, 'cfr_resized75_a4c_train_'+cfr_meta_date+'_*.tfrecords')))
eval_files = sorted(glob.glob(os.path.join(tfr_data_dir, 'cfr_resized75_a4c_eval_'+cfr_meta_date+'_*.tfrecords')))

In [16]:
eval_files

['/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_0.tfrecords',
 '/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_1.tfrecords',
 '/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_2.tfrecords',
 '/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_3.tfrecords',
 '/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_4.tfrecords',
 '/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_5.tfrecords',
 '/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_6.tfrecords',
 '/mnt/obi0/andreas/data/cfr/tfr_200304/cfr_resized75_a4c_eval_200304_7.tfrecords']

In [15]:
# Model parameters
model_dict = {'name': model_name,
              'im_size': (299, 299, 1),
              'im_scale_factor': 1.177,
              'n_frames': 40,
              'filters': 64,
              'fc_nodes': 128,
              'kernel_init': tf.keras.initializers.GlorotNormal(),
              'bias_init': tf.keras.initializers.Zeros()}

# Training parameters
train_dict = {'train_device_list': device_list,
              'learning_rate': 0.0001,
              'train_batch_size': 40,
              'eval_batch_size': 40,
              'validation_batches': None,
              'validation_freq': 1,
              'n_epochs': 100,
              'verbose': 1,
              'buffer_n_batches_train': 16,
              'train_file_list': train_files,
              'eval_file_list': eval_files}

In [16]:
train_dict['train_device_list']

['/GPU:0', '/GPU:1']

In [17]:
# Compile the model
VT = VideoTrainer(log_dir=log_dir, model_dict=model_dict, train_dict=train_dict)
model=VT.compile_inc1model()
model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensor