# import

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
import glob
import datetime as dt
import tqdm
import json
import matplotlib.pyplot as plt
import numpy as np
import sys, os

sys.path.insert(0, os.path.abspath('../src/'))
from model.vgg19 import VGG19
from model.ViT import VisionTransformer
from model.augmentation import aug_process
from dataloader import dataloader

# mixed precision

In [2]:
is_mixed = True

In [3]:
if is_mixed:
    from tensorflow.keras.mixed_precision import experimental as mixed_precision
    policy = mixed_precision.Policy('mixed_float16')
    mixed_precision.set_policy(policy)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0
Instructions for updating:
Use tf.keras.mixed_precision.LossScaleOptimizer instead. LossScaleOptimizer now has all the functionality of DynamicLossScale


# model type

In [4]:
model_type = 'EfficientNet'

# file path

In [5]:
json_open = open('../config.json', 'r')
config = json.load(json_open)

In [6]:
dataset_path = f'{config["filepath"]["dataset"]}'
partitions_path = f'{config["filepath"]["partitions"]}'
output_path = f'{config["filepath"]["output"]}/01'
file_name = f"{model_type}_{config['data']['img_size']}"
log_dir = f"{output_path}/logs/{file_name}"

os.makedirs(log_dir, exist_ok=True)

In [7]:
test_txt_path_list = glob.glob(f"{partitions_path}/Test*")
train_txt_path_list = glob.glob(f"{partitions_path}/Train*")
class_name_path = f"{partitions_path}/ClassName.txt"

# label to index

In [8]:
f = open(class_name_path, 'r')
label_name_list = f.readlines()
label_name_list = list(map(lambda tmp_path: tmp_path[:-1].split('/', 2)[2], label_name_list))
f.close()
label_to_index = dict((name, index) for index, name in enumerate(label_name_list))

# text to path list

In [9]:
def txt_to_path(txt_path_list):
    path_list = []
    for path in txt_path_list:
        f = open(path, 'r')
        path_list+=f.readlines()
        f.close()
    
    path_list = list(map(lambda tmp_path: dataset_path+tmp_path[:-1], path_list))#.remove(config["data"]["exclude_list"])
    path_list = sorted(list(set(path_list)-set(config["data"]["exclude_list"])))
    label_list = list(map(lambda tmp_path: label_to_index[tmp_path.split('/', 6)[6].rsplit('/', 1)[0]], path_list))
    return path_list, label_list

In [10]:
img_path_list, label_list = txt_to_path(train_txt_path_list)

In [11]:
train_img_path_list, val_img_path_list,\
train_label_list, val_label_list = train_test_split(img_path_list, label_list,
                                                    test_size=0.2, random_state=0)
test_img_path_list, test_label_list = txt_to_path(test_txt_path_list)

In [12]:
tmp_dataloader = dataloader(config['data']['batch_size'], config['data']['img_size'],)
train_ds = tmp_dataloader(train_img_path_list, train_label_list, shuffle_buffer=100)
val_ds = tmp_dataloader(val_img_path_list, val_label_list, train=False, shuffle=False)

# def model

In [13]:
strategy = tf.distribute.MirroredStrategy()
num_classes = len(label_to_index)+1
with strategy.scope():
    if model_type=='vgg19':
        model = VGG19(num_classes, img_size=config['data']['img_size'],)
    if model_type=='ViT':
        model = VisionTransformer(num_classes=num_classes, img_size=config['data']['img_size'])
    if model_type=='EfficientNet':
        model = tf.keras.applications.EfficientNetB7(classes=num_classes, weights=None,
                                                     input_shape=(config['data']['img_size'], config['data']['img_size'], 3))
    aug_model = aug_process(config['data']['img_size'])
    
    loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    def calc_loss(target_y, predicted_y):
        return tf.math.reduce_mean(loss_obj(target_y, predicted_y))
    
    acc_func = tf.keras.metrics.SparseCategoricalAccuracy()
    def calc_acc(target_y, predicted_y):
        return tf.math.reduce_mean(acc_func(target_y, predicted_y))
    
optimizer = tfa.optimizers.RectifiedAdam(lr=1e-3, clipnorm=0.01)
optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
#model.compile(optimizer=opt,
#              loss='sparse_categorical_crossentropy', metrics=['acc'],)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensor

# train

In [14]:
#pytorch like learning rate scheduler
class ReduceLROnPlateau():
    def __init__(self, optimizer, patience, factor):
        self.optimizer = optimizer
        self.patience = patience
        self.factor = factor
        self.best_loss = None
        self.count = 0
    def step(self, loss):
        if self.best_loss is None:
            self.best_loss = loss
        elif self.best_loss>loss:
            self.count = 0
            self.best_loss = loss
        else:
            self.count+=1
            if self.count==self.patience:
                self.optimizer.learning_rate = self.optimizer.learning_rate*self.factor
                self.count=0

@tf.function
def train_step(input_img, label, optimizer):
    aug_img = aug_model(input_img)
    with tf.GradientTape() as GT:
        prediction = model(aug_img, training=True)
        loss = calc_loss(label, prediction)
        acc = calc_acc(label, prediction)
        scaled_loss = optimizer.get_scaled_loss(loss)
    scaled_grad = GT.gradient(scaled_loss, model.trainable_variables)
    grad = optimizer.get_unscaled_gradients(scaled_grad)
    optimizer.apply_gradients(zip(grad, model.trainable_variables))
    
    return loss, acc

@tf.function
def distributed_train_step(input_img, label, optimizer):
    per_replica_losses, per_replica_acc = strategy.run(train_step, args=(input_img, label, optimizer))
    loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
    acc = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_acc, axis=None)
    return loss, acc

@tf.function
def val_step(input_img, label):
    prediction = model(input_img)
    loss = calc_loss(label, prediction)
    acc = calc_acc(label, prediction)
    return loss, acc

@tf.function
def distributed_val_step(input_img, label):
    per_replica_losses, per_replica_acc = strategy.run(val_step, args=(input_img, label))
    loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
    acc = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_acc, axis=None)
    return loss, acc

In [15]:
train_writer = tf.summary.create_file_writer(f"{log_dir}/train")
val_writer = tf.summary.create_file_writer(f"{log_dir}/val")

In [16]:
train_loss_list = []
train_acc_list = []
val_loss_list = []
val_acc_list = []
lr_scheduler = ReduceLROnPlateau(optimizer, patience=4, factor=0.95)

for epoch in range(config['data']['epochs']):
    
    
    #train
    st = dt.datetime.now()
    tmp_loss_list = []
    tmp_acc_list = []
    for inputs, outputs in train_ds:
        tmp_loss, tmp_acc = distributed_train_step(inputs, outputs, optimizer)
        tmp_loss_list.append(tmp_loss)
        tmp_acc_list.append(tmp_acc)
        
    train_loss = tf.math.reduce_mean(tmp_loss_list).numpy()
    train_loss_list.append(train_loss)
    
    train_acc = tf.math.reduce_mean(tmp_acc_list).numpy()
    train_acc_list.append(train_acc)
    
    tmp_time = dt.datetime.now()-st
    
    
    #validation
    tmp_loss_list = []
    tmp_acc_list = []
    for inputs, outputs in val_ds:
        tmp_loss, tmp_acc = distributed_val_step(inputs, outputs)
        tmp_loss_list.append(tmp_loss)
        tmp_acc_list.append(tmp_acc)
        
    val_loss = tf.math.reduce_mean(tmp_loss_list).numpy()
    val_loss_list.append(val_loss)
    
    val_acc = tf.math.reduce_mean(tmp_acc_list).numpy()
    val_acc_list.append(val_acc)
    
    str_time = str(tmp_time).split('.')[0]
    learning_rate = optimizer.learning_rate.numpy()
    
    for writer, loss, acc in zip([train_writer, val_writer], 
                                 [train_loss, val_loss],
                                 [train_acc, val_acc]):
        with writer.as_default():
            tf.summary.scalar("loss", loss, step=epoch)
            tf.summary.scalar("acc", acc, step=epoch)
            writer.flush()
    
    print(f'epoch:{epoch+1} - train_loss:{train_loss:.7f} - val_loss:{val_loss:.7f} - time:{str_time} - leaning_rate:{learning_rate:.8f}')
    
    lr_scheduler.step(val_loss)

INFO:tensorflow:batch_all_reduce: 711 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 711 all-reduces with algorithm = nccl, num_packs = 1


ResourceExhaustedError: 3 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[512,32,128,128] and type half on /job:localhost/replica:0/task:0/device:GPU:1 by allocator GPU_1_bfc
	 [[{{node StatefulPartitionedCall/replica_1/efficientnetb7/block1b_bn/FusedBatchNormV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[StatefulPartitionedCall/div_no_nan/ReadVariableOp_1/_48]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[512,32,128,128] and type half on /job:localhost/replica:0/task:0/device:GPU:1 by allocator GPU_1_bfc
	 [[{{node StatefulPartitionedCall/replica_1/efficientnetb7/block1b_bn/FusedBatchNormV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (2) Resource exhausted:  OOM when allocating tensor with shape[512,32,128,128] and type half on /job:localhost/replica:0/task:0/device:GPU:1 by allocator GPU_1_bfc
	 [[{{node StatefulPartitionedCall/replica_1/efficientnetb7/block1b_bn/FusedBatchNormV3}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[truediv_1/_152]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_train_step_406679]

Function call stack:
distributed_train_step -> distributed_train_step -> distributed_train_step


In [None]:
model.save_weights(f"{output_path}/{model_type}.h5")