# **Mixed Precision Training**

Micikevicius, P., Narang, S., Alben, J., Diamos, G., Elsen, E., Garcia, D., ... & Wu, H. (2017). Mixed precision training. arXiv preprint arXiv:1710.03740.

Every experimental results are committed to Tensorboard Dev, and you can access them as blow:

*https://tensorboard.dev/experiment/lYlje1KYQ1KULHjd2Qj9jw/*

## **Default Setting**

In [1]:
!pip3 install -q tensorflow-datasets

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds

import math
import os

print(f"tf.__version__: {tf.__version__}")
print(f"tfds.__version__: {tfds.__version__}")

tf.__version__: 2.4.1
tfds.__version__: 4.0.1


In [3]:
## Using NVIDIA Tesla V100.
!nvidia-smi

Tue Apr 13 07:07:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
class HParams(object):
    def __init__(self):
        self.steps_per_epoch = None
        self.steps_per_execution = 16

        self.global_batch_sz = 128
        self.buffer_sz = 20_000
        self.auto = tf.data.experimental.AUTOTUNE

        self.image_sz = [224, 224]

        self.learning_rate = 1e-3
        self.epochs = 20
        self.steps_per_epoch = None
        self.validation_steps = None

HPARAMS = HParams()

## **Prepare Dataset**

In [5]:
@tf.function
def resize_and_rescale(image, label):
    image = tf.image.resize(image, HPARAMS.image_sz) ## resizing
    image = tf.cast(image, tf.float32) / 255. ## rescaling
    label = tf.cast(label, tf.float32)
    return image, label


def get_shapes(element_spec):
    return [get_shapes(e) if isinstance(e, tuple) else e.shape for e in element_spec]

In [6]:
def get_dataset(is_mixed_precision_training = True):
    ## Load dataset from tfds.
    tr_ds = tfds.load(
        "cifar100", 
        split = "train", 
        as_supervised = True,
    )
    ## Mixing with the seeds fixed ensures the same train 
    ## and validation dataset even after the runtime is restarted.
    tr_ds = tr_ds.shuffle(100_000, seed = 42)

    ts_ds = tfds.load(
        "cifar100", 
        split = "test", 
        as_supervised = True,
    )

    ## Building.
    vl_ds = tr_ds.take(10_000
                ).cache(
                ).repeat(
                # ).shuffle(HPARAMS.buffer_sz, reshuffle_each_iteration = True,
                ).batch(HPARAMS.global_batch_sz * (2 if is_mixed_precision_training else 1)
                ).map(resize_and_rescale, num_parallel_calls = HPARAMS.auto
                ).prefetch(HPARAMS.auto)

    tr_ds = tr_ds.skip(10_000
                ).cache(
                ).repeat(
                ).shuffle(HPARAMS.buffer_sz, reshuffle_each_iteration = True,
                ).batch(HPARAMS.global_batch_sz * (2 if is_mixed_precision_training else 1)
                ).map(resize_and_rescale, num_parallel_calls = HPARAMS.auto
                ).prefetch(HPARAMS.auto)

    ts_ds = ts_ds.cache(
                # ).shuffle(HPARAMS.buffer_sz, reshuffle_each_iteration = True,
                ).batch(HPARAMS.global_batch_sz * (2 if is_mixed_precision_training else 1)
                ).map(resize_and_rescale, num_parallel_calls = HPARAMS.auto
                ).prefetch(HPARAMS.auto)
    
    steps_per_epoch = 40_000 // (HPARAMS.global_batch_sz * (2 if is_mixed_precision_training else 1)) + 1
    validation_steps = 10_000 // (HPARAMS.global_batch_sz * (2 if is_mixed_precision_training else 1)) + 1
    
    HPARAMS.steps_per_epoch = steps_per_epoch
    HPARAMS.validation_steps = validation_steps

    print(f"Global batch size: {HPARAMS.global_batch_sz * (2 if is_mixed_precision_training else 1)}")
    print(f"Steps per epoch: {steps_per_epoch} (total {steps_per_epoch * HPARAMS.epochs} batches)")
    print(f"Validation steps: {validation_steps} (total {validation_steps * HPARAMS.epochs} batches)")

    print(f"\ntr_ds.element_spec: {get_shapes(tr_ds.element_spec)}")
    print(f"ts_ds.element_spec: {get_shapes(ts_ds.element_spec)}\n")

    return tr_ds, vl_ds, ts_ds

## **Modeling**

In [7]:
def SE_Block(
    x,
    reduction_rate = 24,
    apply_type = "transformed",
):
    assert not (x.shape[-1] % reduction_rate), f"x.shape {x.shape} must be divided by reduction_rate {reduction_rate}"
    assert apply_type.lower() in ["textbook", "transformed"]
    
    residual = x

    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Reshape((1, 1, -1))(x)

    if apply_type.lower() == "textbook":
        ## This is the method actually applied in the original paper.
        x = tf.keras.layers.Dense(x.shape[-1] // reduction_rate)(x)
        x = tf.keras.layers.Activation(tf.nn.relu6)(x)

        x = tf.keras.layers.Dense(residual.shape[-1])(x)
        x = tf.keras.layers.Activation(tf.nn.sigmoid)(x)

    else:
        ## This is the method that expands the existing block.
        ## It's not sure if the activation function has been 
        ## applied, but I think it might have been applied.
        x = tf.keras.layers.Conv2D(x.shape[-1] // reduction_rate, 1, padding = "same")(x)
        x = tf.keras.layers.Activation(tf.nn.relu6)(x)

        x = tf.keras.layers.Conv2D(residual.shape[-1], 1, padding = "same")(x)
        x = tf.keras.layers.Activation(tf.nn.sigmoid)(x)
        
    x = tf.keras.layers.Multiply()([x, residual]) ## channel-wise multiplication
    
    return x

In [8]:
def round_filters(filters, width_coefficient, depth_divisor = 8):
    """Round number of filters based on width multiplier."""

    filters *= width_coefficient
    new_filters = int(filters + depth_divisor / 2) // depth_divisor * depth_divisor
    new_filters = max(depth_divisor, new_filters)
    ## Make sure that round down does not go down by more than 10%.
    if new_filters < 0.9 * filters:
        new_filters += depth_divisor
        
    return int(new_filters)


def round_repeats(repeats, depth_coefficient):
    """Round number of repeats based on depth multiplier."""

    return int(math.ceil(depth_coefficient * repeats))


def ConvBNReLU(
    x, 
    layer_type, 
    output_channels = None,
    kernel_size = 3,
    strides = 1, 
    activation_fn = tf.nn.relu6, 
    expansion_factor = 6, 
    reduction_rate = 24,
):
    assert layer_type.lower() in ["expansion", "depthwise", "pointwise", "naive"]

    if layer_type.lower() == "expansion":
        ## Conv 1x1
        x = tf.keras.layers.Conv2D(x.shape[-1] * expansion_factor, 1, padding = "same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(activation_fn)(x)

    elif layer_type.lower() == "depthwise":
        ## Dwise 3x3
        x = tf.keras.layers.DepthwiseConv2D(kernel_size, strides = strides, padding = "same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(activation_fn)(x)

        ## SE_Block is only bound behind a depthwise convolution.
        scaled_reduction_rate = 4 if x.shape[-1] % reduction_rate else reduction_rate
        
        x = SE_Block(x, scaled_reduction_rate)
    
    elif layer_type.lower() == "pointwise":
        ## Conv 1x1
        assert output_channels != None
        x = tf.keras.layers.Conv2D(output_channels, 1, padding = "same")(x) ## no activation, i.e. use linear.
        x = tf.keras.layers.BatchNormalization()(x)

    else: ## naive
        assert output_channels != None
        x = tf.keras.layers.Conv2D(output_channels, kernel_size, strides = strides, padding = "same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(activation_fn)(x)

    return x


def InvertResidualBlock(
    x, 
    output_channels,
    kernel_size = 3, 
    strides = 1,
    expansion_factor = 6,
):
    assert strides in [1, 2], f"Argument 'strides' must be 1 or 2, not {strides}."
    residual = x

    x = ConvBNReLU(x, "expansion", kernel_size = kernel_size, expansion_factor = expansion_factor)
    x = ConvBNReLU(x, "depthwise", kernel_size = kernel_size, expansion_factor = expansion_factor, strides = strides,)
    x = ConvBNReLU(x, "pointwise", kernel_size = kernel_size, expansion_factor = expansion_factor, output_channels = output_channels)

    if strides == 1 and x.shape[-1] == residual.shape[-1]:
        x = tf.keras.layers.Add()([x, residual])

    return x

In [9]:
def EfficientNet(
    compound_coefficient = 0, 
    prefix = None,
) -> tf.keras.Model:

    assert compound_coefficient in range(8), \
        f"Compound scaling coefficient phi must be in range [0, 7], not {compound_coefficient}"

    def EfficientNet_Baseline(
        depth_coefficient, 
        width_coefficient, 
        image_size, 
        model_name, 
        reduction_rate = 24,
        embedding_dims = 100, 
        apply_classifier = True,
    ) -> tf.keras.Model:
        ## Readjust resolution from gamma.
        x = model_input = tf.keras.layers.Input(shape = (image_size, image_size, 3))

        ## Entry flow (stem).
        x = ConvBNReLU(x, "naive", kernel_size = 3, strides = 2, output_channels = 32)
        x = InvertResidualBlock(x, 16, expansion_factor = 1)

        ## Middle flow.
        ## It means (filters, kernel size, repeats, stride).
        args = [
            (24,  3, 2, 2),
            (40,  5, 2, 2),
            (80,  3, 3, 2),
            (112, 5, 3, 1),
            (192, 5, 4, 2),
            (320, 3, 1, 1)]
        
        for (filters, kernel_size, repeats, strides) in args:
            ## Newly scaled parameters are delivered while retaining the existing arguments.
            scaled_filters = round_filters(filters, width_coefficient)
            scaled_repeats = round_repeats(repeats, depth_coefficient)

            ## The first layer of each sequence has a stride s and all others use stride 1.
            x = InvertResidualBlock(x, scaled_filters, kernel_size = kernel_size, strides = strides)
            for _ in range(1, scaled_repeats):
                x = InvertResidualBlock(x, scaled_filters, kernel_size = kernel_size, strides = 1)

        ## Exit flow.
        x = ConvBNReLU(x, "naive", kernel_size = 1, output_channels = 1_280)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)

        model_output = x = tf.keras.layers.Dense(embedding_dims)(x) ## fixed
        if apply_classifier:
            model_output = tf.keras.layers.Softmax(dtype = tf.float32)(model_output)
            
        return tf.keras.Model(
            inputs = model_input,
            outputs = model_output,
            name = model_name)

    ## The textbook coefficient is as follows, 
    ## but it is actually adjusted slightly and applied.
    """
        depth_coefficient = 1.2
        width_coefficient = 1.1
        resol_coefficient = 1.15

        scaled_depth_coefficient = depth_coefficient ** compound_coefficient
        scaled_width_coefficient = width_coefficient ** (compound_coefficient * 0.5)
        scaled_resol_coefficient = resol_coefficient ** (compound_coefficient * 0.5)
    """

    coefficient_args = {
        0: (1.0, 1.0, 224),
        1: (1.0, 1.1, 240),
        2: (1.1, 1.2, 260),
        3: (1.2, 1.4, 300),
        4: (1.4, 1.8, 380),
        5: (1.6, 2.2, 456),
        6: (1.8, 2.6, 528),
        7: (2.0, 3.1, 600)}
        
    return EfficientNet_Baseline(
        *coefficient_args[compound_coefficient],
        model_name = f"EfficientNet-B{compound_coefficient}-{prefix}"
    )

## **Callbacks**

In [10]:
def get_callbacks(model_name):    
    ## TensorBoard callback.
    log_dir = f"logs/fit/{model_name}"
    os.makedirs(os.path.dirname(log_dir), exist_ok = True)

    tb_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq = 1)
    
    return [tb_callback]

## **Baseline with FP32**

The batch size sets the maximum size possible with the current VRAM capacity. When the batch size is 256, a ResourceExhauseError (Out-of-Memory; OOM) occurs, so 128, which is half the size, is used as the default batch size.

In [12]:
tr_ds, vl_ds, ts_ds = get_dataset(is_mixed_precision_training = False)

model = EfficientNet(prefix = "baseline")
model.compile(
    optimizer = "adam",
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = ["acc"],
    steps_per_execution = HPARAMS.steps_per_execution,
)

model.fit(
    tr_ds,
    validation_data = vl_ds,
    steps_per_epoch = HPARAMS.steps_per_epoch,
    validation_steps = HPARAMS.validation_steps,
    epochs = HPARAMS.epochs,
    verbose = 2,
    callbacks = get_callbacks(model.name),
)

Global batch size: 128
Steps per epoch: 313 (total 6260 batches)
Validation steps: 79 (total 1580 batches)

tr_ds.element_spec: [TensorShape([None, 224, 224, 3]), TensorShape([None])]
ts_ds.element_spec: [TensorShape([None, 224, 224, 3]), TensorShape([None])]

Epoch 1/20
313/313 - 186s - loss: 3.6864 - acc: 0.1325 - val_loss: 5.5154 - val_acc: 0.0097
Epoch 2/20
313/313 - 158s - loss: 2.8268 - acc: 0.2764 - val_loss: 5.0246 - val_acc: 0.1029
Epoch 3/20
313/313 - 158s - loss: 2.2828 - acc: 0.3902 - val_loss: 3.7533 - val_acc: 0.2573
Epoch 4/20
313/313 - 160s - loss: 1.9197 - acc: 0.4672 - val_loss: 3.6222 - val_acc: 0.2817
Epoch 5/20
313/313 - 159s - loss: 1.6460 - acc: 0.5317 - val_loss: 2.5104 - val_acc: 0.4002
Epoch 6/20
313/313 - 159s - loss: 1.4266 - acc: 0.5900 - val_loss: 2.0562 - val_acc: 0.4785
Epoch 7/20
313/313 - 160s - loss: 1.2413 - acc: 0.6359 - val_loss: 1.6746 - val_acc: 0.5484
Epoch 8/20
313/313 - 160s - loss: 1.0774 - acc: 0.6781 - val_loss: 2.3834 - val_acc: 0.4481
Epo

<tensorflow.python.keras.callbacks.History at 0x7f8df75341d0>

In [13]:
model.evaluate(ts_ds, verbose = 2)

79/79 - 6s - loss: 2.8551 - acc: 0.5242


[2.8550562858581543, 0.5242000222206116]

## **Mixed Precision with FP16**

There is no difference from the baseline except that the batch size is doubling and the steps per epoch become smaller accordingly. However, to apply the mixed precision policy, you must restart the runtime. (ctrl + m, dot)

In [11]:
tf.keras.mixed_precision.set_global_policy("mixed_float16")

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: Tesla V100-SXM2-16GB, compute capability 7.0


In [12]:
tr_ds, vl_ds, ts_ds = get_dataset(is_mixed_precision_training = True)

model = EfficientNet(prefix = "mixed-precision")
model.compile(
    optimizer = "adam",
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = ["acc"],
    steps_per_execution = HPARAMS.steps_per_execution,
)

model.fit(
    tr_ds,
    validation_data = vl_ds,
    steps_per_epoch = HPARAMS.steps_per_epoch,
    validation_steps = HPARAMS.validation_steps,
    epochs = HPARAMS.epochs,
    verbose = 2,
    callbacks = get_callbacks(model.name),
)

Global batch size: 256
Steps per epoch: 157 (total 3140 batches)
Validation steps: 40 (total 800 batches)

tr_ds.element_spec: [TensorShape([None, 224, 224, 3]), TensorShape([None])]
ts_ds.element_spec: [TensorShape([None, 224, 224, 3]), TensorShape([None])]

Epoch 1/20
157/157 - 109s - loss: 3.7674 - acc: 0.1204 - val_loss: 5.0847 - val_acc: 0.0087
Epoch 2/20
157/157 - 67s - loss: 2.9313 - acc: 0.2561 - val_loss: 5.5176 - val_acc: 0.0087
Epoch 3/20
157/157 - 67s - loss: 2.4018 - acc: 0.3663 - val_loss: 7.6578 - val_acc: 0.0104
Epoch 4/20
157/157 - 67s - loss: 2.0121 - acc: 0.4535 - val_loss: 3.1316 - val_acc: 0.2826
Epoch 5/20
157/157 - 67s - loss: 1.7234 - acc: 0.5163 - val_loss: 2.7835 - val_acc: 0.3612
Epoch 6/20
157/157 - 67s - loss: 1.4798 - acc: 0.5759 - val_loss: 2.8795 - val_acc: 0.3631
Epoch 7/20
157/157 - 67s - loss: 1.2959 - acc: 0.6212 - val_loss: 1.8614 - val_acc: 0.4996
Epoch 8/20
157/157 - 67s - loss: 1.1056 - acc: 0.6725 - val_loss: 1.8476 - val_acc: 0.5216
Epoch 9/20


<tensorflow.python.keras.callbacks.History at 0x7f4ac802b550>

In [13]:
model.evaluate(ts_ds, verbose = 2)

40/40 - 6s - loss: 2.8439 - acc: 0.5059


[2.8438925743103027, 0.5059000253677368]

## **Commit to Tensorboard Dev.**

In [None]:
!tensorboard dev upload --logdir ./logs \
    --name "Experiment of 'Mixed Precision Training'" \
    --description "Implemented training results from the paper 'https://arxiv.org/abs/1710.03740'" \
    --one_shot

In [15]:
from IPython import display

display.IFrame(
    src = "https://tensorboard.dev/experiment/lYlje1KYQ1KULHjd2Qj9jw/",
    width = "100%",
    height = "1000px"
)