# **EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks**

Tan, M., & Le, Q. (2019, May). Efficientnet: Rethinking model scaling for convolutional neural networks. In International Conference on Machine Learning (pp. 6105-6114). PMLR.

Ref.

*https://github.com/qubvel/efficientnet/tree/8984e988ecccd9c3a15be2e793991845619a8a26*

In [11]:
import tensorflow as tf
import math

tf.__version__

'2.4.1'

In [2]:
def SE_Block(
    x,
    reduction_rate = 24,
    apply_type = "transformed",
):
    assert not (x.shape[-1] % reduction_rate), f"x.shape {x.shape} must be divided by reduction_rate {reduction_rate}"
    assert apply_type.lower() in ["textbook", "transformed"]
    
    residual = x

    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Reshape((1, 1, -1))(x)

    if apply_type.lower() == "textbook":
        ## This is the method actually applied in the original paper.
        x = tf.keras.layers.Dense(x.shape[-1] // reduction_rate)(x)
        x = tf.keras.layers.Activation(tf.nn.relu6)(x)

        x = tf.keras.layers.Dense(residual.shape[-1])(x)
        x = tf.keras.layers.Activation(tf.nn.softmax)(x)

    else:
        ## This is the method that expands the existing block.
        ## It's not sure if the activation function has been 
        ## applied, but I think it might have been applied.
        x = tf.keras.layers.Conv2D(x.shape[-1] // reduction_rate, 1, padding = "same")(x)
        x = tf.keras.layers.Activation(tf.nn.relu6)(x)

        x = tf.keras.layers.Conv2D(residual.shape[-1], 1, padding = "same")(x)
        x = tf.keras.layers.Activation(tf.nn.softmax)(x)
        
    x = tf.keras.layers.Multiply()([x, residual]) ## channel-wise multiplication
    
    return x

In [8]:
def round_filters(filters, width_coefficient, depth_divisor = 8):
    """Round number of filters based on width multiplier."""

    filters *= width_coefficient
    new_filters = int(filters + depth_divisor / 2) // depth_divisor * depth_divisor
    new_filters = max(depth_divisor, new_filters)
    ## Make sure that round down does not go down by more than 10%.
    if new_filters < 0.9 * filters:
        new_filters += depth_divisor
        
    return int(new_filters)


def round_repeats(repeats, depth_coefficient):
    """Round number of repeats based on depth multiplier."""

    return int(math.ceil(depth_coefficient * repeats))


def ConvBNReLU(
    x, 
    layer_type, 
    output_channels = None,
    kernel_size = 3,
    strides = 1, 
    activation_fn = tf.nn.relu6, 
    expansion_factor = 6, 
    reduction_rate = 24,
):
    assert layer_type.lower() in ["expansion", "depthwise", "pointwise", "naive"]

    if layer_type.lower() == "expansion":
        ## Conv 1x1
        x = tf.keras.layers.Conv2D(x.shape[-1] * expansion_factor, 1, padding = "same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(activation_fn)(x)

    elif layer_type.lower() == "depthwise":
        ## Dwise 3x3
        x = tf.keras.layers.DepthwiseConv2D(kernel_size, strides = strides, padding = "same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(activation_fn)(x)

        ## SE_Block is only bound behind a depthwise convolution.
        scaled_reduction_rate = 4 if x.shape[-1] % reduction_rate else reduction_rate
        
        x = SE_Block(x, scaled_reduction_rate)
    
    elif layer_type.lower() == "pointwise":
        ## Conv 1x1
        assert output_channels != None
        x = tf.keras.layers.Conv2D(output_channels, 1, padding = "same")(x) ## no activation, i.e. use linear.
        x = tf.keras.layers.BatchNormalization()(x)

    else: ## naive
        assert output_channels != None
        x = tf.keras.layers.Conv2D(output_channels, kernel_size, strides = strides, padding = "same")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(activation_fn)(x)

    return x


def InvertResidualBlock(
    x, 
    output_channels,
    kernel_size = 3, 
    strides = 1,
    expansion_factor = 6,
):
    assert strides in [1, 2], f"Argument 'strides' must be 1 or 2, not {strides}."
    residual = x

    x = ConvBNReLU(x, "expansion", kernel_size = kernel_size, expansion_factor = expansion_factor)
    x = ConvBNReLU(x, "depthwise", kernel_size = kernel_size, expansion_factor = expansion_factor, strides = strides,)
    x = ConvBNReLU(x, "pointwise", kernel_size = kernel_size, expansion_factor = expansion_factor, output_channels = output_channels)

    if strides == 1 and x.shape[-1] == residual.shape[-1]:
        x = tf.keras.layers.Add()([x, residual])

    return x

In [9]:
IMAGE_SIZE = [224, 224]

def EfficientNet(
    compound_coefficient = 0, 
) -> tf.keras.Model:

    assert compound_coefficient in range(8), \
        f"Compound scaling coefficient phi must be in range [0, 7], not {compound_coefficient}"

    def EfficientNet_Baseline(
        depth_coefficient, 
        width_coefficient, 
        image_size, 
        model_name, 
        reduction_rate = 24,
        embedding_dims = 1000, 
        apply_classifier = True
    ) -> tf.keras.Model:
        ## Readjust resolution from gamma.
        x = model_input = tf.keras.layers.Input(shape = (image_size, image_size, 3))

        ## Entry flow (stem).
        x = ConvBNReLU(x, "naive", kernel_size = 3, strides = 2, output_channels = 32)
        x = InvertResidualBlock(x, 16, expansion_factor = 1)

        ## Middle flow.
        ## It means (filters, kernel size, repeats, stride).
        args = [
            (24,  3, 2, 2),
            (40,  5, 2, 2),
            (80,  3, 3, 2),
            (112, 5, 3, 1),
            (192, 5, 4, 2),
            (320, 3, 1, 1)]
        
        for (filters, kernel_size, repeats, strides) in args:
            ## Newly scaled parameters are delivered while retaining the existing arguments.
            scaled_filters = round_filters(filters, width_coefficient)
            scaled_repeats = round_repeats(repeats, depth_coefficient)

            ## The first layer of each sequence has a stride s and all others use stride 1.
            x = InvertResidualBlock(x, scaled_filters, kernel_size = kernel_size, strides = strides)
            for _ in range(1, scaled_repeats):
                x = InvertResidualBlock(x, scaled_filters, kernel_size = kernel_size, strides = 1)

        ## Exit flow.
        x = ConvBNReLU(x, "naive", kernel_size = 1, output_channels = 1_280)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)

        model_output = x = tf.keras.layers.Dense(embedding_dims)(x) ## fixed
        if apply_classifier:
            model_output = tf.keras.layers.Softmax()(model_output)
            
        return tf.keras.Model(
            inputs = model_input,
            outputs = model_output,
            name = model_name)

    ## The textbook coefficient is as follows, 
    ## but it is actually adjusted slightly and applied.
    """
        depth_coefficient = 1.2
        width_coefficient = 1.1
        resol_coefficient = 1.15

        scaled_depth_coefficient = depth_coefficient ** compound_coefficient
        scaled_width_coefficient = width_coefficient ** (compound_coefficient * 0.5)
        scaled_resol_coefficient = resol_coefficient ** (compound_coefficient * 0.5)
    """

    coefficient_args = {
        0: (1.0, 1.0, 224),
        1: (1.0, 1.1, 240),
        2: (1.1, 1.2, 260),
        3: (1.2, 1.4, 300),
        4: (1.4, 1.8, 380),
        5: (1.6, 2.2, 456),
        6: (1.8, 2.6, 528),
        7: (2.0, 3.1, 600)}
        
    return EfficientNet_Baseline(
        *coefficient_args[compound_coefficient],
        model_name = f"EfficientNet-B{compound_coefficient}")

In [15]:
tmp = EfficientNet(compound_coefficient = 0)
tmp.summary()

Model: "EfficientNet-B0"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv2d_263 (Conv2D)             (None, 112, 112, 32) 896         input_5[0][0]                    
__________________________________________________________________________________________________
batch_normalization_199 (BatchN (None, 112, 112, 32) 128         conv2d_263[0][0]                 
__________________________________________________________________________________________________
activation_263 (Activation)     (None, 112, 112, 32) 0           batch_normalization_199[0][0]    
____________________________________________________________________________________

In [16]:
del tmp