# This is a simple implementation of EfficientNet

In [81]:
import os
import tensorflow as tf
import utils
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.regularizers import l2
import myUtils
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout,GlobalAveragePooling2D

from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical


# **MBConv Block: Mobile Inverted Bottleneck Convolution**

The **MBConv block** is a fundamental building block in lightweight neural networks like **EfficientNet** and **MobileNetV2**. It is designed to achieve high accuracy with minimal computational cost, leveraging techniques such as inverted bottlenecks, depthwise separable convolutions, and optional squeeze-and-excitation (SE) blocks.

---

## **Key Components of MBConv Block**

### **1. Expansion Phase**
- **Purpose:** Expands the input tensor's channel dimensions to a higher-dimensional space, enabling the model to capture richer features.
- **Operation:** A `1 × 1` pointwise convolution is applied to increase the channel size.


---

### **2. Depthwise Convolution**
- **Purpose:** Performs spatial filtering independently on each channel.
- **Operation:** A `k × k` depthwise convolution is applied, significantly reducing the computational cost compared to traditional convolutions.


Where:
- `C`: Number of input channels.
- `C'`: Number of output channels.
- `k`: Kernel size.

---

### **3. Squeeze-and-Excitation (SE) Block**
- **Purpose:** Introduces channel-wise attention by recalibrating the importance of each channel.
- **Steps:**
  1. **Squeeze:** Apply global average pooling to compute channel-wise statistics.
  2. **Excite:** Use fully connected layers to compute channel weights, followed by a sigmoid activation.
  3. **Scale:** Multiply the original feature map by the learned weights.

---

### **4. Projection Phase**
- **Purpose:** Reduces the expanded feature map back to the desired number of output channels.
- **Operation:** A `1 × 1` pointwise convolution compresses the channel dimensions.

---

### **5. Residual Connection**
- **Purpose:** Adds the input to the output if:
  1. The stride is `1`.
  2. The number of input and output channels are the same.
- **Benefit:** Improves gradient flow and stability during training.


---

## **Diagram of MBConv Block**

Input
  |

[ Expand (1x1 Conv) ]  -- Expand phase
  |

[ Depthwise Conv (k x k) ] -- Spatial convolution
  |

[ SE Block (optional) ] -- Channel attention
  |

[ Projection (1x1 Conv) ] -- Reduce channels
  |

[ Residual Connection (if applicable) ]
  |
  
Output


In [82]:
class DropConnectLayer(tf.keras.layers.Layer):
    def __init__(self, drop_connect_rate=0.0, **kwargs):
        super(DropConnectLayer, self).__init__(**kwargs)
        self.drop_connect_rate = drop_connect_rate

    def call(self, inputs, training=False):
        if not training or self.drop_connect_rate <= 0.0:
            return inputs

        keep_prob = 1.0 - self.drop_connect_rate
        random_tensor = keep_prob + tf.random.uniform(tf.shape(inputs), dtype=inputs.dtype)
        binary_tensor = tf.floor(random_tensor)
        return inputs * binary_tensor / keep_prob


In [107]:
def mbConv_block(input, input_channels, output_channel, t, s, kernel_size=3, drop_rate=0.2, block_name="Block", block_num=0, survival_prob=0.8, output_resolution=None):
    """
    Constructs an MBConv block using the functional API with an SE block and L2 regularization.

    Parameters:
    - input: Input tensor.
    - input_channels: Number of input channels.
    - output_channel: Number of output channels.
    - t: Expansion factor.
    - s: Stride for depthwise convolution.
    - kernel_size: Kernel size for depthwise convolution.
    - drop_rate: Dropout rate.
    - block_name: Name of the block.
    - block_num: Block number (useful for debugging).
    - survival_prob: DropConnect survival probability.
    - output_resolution: Output resolution for padding calculation.

    Returns:
    - Output tensor from the MBConv block.
    """
    bn_axis = 3 
    # Block A---------------------------------------------------------------------------------------------------------------
    
    # Expansion
    expanded_filters = input_channels * t
    padding = 'same'

    if t > 1:
        # Expansion Convolution
        x = Conv2D(
            expanded_filters, 
            1, 
            padding='same', 
            use_bias=False, 
            kernel_initializer=myUtils.CONV_KERNEL_INITIALIZER,
            name=f'{block_name}_Expansion_Conv'
        )(input)
        x = tf.keras.layers.BatchNormalization(name=f'{block_name}_Expansion_BN',axis=bn_axis)(x)
        x = tf.keras.activations.swish(x)  # Swish activation, no name needed
        se_ratio = (1 / 24)
        if s == 2:
            pad = myUtils.calculate_padding(input_dim=input.shape[1], kernel_size=kernel_size, stride=s, output_dim=output_resolution)
            x = tf.keras.layers.ZeroPadding2D(padding=(pad, pad), name=f'{block_name}_Zero_Padding')(x)
            padding = 'valid'

    # Depthwise Convolution
    if t == 1:
        se_ratio = (0.25)
        x = input
    x = tf.keras.layers.DepthwiseConv2D(
        kernel_size=kernel_size, 
        padding=padding, 
        strides=s, 
        use_bias=False, 
        depthwise_initializer=myUtils.CONV_KERNEL_INITIALIZER,
        name=f'{block_name}_Depthwise_Conv'
    )(x)
    x = tf.keras.layers.BatchNormalization(name=f'{block_name}_Depthwise_BN',axis=bn_axis)(x)
    x = tf.keras.activations.swish(x)  # Swish activation, no name needed

    # Squeeze-and-Excitation (SE)
    se = GlobalAveragePooling2D(name=f'{block_name}_SE_Global_Avg_Pool')(x)
    se = tf.keras.layers.Reshape((1, 1, expanded_filters), name=f'{block_name}_SE_Reshape')(se)
    se_filters = max(1, int(expanded_filters * se_ratio))
    se = Conv2D(
        se_filters, 
        kernel_size=1, 
        activation="swish", 
        kernel_initializer=myUtils.CONV_KERNEL_INITIALIZER,
        name=f'{block_name}_SE_Conv1'
    )(se)
    se = Conv2D(
        expanded_filters, 
        kernel_size=1, 
        activation="sigmoid", 
        kernel_initializer=myUtils.CONV_KERNEL_INITIALIZER,
        name=f'{block_name}_SE_Conv2'
    )(se)
    x = tf.keras.layers.Multiply(name=f'{block_name}_SE_Multiply')([x, se])  # Multiply the input with the SE output

    # Projection Convolution
    x = Conv2D(
        output_channel, 
        kernel_size=1, 
        padding='same', 
        use_bias=False, 
        kernel_initializer=myUtils.CONV_KERNEL_INITIALIZER,
        name=f'{block_name}_Projection_Conv'
    )(x)
    x = tf.keras.layers.BatchNormalization(name=f'{block_name}_Projection_BN',axis=bn_axis)(x)
    

    # Skip connection (Residual connection)
    if s == 1 and input_channels == output_channel:
        x = Dropout(drop_rate,
                               noise_shape=(None, 1, 1, 1),
                               name=f'{block_name}_Dropout')(x)
        x = tf.keras.layers.Add(name=f'{block_name}_Skip_Connection')([input, x])

    return x


# EfficientNet Architecture

EfficientNet is a family of convolutional neural networks (CNNs) that achieve state-of-the-art accuracy while being computationally efficient. The key innovation behind EfficientNet is the use of a compound scaling method that uniformly scales all dimensions of depth, width, and resolution using a set of fixed scaling coefficients.

## Key Components of EfficientNet

### 1. Compound Scaling
- **Purpose:** Efficiently scales the network to achieve better performance.
- **Method:** Uses a compound coefficient to uniformly scale network depth, width, and resolution.
- **Formula:**
    \[
    \text{depth} = \alpha^d, \quad \text{width} = \beta^d, \quad \text{resolution} = \gamma^d
    \]
    where \( \alpha, \beta, \gamma \) are constants determined through a grid search, and \( d \) is the compound coefficient.

### 2. MBConv Blocks
- **Purpose:** Serve as the building blocks of EfficientNet, designed for efficiency and performance.
- **Components:**
    - Expansion phase
    - Depthwise convolution
    - Squeeze-and-Excitation (SE) block (optional)
    - Projection phase
    - Residual connection (if applicable)

### 3. Squeeze-and-Excitation (SE) Blocks
- **Purpose:** Introduce channel-wise attention to recalibrate feature maps.
- **Steps:**
    1. **Squeeze:** Global average pooling to compute channel-wise statistics.
    2. **Excite:** Fully connected layers to compute channel weights, followed by a sigmoid activation.
    3. **Scale:** Multiply the original feature map by the learned weights.

### 4. Swish Activation Function
- **Purpose:** Improves model performance by providing a smooth, non-monotonic activation function.
- **Formula:**
    \[
    \text{swish}(x) = x \cdot \text{sigmoid}(x)
    \]

## EfficientNet Variants
EfficientNet comes in several variants, each identified by a different scaling coefficient:
- **EfficientNet-B0:** Baseline model.
- **EfficientNet-B1 to B7:** Scaled versions of the baseline model, with increasing depth, width, and resolution.

## Advantages of EfficientNet
- **High Accuracy:** Achieves state-of-the-art performance on various benchmarks.
- **Computational Efficiency:** Requires fewer parameters and FLOPs compared to other models with similar accuracy.
- **Scalability:** The compound scaling method allows for easy scaling of the model to meet different resource constraints.

## Diagram of EfficientNet Architecture

```
Input
    |
[ Stem (Conv3x3) ]
    |
[ MBConv Block 1 ]
    |
[ MBConv Block 2 ]
    |
[ MBConv Block 3 ]
    |
[ MBConv Block 4 ]
    |
[ MBConv Block 5 ]
    |
[ MBConv Block 6 ]
    |
[ MBConv Block 7 ]
    |
[ Head (Conv1x1) ]
    |
[ Fully Connected Layer ]
    |
Output
```

In [108]:
def scaledResolution(phi):
    """
    Scaled Resolution function for EfficientNetB0-B7.
    """
    cases = {
        0: 224,
        1: 240,
        2: 260,
        3: 300,
        4: 380,
        5: 456,
        6: 528,
        7: 600
    }
    return cases[phi]

In [109]:
def round_filters(filters, width_coefficient):
    filters *= width_coefficient
    return max(8, int(filters + 4) // 8 * 8)


In [110]:
def dropOut_rate(phi):
    """
    Dropout rate for EfficientNetB0-B7.
    """
    cases = {
        0: 0.2,
        1: 0.2,
        2: 0.3,
        3: 0.3,
        4: 0.4,
        5: 0.4,
        6: 0.5,
        7: 0.5
    }
    return cases[phi]

In [111]:
import math
math.floor(1.2)
    

1

In [114]:
def efficientNet(input_shape=(224,224,3), num_classes=1000, phi=1):
    alpha = 1.2
    beta = 1.1
    gamma = 1.15
    bn_axis = 3

    depth = alpha ** phi
    width = beta ** phi
    scaled_resolution = scaledResolution(phi)
    scaled_input_shape = (scaled_resolution, scaled_resolution, input_shape[2])

    inputs = tf.keras.layers.Input(shape=scaled_input_shape, name='Input_Layer')
    # Preprocessing Layers
    x = tf.keras.layers.Normalization(name='Normalization')(inputs)
    x = tf.keras.layers.Rescaling(scale=1.0 / 127.5, offset=-1, name='Rescaling_1')(x)
    x = tf.keras.layers.ZeroPadding2D(padding=((0, 1), (0, 1)), name='ZeroPadding')(x)

    # Stem
    scaled_filters = round_filters(32, width)
    x = Conv2D(scaled_filters, 3, strides=2, padding='valid', use_bias=False, name='Stem_Conv')(x)
    x = tf.keras.layers.BatchNormalization(name='Stem_BN',axis=bn_axis)(x)
    x = tf.keras.activations.swish(x)

    # Block Configuration
    blocks_args = [
        (32, 16, 1, 1, math.floor(1 * depth), 3),  # Stage 1
        (16, 24, 6, 2, math.floor(2 * depth), 3),  # Stage 2
        (24, 40, 6, 2, math.floor(2 * depth), 5),  # Stage 3
        (40, 80, 6, 2, math.floor(3 * depth), 3),  # Stage 4
        (80, 112, 6, 1, math.floor(3 * depth), 5),  # Stage 5
        (112, 192, 6, 2, math.floor(4 * depth), 5),  # Stage 6
        (192, 320, 6, 1, math.floor(1 * depth), 3),  # Stage 7
    ]
    block_idx = 0
# MBConv Blocks
    for stage_idx, (input_channels, output_channels, t, s, repeats, kernel) in enumerate(blocks_args):
        output_shape=myUtils.compute_single_layer_output(initial_resolution=x.shape[1], kernel_size=kernel, stride=s)
    # For subsequent blocks in the stage (e.g., Stage_1_Block_2_b, Stage_1_Block_3_c, etc.)
        for block_idx in range(0, repeats+1):
            # Name for the first block in each stage (e.g., Stage_1_Block_1_a)
            x = mbConv_block(
        x,
        input_channels=input_channels if block_idx == 0 else output_channels,
        output_channel=output_channels,
        t=t,
        s=s if block_idx == 0 else 1,
        kernel_size=kernel,
        block_name=f'Block{stage_idx+1}{chr(97+block_idx)}',
        block_num=block_idx,
        output_resolution=output_shape
        )


    # Head

    x = Conv2D(1280, 1, padding='same', use_bias=False, name=f'Head_Conv_{stage_idx+1}',kernel_initializer=myUtils.CONV_KERNEL_INITIALIZER)(x)
    x = tf.keras.layers.BatchNormalization(name=f'Head_BN_{stage_idx+1}',axis=bn_axis)(x)
    x = tf.keras.activations.swish(x)

    x = tf.keras.layers.GlobalAveragePooling2D(name='Global_Avg_Pool')(x)
    drop_rate = dropOut_rate(phi)
    x = Dropout(drop_rate, name='Dropout')(x)
    x = tf.keras.layers.Dense(num_classes, activation='softmax', name='Output_Dense',kernel_initializer=myUtils.DENSE_KERNEL_INITIALIZER)(x)

    model = tf.keras.Model(inputs=inputs, outputs=x, name='EfficientNet_Model')

    return model


In [115]:
model = efficientNet(num_classes=10,phi=1)

In [99]:
model.summary()


In [116]:
def plot_history(history):
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='train accuracy')
    plt.plot(history.history['val_accuracy'], label='validation accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='train loss')
    plt.plot(history.history['val_loss'], label='validation loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [117]:
def evaluate_model(model, test_dataset):
    loss, accuracy = model.evaluate(test_dataset)
    print(f"Test accuracy: {accuracy}")
    print(f"Test loss: {loss}")


In [118]:
def plot_predictions(model, dataset):
    plt.figure(figsize=(12, 12))
    classes = [
        "airplane", "automobile", "bird", "cat", "deer",
        "dog", "frog", "horse", "ship", "truck"
    ]
    for i, (images, labels) in enumerate(dataset.take(1)):
        pred = model.predict(images)
        for j in range(9):
            plt.subplot(3, 3, j + 1)
            plt.imshow(images[j])
            plt.title(f"Actual: {classes[labels[j].numpy().argmax()]}\nPredicted: {classes[pred[j].argmax()]}")
            plt.axis("off")

    plt.tight_layout()
    plt.show()

In [119]:
tiny_imagenet_path = "tiny-imagenet-200"
train_dir = os.path.join(tiny_imagenet_path, "train")
val_dir = os.path.join(tiny_imagenet_path, "val")

In [None]:
# Parameters
img_size = (240, 240)  # Resize TinyImageNet images to match EfficientNetB1 input
batch_size = 32
num_classes = 200  # TinyImageNet has 200 classes

In [None]:
# Image data generators
train_datagen = ImageDataGenerator(
    rescale=1.0/255,
    horizontal_flip=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
)

val_datagen = ImageDataGenerator(rescale=1.0/255)

# Load training and validation data
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode="categorical"
)

val_generator = val_datagen.flow_from_directory(
    val_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode="categorical"
)

In [120]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [121]:
def preprocess(data , labels, num_classes=10, target_shape=(240, 240)):
    
    data = tf.image.resize(data, target_shape)
    labels = tf.squeeze(labels)  # Remove extra dimensions (e.g., from (32, 1) to (32,))
    labels = to_categorical(labels, num_classes)
    return data , labels

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
(x_val, y_val) = x_train[40000:], y_train[40000:]

x_train = x_train[:40000]
y_train = y_train[:40000]

# Create datasets and preprocess them
batch_size = 32
train_dataset = (
    tf.data.Dataset.from_tensor_slices((x_train, y_train))
    .map(lambda x, y: preprocess(x, y), num_parallel_calls=tf.data.AUTOTUNE)
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

test_dataset = (
    tf.data.Dataset.from_tensor_slices((x_test, y_test))
    .map(lambda x, y: preprocess(x, y), num_parallel_calls=tf.data.AUTOTUNE)
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

val_dataset = (
    tf.data.Dataset.from_tensor_slices((x_val, y_val))
    .map(lambda x, y: preprocess(x, y), num_parallel_calls=tf.data.AUTOTUNE)
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

# Example: Iterate over the dataset
for batch_images, batch_labels in train_dataset.take(1):  # Process only one batch for demonstration
    print(f"Batch image shape: {batch_images.shape}")  # Expected: (batch_size, 240, 240, 3)
    print(f"Batch label shape: {batch_labels.shape}")  # Expected: (batch_size, 10)


Batch image shape: (32, 240, 240, 3)
Batch label shape: (32, 10)


In [122]:
history = model.fit(train_dataset, validation_data=val_dataset, epochs=20)

Epoch 1/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 200ms/step - accuracy: 0.1718 - loss: 2.6452 - val_accuracy: 0.3702 - val_loss: 1.6643
Epoch 2/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 195ms/step - accuracy: 0.4038 - loss: 1.6309 - val_accuracy: 0.5627 - val_loss: 1.2247
Epoch 3/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 198ms/step - accuracy: 0.5735 - loss: 1.1998 - val_accuracy: 0.6918 - val_loss: 0.8912
Epoch 4/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 199ms/step - accuracy: 0.6823 - loss: 0.9024 - val_accuracy: 0.7346 - val_loss: 0.7805
Epoch 5/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 199ms/step - accuracy: 0.7536 - loss: 0.7024 - val_accuracy: 0.7812 - val_loss: 0.6333
Epoch 6/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 198ms/step - accuracy: 0.8071 - loss: 0.5590 - val_accuracy: 0.7846 - val_loss:

In [None]:
model.save('efficientNetB1.keras')

In [None]:
def compute_single_layer_output(initial_resolution, kernel_size, stride):
    """
    Computes the output resolution for a single layer without padding.

    Parameters:
    - initial_resolution: Tuple of (height, width) representing the input resolution.
    - kernel_size: Size of the kernel (int).
    - stride: Stride of the convolution (int).

    Returns:
    - Tuple representing the output resolution (height, width).
    """
    height, width = initial_resolution

    out_height = int((height + stride - 1) / stride) 
    out_width = int((width + stride - 1) / stride)

    return (out_height, out_width)

# Example usage:
initial_resolution = (15, 15)
kernel_size = 3
stride = 2

output_resolution = compute_single_layer_output(initial_resolution, kernel_size, stride)
print(f"Output resolution: {output_resolution}")


In [None]:
from tensorflow.keras.applications import EfficientNetB1
model_efficientnet = EfficientNetB1(weights=None,classes=10)



In [None]:
model_efficientnet.summary()

In [None]:
model_efficientnet.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
    history_efficientnet = model_efficientnet.fit(train_dataset, validation_data=val_dataset, epochs=20)