# Multiclass semantic segmentation using DeepLabV3+

**Author:** [Soumik Rakshit](http://github.com/soumik12345)<br>
**Date created:** 2021/08/31<br>
**Last modified:** 2023/01/06<br>
**Description:** Implement DeepLabV3+ architecture for Multi-class Semantic Segmentation.

## Introduction

Semantic segmentation, with the goal to assign semantic labels to every pixel in an image,
is an essential computer vision task. In this example, we implement
the **DeepLabV3+** model for multi-class semantic segmentation, a fully-convolutional
architecture that performs well on semantic segmentation benchmarks.

### References:

- [Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/pdf/1802.02611.pdf)
- [Rethinking Atrous Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1706.05587)
- [DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs](https://arxiv.org/abs/1606.00915)

## Downloading the data

We will use the [Crowd Instance-level Human Parsing Dataset](https://arxiv.org/abs/1811.12596)
for training our model. The Crowd Instance-level Human Parsing (CIHP) dataset has 38,280 diverse human images.
Each image in CIHP is labeled with pixel-wise annotations for 20 categories, as well as instance-level identification.
This dataset can be used for the "human part segmentation" task.

In [1]:
import os
import cv2
import numpy as np
from glob import glob
from scipy.io import loadmat
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [3]:
path="/content/drive/MyDrive/CRACK500-Resized"
train_images_path=os.path.join(os.path.join(path,'Train'),'images')
train_mask_path=os.path.join(os.path.join(path,'Train'),'masks')
val_images_path=os.path.join(os.path.join(path,'val'),'images')
val_mask_path=os.path.join(os.path.join(path,'val'),'masks')

In [4]:
import os
import cv2
import numpy as np

def read_data(images_path, masks_path, ext):
    images = []
    masks = []
    i = 0

    for subdir, dirs, files in os.walk(images_path):
        for file in files:
            name = os.path.splitext(file)[0]
            image = cv2.imread(os.path.join(images_path, file))
            mask_file = file.replace(".jpg", ext)
            mask = cv2.imread(os.path.join(masks_path, mask_file))
  
            # Resize the image and mask
            image = cv2.resize(image, (256,256), interpolation=cv2.INTER_AREA)
            mask = cv2.resize(mask, (256,256), interpolation=cv2.INTER_AREA)

            # Convert the mask to grayscale
            mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)

            # Expand the dimensions of the mask
            mask = np.expand_dims(mask, axis=-1)
            # Normalize the image and mask
            # image = image / 255.0
            # mask = mask / 255.0
            # Append the augmented image and mask to the lists
            images.append(image/255.0)
            masks.append(mask/255.0)
            # if ext==".png":

            #   image_aug = cv2.flip(image, 1)
            #   mask_aug = cv2.flip(mask, 1)
            #   mask_aug = np.expand_dims(mask_aug, axis=-1)
            #   images.append(image_aug/255.0)
            #   masks.append(mask_aug/255.0)
            #   print(mask_aug.shape)
            #   # Vertical flip
            #   image_aug = cv2.flip(image, 0)
            #   mask_aug = cv2.flip(mask, 0)
            #   mask_aug = np.expand_dims(mask_aug, axis=-1)
            #   images.append(image_aug/255.0)
            #   masks.append(mask_aug/255.0)
            #   print(mask_aug.shape)
            #   # # Zoom
            #   # scale = np.random.uniform(0.8, 1.2)
            #   # image_aug = cv2.resize(image, None, fx=scale, fy=scale)
            #   # mask_aug = cv2.resize(mask, None, fx=scale, fy=scale)
            #   # mask_aug = np.expand_dims(mask_aug, axis=-1)
            #   # images.append(image_aug/255.0)
            #   # masks.append(mask_aug/255.0)
            #   # print(mask_aug.shape)
            #   # Rotation by 15 degrees
            #   angle = np.random.uniform(-15, 15)
            #   matrix = cv2.getRotationMatrix2D((image.shape[1] / 2, image.shape[0] / 2), angle, 1)
            #   image_aug = cv2.warpAffine(image, matrix, (image.shape[1], image.shape[0]))
            #   mask_aug = cv2.warpAffine(mask, matrix, (mask.shape[1], mask.shape[0]))
            #   mask_aug = np.expand_dims(mask_aug, axis=-1)
            #   images.append(image_aug/255.0)
            #   masks.append(mask_aug/255.0)
            #   print(mask_aug.shape)

            if i % 20 == 0:
                print(i)
            i = i + 1

    images = np.array(images)
    masks = np.array(masks)

    return images, masks


In [5]:
train_images,train_masks=read_data(train_images_path,train_mask_path,".png")

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880


In [6]:
val_images,val_masks=read_data(val_images_path,val_mask_path,"_mask.png")

0
20
40


In [7]:
train_images.shape,val_images.shape,train_masks.shape, val_masks.shape

((1896, 256, 256, 3),
 (50, 256, 256, 3),
 (1896, 256, 256, 1),
 (50, 256, 256, 1))

In [8]:
# from tensorflow.keras import layers


# def get_model(img_size, num_classes):
#     inputs = keras.Input(shape=img_size + (3,))

#     ### [First half of the network: downsampling inputs] ###

#     # Entry block
#     x = layers.Conv2D(32, 3, strides=2, padding="same")(inputs)
#     x = layers.BatchNormalization()(x)
#     x = layers.Activation("relu")(x)

#     previous_block_activation = x  # Set aside residual

#     # Blocks 1, 2, 3 are identical apart from the feature depth.
#     for filters in [64, 128, 256]:
#         x = layers.Activation("relu")(x)
#         x = layers.SeparableConv2D(filters, 3, padding="same")(x)
#         x = layers.BatchNormalization()(x)

#         x = layers.Activation("relu")(x)
#         x = layers.SeparableConv2D(filters, 3, padding="same")(x)
#         x = layers.BatchNormalization()(x)

#         x = layers.MaxPooling2D(3, strides=2, padding="same")(x)

#         # Project residual
#         residual = layers.Conv2D(filters, 1, strides=2, padding="same")(
#             previous_block_activation
#         )
#         x = layers.add([x, residual])  # Add back residual
#         previous_block_activation = x  # Set aside next residual

#     ### [Second half of the network: upsampling inputs] ###

#     for filters in [256, 128, 64, 32]:
#         x = layers.Activation("relu")(x)
#         x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
#         x = layers.BatchNormalization()(x)

#         x = layers.Activation("relu")(x)
#         x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
#         x = layers.BatchNormalization()(x)

#         x = layers.UpSampling2D(2)(x)

#         # Project residual
#         residual = layers.UpSampling2D(2)(previous_block_activation)
#         residual = layers.Conv2D(filters, 1, padding="same")(residual)
#         x = layers.add([x, residual])  # Add back residual
#         previous_block_activation = x  # Set aside next residual

#     # Add a per-pixel classification layer
#     outputs = layers.Conv2D(num_classes, 3, activation="softmax", padding="same")(x)

#     # Define the model
#     model = keras.Model(inputs, outputs)
#     return model


# # Free up RAM in case the model definition cells were run multiple times
# keras.backend.clear_session()

# # Build model
# model = get_model((256,256), 1)
# model.summary()

In [9]:
# loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# import tensorflow as tf

# # Define custom IoU metric function
# def iou_score(y_true, y_pred):
#     y_true = tf.cast(y_true > 0.5, dtype=tf.float32)
#     y_pred = tf.cast(y_pred > 0.5, dtype=tf.float32)
#     intersection = tf.reduce_sum(y_true * y_pred)
#     union = tf.reduce_sum(y_true) + tf.reduce_sum(y_pred) - intersection
#     iou = intersection / union
#     return iou
# model.compile(
#     optimizer=keras.optimizers.Adam(learning_rate=0.001),
#     loss='binary_crossentropy',
#     metrics=["accuracy",iou_score],
# )

# history = model.fit(train_images,train_masks,batch_size=2,validation_data=(val_images,val_masks), epochs=25)

In [10]:
IMAGE_SIZE = 256
NUM_CLASSES = 1

## Creating a TensorFlow Dataset

Training on the entire CIHP dataset with 38,280 images takes a lot of time, hence we will be using
a smaller subset of 200 images for training our model in this example.

## Building the DeepLabV3+ model

DeepLabv3+ extends DeepLabv3 by adding an encoder-decoder structure. The encoder module
processes multiscale contextual information by applying dilated convolution at multiple
scales, while the decoder module refines the segmentation results along object boundaries.

![](https://github.com/lattice-ai/DeepLabV3-Plus/raw/master/assets/deeplabv3_plus_diagram.png)

**Dilated convolution:** With dilated convolution, as we go deeper in the network, we can keep the
stride constant but with larger field-of-view without increasing the number of parameters
or the amount of computation. Besides, it enables larger output feature maps, which is
useful for semantic segmentation.

The reason for using **Dilated Spatial Pyramid Pooling** is that it was shown that as the
sampling rate becomes larger, the number of valid filter weights (i.e., weights that
are applied to the valid feature region, instead of padded zeros) becomes smaller.

In [11]:

def convolution_block(
    block_input,
    num_filters=256,
    kernel_size=3,
    dilation_rate=1,
    padding="same",
    use_bias=False,
):
    x = layers.Conv2D(
        num_filters,
        kernel_size=kernel_size,
        dilation_rate=dilation_rate,
        padding="same",
        use_bias=use_bias,
        kernel_initializer=keras.initializers.HeNormal(),
    )(block_input)
    x = layers.BatchNormalization()(x)
    return tf.nn.relu(x)


def DilatedSpatialPyramidPooling(dspp_input):
    dims = dspp_input.shape
    x = layers.AveragePooling2D(pool_size=(dims[-3], dims[-2]))(dspp_input)
    x = convolution_block(x, kernel_size=1, use_bias=True)
    out_pool = layers.UpSampling2D(
        size=(dims[-3] // x.shape[1], dims[-2] // x.shape[2]), interpolation="bilinear",
    )(x)

    out_1 = convolution_block(dspp_input, kernel_size=1, dilation_rate=1)
    out_6 = convolution_block(dspp_input, kernel_size=3, dilation_rate=6)
    out_12 = convolution_block(dspp_input, kernel_size=3, dilation_rate=12)
    out_18 = convolution_block(dspp_input, kernel_size=3, dilation_rate=18)

    x = layers.Concatenate(axis=-1)([out_pool, out_1, out_6, out_12, out_18])
    output = convolution_block(x, kernel_size=1)
    return output


The encoder features are first bilinearly upsampled by a factor 4, and then
concatenated with the corresponding low-level features from the network backbone that
have the same spatial resolution. For this example, we
use a ResNet50 pretrained on ImageNet as the backbone model, and we use
the low-level features from the `conv4_block6_2_relu` block of the backbone.

In [12]:

def DeeplabV3Plus(image_size, num_classes):
    model_input = keras.Input(shape=(image_size, image_size, 3))
    resnet50 = keras.applications.ResNet50(
        weights="imagenet", include_top=False, input_tensor=model_input
    )
    x = resnet50.get_layer("conv4_block6_2_relu").output
    x = DilatedSpatialPyramidPooling(x)

    input_a = layers.UpSampling2D(
        size=(image_size // 4 // x.shape[1], image_size // 4 // x.shape[2]),
        interpolation="bilinear",
    )(x)
    input_b = resnet50.get_layer("conv2_block3_2_relu").output
    input_b = convolution_block(input_b, num_filters=48, kernel_size=1)

    x = layers.Concatenate(axis=-1)([input_a, input_b])
    x = convolution_block(x)
    x = convolution_block(x)
    x = layers.UpSampling2D(
        size=(image_size // x.shape[1], image_size // x.shape[2]),
        interpolation="bilinear",
    )(x)
    model_output = layers.Conv2D(num_classes, kernel_size=(1, 1), padding="same")(x)
    return keras.Model(inputs=model_input, outputs=model_output)


model = DeeplabV3Plus(image_size=IMAGE_SIZE, num_classes=NUM_CLASSES)
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256, 256, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, 262, 262, 3)  0           ['input_1[0][0]']                
                                                                                                  
 conv1_conv (Conv2D)            (None, 128, 128, 64  9472        ['conv1_pad[0][0]']              
                                )                 

In [13]:
import tensorflow as tf

# Define custom IoU metric function
def iou_score(y_true, y_pred):
    y_true = tf.cast(y_true > 0.5, dtype=tf.float32)
    y_pred = tf.cast(y_pred > 0.5, dtype=tf.float32)
    intersection = tf.reduce_sum(y_true * y_pred)
    union = tf.reduce_sum(y_true) + tf.reduce_sum(y_pred) - intersection
    iou = intersection / union
    return iou

In [None]:
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=["accuracy",iou_score],
)

history = model.fit(train_images,train_masks,batch_size=2,validation_data=(val_images,val_masks), epochs=25)


Epoch 1/25
 36/948 [>.............................] - ETA: 1:11:30 - loss: 1.1219 - accuracy: 0.8589 - iou_score: 0.0655

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Predict masks using the trained model
predicted_masks = model.predict(val_images)

# Display the predicted and actual masks
num_samples = len(val_images)

for i in range(num_samples):
    # Rescale the predicted and actual masks to the range [0, 1]
    predicted_mask = predicted_masks[i] * 255.0
    actual_mask = val_masks[i] * 255.0
    
    # Convert the masks to uint8 for visualization
    predicted_mask = predicted_mask.astype(np.uint8)
    actual_mask = actual_mask.astype(np.uint8)
    
    # Display the predicted and actual masks
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    axes[0].imshow(predicted_mask, cmap='gray')
    axes[0].set_title('Predicted Mask')
    axes[0].axis('off')
    axes[1].imshow(actual_mask, cmap='gray')
    axes[1].set_title('Actual Mask')
    axes[1].axis('off')
    
    plt.tight_layout()
    plt.show()
    break


In [None]:
ggggg

## Training

We train the model using sparse categorical crossentropy as the loss function, and
Adam as the optimizer.

In [None]:

plt.plot(history.history["loss"])
plt.title("Training Loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.show()

plt.plot(history.history["accuracy"])
plt.title("Training Accuracy")
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.show()

plt.plot(history.history["val_loss"])
plt.title("Validation Loss")
plt.ylabel("val_loss")
plt.xlabel("epoch")
plt.show()

plt.plot(history.history["val_accuracy"])
plt.title("Validation Accuracy")
plt.ylabel("val_accuracy")
plt.xlabel("epoch")
plt.show()

## Inference using Colormap Overlay

The raw predictions from the model represent a one-hot encoded tensor of shape `(N, 512, 512, 20)`
where each one of the 20 channels is a binary mask corresponding to a predicted label.
In order to visualize the results, we plot them as RGB segmentation masks where each pixel
is represented by a unique color corresponding to the particular label predicted. We can easily
find the color corresponding to each label from the `human_colormap.mat` file provided as part
of the dataset. We would also plot an overlay of the RGB segmentation mask on the input image as
this further helps us to identify the different categories present in the image more intuitively.

In [None]:
# Loading the Colormap
colormap = loadmat(
    "./instance-level_human_parsing/instance-level_human_parsing/human_colormap.mat"
)["colormap"]
colormap = colormap * 100
colormap = colormap.astype(np.uint8)


def infer(model, image_tensor):
    predictions = model.predict(np.expand_dims((image_tensor), axis=0))
    predictions = np.squeeze(predictions)
    predictions = np.argmax(predictions, axis=2)
    return predictions


def decode_segmentation_masks(mask, colormap, n_classes):
    r = np.zeros_like(mask).astype(np.uint8)
    g = np.zeros_like(mask).astype(np.uint8)
    b = np.zeros_like(mask).astype(np.uint8)
    for l in range(0, n_classes):
        idx = mask == l
        r[idx] = colormap[l, 0]
        g[idx] = colormap[l, 1]
        b[idx] = colormap[l, 2]
    rgb = np.stack([r, g, b], axis=2)
    return rgb


def get_overlay(image, colored_mask):
    image = tf.keras.utils.array_to_img(image)
    image = np.array(image).astype(np.uint8)
    overlay = cv2.addWeighted(image, 0.35, colored_mask, 0.65, 0)
    return overlay


def plot_samples_matplotlib(display_list, figsize=(5, 3)):
    _, axes = plt.subplots(nrows=1, ncols=len(display_list), figsize=figsize)
    for i in range(len(display_list)):
        if display_list[i].shape[-1] == 3:
            axes[i].imshow(tf.keras.utils.array_to_img(display_list[i]))
        else:
            axes[i].imshow(display_list[i])
    plt.show()


def plot_predictions(images_list, colormap, model):
    for image_file in images_list:
        image_tensor = read_image(image_file)
        prediction_mask = infer(image_tensor=image_tensor, model=model)
        prediction_colormap = decode_segmentation_masks(prediction_mask, colormap, 20)
        overlay = get_overlay(image_tensor, prediction_colormap)
        plot_samples_matplotlib(
            [image_tensor, overlay, prediction_colormap], figsize=(18, 14)
        )


### Inference on Train Images

In [None]:
plot_predictions(train_images[:4], colormap, model=model)

### Inference on Validation Images

You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/deeplabv3p-resnet50) and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/Human-Part-Segmentation).

In [None]:
plot_predictions(val_images[:4], colormap, model=model)