**Portrait Segmentation Using Mobile-Unet**

Set up the GPU runtime

In [0]:
 # Check GPU
!nvidia-smi

In [0]:
# Mount G-drive
from google.colab import drive
drive.mount('/content/drive')

**Imports**

In [0]:
# Import libraries
import os
import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import Dense, Input,Flatten, concatenate,Reshape, Conv2D, MaxPooling2D, Lambda,Activation,Conv2DTranspose
from keras.layers import UpSampling2D, Conv2DTranspose, BatchNormalization, Dropout, DepthwiseConv2D, Add
from keras.callbacks import TensorBoard, ModelCheckpoint, Callback, ReduceLROnPlateau
from keras.regularizers import l1
from keras.optimizers import SGD, Adam
import keras.backend as K
from keras.utils import plot_model
from keras.callbacks import TensorBoard, ModelCheckpoint, Callback
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage.filters import gaussian_filter
from random import randint
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import matplotlib.pyplot as plt
from random import randint
%matplotlib inline

In [0]:
# Keras optimization library
!pip install kito
from kito import reduce_keras_model

**Load dataset**

Load the datset for training the model.

Ensure the images are in **RGB** format and masks (**ALPHA**) have pixel values **0 or 255**.

In [0]:
# Load the dataset
x_train=np.load("/content/drive/My Drive/finsegds/img_uint8.npy")
y_train=np.load("/content/drive/My Drive/finsegds/msk_uint8.npy")

In [0]:
# Verify the mask shape and values
print(np.unique(y_train))
print(y_train.shape)

# Total number of images
num_images=x_train.shape[0]

Copy pretrained model to local runtime disk. Save the checkpoints to your google drive (safe).

In [0]:
# Configure save paths and batch size
PRETRAINED='/content/pretrained_model.hdf5'
CHECKPOINT="/content/drive/My Drive/finsegds/munet_mnv3_wm10-{epoch:02d}-{val_loss:.2f}.hdf5"
LOGS='./logs'
BATCH_SIZE=64

**Preprocessing**

Normalize the source images  at runtime; but do not modify the masks

In [0]:
# Preprocessing function (runtime)
def normalize_batch(imgs):
    if imgs.shape[-1] > 1 :
      return (imgs -  np.array([0.50693673, 0.47721124, 0.44640532])) /np.array([0.28926975, 0.27801928, 0.28596011])
    else:
      return imgs.round()
def denormalize_batch(imgs,should_clip=True):
    imgs= (imgs * np.array([0.28926975, 0.27801928, 0.28596011])) + np.array([0.50693673, 0.47721124, 0.44640532])
    
    if should_clip:
        imgs= np.clip(imgs,0,1)
    return imgs

**Data Generator**

Create a data generator to load images and masks together at runtime. 
Use same seed for performing run-time augmentation for images and masks. Here we use  80/20 tran-val split.

**Note:** Currently the data generator loads the entire augmented dataset(npy) into memory (RAM) , so there is a good chance that the application would crash if there is not enough memory. You may alternatively use a different data generator for loading images from directories, after ensuring a proper directory structure.

In [0]:
# Data generator for training and validation

data_gen_args = dict(rescale=1./255,
                     width_shift_range=0.1,
                     height_shift_range=0.1,
                     zoom_range=0.2,
                     horizontal_flip=True,
                     validation_split=0.2
                    )

image_datagen = ImageDataGenerator(**data_gen_args, preprocessing_function=normalize_batch)
mask_datagen = ImageDataGenerator(**data_gen_args,  preprocessing_function=normalize_batch)

# Provide the same seed and keyword arguments to the fit and flow methods
seed = 1
batch_sz=BATCH_SIZE

# Train-val split (80-20)
num_train=int(num_images*0.8)
num_val=int(num_images*0.2) 


train_image_generator = image_datagen.flow(
    x_train,
    batch_size=batch_sz,
    shuffle=True,
    subset='training',
    seed=seed)

train_mask_generator = mask_datagen.flow(
    y_train,
    batch_size=batch_sz,
    shuffle=True,
    subset='training',
    seed=seed)


val_image_generator = image_datagen.flow(
    x_train, 
batch_size = batch_sz,
shuffle=True,
subset='validation',
seed=seed)

val_mask_generator = mask_datagen.flow(
     y_train,
batch_size = batch_sz,
shuffle=True,
subset='validation',
seed=seed)

                     
# combine generators into one which yields image and masks

train_generator = zip(train_image_generator, train_mask_generator)
val_generator = zip(val_image_generator, val_mask_generator)

# Delete numpy arrays to free memory
del(x_train)
del(y_train)


**Model Architecture**

Here we use the minimalistic version  of **Mobilent v3** with **width multiplier 1.0 or 0.5** as encoder (feature extractor).  

For the **decoder part**, we can use a upsampling block with **Transpose Convolution** of **stride of 2**. Ensure proper **skip connections** between encoder and decoder parts for better results.

In [0]:
# MobilenetV3 Blocks
 
def deconv_block(tensor, nfilters, size=3, padding='same', kernel_initializer = 'he_normal'):
    
    y = Conv2DTranspose(filters=nfilters, kernel_size=size, strides=2, padding = padding, kernel_initializer = kernel_initializer)(tensor)
    y = BatchNormalization()(y)
    y = Dropout(0.5)(y)
    y = Activation("relu")(y)
    
    return y

def conv_block(tensor, filters, strides, size=3):
    x = Conv2D(filters=filters, kernel_size=size, strides=strides, padding = 'same' )(tensor)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    return x

def bottleneck(tensor, nfilters, exp_ch, strides=1, alpha=1,residual=False):

  x = conv_block(tensor, filters=exp_ch, size=1, strides=1)

  y = DepthwiseConv2D(kernel_size=3, strides=strides, depth_multiplier=1, padding='same')(x)
  y = BatchNormalization()(y)
  y = Activation("relu")(y)

  z = Conv2D(filters=int(nfilters*alpha), kernel_size=1, strides=1, padding = 'same' )(y)
  z = BatchNormalization()(z)
  
  if residual:
      z = Add()([z, tensor])

  return z


# MobilenetV3 Base
def get_mobilenetv3(pretrained=False, alpha=1):
     
     input = Input(shape=(224,224,4))

     x = conv_block(input, filters=16, size=3, strides=2)

     x = DepthwiseConv2D(kernel_size=3, strides=2, depth_multiplier=1, padding='same')(x)
     x = BatchNormalization()(x)
     x = Activation("relu")(x)

     x = Conv2D(filters=int(16*alpha), kernel_size=1, strides=1, padding = 'same' )(x)
     x = BatchNormalization()(x)

     x = bottleneck(x, nfilters=24, exp_ch= 72, strides=2, alpha=alpha)
     x = bottleneck(x, nfilters=24, exp_ch= 88, strides=1, alpha=alpha, residual=True)
     
     x = bottleneck(x, nfilters=40,exp_ch= 96, strides=2, alpha=alpha)
     x = bottleneck(x, nfilters=40,exp_ch= 240, strides=1, alpha=alpha, residual=True)
     x = bottleneck(x, nfilters=40,exp_ch= 240, strides=1, alpha=alpha, residual=True)
     x = bottleneck(x, nfilters=48,exp_ch= 120, strides=1, alpha=alpha)
     x = bottleneck(x, nfilters=48,exp_ch= 144, strides=1, alpha=alpha, residual=True)
     x = bottleneck(x, nfilters=96, exp_ch=288,strides=2, alpha=alpha)
     x = bottleneck(x, nfilters=96,exp_ch= 576, strides=1, alpha=alpha, residual=True)
     x = bottleneck(x, nfilters=96,exp_ch= 576, strides=1, alpha=alpha, residual=True)

     x=conv_block(x, 576, strides=1, size=1)
    
     model = Model(inputs=input, outputs=x)
     return model


In [0]:
#  Model architecture: mnv3_unet

def mnv3_unet(finetuene=False, pretrained=False, alpha=1):

    # Load pretrained model (if any)
    if (pretrained):
       model=load_model(PRETRAINED)
       print("Loaded pretrained model ...\n")
       return model

    # Encoder
    mnv3 = get_mobilenetv3(pretrained=False, alpha=alpha)

    # Decoder
    x=mnv3.output

    x = deconv_block(x, int(256*alpha))
    x = concatenate([x, mnv3.layers[71].output], axis = 3)
    
    x = deconv_block(x, int(128*alpha))
    x = concatenate([x, mnv3.layers[28].output], axis = 3)
                
    x = deconv_block(x, int(64*alpha))
    x = concatenate([x, mnv3.layers[11].output], axis = 3)
    
    x = deconv_block(x, int(32*alpha))
    x = concatenate([x, mnv3.layers[3].output], axis = 3)
                

    x = Conv2DTranspose(filters=int(16*alpha), kernel_size=3, strides=2, padding='same', kernel_initializer = 'he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    
   
    x = Conv2DTranspose(1, (1,1), padding='same')(x)
    x = Activation('sigmoid', name="op")(x)
    
    
    model = Model(inputs=mnv3.input, outputs=x)
    
    
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model

model=mnv3_unet(finetuene=False, pretrained=False, alpha=1)

# Model summary
model.summary()

# Layer specifications
for i, layer in enumerate(model.layers):
    print(i, layer.output.name, layer.output.shape)

# Plot model architecture
plot_model(model, to_file='portrait_mnv3.png')

# Save checkpoints
checkpoint = ModelCheckpoint(CHECKPOINT, monitor='val_loss', verbose=1, save_weights_only=False , save_best_only=True, mode='min')

# Callbacks 
reduce_lr = ReduceLROnPlateau(factor=0.5, patience=15, min_lr=0.000001, verbose=1)
tensorboard = TensorBoard(log_dir=LOGS, histogram_freq=0,
                          write_graph=True, write_images=True)

callbacks_list = [checkpoint, tensorboard, reduce_lr]


**Train**

Train the model for **300 epochs** with our custom data generator. Use keras callbacks for **tensorboard** visulaization and **learning rate decay** as shown below. You can resume your training from a previous session by loading the entire **pretrained model** (weights  & optimzer state) as a hdf5 file.

In [0]:
# Load pretrained model (if any)
model=load_model('/content/drive/My Drive/finsegds/munet_mnv3_wm10-81-0.07.hdf5')

In [0]:
# Train the model
model.fit_generator(
    train_generator,
    epochs=300,
    steps_per_epoch=num_train/batch_sz,
    validation_data=val_generator, 
    validation_steps=num_val/batch_sz,
    use_multiprocessing=True,
    workers=2,
    callbacks=callbacks_list)

**Evaluate**

Evalute the performance of the model on a test data-set.

In [0]:
# Load a trained model checkpoint
model=load_model('/content/munet_mnv3_wm10-81-0.07.hdf5')

# Load a test dataset
new_xtest=x_train[14958:,...]
new_ytest=y_train[14958:,...]

In [0]:
# Evaluate model 
score = model.evaluate(np.float32(new_xtest/255.0), np.float32(new_ytest/255.0), verbose=0)
# Print loss and accuracy
print('Test loss:', score[0])
print('Test accuracy:', score[1])


**Optimize**

Using the kito library, you can optimize the model by folding the batch norms. This does not change the model behaviour or accuracy; but helps us to reduce the number of layers.

In [0]:
# Optimize model by folding batch-norms
model_reduced = reduce_keras_model(model)
model_reduced.summary()
model_reduced.save('munet_mnv3_wm10_bnoptimized.h5')

**Test**

Test the model on a new portrait image and plot the results.

In [0]:
# Load a test image
im=Image.open('/content/sf99.png')

In [0]:
# Inference
im=im.resize((224,224),Image.ANTIALIAS)
img=np.float32(np.array(im)/255.0)
plt.imshow(img[:,:,0:3])
img=img[:,:,0:3]

# Reshape input and threshold output
out=model_reduced.predict(img.reshape(1,224,224,3))
out=np.float32((out>0.5))

In [0]:
# Output mask
plt.imshow(np.squeeze(out.reshape((224,224))))

**Export Model**

Export the model to **tflite** format for **real-time** inference on a **smart-phone**.

In [0]:
# Flatten output and save model
output = model_reduced.output
newout=Reshape((50176,))(output)
new_model=Model(model_reduced.input,newout)

new_model.save('munet_mnv3_wm10.h5')

# For Float32 Model
converter = tf.lite.TFLiteConverter.from_keras_model_file('/content/munet_mnv3_wm10.h5')
tflite_model = converter.convert()
open("munet_mnv3_wm10.tflite", "wb").write(tflite_model)

**Post-training Quantization**

We can **reduce the model size and latency** by performing post training quantization. Fixed precison conversion (**UINT8**) allows us to reduce the model size significantly by quantizing the model weights.We can run this model on the mobile **CPU**. The **FP16** (experimental) conversion allows us to reduce the model size by half and the corresponding model can be run directly on mobile **GPU**.

In [0]:
# For UINT8 Quantization

converter = tf.lite.TFLiteConverter.from_keras_model_file('/content/munet_mnv3_wm10.h5')
converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
tflite_model = converter.convert()
open("munet_mnv3_wm10_uint8.tflite", "wb").write(tflite_model)


In [0]:
# For Float16 Quantization (Experimental)

import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_keras_model_file('/content/munet_mnv3_wm10.h5')
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]
tflite_model = converter.convert()
open("munet_mnv3_wm10_fp16.tflite", "wb").write(tflite_model)

**Plot sample output**

Load the test data as a batch using a numpy array. 

Crop the image using the output mask and plot the result.

In [0]:
# Load test images and model
model=load_model('/content/munet_mnv3_wm10.h5',compile=False)
test_imgs=np.load('/content/timg_uint8.npy')
test_imgs= np.float32(np.array(test_imgs)/255.0)

In [0]:
# Perform batch prediction
out=model.predict(test_imgs)
out=np.float32((out>0.5))
out=out.reshape((4,224,224,1))

In [0]:
# Plot the output using matplotlib
fig=plt.figure(figsize=(16, 16))
columns = 4
rows = 2

for i in range(1, columns+1):
    img = test_imgs[i-1].squeeze()
    fig.add_subplot(rows, columns, i)
    plt.imshow(img)
plt.show()

fig=plt.figure(figsize=(16, 16))
columns = 4
rows = 2

for i in range(1, columns+1):
    img = out[i-1].squeeze()/255.0
    fig.add_subplot(rows, columns, 4+i)
    plt.imshow(out[i-1]*test_imgs[i-1])

plt.show()