# Problem 2. Semantic Segmentation

First: Switch to GPU.

In this problem you will train a CNN to perform Semantic Segmentation of selfies. The goal is to produce a binary mask where 0 is 'background' and 1 is 'person'.

This could be useful if you are interested in writing your own Instagram Stories filter :)

We will use a dataset originating from here. https://github.com/clovaai/ext_portrait_segmentation

Most of the code provided comes from the CamVid example. With a few minor changes to adapt to a new dataset.

1. Use colab file explorer to get a sense of how the data is organized. Complete the data manipulation code
2. Train a U-Net with efficientnet-B0 as a back-bone
- Using decoder_block_type='upsampling'
- Using `sm.losses.CategoricalFocalLoss()`
- Adam default LR 
- 10 epochs 
- Show accuracy
- Use model checkpointing to save and recover the best weights (w.r.t. validation accuracy)
- You can get inspiration from the notebook `segmentation_camvid.ipynb`
- You should get a validation accuracy ~96% or more
- 10 epochs might take around 4 minutes

3. Repeat 2. with decoder_block_type='transpose'

4. Compare and comment.

## Imports and data

In [None]:
import tensorflow as tf
import pathlib
import os
import glob
import matplotlib.pyplot as plt
import numpy as np
import PIL
from functools import partial
from tqdm import tqdm_notebook as tqdm
!pip install -U --quiet git+https://github.com/qubvel/segmentation_models
!pip install -U --quiet git+https://github.com/albumentations-team/albumentations
%env SM_FRAMEWORK=tf.keras
import segmentation_models as sm
import albumentations as A

  Building wheel for segmentation-models (setup.py) ... [?25l[?25hdone
  Building wheel for albumentations (setup.py) ... [?25l[?25hdone
env: SM_FRAMEWORK=tf.keras


In [None]:
url ="https://drive.google.com/uc?id=1-1xrQ2OzXZpnpxnQYJZVIUWs0Hixbr15"
!gdown {url}
!unzip -q data_PB2.zip
!rm data_PB2.zip

Downloading...
From: https://drive.google.com/uc?id=1-1xrQ2OzXZpnpxnQYJZVIUWs0Hixbr15
To: /content/data_PB2.zip
107MB [00:00, 124MB/s] 


In [None]:
data_dir = '/content/images_data_crop/'
data_dir = pathlib.Path(data_dir)
images_list = (data_dir).glob('*.jpg')
images_list = [str(x) for x in images_list]
np.random.shuffle(images_list)
valid_images_idx = np.loadtxt('val.txt',dtype=np.int64)
valid_images_list = np.array(images_list)[valid_images_idx]
train_images_list = set(images_list).difference(valid_images_list)
train_images_list = list(train_images_list)
valid_images_list = list(valid_images_list)
print(len(train_images_list), 'Training images')
print(len(valid_images_list), 'validation images')

1330 Training images
300 validation images


In [None]:
def get_mask_path(path):
  # YOUR CODE HERE
  return 

idx = 10
img = PIL.Image.open(train_images_list[idx])
plt.imshow(img)
img = PIL.Image.open(get_mask_path(train_images_list[idx]))
plt.imshow(img, cmap='jet', alpha=0.4)
plt.show()

In [None]:
NCLASSES = 2

In [None]:
train_masks_list = [get_mask_path(x) for x in train_images_list]
valid_masls_list = [get_mask_path(x) for x in valid_images_list]

In [None]:
data_train = tf.data.Dataset.from_tensor_slices((train_images_list, train_masks_list))
data_val = tf.data.Dataset.from_tensor_slices((valid_images_list, valid_masls_list))

In [None]:
BATCH_SIZE = 8
IMG_SIZE = (224, 192)
transforms_train = A.Compose([
            A.Resize(IMG_SIZE[0], IMG_SIZE[1], p=1),
            A.HorizontalFlip(p=0.5),              
            A.RandomSizedCrop(min_max_height=(IMG_SIZE[0]//2, IMG_SIZE[0]), height=IMG_SIZE[0], width=IMG_SIZE[1],
                             w2h_ratio=720/960, p=0.5),
            A.GaussNoise(p=0.1),
            A.MotionBlur(blur_limit=3, p=0.4),
            A.OneOf([A.RandomBrightnessContrast(), A.HueSaturationValue()], p=0.7),
            A.CLAHE(p=0.5)
        ])

transforms_val = A.Compose([
            A.Resize(IMG_SIZE[0], IMG_SIZE[1], p=1)
        ])

BACKBONE = 'efficientnetb0'
preprocess_input = sm.get_preprocessing(BACKBONE)

def aug_fn(image, mask, train):
    data = {"image": image, "mask": mask}
    if train:
      data = transforms_train(**data)
    else:
      data = transforms_val(**data)
    aug_img = data["image"]
    aug_img = preprocess_input(aug_img)
    return aug_img, data["mask"]

def parse(image_path, mask_path):
    image = tf.io.read_file(image_path)
    image = tf.io.decode_png(image, channels=3)
    mask = tf.io.read_file(mask_path)
    mask = tf.io.decode_png(mask, channels=1) 
    mask = tf.clip_by_value(mask, clip_value_min=0, clip_value_max=1) #only two classes 0 and 1
    return image, mask

def process_data_train(image, mask):
    aug_img, aug_mask = tf.numpy_function(func=aug_fn, inp=[image, mask, True], 
                                Tout=[tf.float32, tf.uint8])
    aug_img.set_shape(IMG_SIZE+(3,))
    aug_mask = tf.squeeze(aug_mask)
    aug_mask = tf.one_hot(aug_mask,depth=NCLASSES)
    aug_mask.set_shape(IMG_SIZE+(NCLASSES,))
    return aug_img, aug_mask

def process_data_val(image, mask):
    aug_img, aug_mask = tf.numpy_function(func=aug_fn, inp=[image, mask, False], 
                                Tout=[tf.float32, tf.uint8])
    aug_img.set_shape(IMG_SIZE+(3,))
    aug_mask = tf.squeeze(aug_mask)
    aug_mask = tf.one_hot(aug_mask,depth=NCLASSES)
    aug_mask.set_shape(IMG_SIZE+(NCLASSES,))
    return aug_img, aug_mask

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

data_train = data_train.shuffle(buffer_size=BATCH_SIZE*4).map(parse, 
                  num_parallel_calls=AUTOTUNE).cache().map(process_data_train,
                  num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)
data_val = data_val.map(parse, 
                  num_parallel_calls=AUTOTUNE).cache().map(process_data_val,
                  num_parallel_calls=AUTOTUNE).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [None]:
def view_image_batch(ds, model=None):
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    image, mask = next(iter(ds)) # extract 1 batch from the dataset
    if model is None:
      mask = tf.argmax(mask,axis=-1)
    else:
      mask = tf.argmax(model(image),axis=-1)
    image = np.clip((image.numpy()*std)+mean,0,1) # Doing the efn.preprocess_input inverse 
    fig = plt.figure(figsize=(22, 10))
    for i in range(8):
        ax = fig.add_subplot(2, 4, i+1, xticks=[], yticks=[])
        ax.imshow(image[i])
        ax.imshow(mask[i], cmap='jet', alpha=0.4)

view_image_batch(data_train)

## U-Net

In [None]:
IMG_SHAPE = IMG_SIZE + (3,)
# YOUR CODE HERE
# model = 
# Fit and checkpoint 
# Load best checkpoint after training

In [None]:
print('PREDICTED')
view_image_batch(data_val, model)
plt.show()
print('GROUND TRUTH')
view_image_batch(data_val)