# TorchVision 0.3 Object Detection finetuning tutorial

For this tutorial, we will be finetuning a pre-trained [Mask R-CNN](https://arxiv.org/abs/1703.06870) model in the [*Penn-Fudan Database for Pedestrian Detection and Segmentation*](https://www.cis.upenn.edu/~jshi/ped_html/). It contains 170 images with 345 instances of pedestrians, and we will use it to illustrate how to use the new features in torchvision in order to train an instance segmentation model on a custom dataset.

First, we need to install `pycocotools`. This library will be used for computing the evaluation metrics following the COCO metric for intersection over union.

In [None]:
import boto3, sagemaker

my_session = boto3.session.Session()
my_region = my_session.region_name
client = boto3.client("sts")
account_id = client.get_caller_identity()["Account"]
algorithm_name = "pytorch-maskrcnn-tl"
session = sagemaker.session.Session()
bucket = session.default_bucket()

In [None]:
!aws s3 rm --recursive s3://$bucket/data/PennFudanPed
!aws s3 cp --recursive PennFudanPed s3://$bucket/data/PennFudanPed

Let's have a look at the dataset and how it is layed down.

The data is structured as follows
```
PennFudanPed/
  PedMasks/
    FudanPed00001_mask.png
    FudanPed00002_mask.png
    FudanPed00003_mask.png
    FudanPed00004_mask.png
    ...
  PNGImages/
    FudanPed00001.png
    FudanPed00002.png
    FudanPed00003.png
    FudanPed00004.png
```

Here is one example of an image in the dataset, with its corresponding instance segmentation mask

In [None]:
!./build_and_push.sh 

In [None]:
image_uri=f"{account_id}.dkr.ecr.{my_region}.amazonaws.com/{algorithm_name}:latest"

In [None]:
from sagemaker.pytorch import PyTorch


import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
docker_image = image_uri

pytorch_estimator = PyTorch(base_job_name='pytorch-maskrcnn-fudan',
                        source_dir='src',
                        image_uri=docker_image,    
                        entry_point='tv-training-code-sagemaker.py',
                        role=role,
                        framework_version='1.8.0',
                        py_version='py36',
                        # For training with multinode distributed training, set this count. Example: 2
                        instance_count=1,
                        # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
                        instance_type= 'ml.p3.16xlarge',
                        sagemaker_session=sagemaker_session,
                        # Training using SMDataParallel Distributed Training Framework
#                         distribution={'smdistributed':{
#                                             'dataparallel':{
#                                                     'enabled': True
#                                                  }
#                                           }
#                                       },

                        debugger_hook_config=False)
pytorch_estimator.fit({'train': 's3://{}/data/'.format(bucket)})

So each image has a corresponding segmentation mask, where each color correspond to a different instance. Let's write a `torch.utils.data.Dataset` class for this dataset.

In [None]:
model_s3_path = pytorch_estimator.model_data
model_s3_path

In [None]:
!rm -rf model 
!aws s3 cp $model_s3_path . 
!tar -xvf model.tar.gz
!mkdir -p model 
!mv model.pth model/

In [None]:
import torchvision

def get_instance_segmentation_model(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)


    return model

In [None]:
import torch 
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

model = get_instance_segmentation_model(2)
checkpoint = torch.load("model/model.pth", map_location='cuda:0')
model.load_state_dict(checkpoint)

### For distributed version

In [None]:

# import torch 
# from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
# from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
# from collections import OrderedDict

# model = get_instance_segmentation_model(2)
# checkpoint = torch.load("model/model.pth", map_location='cuda:0')
# checkpoint_modified = OrderedDict()
# for key in checkpoint.keys():
#     new_key = key[7:]
#     checkpoint_modified[new_key] = checkpoint[key]

# model.load_state_dict(checkpoint_modified)

In [None]:
!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'

In [None]:
import os
import numpy as np
import torch
import torch.utils.data
from PIL import Image


class PennFudanDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms=None):
        self.root = root
        self.transforms = transforms
        # load all image files, sorting them to
        # ensure that they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # load images ad masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # note that we haven't converted the mask to RGB,
        # because each color corresponds to a different instance
        # with 0 being background
        mask = Image.open(mask_path)

        mask = np.array(mask)
        # instances are encoded as different colors
        obj_ids = np.unique(mask)
        # first id is the background, so remove it
        obj_ids = obj_ids[1:]

        # split the color-encoded mask into a set
        # of binary masks
#         print(mask)
        masks = mask == obj_ids[:, None, None]
#         print(masks)

        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)
        return img, target

    def __len__(self):
        return len(self.imgs)

In [None]:
from engine import train_one_epoch, evaluate
import utils
import transforms as T


def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(T.ToTensor())
    if train:
        # during training, randomly flip the training images
        # and ground-truth for data augmentation
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
model.to(device)

In [None]:
img, _ = dataset[0]
# put the model in evaluation mode
model.eval()
with torch.no_grad():
    prediction = model([img.to(device)])

In [None]:
Image.fromarray(img.mul(255).permute(1, 2, 0).byte().numpy())

In [None]:
Image.fromarray(prediction[0]['masks'][0, 0].mul(255).byte().cpu().numpy())

In [None]:
Image.fromarray(prediction[0]['masks'][1, 0].mul(255).byte().cpu().numpy())