## Import libraries

In [None]:
import numpy as np
import torch
from tqdm import trange
import os, sys, zipfile
import shutil
import urllib.request
import requests
import skimage.io as io
import matplotlib.pyplot as plt
from torchvision import transforms
from PIL import Image, ImageFile
from pycocotools.coco import COCO
%matplotlib inline
base_size = 513
crop_size = 513


## Record package versions for reproducibility

In [None]:
print("os: {}".format(os.name))
print("sys: {}".format(sys.version))
print("numpy: {}, {}".format(np.__version__, np.__file__))

## Setup data paths

In [None]:
dataDir = 'dataloaders/datasets/coco_dataset'
dataType = 'val2017'
annDir = '{}/annotations'.format(dataDir)
annZipFile = '{}/annotations_train{}.zip'.format(dataDir, dataType)
annFile = '{}/instances_{}.json'.format(annDir, dataType)
annURL = 'http://images.cocodataset.org/annotations/annotations_train{}.zip'.format(dataType)
print(f'annDir: {annDir}')
print(f'annFile: {annFile}')
print(f'annZipFile: {annZipFile}')
print(f'annURL: {annURL}')

## Download data if not available locally

In [None]:
if not os.path.exists(annDir):
    os.makedirs(annDir)
if not os.path.exists(annFile):
    if not os.path.exists(annZipFile):
        print("Downloading zipped annotations to " + annZipFile + " ...")
        with urllib.request.urlopen(annURL) as resp, open(annZipFile, 'wb') as out:
            shutil.copyfileobj(resp, out)
        print("... done downloading")
    print("Unzipping " + annZipFile)
    with zipfile.ZipFile(annZipFile, "r") as zip_ref:
        zip_ref.extractall(dataDir)
    print("... done unzipping")
print("will use annotations in " + annFile)

## Initialize COCO API for instance annotations

In [None]:
coco = COCO(annFile)

## Display COCO categories and supercategories

In [None]:
cats = coco.loadCats(coco.getCatIds())
nms = [cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))
nms = set([cat['supercategory'] for cat in cats])
print('COCO supercategories: \n{}'.format(' '.join(nms)))

In [None]:
print(coco.cats)

##  Get all images containing given categories, for an example

In [None]:
catIds = coco.getCatIds(catNms=['person', 'dog', 'skateboard'])
imgIds = coco.getImgIds(catIds=catIds)
imgs = coco.loadImgs(imgIds[1])
annIds = coco.getAnnIds(imgIds=imgIds[1])
anns = coco.loadAnns(annIds)

In [None]:
for img in imgs:
    I = io.imread(img['coco_url'])
    fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(15, 15))
    ax1.axis('off')
    ax1.imshow(I) 
    ax2.axis('off')
    ax2.imshow(I)    
    annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds, iscrowd=None)
    anns = coco.loadAnns(annIds)
    coco.showAnns(anns)
plt.show()

## Download 2017 validation images for simplicity
A size of training images is too large for data exploration

In [None]:
# catIds = coco.getCatIds()
imgIds = coco.getImgIds()
imgs = coco.loadImgs(imgIds)
imgType = 'val2017'
imgDir = 'dataloaders/datasets/coco_dataset/{}/'.format(imgType)
if not os.path.exists(imgDir):
    os.makedirs(imgDir)
    for im in imgs:
        img_data = requests.get(im['coco_url']).content
        with open(imgDir + im['file_name'], 'wb') as handler:
            handler.write(img_data)

## Class for COCO dataset

In [1]:
from dataloaders.datasets import coco
from modeling.deeplab import *
from modeling.unet import *
from utils.loss import SegmentationLosses
from utils.lr_scheduler import LR_Scheduler
from modeling.sync_batchnorm.replicate import patch_replication_callback
from dataloaders import make_data_loader
from tqdm import tqdm
import numpy as np
import config as args
from utils.saver import Saver
from utils.summaries import TensorboardSummary
from utils.metrics import Evaluator

In [None]:
torch.cuda.device_count()

## Class for training and validation

In [2]:
class Trainer:
    def __init__(self, args):
        self.args = args
        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs)
        
        self.cuda = args.useCUDA and torch.cuda.is_available()
        if self.cuda:
            args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')]
        if self.cuda and len(args.gpu_ids) > 1:
            args.sync_bn = True
        else:
            args.sync_bn = False
        # Define network
        # DeepLab version
#         model = DeepLab(backbone=args.backbone, output_stride=args.out_stride, num_classes=self.nclass, sync_bn=args.sync_bn, 
#                         freeze_bn=args.freeze_bn)
#       # set different learning rate for backbone(ResNet101), Atrous Spatial Pyramid Pooling and decoder part
#         train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, 
#                         {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}]
#         # Define Optimizer
#         optimizer = torch.optim.SGD(train_params, momentum=args.momentum,
#                                     weight_decay=args.weight_decay, nesterov=args.nesterov)
        # UNet version
        model = UNet(num_classes=self.nclass, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn)
        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov)
        
        weight = None
 
        
        self.criterion = SegmentationLosses(weight=weight, cuda=self.cuda).build_loss(mode=args.loss_type) #pytorch cross-entropy loss
        self.model, self.optimizer = model, optimizer
        self.evaluator = Evaluator(self.nclass)
        self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader))
        
            # Using cuda
        if self.cuda:
            self.model = torch.nn.DataParallel(self.model, device_ids=args.gpu_ids) # only use single GPU in my local machine
            patch_replication_callback(self.model) # replicate the model to each GPU
            self.model = self.model.cuda()
        self.best_pred = 0.0
        
    def training(self, epoch):
        train_loss = 0.0
        self.model.train()
        tbar = tqdm(self.train_loader)
        num_img_tr = len(self.train_loader)
    
        for i, sample in enumerate(tbar):
#             import pdb; pdb.set_trace()
            image, target = sample['image'], sample['label']
            if self.cuda:
                image, target = image.cuda(), target.cuda() # send mini-batches to GPU
            self.scheduler(self.optimizer, i, epoch, self.best_pred)
            self.optimizer.zero_grad()
            output = self.model(image)
            loss = self.criterion(output, target)
            loss.backward()
            self.optimizer.step()
            train_loss += loss.item()
            tbar.set_description('Train loss: {0:.3f}'.format(train_loss / (i + 1))) # keep track of average train loss
            self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch)
            # Show 10 * 3 inference results each epoch
            if i % (num_img_tr // 10) == 0:
                global_step = i + num_img_tr * epoch
                self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step)

        self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch)
        print('[Epoch: {0:d}, numImages: {1:5d}]'.format(epoch+1, i * self.args.batch_size + image.data.shape[0]))
        print('Loss: {0:.3f}'.format(train_loss))

    def validation(self, epoch):
        self.model.eval()
        self.evaluator.reset()
        tbar = tqdm(self.val_loader, desc='\r')
        val_loss = 0.0

        for i, sample in enumerate(tbar):
            image, target = sample['image'], sample['label']
            if self.cuda:
                image, target = image.cuda(), target.cuda()
                with torch.no_grad(): # no backpropagation for model evaludation
                    output = self.model(image)
                loss = self.criterion(output, target)
                val_loss += loss.item()
                tbar.set_description('Validation loss: {0:.3f}'.format(val_loss / (i + 1)))
                pred = output.data.cpu().numpy()
                target = target.cpu().numpy()
                pred = np.argmax(pred, axis=1)
                # Add batch sample into evaluator
                # this will generate confusion matrix row: ground_truth, col: predicted label
                self.evaluator.add_batch(target, pred)

        # Fast evaludation during training
        Acc = self.evaluator.Pixel_Accuracy()
        Acc_class = self.evaluator.Pixel_Accuracy_Class()
        mIoU = self.evaluator.Mean_Intersection_over_Union()
        FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union()

        # Write current evaluation metrics to tensorboard
        self.writer.add_scalar('val/total_loss_epoch', val_loss, epoch)
        self.writer.add_scalar('val/mIoU', mIoU, epoch)
        self.writer.add_scalar('val/Acc', Acc, epoch)
        self.writer.add_scalar('val/Acc_class', Acc_class, epoch)
        self.writer.add_scalar('val/fwIoU', FWIoU, epoch)
        print('Validation:')
        print('[Epoch: {0:d}, numImages: {1:5d}]'.format(epoch+1, i * self.args.batch_size + image.data.shape[0]))
        print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU))
        print('Loss: {0:.3f}'.format(val_loss))

        new_pred = mIoU
        if new_pred > self.best_pred:
            is_best = True
            self.best_pred = new_pred
            self.saver.save_checkpoint({'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 
                                        'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred}, is_best)

In [3]:
torch.manual_seed(42) # set random seed for both CPU and GPU
trainer = Trainer(args)
print('Total Epoches:', trainer.args.epochs)

loading annotations into memory...
Done (t=0.57s)
creating index...
index created!
loading annotations into memory...
Done (t=0.72s)
creating index...
index created!
Using poly LR Scheduler!
Total Epoches: 3


In [4]:
for epoch in range(0, trainer.args.epochs):
        trainer.training(epoch)
        if not trainer.args.no_val and epoch % args.eval_interval == (args.eval_interval - 1):
            trainer.validation(epoch)
trainer.writer.close()

  0%|          | 0/1950 [00:00<?, ?it/s]


=>Epoches 0, learning rate = 0.1000,                 previous best = 0.0000


Train loss: 0.577: 100%|██████████| 1950/1950 [27:56<00:00,  1.16it/s]
  0%|          | 0/1950 [00:00<?, ?it/s]

[Epoch: 1, numImages:  3899]
Loss: 1125.921

=>Epoches 1, learning rate = 0.0694,                 previous best = 0.0000


Train loss: 0.555: 100%|██████████| 1950/1950 [27:52<00:00,  1.17it/s]
:   0%|          | 0/1950 [00:00<?, ?it/s]

[Epoch: 2, numImages:  3899]
Loss: 1083.023


Validation loss: 0.543: 100%|██████████| 1950/1950 [13:32<00:00,  2.40it/s]


Validation:
[Epoch: 2, numImages:  3899]
Acc:0.724710745393259, Acc_class:0.04800431726249123, mIoU:0.0349509779197328, fwIoU: 0.5261700152457511
Loss: 1059.651


  0%|          | 0/1950 [00:00<?, ?it/s]


=>Epoches 2, learning rate = 0.0372,                 previous best = 0.0350


Train loss: 0.540: 100%|██████████| 1950/1950 [27:53<00:00,  1.17it/s]

[Epoch: 3, numImages:  3899]
Loss: 1052.712



