In [1]:
import numpy as np
import random

from PIL import Image
from PIL import Image, ImageFont, ImageDraw, ImageEnhance
import colorsys

import torch
import torch.utils.data as data
import torch.nn as nn
import torchvision.models as models
import torch.nn.functional as F
import torch.optim as optim


import os
import os.path
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from pycocotools.coco import COCO
from pycocotools import mask as maskUtils

inf = float('inf')
nan = float('nan')

In [2]:
train_dir = "/home/aravind/dataset/train2017"
val_dir = "/home/aravind/dataset/val2017"
train_ann = "/home/aravind/dataset/annotations/instances_train2017.json"
val_ann = "/home/aravind/dataset/annotations/instances_val2017.json"

In [3]:
# config to train
# TODO: check Config is correct
class ProposalConfig():
    NAME = "InSegm"
    GPU_COUNT = 1
    # online training
    IMAGES_PER_GPU = 1
    STEPS_PER_EPOCH = 100
    NUM_WORKERS = 1
    PIN_MEMORY = True
    VALIDATION_STEPS = 20
    # including bg
    NUM_CLASSES = 81

    MEAN_PIXEL = np.array(
        [0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 1, -1)
    STD_PIXEL = np.array(
        [0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 1, -1)
    CLASS_NAMES = [
        'BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
        'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
        'scissors', 'teddy bear', 'hair drier', 'toothbrush'
    ]
    GRID_SHAPE = 11
    IMPULSE_SHAPE = (48, 48)
    MIN_PIXELS = 256
    MIN_INTERSECTION = 128
    def __init__(self):
        self.WIDTH = 32 * self.GRID_SHAPE
        self.HEIGHT = 32 * self.GRID_SHAPE
        self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT
        self.IMAGE_SHAPE = (self.WIDTH, self.HEIGHT, 3)

    def display(self):
        """Display Configuration values."""
        print("\nConfigurations:")
        for a in dir(self):
            if not a.startswith("__") and not callable(getattr(self, a)):
                print("{:30} {}".format(a, getattr(self, a)))
        print("\n")

In [4]:
class CocoDetection(data.Dataset):
    def __init__(self, root, annFile, config):
        self.root = root
        self.coco = COCO(annFile)
        self.ids = list(self.coco.imgs.keys())
        self.config = config
        self.catMap = self.build_class_map()

    # coco ids remapped to contigous range(81) (including background as 0)
    def build_class_map(self):
        catMap = {}
        coco_cat_ids = [0] + self.coco.getCatIds(config.CLASS_NAMES[1:])
        for i in range(81):
            catMap[coco_cat_ids[i]] = i
        return catMap

    def __getitem__(self, index):
        # IO stuff: reading image, masks; decoding masks as numpy arrays
        img, instance_masks, class_ids = self.load_data(index)

        # Data Augmentation:
        # skip for now

        # Target generation:
        return self.generate_targets(img, instance_masks, class_ids)

    # make base impulse
    # map each impulse to object with highest overlap

    def generate_targets(self, img, instance_masks, class_ids):
        config = self.config
        # add background mask, background class
        # so that all impulses get some response
        bg_mask = np.where(np.sum(instance_masks, 0, keepdims=True) == 0, 1, 0)
        instance_masks = np.concatenate([bg_mask, instance_masks], 0)
        class_ids = np.concatenate([[0], class_ids], 0)
        # resize image, masks to 448*448
        w, h = config.WIDTH, config.HEIGHT
        img = self.resize_image(img, (w, h), "RGB")
        instance_masks = np.array(
            [self.resize_image(m, (w, h), "L") for m in instance_masks])
        # generate base impulse
        base_impulse = self.make_base_impulse()
        # map masks, class labels to impulses
        # map_freq is number of a single mask is mapped to some impulse
        # this is to normalize loss function
        mask_response, class_response, freq_normalization = self.map_impulse_response(
            base_impulse, instance_masks, class_ids)
    
        img = (img/255-config.MEAN_PIXEL)/config.STD_PIXEL
        img = np.moveaxis(img,2,0)
        
        data = [img, mask_response, class_response, base_impulse, freq_normalization]
        data = [torch.from_numpy(np.array(i).astype(np.float32)) for i in data]
        return tuple(data)
    # not so fast implementation of iou between two pairs of masks
    # a, b binary masks [n,w,h]

    def all_pairs_iou(self, a, b):
        iou = np.zeros((a.shape[0], b.shape[0]))
        for i in range(a.shape[0]):
            m = np.expand_dims(a[i], 0)
            intersection = np.sum(m * b, (1, 2))
            intersection = (intersection >
                            self.config.MIN_INTERSECTION) * intersection
            union = np.sum(b + m, (1, 2)) - intersection
            iou[i, :] = intersection / union
        return iou

    def map_impulse_response(self, base_impulse, instance_masks, class_ids):
        scores = self.all_pairs_iou(base_impulse, instance_masks)
        ids = np.argmax(scores, -1)
        mask_response = instance_masks[ids]
        class_response = class_ids[ids]

        freq_normalization = np.ones(base_impulse.shape[0])
        counts = np.zeros(instance_masks.shape[0])
        for i in range(ids.shape[0]):
            counts[ids[i]] += 1
        for i in range(ids.shape[0]):
            freq_normalization[i] = 1 / counts[ids[i]]

        return mask_response, class_response, freq_normalization

    # we generate impulses evenly spread across the image
    # divide image into grid; this is heuristic
    # here, we place impulse centres as from d to (2g-1)*d
    # separated by 2*d. where d = w//(2*g)

    def make_base_impulse(self):
        config = self.config
        g = config.GRID_SHAPE
        w, h = config.WIDTH, config.HEIGHT
        dw, dh = w // (2 * g), h // (2 * g)

        base_impulse = np.zeros((g * g, w, h))
        dx, dy = config.IMPULSE_SHAPE[0] // 2, config.IMPULSE_SHAPE[1] // 2

        for i in range(g):
            for j in range(g):
                k = g * i + j
                x, y = dw * (2 * i + 1), dh * (2 * j + 1)
                lx = max(0, x - dx)
                ly = max(0, y - dy)
                rx = min(x + dx, w)
                ry = min(y + dy, h)
                base_impulse[k][lx:rx, ly:ry] = np.ones((rx - lx, ry - ly))
        return base_impulse

    # resize image/mask to specified size without losing aspect ratio

    def resize_image(self, img, size, mode):
        interpolation = {"RGB": Image.BICUBIC, "L": Image.NEAREST}[mode]
        img_obj = Image.fromarray(img.astype(np.uint8), mode)
        img_obj.thumbnail(size, interpolation)

        (w, h) = img_obj.size
        padded_img = Image.new(mode, size, "black")
        padded_img.paste(img_obj, ((size[0] - w) // 2, (size[1] - h) // 2))

        return np.array(padded_img)

    # read image, masks; decode masks to numpy arrays
    # image format: channels last
    # mask format: channels first

    def load_data(self, index):
        coco = self.coco
        config = self.config

        img_id = self.ids[index]
        ann_ids = coco.getAnnIds(
            imgIds=img_id, areaRng=[config.MIN_PIXELS, inf], iscrowd=False)
        anns = coco.loadAnns(ann_ids)
        path = coco.loadImgs(img_id)[0]['file_name']

        img = Image.open(os.path.join(self.root, path)).convert('RGB')
        instance_masks = []
        class_ids = []
        w, h = img.size
        for ann in anns:
            instance_masks.append(self.annToMask(ann, h, w))
            class_ids.append(self.catMap[ann['category_id']])
        return np.array(img), np.array(instance_masks), np.array(class_ids)

    def __len__(self):
        return len(self.ids)

    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        fmt_str += '    Root Location: {}\n'.format(self.root)
        tmp = '    Transforms (if any): '
        fmt_str += '{0}{1}\n'.format(
            tmp,
            self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
        tmp = '    Target Transforms (if any): '
        fmt_str += '{0}{1}'.format(
            tmp,
            self.target_transform.__repr__().replace('\n',
                                                     '\n' + ' ' * len(tmp)))
        return fmt_str

    def annToRLE(self, ann, h, w):
        """
        Convert annotation which can be polygons, uncompressed RLE to RLE.
        :return: binary mask (numpy 2D array)
        """

        segm = ann['segmentation']
        if type(segm) == list:
            # polygon -- a single object might consist of multiple parts
            # we merge all parts into one mask rle code
            rles = maskUtils.frPyObjects(segm, h, w)
            rle = maskUtils.merge(rles)
        elif type(segm['counts']) == list:
            # uncompressed RLE
            rle = maskUtils.frPyObjects(segm, h, w)
        else:
            # rle
            rle = ann['segmentation']
        return rle

    def annToMask(self, ann, h, w):
        """
        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
        :return: binary mask (numpy 2D array)
        """
        rle = self.annToRLE(ann, h, w)
        m = maskUtils.decode(rle)
        return m

In [5]:
config = ProposalConfig()
val_dataset = CocoDetection(val_dir, val_ann, config)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                          batch_size=config.BATCH_SIZE,
                                          shuffle=True,
                                          pin_memory=config.PIN_MEMORY,
                                          num_workers=config.NUM_WORKERS
                                          )
# train_dataset = CocoDetection(train_dir,train_ann)

loading annotations into memory...
Done (t=0.59s)
creating index...
index created!


In [6]:
# imgs = next(q)[0]
# print(imgs.shape)
# Image.fromarray(np.array(imgs[0]).astype(np.uint8),"RGB").show()

In [7]:
# (shitty make shift) modification of vgg 16.
# exotic variations postponed


class split_conv(nn.Module):
    def __init__(self, in_features, cur_out, d_out):
        super(split_conv, self).__init__()
        self.ignore_filters = nn.Conv2d(
            in_features, cur_out, (3, 3), padding=(1, 1))
        self.copy_filters = nn.Conv2d(
            in_features, d_out, (3, 3), padding=(1, 1))

    def forward(self, x):
        ignore = self.ignore_filters(x)
        copy = self.copy_filters(x)
        return torch.cat([ignore, copy], 1)


class split_vgg16_features(nn.Module):
    def __init__(self, pre_trained_weights, d_in):
        super(split_vgg16_features, self).__init__()
        self.d_in = d_in
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.layer1 = nn.Sequential(
            split_conv(3 + d_in, 64, d_in),
            self.relu,
            split_conv(64 + d_in, 64, d_in),
            self.relu,
        )
        self.layer2 = nn.Sequential(
            split_conv(64 + d_in, 128, d_in),
            self.relu,
            split_conv(128 + d_in, 128, d_in),
            self.relu,
        )
        self.layer3 = nn.Sequential(
            split_conv(128 + d_in, 256, d_in),
            self.relu,
            split_conv(256 + d_in, 256, d_in),
            self.relu,
            split_conv(256 + d_in, 256, d_in),
            self.relu,
        )
        self.layer4 = nn.Sequential(
            split_conv(256 + d_in, 512, d_in),
            self.relu,
            split_conv(512 + d_in, 512, d_in),
            self.relu,
            split_conv(512 + d_in, 512, d_in),
            self.relu,
        )
        self.layer5 = nn.Sequential(
            split_conv(512 + d_in, 512, d_in),
            self.relu,
            split_conv(512 + d_in, 512, d_in),
            self.relu,
            split_conv(512 + d_in, 512, d_in),
            self.relu,
        )

        # initialize with vgg weights
        if pre_trained_weights == True:
            self.init_weights()

    def forward(self, x):
        outs = []
        x = self.layer1(x)
        outs.append(x)
        x = self.pool(x)
        
        x = self.layer2(x)
        outs.append(x)
        x = self.pool(x)
        
        x = self.layer3(x)
        outs.append(x)
        x = self.pool(x)
        
        x = self.layer4(x)
        outs.append(x)
        x = self.pool(x)
        
        x = self.layer5(x)
        outs.append(x)
        x = self.pool(x)
        
        return x, outs

    def init_weights(self):
        _shapes = [[] for i in range(5)]
        l = 0
        vgg = models.vgg16(pretrained=True)
        for child in vgg.features.children():
            if isinstance(child, nn.Conv2d):
                _shapes[l].append(child.weight.shape)
            elif isinstance(child, nn.MaxPool2d):
                l += 1

        d_in = self.d_in

        copy_weight = [[] for l in range(5)]
        ignore_weight = [[] for l in range(5)]
        ignore_bias = [[] for l in range(5)]
        copy_bias = [[] for l in range(5)]

        i = 0
        l = 0
        decay = 1.1

        for child in vgg.features.children():
            if isinstance(child, nn.Conv2d):
                cur_in = _shapes[l][i][1]
                cur_out = _shapes[l][i][0]
                kernel_shape = _shapes[l][i][2:]
                # !!!
                d_out = d_in
                fan_in = kernel_shape[0] * kernel_shape[1]
                # ignore_filters: cur_out, cur_in + d_in, kernel_shape
                c = torch.zeros((cur_out, d_in) + kernel_shape)
                ignore_filters = torch.cat([child.weight, c], 1)
                # copy_filters: d_out, cur_in + d_in, kernel_shape
                a = torch.zeros((
                    d_out,
                    cur_in,
                ) + kernel_shape)
                b = torch.eye(d_out, d_in).unsqueeze(-1).unsqueeze(-1).float()
                b = b.repeat([1, 1, kernel_shape[0], kernel_shape[1]
                              ]) / fan_in / random.uniform(1, decay)
                copy_filters = torch.cat([a, b], 1)
                ignore_weight[l].append(ignore_filters)
                ignore_bias[l].append(child.bias)
                copy_weight[l].append(copy_filters)
                copy_bias[l].append(torch.zeros(d_out))
                d_in = d_out
                i += 1
            elif isinstance(child, nn.MaxPool2d):
                l += 1
                i = 0

        l = 0
        for name, child in self.named_children():
            if name[:-1] == "layer":
                k = 0
                for gc in child.children():
                    if isinstance(gc, split_conv):
                        gc.copy_filters.weight = nn.Parameter(
                            copy_weight[l][k])
                        gc.ignore_filters.weight = nn.Parameter(
                            ignore_weight[l][k])
                        gc.copy_filters.bias = nn.Parameter(copy_bias[l][k])
                        gc.ignore_filters.bias = nn.Parameter(
                            ignore_bias[l][k])
                        k += 1
                l += 1

In [25]:
class MaskProp(nn.Module):
    def __init__(self, init_weights, d_in):
        super(MaskProp, self).__init__()
        self.relu = nn.ReLU(inplace=True)
        self.upsample = nn.Upsample(scale_factor=2)
        self.layer5 = nn.Sequential(
            nn.Conv2d(
                512 + d_in + 512 + d_in, 512 + d_in, (3, 3), padding=(1, 1)),
            nn.BatchNorm2d(512 + d_in),
            self.relu,
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(
                512 + d_in + 512 + d_in, 512 + d_in, (3, 3), padding=(1, 1)),
            nn.BatchNorm2d(512 + d_in),
            self.relu,
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(
                512 + d_in + 256 + d_in, 256 + d_in, (3, 3), padding=(1, 1)),
            nn.BatchNorm2d(256 + d_in),
            self.relu,
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(
                256 + d_in + 128 + d_in, 128 + d_in, (3, 3), padding=(1, 1)),
            nn.BatchNorm2d(128 + d_in),
            self.relu,
        )
        self.layer1 = nn.Sequential(
            nn.Conv2d(128 + d_in + 64 + d_in, d_in, (3, 3), padding=(1, 1)),
            nn.BatchNorm2d(d_in),
            # self.relu,
        )

        if init_weights:
            for name, child in self.named_children():
                if name[:-1] == 'layer' or 'mask_layer':
                    for gc in child.children():
                        if isinstance(gc, nn.Conv2d):
                            nn.init.xavier_uniform_(gc.weight)

    def forward(self, x):
        c, m = x
        l1, l2, l3, l4, l5 = m

        c = F.upsample(c, scale_factor=2)
        y = self.layer5(torch.cat([c, l5], 1))

        y = self.upsample(y)
        y = self.layer4(torch.cat([y, l4], 1))

        y = self.upsample(y)
        y = self.layer3(torch.cat([y, l3], 1))

        y = self.upsample(y)
        y = self.layer2(torch.cat([y, l2], 1))

        y = self.upsample(y)
        y = self.layer1(torch.cat([y, l1], 1))

        return y


# classifier takes a single level features and classifies


class Classifier(nn.Module):
    def __init__(self, init_weights,d_in):
        super(Classifier, self).__init__()
        self.conv1 = nn.Conv2d(512+d_in, 512, (3, 3), padding=(1, 1))

        self.relu = nn.ReLU(inplace=True)
        if init_weights:
            nn.init.xavier_uniform_(self.conv1.weight)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        return x


class SingleHGModel(nn.Module):
    def __init__(self, d_in):
        super(SingleHGModel, self).__init__()
        self.vgg0 = split_vgg16_features(pre_trained_weights=True, d_in=d_in)
        self.mp0 = MaskProp(init_weights=True,d_in = d_in)
        self.class_predictor = Classifier(init_weights=True, d_in=d_in)

    def forward(self, x):
        im, impulse = x

        inp = torch.cat([im, impulse], dim=1)
        class_features, mask_features = self.vgg0(inp)
        c = self.class_predictor(class_features)
        m0 = self.mp0([class_features, mask_features])

        return c, m0

In [None]:
net = SingleHGModel(d_in=121)
net = net.cuda()
img, masks, class_ids, base_impulse, freq_normalization = [v.cuda() for v in next(iter(val_loader))]
res = net([img,base_impulse])