In [None]:
import os
import sys
import collections
import itertools
from tqdm import tqdm
import numpy as np
import pandas as pd
from PIL import Image

In [None]:
!pip install -U torchvision
!pip install pycocotools

In [None]:
!cp -r ../input/vision-references-detection/* ./

In [None]:
import torch
import torchvision
from engine import train_one_epoch
import transforms as T
import utils
from torchvision import transforms
from torchvision.transforms import ToTensor
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

In [None]:
def rle_decode(mask_rle, shape):
    shape = (shape[1], shape[0])
    s = mask_rle.split()
    # gets starts & lengths 1d arrays
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0::2], s[1::2])]
    starts -= 1
    # gets ends 1d array
    ends = starts + lengths
    # creates blank mask image 1d array
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    # sets mark pixles
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    # reshape as a 2d mask image
    return img.reshape(shape).T  # Needed to align to RLE direction

In [None]:
# Convert data to run-length encoding
def to_rle(bits):
    rle = []
    pos = 0
    for bit, group in itertools.groupby(bits):
        group_list = list(group)
        if bit:
            rle.extend([pos, sum(group_list)])
        pos += len(group_list)
    return rle

In [None]:
def refine_masks(masks, labels):
    # Compute the areas of each mask
    areas = np.sum(masks.reshape(-1, masks.shape[-1]), axis=0)
    # Masks are ordered from smallest to largest
    mask_index = np.argsort(areas)
    # One reference mask is created to be incrementally populated
    union_mask = {k:np.zeros(masks.shape[:-1], dtype=bool) for k in np.unique(labels)}
    # Iterate from the smallest, so smallest ones are preserved
    for m in mask_index:
        label = labels[m]
        masks[:, :, m] = np.logical_and(masks[:, :, m], np.logical_not(union_mask[label]))
        union_mask[label] = np.logical_or(masks[:, :, m], union_mask[label])
    # Reorder masks
    refined = list()
    for m in range(masks.shape[-1]):
        mask = masks[:, :, m].ravel(order='F')
        rle = to_rle(mask)
        label = labels[m] - 1
        refined.append([masks[:, :, m], rle, label])
    return refined

Train data

In [None]:
train = pd.read_csv("../input/preprocessing/train_df_truncated.csv")
train = train[train['dataset']=='train'][['ImageId', 'EncodedPixels', 'Height', 'Width', 'Category']].iloc[:int(len(train)*0.2)]

Dataset class

In [None]:
class FashionDataset(object):
    def __init__(self, image_dir, df, height, width, transforms=None):
        self.image_dir = image_dir
        self.df = df
        self.transforms = transforms
        self.height = height
        self.width = width
        # aggregated images info
        self.image_info = collections.defaultdict(dict)
        
        temp_df = self.df.groupby('ImageId')['EncodedPixels', 'Category'].agg(lambda x: list(x)).reset_index()
        size_df = self.df.groupby('ImageId')['Height', 'Width'].mean().reset_index()
        temp_df = temp_df.merge(size_df, on='ImageId', how='left')
        
        for index, row in temp_df.iterrows():#tqdm(temp_df.iterrows(), total=len(temp_df)):
            image_id = row['ImageId']
            image_path = os.path.join(self.image_dir, image_id)
            self.image_info[index]["image_id"] = image_id
            self.image_info[index]["image_path"] = image_path
            self.image_info[index]["labels"] = row["Category"]
            self.image_info[index]["width"] = self.width
            self.image_info[index]["height"] = self.height 
            self.image_info[index]["orig_height"] = row["Height"]
            self.image_info[index]["orig_width"] = row["Width"]
            self.image_info[index]["annotations"] = row["EncodedPixels"]        
        
    def __getitem__(self, idx):
        img_path = self.image_info[idx]["image_path"]
        img = Image.open(img_path).convert("RGB")
        img = img.resize((self.width, self.height), resample=Image.BILINEAR)
        
        info = self.image_info[idx]
        
        mask = np.zeros((len(info['annotations']), self.width, self.height), dtype=np.uint8)
        
        labels = []
        for m, (annotation, label) in enumerate(zip(info['annotations'], info['labels'])):
            sub_mask = rle_decode(annotation, (info['orig_height'], info['orig_width']))
            sub_mask = Image.fromarray(sub_mask)
            sub_mask = sub_mask.resize((self.width, self.height), resample=Image.BILINEAR)
            mask[m, :, :] = sub_mask
            labels.append(int(label) + 1)

        num_objs = len(labels)
        boxes = []
        new_labels = []
        new_masks = []

        for i in range(num_objs):
            try:
                pos = np.where(mask[i, :, :])
                xmin = np.min(pos[1])
                xmax = np.max(pos[1])
                ymin = np.min(pos[0])
                ymax = np.max(pos[0])
                if abs(xmax - xmin) >= 20 and abs(ymax - ymin) >= 20:
                    boxes.append([xmin, ymin, xmax, ymax])
                    new_labels.append(labels[i])
                    new_masks.append(mask[i, :, :])
            except ValueError:
                continue

        if len(new_labels) == 0:
            boxes.append([0, 0, 20, 20])
            new_labels.append(0)
            new_masks.append(mask[0, :, :])

        nmx = np.zeros((len(new_masks), self.width, self.height), dtype=np.uint8)
        for i, n in enumerate(new_masks):
            nmx[i, :, :] = n

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(new_labels, dtype=torch.int64)
        masks = torch.as_tensor(nmx, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.image_info)

Loading model

In [None]:
def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

In [None]:
def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)


Training
(resnet50 backbone)

In [None]:
num_classes = 46 + 1
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
torch.cuda.is_available()

Define dataset and dataloader

In [None]:
dataset_train = FashionDataset("../input/imaterialist-fashion-2019-FGVC6/train/",
                               train,
                               512, 512,
                               transforms=get_transform(train=True))

data_loader = torch.utils.data.DataLoader(
    dataset_train, batch_size=2, shuffle=True, num_workers=8,
    collate_fn=utils.collate_fn)

In [None]:
model_ft = get_model_instance_segmentation(num_classes)
model_ft.to(device)

params = [p for p in model_ft.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.001,
                            momentum=0.9, weight_decay=0.0005)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=5,
                                               gamma=0.1)
num_epochs = 1

for epoch in range(num_epochs):
    train_one_epoch(model_ft, optimizer, data_loader, device, epoch, print_freq=10)
    lr_scheduler.step()    

In [None]:
torch.save(model_ft.state_dict(), "model_resnet.bin")

Evaluation

In [None]:
test = pd.read_csv("../input/preprocessing/train_df_truncated.csv")
test = test[test['dataset']=='test'][['ImageId', 'EncodedPixels', 'Height', 'Width', 'Category']].reset_index().iloc[:int(len(test)*0.1)]

In [None]:
img_path = "../input/imaterialist-fashion-2019-FGVC6/train/"

for param in model_ft.parameters():
    param.requires_grad = False

model_ft.eval()

In [None]:
sub_list = []
missing_count = 0
submission = []
ctr = 0

tt = ToTensor()
for i, row in tqdm(test.iterrows(), total=len(test)):
    # loading image
    image_id = row['ImageId']
    img_path = os.path.join(img_path, image_id)
    img = Image.open(img_path).convert("RGB")
    img = img.resize((512, 512), resample=Image.BILINEAR)
    img = tt(img)
    
    
    result = model_ft([img.to(device)])[0]
    masks = np.zeros((512, 512, len(result["masks"])))
    for j, m in enumerate(result["masks"]):
        res = transforms.ToPILImage()(result["masks"][j].permute(1, 2, 0).cpu().numpy())
        res = np.asarray(res.resize((512, 512), resample=Image.BILINEAR))
        masks[:, :, j] = (res[:, :] * 255. > 127).astype(np.uint8)

    lbls = result['labels'].cpu().numpy()
    scores = result['scores'].cpu().numpy()

    best_idx = 0
    for scr in scores:
        if scr > 0.8:
            best_idx += 1

    if best_idx == 0:
        sub_list.append([test.loc[i, 'ImageId'], '1 1', 23])
        missing_count += 1
        continue

    if masks.shape[-1] > 0:
        masks = refine_masks(masks[:, :, :best_idx], lbls[:best_idx])
        for m, rle, label in masks:
            sub_list.append([test.loc[i, 'ImageId'], ' '.join(list(map(str, list(rle)))), label])
    else:
        sub_list.append([test.loc[i, 'ImageId'], '1 1', 23])
        missing_count += 1

In [None]:
submission_df = pd.DataFrame(sub_list)
sub_list