In [1]:
import os

import torch
print(torch.__version__)

1.5.0


In [2]:
import numpy as np
from PIL import Image

class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # 省略代码：加载所有图像并排序，检查图像一致性
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))

    def __getitem__(self, idx):
        # 加载图像和蒙版
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        # 蒙版不转为RGB，因为之后每个对象需要对应不同的蒙版图像
        mask = Image.open(mask_path)
        # 将PIL图像转为Numpy.Array
        mask = np.array(mask)
        # 不同对象对应不同颜色
        obj_ids = np.unique(mask)
        # 第一个ID对应背景，此处不需要
        obj_ids = obj_ids[1:]

        # 因为一张图像会有多个对象，每个对象需要对应一个蒙版
        masks = (mask == obj_ids[:, None, None])

        # 获取每个对象蒙版的边界
        num_objs = len(obj_ids)
        boxes = []
        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])

        # 转为`torch.Tensor`
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # 本例中只有一个类别
        labels = torch.ones((num_objs,), dtype=torch.int64)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        # 假定所有实例满足`iscrowd=False`
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        # 构造`target`字典
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.imgs)

In [3]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

In [4]:
from references.detection import transforms as T

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    # 注意：这里将`.ToTensor()`放到`.RandomHorizontalFlip()`前
    # 会导致报错 `TypeError: img should be PIL Image. Got <class `torch.Tensor`>`
    return T.Compose(transforms)

In [5]:
from references.detection import utils

device = torch.device('cpu')
num_classes = 2
dataset_test = PennFudanDataset('data/PennFudanPed', get_transform(train=False))
data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=1, shuffle=False, num_workers=0,
        collate_fn=utils.collate_fn)
model = get_model_instance_segmentation(num_classes)
model.load_state_dict(torch.load('demo_e20.pth'))
model.to(device)
model.eval()

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d()
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d()
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d()
          (relu): ReLU(inplace=True)
          (downsample): Sequentia

In [6]:
images, _ = next(iter(data_loader_test))
images = list(image for image in images)
predictions = model(images)

In [7]:
from torchvision.utils import save_image

save_image(images, 'data/demo/demo_image_1.jpg')
masks = predictions[0]['masks']
mask = reduce(torch.add, masks)
save_image(mask, 'demo_masks.jpg')

NameError: name 'reduce' is not defined

In [19]:
from PIL import Image
image = Image.open('data/demo/demo_image_1.jpg')
image = np.array(image, dtype=np.float32)/255
images = [torch.from_numpy(image).permute((2,0,1))]

In [20]:
predictions = model(images)

In [21]:
from functools import reduce
masks = predictions[0]['masks']
mask = reduce(torch.add, masks)
save_image(mask, 'data/demo/demo_masks_1.jpg')