In [None]:
import os
from glob import glob

import numpy as np
import torch
import torchvision
from pycocotools.coco import COCO

from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F


class M18KDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        self.annotations = COCO(os.path.join(root, "_annotations.coco.json"))

    def __getitem__(self, idx):
        # load images and masks
        image_object = self.annotations.imgs[idx]
        img_path = image_object["file_name"]
        #mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
        img = read_image(os.path.join(self.root,img_path))
        masks = self.annotations.loadAnns(self.annotations.getAnnIds([image_object["id"]]))
        
        num_objs = len(masks)

        # tensor of shape [#objects,h,w] of binary masks
        binary_masks = torch.tensor(np.dstack([annotations.annToMask(mask) for mask in masks]),dtype=torch.uint8).permute([2,0,1])
        
        # get bounding box coordinates for each mask
        boxes = masks_to_boxes(binary_masks)

        # there is only one class
        labels = torch.tensor([mask["category_id"] for mask in masks],dtype=torch.int64)

        image_id = idx
        area = torch.tensor([mask["area"] for mask in masks],dtype=torch.float32)
        # suppose all instances are not crowd
        iscrowd = torch.tensor([mask["iscrowd"] for mask in masks],dtype=torch.int64)

        # Wrap sample and targets into torchvision tv_tensors:
        img = tv_tensors.Image(img)
        
        target = {}
        target["boxes"] = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=F.get_size(img))
        target["masks"] = tv_tensors.Mask(binary_masks)
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            print(type(target))
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.annotations.imgs)

ds = M18KDataset("M18K/Dataset/train",None)
ds[0]

In [None]:
annotations = COCO(os.path.join("M18K/Dataset/train", "_annotations.coco.json"))

In [None]:
img = read_image(os.path.join("M18K/Dataset/train",annotations.imgs[0]["file_name"]))
img


In [None]:
import numpy as np
masks = annotations.loadAnns(annotations.getAnnIds([annotations.imgs[0]["id"]]))
binary_masks = torch.tensor(np.dstack([annotations.annToMask(mask) for mask in masks]),dtype=torch.uint8).permute([2,0,1])
binary_masks.shape

In [3]:
from M18K.Data.Dataset import M18KDataset
from M18K.Data.DataModule import M18KDataModule
from M18K.Models.TorchVision import MaskRCNN_ResNet50
from lightning import LightningModule, Trainer
from torchvision import transforms
from lightning.pytorch import loggers as pl_loggers
import argparse
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

In [None]:
from M18K.Data.Dataset import M18KDataset
from M18K.Data.DataModule import M18KDataModule
from M18K.Models.TorchVision import MaskRCNN_ResNet50
from lightning import LightningModule, Trainer
from torchvision import transforms
from lightning.pytorch import loggers as pl_loggers
import argparse
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

def main(model_name="resnet_18"):
    # Instantiate the data module
    t = transforms.ToTensor()
    # if model_name == "swin_v2_b":
    #     t = transforms.Compose([transforms.ToTensor(),transforms.Grayscale()])
    dm = M18KDataModule(batch_size=2)

    # Instantiate the model
    model = MaskRCNN_ResNet50()

    checkpoint_callback = ModelCheckpoint(
        save_top_k=100,
        monitor="val_loss",
        mode="min",
        dirpath=f"runs/{model_name}/",
        filename= model_name+"-{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}",
    )

    #early_stop_callback = EarlyStopping(monitor="val_accuracy", min_delta=0.00, patience=50, verbose=False, mode="max")

    # Initialize a trainer
    tb_logger = pl_loggers.TensorBoardLogger(save_dir=f"runs/{model_name}/")
    trainer = Trainer(max_epochs=100,devices=1,log_every_n_steps=1,logger=tb_logger,callbacks=[checkpoint_callback])

    # Train the model ⚡
    trainer.fit(model, dm)


if __name__ == '__main__':
    # parser = argparse.ArgumentParser(description='A simple script with command-line arguments.')
    # parser.add_argument('model', type=str, help='model name')
    # args = parser.parse_args()
    # model = args.model
    main()

In [1]:
import torchvision
model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

In [4]:
ds = M18KDataset(root="M18K/Data/train",transforms=None)


loading annotations into memory...
Done (t=1.44s)
creating index...
index created!


In [15]:
batch=[ds[0],ds[1]]

In [16]:
batch = tuple(zip(*batch))

In [18]:
images,targets = batch

In [19]:
images

(tensor([[[0.2118, 0.2118, 0.2078,  ..., 0.4941, 0.4706, 0.5412],
          [0.1961, 0.2000, 0.2000,  ..., 0.5373, 0.5098, 0.5490],
          [0.2000, 0.1961, 0.1882,  ..., 0.5451, 0.5098, 0.5216],
          ...,
          [0.2353, 0.2510, 0.2353,  ..., 0.5255, 0.3725, 0.3333],
          [0.2314, 0.2314, 0.2235,  ..., 0.5098, 0.3608, 0.3216],
          [0.2314, 0.2196, 0.2118,  ..., 0.3255, 0.2863, 0.3608]],
 
         [[0.2353, 0.2353, 0.2196,  ..., 0.5490, 0.5529, 0.6275],
          [0.2196, 0.2235, 0.2118,  ..., 0.5922, 0.5804, 0.6353],
          [0.2196, 0.2157, 0.2000,  ..., 0.5961, 0.5765, 0.6039],
          ...,
          [0.2118, 0.2392, 0.2157,  ..., 0.4667, 0.3020, 0.2510],
          [0.2157, 0.2275, 0.2118,  ..., 0.4471, 0.2863, 0.2392],
          [0.2275, 0.2157, 0.2078,  ..., 0.2627, 0.2118, 0.2784]],
 
         [[0.2157, 0.2157, 0.2118,  ..., 0.5451, 0.5451, 0.6196],
          [0.2000, 0.2039, 0.2039,  ..., 0.5882, 0.5765, 0.6275],
          [0.2118, 0.2078, 0.1922,  ...,

In [20]:
model(images,targets)

{'loss_classifier': tensor(3.2122, grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.6233, grad_fn=<DivBackward0>),
 'loss_mask': tensor(0.3126, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_objectness': tensor(2.0911, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0579, grad_fn=<DivBackward0>)}