In [None]:
%%shell
pip install pycocotools
pip3 install --upgrade albumentations
pip3 install opencv-python-headless==4.1.2.30
pip3 uninstall urllib3 --y
pip3 install urllib3
#pip install deeplake
pip install pytorch_lightning
pip install wandb
pip install torchmetrics
pip install torchinfo

mkdir coco
cd coco

#wget http://images.cocodataset.org/zips/train2017.zip
wget http://images.cocodataset.org/zips/val2017.zip
#wget http://images.cocodataset.org/zips/test2017.zip
#wget http://images.cocodataset.org/zips/unlabeled2017.zip

#unzip train2017.zip
unzip val2017.zip
#unzip test2017.zip
#unzip unlabeled2017.zip

#rm train2017.zip
rm val2017.zip
#rm test2017.zip
#rm unlabeled2017.zip 

wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
wget http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip
#wget http://images.cocodataset.org/annotations/image_info_test2017.zip
#wget http://images.cocodataset.org/annotations/image_info_unlabeled2017.zip

unzip annotations_trainval2017.zip
unzip stuff_annotations_trainval2017.zip
#unzip image_info_test2017.zip
#unzip image_info_unlabeled2017.zip

rm annotations_trainval2017.zip
rm stuff_annotations_trainval2017.zip
#rm image_info_test2017.zip
#rm image_info_unlabeled2017.zip

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from coco import COCODataset
import torch

def collate_fn(batch):
    return tuple(zip(*batch))

trainset = COCODataset(data_dir='coco/', split='val2017', train=True, mode='train')
train_dataloader = torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=256, num_workers=0, collate_fn=collate_fn)

valset = COCODataset(data_dir='coco/', split='val2017', train=True, mode='val')
val_dataloader  = torch.utils.data.DataLoader(valset, shuffle=False, batch_size=2, num_workers=0, collate_fn=collate_fn)

loading annotations into memory...
Done (t=1.65s)
creating index...
index created!
loading annotations into memory...
Done (t=1.92s)
creating index...
index created!


In [None]:
batch = next(iter(train_dataloader))

In [None]:
targets = [{k:v for k, v in t.items()} for t in batch[1]]
plt.imshow(targets[0]['masks'][0])
plt.show()

### Model

In [1]:
import models.vit_dino as vit_dino
import torch

model = vit_dino.__dict__['vit_tiny'](num_classes=80)
torch.save(model.state_dict(), 'vit_tiny.pth')

In [2]:
## Attaching ViT and MaskRCNN
from torchvision.models.detection import FasterRCNN, MaskRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.ops import MultiScaleRoIAlign
import torch
import torchvision
import torch.nn as nn
from torchinfo import summary


class ModifyViT(nn.Module):
    def __init__(self, num_classes=79, pretrained=False):
        super(ModifyViT, self).__init__()
        self.model = vit_dino.__dict__['vit_tiny'](num_classes=num_classes, use_clf_token=True,
                                                     use_positional_embeddings=True)
        self.patch_size = 16 // 4

        if pretrained:
            self.model.load_state_dict(torch.load('vit_tiny.pth'))

    def forward(self, x):
        out = self.model(x)
        x = out['z_patches']
        x = x.reshape(x.shape[0], -1, self.patch_size, self.patch_size)
        return x

In [3]:
model = ModifyViT()
model(torch.rand(1,3,224,224)).shape

torch.Size([1, 2352, 4, 4])

In [4]:
def create_model(num_classes = 79, pretrained = False, min_size = 200, max_size = 1300):
    backbone = ModifyViT(num_classes = num_classes, pretrained=False)
    backbone.out_channels = 2352

    #backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features
    #backbone.out_channels = 1280

    anchorgen = AnchorGenerator(sizes=((32, 64, 128, 256),),
                                            aspect_ratios=((0.5, 1.0, 2.0),))
    pooler = MultiScaleRoIAlign(featmap_names=['0'], output_size=4, sampling_ratio=2)
    mask_roi_pooler = MultiScaleRoIAlign(featmap_names=['0'], output_size=4,sampling_ratio=2)

    model = MaskRCNN(backbone=backbone, min_size=min_size, max_size=max_size,
                                num_classes=num_classes,
                                rpn_anchor_generator=anchorgen, box_roi_pool=pooler,
                                mask_roi_pool=mask_roi_pooler)
    return model

model = create_model(num_classes = 90)

In [None]:
model = create_model(num_classes = 90)
summary(model)

In [None]:
# Eval
x = torch.rand(1, 3, 224, 224)
model.train(False)
x = model(x)
print('Eval Results:', x)

Eval Results: [{'boxes': tensor([], size=(0, 4), grad_fn=<StackBackward0>), 'labels': tensor([], dtype=torch.int64), 'scores': tensor([], grad_fn=<IndexBackward0>), 'masks': tensor([], size=(0, 1, 224, 224))}]


In [None]:
# Train
x = next(iter(val_dataloader))
images  = list(image for image in x[0])
targets = [{k:v for k, v in t.items()} for t in x[1]]

model.train(True)
x = model(images, targets)
print('Train Results:',x)

Train Results: {'loss_classifier': tensor(4.4909, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(0.0746, grad_fn=<DivBackward0>), 'loss_mask': tensor(0.8502, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_objectness': tensor(0.6677, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(0.0728, grad_fn=<DivBackward0>)}


### Training

In [5]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor

from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torchmetrics import Accuracy
from coco import COCODataset

In [6]:
class MaskRCNNDetector(pl.LightningModule):
    def __init__(self, num_classes, lr=0.0001, weight_decay=0.0005):
        super().__init__()
        self.model = create_model(num_classes=num_classes, pretrained=True)
        self.lr = lr
        self.weight_decay = weight_decay
        self.map_box = MeanAveragePrecision(box_format="xyxy", iou_type="bbox")
        self.map_mask = MeanAveragePrecision(iou_type="segm")

    def forward(self, x):
        return self.model.forward(x)

    def training_step(self, batch, batch_idx):
        model.train(True)

        images  = list(image for image in batch[0])
        targets = [{k:v for k, v in t.items()} for t in batch[1]]

        loss_dict = self.model(images, targets)

        loss_cls = loss_dict['loss_classifier']
        loss_box_reg = loss_dict['loss_box_reg']
        loss_mask = loss_dict['loss_mask']
        losses = sum(loss for loss in loss_dict.values())

        self.log('train_loss', losses, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_loss_cls', loss_cls, on_step=True, on_epoch=True)
        self.log('train_loss_box_reg', loss_box_reg, on_step=True, on_epoch=True)
        self.log('train_loss_mask', loss_mask, on_step=True, on_epoch=True)
        return losses

    def validation_step(self, batch, batch_idx):
        model.train(False)
        images  = list(image for image in batch[0])
        targets = [{k:v for k, v in t.items()} for t in batch[1]]
         
        preds = self.model(images)

        self.map_box.update(preds=preds, target=targets)
        self.map_mask.update(preds=preds, target=targets)

    def validation_epoch_end(self, validation_step_outputs):

        mAPs_box = {"val_box_" + k: v for k, v in self.map_box.compute().items()}     
        mAPs_mask = {"val_mask_" + k: v for k, v in self.map_mask.compute().items()}

        self.print(mAPs_box)
        self.print(mAPs_mask)

        self.log_dict(mAPs_box, sync_dist=True)
        self.log_dict(mAPs_mask, sync_dist=True)

        self.map_box.reset()
        self.map_mask.reset()

    def configure_optimizers(self):
        params = [p for p in self.model.parameters() if p.requires_grad]
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.9, weight_decay=self.weight_decay)
        sch = torch.optim.lr_scheduler.StepLR(
        optimizer, step_size  = 10 , gamma = 0.5)

        return {"optimizer":optimizer, "lr_scheduler": sch}

    def collate_fn(batch):
        return tuple(zip(*batch))

    def train_dataloader(self):
        trainset = COCODataset(data_dir='coco/', split='val2017', train=True, mode='train')
        train_dataloader = torch.utils.data.DataLoader(trainset, shuffle=True, batch_size=128, num_workers=2, collate_fn=collate_fn)
        return train_dataloader

    def val_dataloader(self):
        valset = COCODataset(data_dir='coco/', split='val2017', train=True, mode='val')
        val_dataloader  = torch.utils.data.DataLoader(valset, shuffle=False, batch_size=2, num_workers=2, collate_fn=collate_fn)
        return val_dataloader

In [None]:
NUM_CLASSES=90
detector = MaskRCNNDetector(num_classes=NUM_CLASSES, lr=0.005, weight_decay=0.0002)

wandb_logger = WandbLogger(project='Mask-RCNN', save_dir='Prova_MaskRCNN', log_model="all")

trainer = pl.Trainer(accelerator="auto",\
                     max_epochs=10 , logger=wandb_logger)

trainer.fit(detector, train_dataloader, val_dataloader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type                 | Params
--------------------------------------------------
0 | model    | MaskRCNN             | 44.5 M
1 | map_box  | MeanAveragePrecision | 0     
2 | map_mask | MeanAveragePrecision | 0     
--------------------------------------------------
44.5 M    Trainable params
0         Non-trainable params
44.5 M    Total params
178.145   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

{'val_box_map': tensor(0.), 'val_box_map_50': tensor(0.), 'val_box_map_75': tensor(0.), 'val_box_map_small': tensor(0.), 'val_box_map_medium': tensor(0.), 'val_box_map_large': tensor(0.), 'val_box_mar_1': tensor(0.), 'val_box_mar_10': tensor(0.), 'val_box_mar_100': tensor(0.), 'val_box_mar_small': tensor(0.), 'val_box_mar_medium': tensor(0.), 'val_box_mar_large': tensor(0.), 'val_box_map_per_class': tensor(-1.), 'val_box_mar_100_per_class': tensor(-1.)}
{'val_mask_map': tensor(0.), 'val_mask_map_50': tensor(0.), 'val_mask_map_75': tensor(0.), 'val_mask_map_small': tensor(0.), 'val_mask_map_medium': tensor(0.), 'val_mask_map_large': tensor(0.), 'val_mask_mar_1': tensor(0.), 'val_mask_mar_10': tensor(0.), 'val_mask_mar_100': tensor(0.), 'val_mask_mar_small': tensor(0.), 'val_mask_mar_medium': tensor(0.), 'val_mask_mar_large': tensor(0.), 'val_mask_map_per_class': tensor(-1.), 'val_mask_mar_100_per_class': tensor(-1.)}


Training: 0it [00:00, ?it/s]