In [1]:
import torch
import pandas as pd
import numpy as np

from models.setup import ModelSetup
from models.build import create_multimodal_rcnn_model
from data.load import get_datasets, get_dataloaders

from utils.init import reproducibility, clean_memory_get_device
from data.constants import DEFAULT_REFLACX_LABEL_COLS, XAMI_MIMIC_PATH

## Suppress the assignement warning from pandas.r
pd.options.mode.chained_assignment = None  # default='warn'

## Supress user warning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

%matplotlib inline

In [2]:
# clean gpu memory and tell pytorch to use deterministic algorithm.
device = clean_memory_get_device()
reproducibility()

This notebook will running on device: [CPU]


# Parameters setup

In [3]:
use_iobb = True
io_type_str = "IoBB" if use_iobb else "IoU"
labels_cols = DEFAULT_REFLACX_LABEL_COLS
iou_thrs = np.array([0.5])

common_args = {
    "save_early_stop_model": True,
    "optimiser": "sgd",
    "lr": 1e-3,
    "weight_decay": 1e-5,
    "image_backbone_pretrained": True,
    "fixation_backbone_pretrained": True,
    "record_training_performance": True,
    "dataset_mode": "normal",
    "image_size": 512,
    "batch_size": 4,
    "warmup_epochs": 0,
    "lr_scheduler": "ReduceLROnPlateau",
    "reduceLROnPlateau_factor": 0.1,
    "reduceLROnPlateau_patience": 999,
    "reduceLROnPlateau_full_stop": True,
    "multiStepLR_milestones": 100,
    "multiStepLR_gamma": 0.1,
    "use_mask": True,
    "gt_in_train_till": 999,
    "box_head_dropout_rate": 0,
    "measure_test": True,
}

fusion_add_args = {
    "fuse_depth": 0,
    "fusion_residule": False,
    "fusion_strategy": "add", 
}

small_model_args = {
    "mask_hidden_layers": 64,
    "fuse_conv_channels": 64,
    "representation_size": 64, 
    "backbone_out_channels": 64,
}

mobilenet_args = {
    "backbone": "mobilenet_v3",
    "using_fpn": False,
}


# [TODO]: clean the model setup for fixation map.
model_setup = ModelSetup(
        name="forward_testing_model",
        use_fixations=True,
        **mobilenet_args,
        **small_model_args,
        **common_args,
        **fusion_add_args,
    )


# Initiate datasets and dataloaders
The batch size is also defined in this section. For testing purpose, we only set it as 2.

In [4]:
dataset_params_dict = {
    "XAMI_MIMIC_PATH": XAMI_MIMIC_PATH,
    "dataset_mode": model_setup.dataset_mode,
    "bbox_to_mask": model_setup.use_mask,
    "labels_cols": DEFAULT_REFLACX_LABEL_COLS,
}

detect_eval_dataset, train_dataset, val_dataset, test_dataset = get_datasets(
    dataset_params_dict=dataset_params_dict
)

train_dataloader, val_dataloader, test_dataloader = get_dataloaders(
    train_dataset, val_dataset, test_dataset, batch_size=2
)

## Example instance from dataset:
Inside each instance we have:

- Images
- Fixation heatmaps
- Targets (Dictionary)

And, inside the target, there're:

- boxes (bounding boxes of abnormality)
- label (disease index (Note: the class **0** means the background))
- image_id (idx to get that image)
- area (the areas that bouding boxes contain)
- iscrowd (if it's a place with multiple bouding boxes, we assume all the the bouding boxes are not crowd.)

In [5]:
# sizes of train, val and test are correct

# 
print(f'Size of a training instance - images, fixations, targets = {len(train_dataset[0])}')

# 
print(f'Each image has {len(train_dataset[0][0])} channels ')

Size of a training instance - images, fixations, targets = 3
Each image has 3 channels 


In [6]:
# targets
train_dataset[0][2]

{'boxes': tensor([[ 568., 1533., 1309., 2199.],
         [ 523., 1723.,  675., 2211.]], dtype=torch.float64),
 'labels': tensor([1, 2]),
 'image_id': tensor([0]),
 'area': tensor([493506.,  74176.], dtype=torch.float64),
 'iscrowd': tensor([0, 0]),
 'dicom_id': 'd1bdf0b5-cf2d6aee-0685203b-23334088-a5ed5401',
 'image_path': 'D:\\XAMI-MIMIC\\patient_17799242\\CXR-JPG\\s53582712\\d1bdf0b5-cf2d6aee-0685203b-23334088-a5ed5401.jpg',
 'fixations_path': 'D:\\XAMI-MIMIC\\patient_17799242\\CXR-JPG\\s53582712\\time_heatmap_P202R293587.jpg',
 'masks': tensor([[[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]],
 
         [[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 

# Define model

In [10]:
model = create_multimodal_rcnn_model(
    labels_cols,
    model_setup,
    rpn_nms_thresh=0.3,
    box_detections_per_img=10,
    box_nms_thresh=0.2,
    rpn_score_thresh=0.0,
    box_score_thresh=0.05,
)

model.to(device)


Using pretrained backbone. mobilenet_v3
Using pretrained backbone. mobilenet_v3
forward_testing_model will use mask, [64] layers.


MultimodalMaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): Sequential(
    (0): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
            (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))


# Prepare data to feed
We prepare three main data to test the model:

- CXR image
- Fixation heatmaps
- Target

And, for each data, we adjust the format to what the model expect.

In [7]:
data = next(iter(train_dataloader))
data = train_dataset.prepare_input_from_data(data, device)

In [8]:
data[:-1]

([tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [0.0000, 0.0000, 0.0039,  ..., 0.0157, 0.0157, 0.0353],
           [0.0000, 0.0000, 0.0000,  ..., 0.0157, 0.0118, 0.0275],
           [0.0039, 0.0000, 0.0000,  ..., 0.0353, 0.0235, 0.0353]],
  
          [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [0.0000, 0.0000, 0.0039,  ..., 0.0157, 0.0157, 0.0353],
           [0.0000, 0.0000, 0.0000,  ..., 0.0157, 0.0118, 0.0275],
           [0.0039, 0.0000, 0.0000,  ..., 0.0353, 0.0235, 0.0353]],
  
          [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.

# Test Feedforward (Training)

In [11]:
model.train()
loss_dict, outputs = model(*data[:-1], targets=data[-1])

In [12]:
images, fixations, targets = data

In [13]:
original_image_sizes= []
for img in images:
    val = img.shape[-2:]
    assert len(val) == 2
    original_image_sizes.append((val[0], val[1]))

images, targets = model.transform(images, targets)

img_features = model.backbone(images.tensors)

print(img_features.shape)

torch.Size([2, 64, 16, 16])


# Results
Four different losses are given in the output

We will use these losses to optimise the network while training



## Test Feedforward

### Detection.

A detection contain *boxes*, *lables*, and *scores*.

- *boxes*: All the bounding boxes for this image. 
- *labels*: Labels corresponded to the bounding boxes.
- *score*: Score (Confidence) for each boudning box.

In [14]:
loss_dict, outputs

({'loss_classifier': tensor(1.0743, grad_fn=<NllLossBackward0>),
  'loss_box_reg': tensor(0.0014, grad_fn=<DivBackward0>),
  'loss_mask': tensor(0.7265, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
  'loss_objectness': tensor(0.6926, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
  'loss_rpn_box_reg': tensor(0.0034, dtype=torch.float64, grad_fn=<DivBackward0>)},
 [{'boxes': tensor([[   0.0000,  383.0743,  633.2679, 1151.3854],
           [ 422.0579,  612.9487,  527.7842,  897.1188],
           [ 270.7764, 1585.4551,  707.8502, 1861.1404],
           [ 264.2876,  607.8674,  367.8270,  891.1305],
           [ 106.5293, 1949.2998,  548.1766, 2221.8372],
           [ 724.0617,  719.5476, 1161.2401, 1778.1035],
           [ 899.0967, 1188.8501, 1006.4523, 1470.1182],
           [1147.2544, 1081.5037, 1387.4926, 1213.4926],
           [ 528.9288, 1844.8662,  757.8915, 1966.3760],
           [1000.9653, 2227.5544, 1225.6404, 2352.8313]],
          grad_fn=<StackBackward0>),
   'labels

# Toy example

In [64]:
example_image_input = [torch.randn(( 3, 512, 512)).to(device)]
example_fixations_input = [torch.randn(( 3, 512, 512)).to(device)]

In [None]:
model.eval()

model(example_image_input, fixations=example_fixations_input)