###  Imports

In [1]:
import torch
from model import create_model
from utils import FeatureExtractor

import config

import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F

from dataset import FiftyOneTorchDataset, get_transforms
from model import create_model
from utils import add_detections

from engine import train_model
import config

torch.manual_seed(1)

<torch._C.Generator at 0x1bfd64d2850>

In [2]:
print(fo.list_datasets())

[]


In [3]:
if len(fo.list_datasets()) > 0:
    dataset = fo.load_dataset("coco-2017-validation")
    dataset.delete()

In [11]:
#Load in the dataset from the FiftyOne model Zoo
fo_dataset = foz.load_zoo_dataset("coco-2017", "validation")

#needed to calculate image height and width
fo_dataset.compute_metadata()

session = fo.launch_app(fo_dataset)

subset = True

train_transforms, test_transforms = get_transforms()

if subset:
    # to filter certain items from the dataset we can
    item_list = ["car", "dog", "bus", 'fork', 'tie', 'person']
    item_list = ['bus', 'dog']
    item_view = fo_dataset.filter_labels("ground_truth",
            F("label").is_in(item_list))

    #session.view = item_view

    # split the dataset in train and test set
    train_view = item_view.take((len(item_view) * config.TRAIN_TEST_SPLIT), seed=51)
    test_view = item_view.exclude([s.id for s in train_view])

else:
    train_view = fo_dataset.take(len(fo_dataset) * config.TRAIN_TEST_SPLIT)
    test_view = fo_dataset.exclude([s.id for s in train_view])
    item_list = fo_dataset.distinct("ground_truth.detections.label")

print(f'Traning on {len(train_view)} samples')
print(f'Testing on {len(test_view)} samples')

# use our dataset and defined transformations
train_dataset = FiftyOneTorchDataset(train_view, train_transforms,
        classes=item_list)
evaluation_dataset = FiftyOneTorchDataset(test_view, test_transforms,
        classes=item_list)

#this is needed for later use, but not for creating the dataset
if item_list[0] != 'background':
     item_list.insert(0,'background')

Downloading split 'validation' to 'C:\Users\blain\fiftyone\coco-2017\validation' if necessary
Found annotations at 'C:\Users\blain\fiftyone\coco-2017\raw\instances_val2017.json'
Images already downloaded
Existing download of split 'validation' is sufficient
Loading existing dataset 'coco-2017-validation'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use


Traning on 292 samples
Testing on 73 samples


# Load the base FRCNN 

In [12]:
MODEL_TYPE = 'CLIP-Backbone-FRCNN'
CHECKPOINT_NAME = f'{MODEL_TYPE}_epoch_28.pth'

if item_list[0] != 'background':
     item_list.insert(0,'background')

frcnn_model = create_model(MODEL_TYPE, classes=item_list)
checkpoint = torch.load(CHECKPOINT_NAME)
frcnn_model = create_model(MODEL_TYPE, classes=item_list)

frcnn_model.load_state_dict(checkpoint)
frcnn_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): FeatureExtractor(
    (model): ModifiedResNet(
      (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
      (relu): ReLU(inplace=True)
      (layer1): Sequential(
        (0): Bottleneck(
          (con

# Check Faster RCNN performance

In [13]:
add_detections(frcnn_model, evaluation_dataset, fo_dataset, field_name="frcnn_predictions")

results = fo.evaluate_detections(
    test_view,
    "frcnn_predictions",
    classes=item_list,
    eval_key="eval",
    compute_mAP=True
)
session.view = item_view
print(f'mAP: {results.mAP()}')
results.print_report()

Using device cuda
 100% |███████████████████| 73/73 [9.7s elapsed, 0s remaining, 7.7 samples/s]       
Evaluating detections...
 100% |███████████████████| 73/73 [696.0ms elapsed, 0s remaining, 104.9 samples/s]     
Performing IoU sweep...
 100% |███████████████████| 73/73 [640.0ms elapsed, 0s remaining, 114.1 samples/s]      


mAP: 0.19828206967331277
              precision    recall  f1-score   support

  background       0.00      0.00      0.00         0
         bus       0.15      0.79      0.25        91
         dog       0.16      0.85      0.27        39

   micro avg       0.15      0.81      0.26       130
   macro avg       0.10      0.55      0.17       130
weighted avg       0.15      0.81      0.26       130



# Load the CLIP FRCNN

In [7]:
# test out the trained CLIP-FRCNN
MODEL_TYPE = 'CLIP-FRCNN'
CHECKPOINT_NAME = f'{MODEL_TYPE}_epoch_88.pth'


# tokenize item list for CLIP
import clip
_, preprocess = clip.load("RN50", device=config.DEVICE)

if item_list[0] != 'background':
     item_list.insert(0,'background')

text_tokens = clip.tokenize(["This is " + desc for desc in item_list]).cuda()


clip_frcnn_model = create_model(MODEL_TYPE, classes=text_tokens)
checkpoint = torch.load(CHECKPOINT_NAME)
clip_frcnn_model = create_model(MODEL_TYPE, classes=text_tokens)

clip_frcnn_model.load_state_dict(checkpoint)
clip_frcnn_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): FeatureExtractor(
    (model): ModifiedResNet(
      (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
      (relu): ReLU(inplace=True)
      (layer1): Sequential(
        (0): Bottleneck(
          (con

# Check CLIP FRCNN performance

In [8]:
add_detections(clip_frcnn_model, evaluation_dataset, fo_dataset, field_name="clip_frcnn_predictions")

results = fo.evaluate_detections(
    test_view,
    "clip_frcnn_predictions",
    classes=item_list,
    eval_key="clip_eval",
    compute_mAP=True
)
session.view = test_view
print(f'mAP: {results.mAP()}')
results.print_report()

print(f'mAP: {results.mAP()}')
results.print_report()

Using device cuda
   0% ||--------------|    0/1000 [113.1ms elapsed, ? remaining, ? samples/s] 

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


 100% |███████████████| 1000/1000 [5.1m elapsed, 0s remaining, 6.8 samples/s]       
Evaluating detections...
 100% |███████████████| 1000/1000 [41.5s elapsed, 0s remaining, 28.8 samples/s]      
Performing IoU sweep...
 100% |███████████████| 1000/1000 [1.1m elapsed, 0s remaining, 16.3 samples/s]      


mAP: 0.006440438639077175
                precision    recall  f1-score   support

    background       0.00      0.00      0.00         0
      airplane       0.00      0.93      0.01        15
         apple       0.07      0.28      0.11        94
      backpack       0.02      0.16      0.04        73
        banana       0.16      0.47      0.24       108
  baseball bat       0.07      0.46      0.12        57
baseball glove       0.02      0.29      0.03        38
          bear       0.03      0.77      0.06        13
           bed       0.02      0.72      0.03        39
         bench       0.01      0.23      0.01        82
       bicycle       0.03      0.36      0.05        39
          bird       0.07      0.63      0.12       103
          boat       0.02      0.64      0.04       184
          book       0.31      0.63      0.42       536
        bottle       0.14      0.47      0.21       315
          bowl       0.01      0.14      0.03       107
      broccoli       

# Create a CLIP enabled model



In [9]:
frcnn_model.roi_heads.score_thresh

NameError: name 'frcnn_model' is not defined

In [None]:
frcnn_model.roi_heads.detections_per_img

In [None]:
# test out the trained CLIP-FRCNN
MODEL_TYPE = 'CLIP-FRCNN'
CHECKPOINT_NAME = f'{MODEL_TYPE}_epoch_68.pth'

if item_list[0] != 'background':
     item_list.insert(0,'background')

clip_frcnn_model = create_model(MODEL_TYPE, classes=item_list)
checkpoint = torch.load(CHECKPOINT_NAME)
clip_frcnn_model = create_model(MODEL_TYPE, classes=item_list)

clip_frcnn_model.load_state_dict(checkpoint)
clip_frcnn_model.eval()

In [None]:
#TEST out the trained Faster RCNN replacing the box head and box predictors

from torch.cuda.amp import autocast
from model import CLIPHead, CLIPRCNNPredictor

frcnn_model.roi_heads.box_head = CLIPHead()
frcnn_model.roi_heads.box_predictor = CLIPRCNNPredictor(1024, text_tokens)  #CLIP embeds into 1024 dimensions for the RN50 implementation

frcnn_model.eval()

In [None]:
# test requiring the rpn being more confident
frcnn_model.roi_heads.score_thresh = .99

In [None]:
with autocast():
    add_detections(frcnn_model, evaluation_dataset, fo_dataset, field_name="clip_predictions")

results = fo.evaluate_detections(
    test_view,
    "clip_predictions",
    classes=item_list,
    eval_key="clip_eval",
    compute_mAP=True
)

In [None]:
session.view = item_view
print(f'mAP: {results.mAP()}')
results.print_report()

In [None]:
for i in range(100):
    test_image = [torch.rand(3, 224, 224).cuda()]
    with autocast():
        out = frcnn_model(test_image)
    print(out[0]['labels'].shape)