###  Imports

In [1]:
import torch
from model import create_model
from utils import FeatureExtractor

import config

import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F

from dataset import FiftyOneTorchDataset, get_transforms
from model import create_model
from utils import add_detections

from engine import train_model
import config

torch.manual_seed(1)


<torch._C.Generator at 0x1f9f2e2c830>

In [2]:
print(fo.list_datasets())

[]


In [3]:
# dataset = fo.load_dataset("coco-2017-validation")
# dataset.delete()

In [4]:
#Load in the dataset from the FiftyOne model Zoo
fo_dataset = foz.load_zoo_dataset("coco-2017", "validation")

#needed to calculate image height and width
fo_dataset.compute_metadata()

session = fo.launch_app(fo_dataset)


# to filter certain items from the dataset we can
item_list = ["car", "dog", "bus", 'fork', 'tie', 'person']
item_view = fo_dataset.filter_labels("ground_truth",
        F("label").is_in(item_list))

session.view = item_view

# split the dataset in train and test set
train_view = item_view.take((len(item_view) * config.TRAIN_TEST_SPLIT), seed=51)
test_view = item_view.exclude([s.id for s in train_view])

print(f'Traning on {len(train_view)} samples')
print(f'Testing on {len(test_view)} samples')


train_transforms, test_transforms = get_transforms()

# use our dataset and defined transformations
train_dataset = FiftyOneTorchDataset(train_view, train_transforms,
        classes=item_list)
evaluation_dataset = FiftyOneTorchDataset(test_view, test_transforms,
        classes=item_list)

Downloading split 'validation' to 'C:\Users\blain\fiftyone\coco-2017\validation' if necessary
Found annotations at 'C:\Users\blain\fiftyone\coco-2017\raw\instances_val2017.json'
Images already downloaded
Existing download of split 'validation' is sufficient
Loading 'coco-2017' split 'validation'
 100% |███████████████| 5000/5000 [16.6s elapsed, 0s remaining, 309.9 samples/s]      
Dataset 'coco-2017-validation' created


Traning on 2458 samples
Testing on 615 samples


# Load the base FRCNN 

In [5]:
import clip
CLIP_model, preprocess = clip.load("RN50")
image_embedder = list(CLIP_model.visual.children())[-1].cuda().eval()#take the last layer manually load in the latest checkpoint

MODEL_TYPE = 'CLIP-FRCNN'
CHECKPOINT_NAME = f'{MODEL_TYPE}_epoch_22.pth'
num_classes = 6  #number of classes in the FRCNN not counting the background

frcnn_model = create_model(MODEL_TYPE, num_classes=(num_classes+1))
checkpoint = torch.load(CHECKPOINT_NAME)
frcnn_model = create_model(MODEL_TYPE, num_classes=(num_classes+1))

frcnn_model.load_state_dict(checkpoint)
frcnn_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): FeatureExtractor(
    (model): ModifiedResNet(
      (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
      (relu): ReLU(inplace=True)
      (layer1): Sequential(
        (0): Bottleneck(
          (con

# Set up for CLIP integration

In [6]:
add_detections(frcnn_model, evaluation_dataset, fo_dataset, field_name="frcnn_predictions")

results = fo.evaluate_detections(
    test_view,
    "frcnn_predictions",
    classes=item_list,
    eval_key="eval",
    compute_mAP=True
)

In [7]:
session.view = item_view
print(f'mAP: {results.mAP()}')
results.print_report()

# Create a CLIP enabled model



In [9]:
frcnn_model.roi_heads.score_thresh

0.05

In [10]:
frcnn_model.roi_heads.detections_per_img

100

In [11]:
#TEST out replacing the box head and box predictors
if item_list[0] != 'background':
     item_list.insert(0,'background')

text_tokens = clip.tokenize(["This is " + desc for desc in item_list]).cuda()

clip_frcnn_model = frcnn_model

from torch.cuda.amp import autocast
from model import CLIPHead, CLIPRCNNPredictor
with autocast():

    clip_frcnn_model.roi_heads.box_head = CLIPHead()
    clip_frcnn_model.roi_heads.box_predictor = CLIPRCNNPredictor(1024, text_tokens)  #CLIP embeds into 1024 dimensions for the RN50 implementation

clip_frcnn_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): FeatureExtractor(
    (model): ModifiedResNet(
      (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
      (relu): ReLU(inplace=True)
      (layer1): Sequential(
        (0): Bottleneck(
          (con

In [12]:
with autocast():
    add_detections(clip_frcnn_model, evaluation_dataset, fo_dataset, field_name="clip_predictions")

results = fo.evaluate_detections(
    test_view,
    "clip_predictions",
    classes=item_list,
    eval_key="eval",
    compute_mAP=True
)

Using device cuda
   0% |/----------------|   1/615 [350.3ms elapsed, 3.6m remaining, 2.9 samples/s] 

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


 100% |█████████████████| 615/615 [47.2s elapsed, 0s remaining, 12.9 samples/s]      
Evaluating detections...
 100% |█████████████████| 615/615 [29.1s elapsed, 0s remaining, 20.9 samples/s]      
Performing IoU sweep...
 100% |█████████████████| 615/615 [47.2s elapsed, 0s remaining, 14.2 samples/s]      


In [14]:
session.view = item_view
print(f'mAP: {results.mAP()}')
results.print_report()

mAP: 0.0037813750360531913
              precision    recall  f1-score   support

  background       0.00      0.00      0.00         0
         car       0.03      0.45      0.05       495
         dog       0.00      0.83      0.01        41
         bus       0.00      0.70      0.01        83
        fork       0.00      0.32      0.00        44
         tie       0.00      0.21      0.01        29
      person       0.09      0.46      0.15      2568

   micro avg       0.02      0.47      0.05      3260
   macro avg       0.02      0.42      0.03      3260
weighted avg       0.07      0.47      0.12      3260



In [13]:
# for i in range(1):
#     test_image = [torch.rand(3, 224, 224).cuda()]
#     with autocast():
#         out = frcnn_model(test_image)
#     print(out[0]['labels'].shape)

torch.Size([100])
torch.Size([95])
torch.Size([100])
torch.Size([86])
torch.Size([93])
torch.Size([100])
torch.Size([99])
torch.Size([74])
torch.Size([99])
torch.Size([100])
torch.Size([91])
torch.Size([92])
torch.Size([86])
torch.Size([100])
torch.Size([100])
torch.Size([97])
torch.Size([89])
torch.Size([100])
torch.Size([98])
torch.Size([93])
torch.Size([93])
torch.Size([98])
torch.Size([100])
torch.Size([84])
torch.Size([89])
torch.Size([94])
torch.Size([100])
torch.Size([85])
torch.Size([100])
torch.Size([100])
torch.Size([100])
torch.Size([90])
torch.Size([100])
torch.Size([100])
torch.Size([99])
torch.Size([93])
torch.Size([87])
torch.Size([99])
torch.Size([78])
torch.Size([92])
torch.Size([100])
torch.Size([87])
torch.Size([100])
torch.Size([100])
torch.Size([100])
torch.Size([100])
torch.Size([99])
torch.Size([98])
torch.Size([100])
torch.Size([100])
torch.Size([100])
torch.Size([92])
torch.Size([96])
torch.Size([96])
torch.Size([100])
torch.Size([92])
torch.Size([99])
torch.Si