###  Imports

In [1]:
import torch
from model import create_model
from utils import FeatureExtractor

import config

import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F

from dataset import FiftyOneTorchDataset, get_transforms
from model import create_model
from utils import add_detections

from engine import train_model
import config

torch.manual_seed(1)

torch.manual_seed(1)

<torch._C.Generator at 0x21623e79830>

In [2]:
#Load in the dataset from the FiftyOne model Zoo
fo_dataset = foz.load_zoo_dataset("coco-2017", "validation")

#needed to calculate image height and width
fo_dataset.compute_metadata()

session = fo.launch_app(fo_dataset)


# to filter certain items from the dataset we can
item_list = ["car", "dog", "bus", 'fork', 'tie', 'person']
item_view = fo_dataset.filter_labels("ground_truth",
        F("label").is_in(item_list))

#session.view = item_view

# split the dataset in train and test set
train_view = item_view.take((len(item_view) * config.TRAIN_TEST_SPLIT), seed=51)
test_view = item_view.exclude([s.id for s in train_view])

print(f'Traning on {len(train_view)} samples')
print(f'Testing on {len(test_view)} samples')


train_transforms, test_transforms = get_transforms()

# use our dataset and defined transformations
train_dataset = FiftyOneTorchDataset(train_view, train_transforms,
        classes=item_list)
evaluation_dataset = FiftyOneTorchDataset(test_view, test_transforms,
        classes=item_list)

Downloading split 'validation' to 'C:\Users\blain\fiftyone\coco-2017\validation' if necessary
Found annotations at 'C:\Users\blain\fiftyone\coco-2017\raw\instances_val2017.json'
Images already downloaded
Existing download of split 'validation' is sufficient
Loading existing dataset 'coco-2017-validation'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use
Connected to FiftyOne on port 5151 at 127.0.0.1.
If you are not connecting to a remote session, you may need to start a new session and specify a port


Traning on 2458 samples
Testing on 615 samples


# Load the base FRCNN 

In [3]:
import clip
CLIP_model, preprocess = clip.load("RN50")
image_embedder = list(CLIP_model.visual.children())[-1].cuda().eval()#take the last layer manually load in the latest checkpoint

MODEL_TYPE = 'CLIP-FRCNN'
CHECKPOINT_NAME = f'{MODEL_TYPE}_epoch_22.pth'
num_classes = 6  #number of classes in the FRCNN not counting the background

frcnn_model = create_model(MODEL_TYPE, num_classes=(num_classes+1))
checkpoint = torch.load(CHECKPOINT_NAME)
frcnn_model = create_model(MODEL_TYPE, num_classes=(num_classes+1))

frcnn_model.load_state_dict(checkpoint)
frcnn_model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): FeatureExtractor(
    (model): ModifiedResNet(
      (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
      (relu): ReLU(inplace=True)
      (layer1): Sequential(
        (0): Bottleneck(
          (con

# Set up for CLIP integration

In [4]:
add_detections(frcnn_model, evaluation_dataset, fo_dataset, field_name="frcnn_predictions")

results = fo.evaluate_detections(
    test_view,
    "frcnn_predictions",
    classes=item_list,
    eval_key="eval",
    compute_mAP=True
)

Using device cuda
   0% ||----------------|   0/615 [30.9ms elapsed, ? remaining, ? samples/s] 

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


 100% |█████████████████| 615/615 [1.2m elapsed, 0s remaining, 9.4 samples/s]      
Evaluating detections...
 100% |█████████████████| 615/615 [12.6s elapsed, 0s remaining, 54.0 samples/s]      
Performing IoU sweep...
 100% |█████████████████| 615/615 [16.5s elapsed, 0s remaining, 42.6 samples/s]      


In [6]:
results.mAP()

0.12508040681130445

In [7]:
results.print_report()

              precision    recall  f1-score   support

         car       0.10      0.59      0.17       508
         dog       0.15      0.80      0.25        41
         bus       0.13      0.70      0.22        90
        fork       0.09      0.18      0.12        44
         tie       0.11      0.10      0.11        29
      person       0.31      0.86      0.45      4347

   micro avg       0.26      0.82      0.39      5059
   macro avg       0.15      0.54      0.22      5059
weighted avg       0.28      0.82      0.41      5059



# Create a CLIP enabled model

In [5]:
#TEST out replacing the box head and box predictors
from torch.cuda.amp import autocast
from model import CLIPHead, CLIPRCNNPredictor
with autocast():

    model.roi_heads.box_head = CLIPHead()
    model.roi_heads.box_predictor = CLIPRCNNPredictor(1024, text_tokens)  #CLIP embeds into 1024 dimensions for the RN50 implementation

model.eval().to(config.DEVICE)
test_image = [torch.rand(3, 224, 224).cuda()]
with autocast():
    out = model(test_image)
print(out[0]['labels'].shape)
print(out)

model.eval().cuda()
with autocast():
    outs = model([transformed_image.cuda()])


NameError: name 'model' is not defined