# Code for using FiftyOne to train a Faster RCNN on COCO data

###  Imports

In [1]:
import torch
import fiftyone as fo
import fiftyone.zoo as foz
import torchvision.models.detection.roi_heads
from fiftyone import ViewField as F

from dataset import FiftyOneTorchDataset, get_transforms
from model import create_model
from utils import add_detections

from engine import train_model
import config

torch.manual_seed(1)

<torch._C.Generator at 0x20ea831d8d0>

### Load full dataset from model zoo

In [2]:
#Lodad in the dataset from the FiftyOne model Zoo
fo_dataset = foz.load_zoo_dataset("coco-2017", "validation")

#needed to calculate image height and width
fo_dataset.compute_metadata()

session = fo.launch_app(fo_dataset)

Downloading split 'validation' to 'C:\Users\blain\fiftyone\coco-2017\validation' if necessary
Found annotations at 'C:\Users\blain\fiftyone\coco-2017\raw\instances_val2017.json'
Images already downloaded
Existing download of split 'validation' is sufficient
Loading 'coco-2017' split 'validation'
 100% |███████████████| 5000/5000 [19.2s elapsed, 0s remaining, 311.9 samples/s]      
Dataset 'coco-2017-validation' created


For example, cluttered images make it difficult for models to localize objects. We can use FiftyOne to create a view containing only samples with more than, say, 10 objects. You can perform the same operations on views as datasets, so we can create an instance of our PyTorch dataset from this view:

In [3]:
#if we want to see images with more than 10 items, we can
# busy_view = fo_dataset.match(F("ground_truth.detections").length() > 10)
# busy_torch_dataset = FiftyOneTorchDataset(busy_view)
# session.view = busy_view

### Create training and testing views (and corresponding PyTorch datasets) that only contain some items from the full dataset

In [4]:
# to filter certain items from the dataset we can
item_list = ["car", "dog", "bus", 'fork', 'tie', 'person']
item_list = ['bus', 'dog']
item_view = fo_dataset.filter_labels("ground_truth",
        F("label").is_in(item_list))


#session.view = item_view

# split the dataset in train and test set
train_view = item_view.take((len(item_view) * config.TRAIN_TEST_SPLIT), seed=51)
test_view = item_view.exclude([s.id for s in train_view])

print(f'Traning on {len(train_view)} samples')
print(f'Testing on {len(test_view)} samples')


train_transforms, test_transforms = get_transforms()

# use our dataset and defined transformations
train_dataset = FiftyOneTorchDataset(train_view, train_transforms,
        classes=item_list)
evaluation_dataset = FiftyOneTorchDataset(test_view, test_transforms,
        classes=item_list)

if item_list[0] != 'background':
     item_list.insert(0,'background')

Traning on 292 samples
Testing on 73 samples


In [5]:
# map labels to single vehicle class
vehicle_list = ['car', 'bus', 'truck']
vehicles_map = {c: "vehicle" for c in vehicle_list}

train_map_view = train_view.map_labels("ground_truth", vehicles_map)
test_map_view = test_view.map_labels("ground_truth", vehicles_map)

# use our dataset and defined transformations
torch_map_dataset = FiftyOneTorchDataset(train_map_view, train_transforms)
torch_map_dataset_test = FiftyOneTorchDataset(test_map_view, test_transforms)

### Training and Evaluation

In [6]:
# #to change the loss function, create a new function and implement like below
# import torchvision
# torchvision.models.detection.roi_heads.fastrcnn_loss = cliprcnn_loss

In [7]:
MODEL_TYPE='CLIP-FRCNN'
# CLIP-Backbone-FRCNN creates a FRCNN using CLIP features as the model backbone
# CLIP-FRCNN creates a FRCNN using CLIP features as the model backbone, and embeds the rois using CLIP's embedding
# Fully custom vanilla uses a pre-trained resnet50 backbone, and generates new anchor generator and roi pooling
# Custom-Vanilla uses the pre-trained FRCNN from pytorch and replaces the roi heads only
#
import clip
text_tokens = clip.tokenize(["This is " + desc for desc in item_list]).cuda()

model = create_model(MODEL_TYPE, text_tokens)
test = False
#
#
# for i in range(1):
#     test_image = [torch.rand(3, 224, 224).cuda()]
#     model.eval()
#     out = model(test_image)
#     print(out[0]['labels'].shape)
#torch.autograd.set_detect_anomaly(True)

if test:
    train_model(model, evaluation_dataset, evaluation_dataset, num_epochs=config.NUM_EPOCHS, MODEL_TYPE=MODEL_TYPE, batch_size=1)
else:
    train_model(model, train_dataset, evaluation_dataset, num_epochs=config.NUM_EPOCHS, MODEL_TYPE=MODEL_TYPE, batch_size=1)

Using device cuda


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Training Epoch: [0]  [ 0/73]  eta: 0:01:32  lr: 0.000074  loss: 6.3923 (6.3923)  loss_classifier: 5.2820 (5.2820)  loss_box_reg: 0.0220 (0.0220)  loss_objectness: 0.7118 (0.7118)  loss_rpn_box_reg: 0.3764 (0.3764)  time: 1.2713  data: 0.2090  max mem: 1814
Training Epoch: [0]  [10/73]  eta: 0:00:20  lr: 0.000768  loss: 3.7688 (4.2241)  loss_classifier: 2.7926 (3.2866)  loss_box_reg: 0.0220 (0.0493)  loss_objectness: 0.7058 (0.6886)  loss_rpn_box_reg: 0.0908 (0.1996)  time: 0.3261  data: 0.0359  max mem: 2106
Training Epoch: [0]  [20/73]  eta: 0:00:14  lr: 0.001462  loss: 3.5031 (3.9075)  loss_classifier: 2.7926 (3.1695)  loss_box_reg: 0.0209 (0.0398)  loss_objectness: 0.5857 (0.5712)  loss_rpn_box_reg: 0.0365 (0.1271)  time: 0.2304  data: 0.0175  max mem: 2106
Training Epoch: [0]  [30/73]  eta: 0:00:11  lr: 0.002156  loss: 3.9531 (3.9553)  loss_classifier: 3.3956 (3.3073)  loss_box_reg: 0.0249 (0.0408)  loss_objectness: 0.2631 (0.4482)  loss_rpn_box_reg: 0.0306 (0.1591)  time: 0.2285  

KeyboardInterrupt: 

In [None]:
MODEL_TYPE='CLIP-Backbone-FRCNN'

model = create_model(MODEL_TYPE, classes=item_list)
test = False

if test:
    train_model(model, evaluation_dataset, evaluation_dataset, num_epochs=config.NUM_EPOCHS, MODEL_TYPE=MODEL_TYPE, batch_size=2)
else:
    train_model(model, train_dataset, evaluation_dataset, num_epochs=config.NUM_EPOCHS, MODEL_TYPE=MODEL_TYPE, batch_size=2)

In [None]:
#train a custom vanilla model so that we can compare and make sure the CLIP FRCNN is comparable
# Fully-Custom-Vanilla is most appropriate as it generates the model in a similar fashion
MODEL_TYPE = 'Fully-Custom-Vanilla'

vanilla_model = create_model(MODEL_TYPE, classes=item_list)
train_model(vanilla_model, train_dataset, evaluation_dataset, num_epochs=10, MODEL_TYPE=MODEL_TYPE)

### Evaluate the model

In [None]:
add_detections(model, evaluation_dataset, fo_dataset, field_name="predictions")

results = fo.evaluate_detections(
    test_view,
    "predictions",
    classes=item_list,
    eval_key="eval",
    compute_mAP=True
)

In [None]:
results.mAP()

In [None]:
results.print_report()

By default, objects are only matched with other objects of the same class. In order to get an interesting confusion matrix, we need to match interclass objects by setting `classwise=False`.

In [None]:
results_interclass = fo.evaluate_detections(
    test_view, 
    "predictions", 
    classes=item_list,
    compute_mAP=True, 
    classwise=False
)

In [None]:
plot = results.plot_pr_curves(classes=item_list)
plot.show()

In [None]:
results_interclass.plot_confusion_matrix(classes=item_list, include_other=False, include_missing=False)

The [detection evaluation](https://voxel51.com/docs/fiftyone/user_guide/evaluation.html#detections) also added the attributes `eval_fp`, `eval_tp`, and `eval_fn` to every predicted detection indicating if it is a false positive, true positive, or false negative. 
Let's create a view to find the worst samples by sorting by `eval_fp` using the [FiftyOne App](https://voxel51.com/docs/fiftyone/user_guide/app.html) to visualize the results. 

In [None]:
session.view = test_view.sort_by("eval_fp", reverse=True)

In [None]:
session.view = test_view.sort_by("eval_fp", reverse=True)

It would be best to get this [data reannotated to fix these mistakes](https://towardsdatascience.com/managing-annotation-mistakes-with-fiftyone-and-labelbox-fc6e87b51102), but in the meantime, we can easily remedy this by simply creating a new view that remaps the labels `car`, `truck`, and `bus` all to `vehicle` and then retraining the model with that. This is only possible because we are backing our data in FiftyOne and loading views into PyTorch as needed. Without FiftyOne, the PyTorch dataset class or the underlying data would need to be changed to remap these classes.

In [None]:
# map labels to single vehicle class
vehicle_list = ['car', 'bus', 'truck']
vehicles_map = {c: "vehicle" for c in vehicle_list}

train_map_view = train_view.map_labels("ground_truth", vehicles_map)
test_map_view = test_view.map_labels("ground_truth", vehicles_map)

# use our dataset and defined transformations
torch_map_dataset = FiftyOneTorchDataset(train_map_view, train_transforms)
torch_map_dataset_test = FiftyOneTorchDataset(test_map_view, test_transforms)

In [None]:
# Only 2 classes (background and vehicle)
MODEL_TYPE = 'Vanilla-FRCNN'
vehicle_model = create_model(MODEL_TYPE, num_classes=(len(vehicles_map)+1))
train_model(vehicle_model, torch_map_dataset, torch_map_dataset_test, num_epochs=2, MODEL_TYPE=MODEL_TYPE)

In [None]:
add_detections(vehicle_model, torch_map_dataset_test, test_map_view, field_name="vehicle_predictions")

In [None]:
vehicle_results = fo.evaluate_detections(
    test_map_view, 
    "vehicle_predictions", 
    classes=["vehicle"], 
    eval_key="vehicle_eval", 
    compute_mAP=True
)

In [None]:
vehicle_results.mAP()

In [None]:
vehicle_results.print_report()

Due to our ability to easily visualize and manage our dataset with FiftyOne, we were able to spot and take action on a dataset issue that would otherwise have gone unnoticed if we only concerned ourselves with dataset-wide evaluation metrics and fixed dataset representations. Through these efforts, we managed to increase the mAP of the model to 43%.

Even though this example workflow may not work in all situations, this kind of class-merging strategy can be effective in cases where more fine-grained discrimination is not called for.