# Code for using FiftyOne to train a Faster RCNN on COCO data

###  Imports

In [1]:
import torch
import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F

from dataset import FiftyOneTorchDataset, get_transforms
from model import create_model
from utils import add_detections

from engine import train_model
import config

torch.manual_seed(1)

<torch._C.Generator at 0x13f84710850>

### Load full dataset from model zoo

In [2]:
#Lodad in the dataset from the FiftyOne model Zoo
fo_dataset = foz.load_zoo_dataset("coco-2017", "validation")

#needed to calculate image height and width
fo_dataset.compute_metadata()

session = fo.launch_app(fo_dataset)

Downloading split 'validation' to 'C:\Users\blain\fiftyone\coco-2017\validation' if necessary
Found annotations at 'C:\Users\blain\fiftyone\coco-2017\raw\instances_val2017.json'
Images already downloaded
Existing download of split 'validation' is sufficient
Loading existing dataset 'coco-2017-validation'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use


For example, cluttered images make it difficult for models to localize objects. We can use FiftyOne to create a view containing only samples with more than, say, 10 objects. You can perform the same operations on views as datasets, so we can create an instance of our PyTorch dataset from this view:

In [3]:
#if we want to see images with more than 10 items, we can
# busy_view = fo_dataset.match(F("ground_truth.detections").length() > 10)
# busy_torch_dataset = FiftyOneTorchDataset(busy_view)
# session.view = busy_view

### Create training and testing views (and corresponding PyTorch datasets) that only contain some items from the full dataset

In [4]:
# to filter certain items from the dataset we can
item_list = ["car", "dog", "bus", 'fork', 'tie', 'person']
item_view = fo_dataset.filter_labels("ground_truth",
        F("label").is_in(item_list))

#session.view = item_view

# split the dataset in train and test set
train_view = item_view.take((len(item_view) * config.TRAIN_TEST_SPLIT), seed=51)
test_view = item_view.exclude([s.id for s in train_view])

print(f'Traning on {len(train_view)} samples')
print(f'Testing on {len(test_view)} samples')


train_transforms, test_transforms = get_transforms()

# use our dataset and defined transformations
train_dataset = FiftyOneTorchDataset(train_view, train_transforms,
        classes=item_list)
evaluation_dataset = FiftyOneTorchDataset(test_view, test_transforms,
        classes=item_list)

Traning on 2458 samples
Testing on 615 samples


### Training and Evaluation

In [5]:
MODEL_TYPE='CLIP-FRCNN'

# CLIP-FRCNN creates a FRCNN using CLIP features as the model backbone
# Fully custom vanilla uses a pre-trained resnet50 backbone, and generates new anchor generator and roi pooling
# Custom-Vanilla uses the pre-trained FRCNN from pytorch and replaces the roi heads only

model = create_model(MODEL_TYPE, num_classes=(len(item_list)+1))
train_model(model, train_dataset, evaluation_dataset, num_epochs=config.NUM_EPOCHS, MODEL_TYPE=MODEL_TYPE)

Using device cuda


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Training Epoch: [0]  [   0/1229]  eta: 0:34:29  lr: 0.000010  loss: 2.7686 (2.7686)  loss_classifier: 1.9219 (1.9219)  loss_box_reg: 0.0503 (0.0503)  loss_objectness: 0.7342 (0.7342)  loss_rpn_box_reg: 0.0617 (0.0617)  time: 1.6837  data: 0.3540  max mem: 1310
Training Epoch: [0]  [  10/1229]  eta: 0:08:34  lr: 0.000060  loss: 2.7686 (2.7384)  loss_classifier: 1.8906 (1.8610)  loss_box_reg: 0.0497 (0.0578)  loss_objectness: 0.7307 (0.7304)  loss_rpn_box_reg: 0.0755 (0.0892)  time: 0.4224  data: 0.1803  max mem: 1530
Training Epoch: [0]  [  20/1229]  eta: 0:07:12  lr: 0.000110  loss: 2.5292 (2.4910)  loss_classifier: 1.6924 (1.5967)  loss_box_reg: 0.0497 (0.0742)  loss_objectness: 0.7189 (0.7180)  loss_rpn_box_reg: 0.0823 (0.1021)  time: 0.2916  data: 0.1571  max mem: 1532
Training Epoch: [0]  [  30/1229]  eta: 0:06:42  lr: 0.000160  loss: 1.5667 (2.0816)  loss_classifier: 0.7021 (1.2137)  loss_box_reg: 0.0578 (0.0771)  loss_objectness: 0.6871 (0.6981)  loss_rpn_box_reg: 0.0689 (0.0926)

RuntimeError: [enforce fail at ..\caffe2\serialize\inline_container.cc:300] . unexpected pos 39309184 vs 39309080

In [6]:
#train a custom vanilla model so that we can compare and make sure the CLIP FRCNN is comparable
# Fully-Custom-Vanilla is most appropriate as it generates the model in a similar fashion
MODEL_TYPE = 'Fully-Custom-Vanilla'

vanilla_model = create_model(MODEL_TYPE, num_classes=(len(item_list)+1))
train_model(vanilla_model, train_dataset, evaluation_dataset, num_epochs=10, MODEL_TYPE=MODEL_TYPE)

Using device cuda
Training Epoch: [0]  [   0/1229]  eta: 0:09:46  lr: 0.000010  loss: 2.7297 (2.7297)  loss_classifier: 1.8907 (1.8907)  loss_box_reg: 0.0239 (0.0239)  loss_objectness: 0.7061 (0.7061)  loss_rpn_box_reg: 0.1090 (0.1090)  time: 0.4770  data: 0.1310  max mem: 3239
Training Epoch: [0]  [  10/1229]  eta: 0:08:21  lr: 0.000060  loss: 2.6521 (2.6988)  loss_classifier: 1.8757 (1.8517)  loss_box_reg: 0.0388 (0.0390)  loss_objectness: 0.6960 (0.6970)  loss_rpn_box_reg: 0.0922 (0.1111)  time: 0.4113  data: 0.1485  max mem: 3713
Training Epoch: [0]  [  20/1229]  eta: 0:07:25  lr: 0.000110  loss: 2.5596 (2.5217)  loss_classifier: 1.7424 (1.6794)  loss_box_reg: 0.0388 (0.0368)  loss_objectness: 0.6896 (0.6891)  loss_rpn_box_reg: 0.0475 (0.1164)  time: 0.3627  data: 0.1444  max mem: 3713
Training Epoch: [0]  [  30/1229]  eta: 0:07:02  lr: 0.000160  loss: 1.9389 (2.2141)  loss_classifier: 1.0558 (1.3762)  loss_box_reg: 0.0410 (0.0451)  loss_objectness: 0.6697 (0.6796)  loss_rpn_box_re

### Evaluate the model

In [7]:
add_detections(model, evaluation_dataset, fo_dataset, field_name="predictions")

results = fo.evaluate_detections(
    test_view,
    "predictions",
    classes=item_list,
    eval_key="eval",
    compute_mAP=True
)

Using device cuda
 100% |█████████████████| 615/615 [45.2s elapsed, 0s remaining, 13.6 samples/s]      
Evaluating detections...
 100% |█████████████████| 615/615 [11.0s elapsed, 0s remaining, 59.3 samples/s]      
Performing IoU sweep...
 100% |█████████████████| 615/615 [14.2s elapsed, 0s remaining, 47.1 samples/s]      


In [8]:
results.mAP()

0.12274161085786321

In [9]:
results.print_report()

              precision    recall  f1-score   support

         car       0.10      0.59      0.17       508
         dog       0.15      0.80      0.25        41
         bus       0.13      0.70      0.22        90
        fork       0.09      0.18      0.12        44
         tie       0.11      0.10      0.11        29
      person       0.31      0.86      0.45      4349

   micro avg       0.26      0.82      0.39      5061
   macro avg       0.15      0.54      0.22      5061
weighted avg       0.28      0.82      0.41      5061



By default, objects are only matched with other objects of the same class. In order to get an interesting confusion matrix, we need to match interclass objects by setting `classwise=False`.

In [None]:
results_interclass = fo.evaluate_detections(
    test_view, 
    "predictions", 
    classes=item_list,
    compute_mAP=True, 
    classwise=False
)

In [None]:
plot = results.plot_pr_curves(classes=item_list)
plot.show()

In [None]:
results_interclass.plot_confusion_matrix(classes=item_list, include_other=False, include_missing=False)

The [detection evaluation](https://voxel51.com/docs/fiftyone/user_guide/evaluation.html#detections) also added the attributes `eval_fp`, `eval_tp`, and `eval_fn` to every predicted detection indicating if it is a false positive, true positive, or false negative. 
Let's create a view to find the worst samples by sorting by `eval_fp` using the [FiftyOne App](https://voxel51.com/docs/fiftyone/user_guide/app.html) to visualize the results. 

In [None]:
session.view = test_view.sort_by("eval_fp", reverse=True)

In [None]:
session.view = test_view.sort_by("eval_fp", reverse=True)

It would be best to get this [data reannotated to fix these mistakes](https://towardsdatascience.com/managing-annotation-mistakes-with-fiftyone-and-labelbox-fc6e87b51102), but in the meantime, we can easily remedy this by simply creating a new view that remaps the labels `car`, `truck`, and `bus` all to `vehicle` and then retraining the model with that. This is only possible because we are backing our data in FiftyOne and loading views into PyTorch as needed. Without FiftyOne, the PyTorch dataset class or the underlying data would need to be changed to remap these classes.

In [None]:
# map labels to single vehicle class
vehicle_list = ['car', 'bus', 'truck']
vehicles_map = {c: "vehicle" for c in vehicle_list}

train_map_view = train_view.map_labels("ground_truth", vehicles_map)
test_map_view = test_view.map_labels("ground_truth", vehicles_map)

# use our dataset and defined transformations
torch_map_dataset = FiftyOneTorchDataset(train_map_view, train_transforms)
torch_map_dataset_test = FiftyOneTorchDataset(test_map_view, test_transforms)

In [16]:
# Only 2 classes (background and vehicle)
MODEL_TYPE = 'Vanilla-FRCNN'
vehicle_model = create_model(MODEL_TYPE, num_classes=(len(vehicles_map)+1))
train_model(vehicle_model, torch_map_dataset, torch_map_dataset_test, num_epochs=2, MODEL_TYPE=MODEL_TYPE)

UnboundLocalError: local variable 'model' referenced before assignment

In [None]:
add_detections(vehicle_model, torch_map_dataset_test, test_map_view, field_name="vehicle_predictions")

In [None]:
vehicle_results = fo.evaluate_detections(
    test_map_view, 
    "vehicle_predictions", 
    classes=["vehicle"], 
    eval_key="vehicle_eval", 
    compute_mAP=True
)

In [None]:
vehicle_results.mAP()

In [None]:
vehicle_results.print_report()

Due to our ability to easily visualize and manage our dataset with FiftyOne, we were able to spot and take action on a dataset issue that would otherwise have gone unnoticed if we only concerned ourselves with dataset-wide evaluation metrics and fixed dataset representations. Through these efforts, we managed to increase the mAP of the model to 43%.

Even though this example workflow may not work in all situations, this kind of class-merging strategy can be effective in cases where more fine-grained discrimination is not called for.