IndexError: tensors used as indices must be long, byte or bool tensors (Retinanet only) #547

robmarkcole · 2020-11-15T08:26:28Z

🐛 Bug

Describe the bug
On icevision 0.4.0, running retinanet on custom dataset (fire) I get the following:

/usr/local/lib/python3.6/dist-packages/torch/nn/_reduction.py:44: UserWarning: size_average and reduce args will be deprecated, please use reduction='sum' instead.
  warnings.warn(warning.format(ret))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-28-c232684d32d4> in <module>()
      1 learn.freeze()
----> 2 learn.lr_find()

16 frames
/usr/local/lib/python3.6/dist-packages/fastai/callback/schedule.py in lr_find(self, start_lr, end_lr, num_it, stop_div, show_plot, suggestions)
    222     n_epoch = num_it//len(self.dls.train) + 1
    223     cb=LRFinder(start_lr=start_lr, end_lr=end_lr, num_it=num_it, stop_div=stop_div)
--> 224     with self.no_logging(): self.fit(n_epoch, cbs=cb)
    225     if show_plot: self.recorder.plot_lr_find()
    226     if suggestions:

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    203             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    204             self.n_epoch = n_epoch
--> 205             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    206 
    207     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_fit(self)
    194         for epoch in range(self.n_epoch):
    195             self.epoch=epoch
--> 196             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    197 
    198     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_epoch(self)
    188 
    189     def _do_epoch(self):
--> 190         self._do_epoch_train()
    191         self._do_epoch_validate()
    192 

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_epoch_train(self)
    180     def _do_epoch_train(self):
    181         self.dl = self.dls.train
--> 182         self._with_events(self.all_batches, 'train', CancelTrainException)
    183 
    184     def _do_epoch_validate(self, ds_idx=1, dl=None):

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in all_batches(self)
    158     def all_batches(self):
    159         self.n_iter = len(self.dl)
--> 160         for o in enumerate(self.dl): self.one_batch(*o)
    161 
    162     def _do_one_batch(self):

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in one_batch(self, i, b)
    176         self.iter = i
    177         self._split(b)
--> 178         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    179 
    180     def _do_epoch_train(self):

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    152 
    153     def _with_events(self, f, event_type, ex, final=noop):
--> 154         try:       self(f'before_{event_type}')       ;f()
    155         except ex: self(f'after_cancel_{event_type}')
    156         finally:   self(f'after_{event_type}')        ;final()

/usr/local/lib/python3.6/dist-packages/fastai/learner.py in _do_one_batch(self)
    161 
    162     def _do_one_batch(self):
--> 163         self.pred = self.model(*self.xb)
    164         self('after_pred')
    165         if len(self.yb): self.loss = self.loss_func(self.pred, *self.yb)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/retinanet.py in forward(self, images, targets)
    556 
    557             # compute the losses
--> 558             losses = self.compute_loss(targets, head_outputs, anchors)
    559         else:
    560             # compute the detections

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/retinanet.py in compute_loss(self, targets, head_outputs, anchors)
    406             matched_idxs.append(self.proposal_matcher(match_quality_matrix))
    407 
--> 408         return self.head.compute_loss(targets, head_outputs, anchors, matched_idxs)
    409 
    410     def postprocess_detections(self, head_outputs, anchors, image_shapes):

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/retinanet.py in compute_loss(self, targets, head_outputs, anchors, matched_idxs)
     49         # type: (List[Dict[str, Tensor]], Dict[str, Tensor], List[Tensor], List[Tensor]) -> Dict[str, Tensor]
     50         return {
---> 51             'classification': self.classification_head.compute_loss(targets, head_outputs, matched_idxs),
     52             'bbox_regression': self.regression_head.compute_loss(targets, head_outputs, anchors, matched_idxs),
     53         }

/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/retinanet.py in compute_loss(self, targets, head_outputs, matched_idxs)
    118                     foreground_idxs_per_image,
    119                     targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]]
--> 120                 ] = 1.0
    121 
    122                 # find indices for which anchors should be ignored

IndexError: tensors used as indices must be long, byte or bool tensors

To Reproduce
Steps to reproduce the behavior:
Has happened during learn.lr_find(), but on another occasion I passed this point and got error during learn.fine_tune(50, 3e-3, freeze_epochs=1). I placed the notebook at https://github.com/robmarkcole/fire-detection-from-images/blob/master/pytorch/icevision/icevision_firenet_retinanet.ipynb

Expected behavior
No error

Screenshots
NA

Desktop (please complete the following information):

Mac Catalina 10.15.5

Additional context
None

The text was updated successfully, but these errors were encountered:

robmarkcole · 2020-11-15T08:31:12Z

Strangely, on rerunning learn.lr_find() the error does not occur on second attempt, however, it did then occur during training. Owing to the random nature of its occurrence it must be arising from a bad annotation?

robmarkcole · 2020-11-19T06:20:53Z

Still seen on 0.4.0.post1

lgvaz · 2020-11-19T14:37:26Z

It's the first time I see this error, I have to admit I have no clue on what it might be... I'll run the notebook you shared and investigate further

You used the same code to train faster_rcnn and efficientdet without problems right?

robmarkcole · 2020-11-19T14:38:26Z

used the same code to train faster_rcnn and efficientdet without problems right: correct :-)

robmarkcole · 2020-12-04T13:24:34Z

Have rerun my notebook on icevision-0.4.0.post1 nose-1.3.7 resnest-0.0.6b20201204 and the error is now:

ValueError: All bounding boxes should have positive height and width. Found invalid box [183.92996215820312, 231.0, 383.9397277832031, 113.37773895263672] for target at index 11.

UPDATE: this was a separate issue that was introduced and then resolved separately

lgvaz · 2020-12-04T14:49:30Z

For future reference, we're addressing this in: https://discord.com/channels/735877944085446747/780951889025564692/784409448017952808

robmarkcole · 2020-12-12T05:29:14Z

OK installed from master and now on 0.5.1. I might have some insight now. I initially ran with learning rate 3e-3 and thought this issue must be resolved, as training proceeded without error. I then switched to le 1e-3 and immediately get the tensors used as indices must be long, byte or bool tensors error again. Looking at the learning rate plot below it has all these strange jumps in it, with a large jump occurring around e-3. Could this be related? Reminder I only see this error with this dataset using retina net

lgvaz · 2020-12-12T13:22:30Z

If you're running this on a notebook, when you get the error can you try invoking a debugger with %debug and checking what are the value of the following variables?

targets_per_image
matched_idxs_per_image
foreground_idxs_per_image

robmarkcole · 2020-12-13T04:27:55Z

I get:

> /usr/local/lib/python3.6/dist-packages/torchvision/models/detection/retinanet.py(120)compute_loss()
    118                     foreground_idxs_per_image,
    119                     targets_per_image['labels'][matched_idxs_per_image[foreground_idxs_per_image]]
--> 120                 ] = 1.0
    121 
    122                 # find indices for which anchors should be ignored

ipdb> 
ipdb> print(targets_per_image)
{'labels': tensor([], device='cuda:0', dtype=torch.int64), 'boxes': tensor([], device='cuda:0', size=(0, 4))}
ipdb> print(matched_idxs_per_image)
tensor([], dtype=torch.int32)
ipdb> print(foreground_idxs_per_image)
tensor([], dtype=torch.bool)

The training does not look correct at all

lgvaz · 2020-12-13T18:44:10Z

So the loss is exploding here, might be related. What happens if you run with a much smaller learning rate? Like 5e-5 or 1e-4

robmarkcole · 2020-12-16T05:40:20Z

Alright, on 0.5.1. now and indeed loss is under control with a low lr and no error!

UPDATE: on a rerun the error again!

lgvaz · 2020-12-16T12:01:38Z

And now the loss isn't exploding as well =x

Any new insights on what might be happening?

robmarkcole · 2021-04-21T05:33:53Z

Error persists in 0.7.0. Losses all look fine. No new insights im afraid

robmarkcole · 2021-05-12T04:45:13Z

Alright on 0.7.1a1 I am no longer hitting this issue - an ideas if a recent change could have resolved this or am I just striking lucky somehow?

rsomani95 · 2021-05-12T06:16:30Z

@robmarkcole may have to do with the fastai version update

robmarkcole · 2021-05-12T08:07:22Z

Thanks @rsomani95 will close this then

robmarkcole added the bug Something isn't working label Nov 15, 2020

lgvaz added this to Needs triage in Bugs via automation Nov 15, 2020

lgvaz moved this from Needs triage to High priority in Bugs Nov 19, 2020

robmarkcole mentioned this issue Nov 19, 2020

Try retinanet robmarkcole/fire-detection-from-images#12

Open

robmarkcole changed the title ~~IndexError: tensors used as indices must be long, byte or bool tensors~~ IndexError: tensors used as indices must be long, byte or bool tensors (Retinanet only) Dec 12, 2020

robmarkcole closed this as completed May 12, 2021

Bugs automation moved this from High priority to Closed May 12, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

IndexError: tensors used as indices must be long, byte or bool tensors (Retinanet only) #547

IndexError: tensors used as indices must be long, byte or bool tensors (Retinanet only) #547

robmarkcole commented Nov 15, 2020 •

edited

Loading

robmarkcole commented Nov 15, 2020 •

edited

Loading

robmarkcole commented Nov 19, 2020

lgvaz commented Nov 19, 2020

robmarkcole commented Nov 19, 2020

robmarkcole commented Dec 4, 2020 •

edited

Loading

lgvaz commented Dec 4, 2020

robmarkcole commented Dec 12, 2020

lgvaz commented Dec 12, 2020

robmarkcole commented Dec 13, 2020

lgvaz commented Dec 13, 2020 •

edited

Loading

robmarkcole commented Dec 16, 2020 •

edited

Loading

lgvaz commented Dec 16, 2020

robmarkcole commented Apr 21, 2021

robmarkcole commented May 12, 2021

rsomani95 commented May 12, 2021

robmarkcole commented May 12, 2021

IndexError: tensors used as indices must be long, byte or bool tensors (Retinanet only) #547

IndexError: tensors used as indices must be long, byte or bool tensors (Retinanet only) #547

Comments

robmarkcole commented Nov 15, 2020 • edited Loading

🐛 Bug

robmarkcole commented Nov 15, 2020 • edited Loading

robmarkcole commented Nov 19, 2020

lgvaz commented Nov 19, 2020

robmarkcole commented Nov 19, 2020

robmarkcole commented Dec 4, 2020 • edited Loading

lgvaz commented Dec 4, 2020

robmarkcole commented Dec 12, 2020

lgvaz commented Dec 12, 2020

robmarkcole commented Dec 13, 2020

lgvaz commented Dec 13, 2020 • edited Loading

robmarkcole commented Dec 16, 2020 • edited Loading

lgvaz commented Dec 16, 2020

robmarkcole commented Apr 21, 2021

robmarkcole commented May 12, 2021

rsomani95 commented May 12, 2021

robmarkcole commented May 12, 2021

robmarkcole commented Nov 15, 2020 •

edited

Loading

robmarkcole commented Nov 15, 2020 •

edited

Loading

robmarkcole commented Dec 4, 2020 •

edited

Loading

lgvaz commented Dec 13, 2020 •

edited

Loading

robmarkcole commented Dec 16, 2020 •

edited

Loading