You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi, I'm new to ibot and mmcv, sorry to disturb. I'm trying to reproduce the object detection task in evaluation phase. I set the job name to "first_try" and my command is shown below:
2022-11-10 17:52:14,699 - mmseg - INFO - workflow: [('train', 1)], max: 160000 iters
Traceback (most recent call last):
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 176, in <module>
Traceback (most recent call last):
main() File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 176, in <module>
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 172, in main
Traceback (most recent call last):
meta=meta) File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 176, in <module>
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/train_api.py", line 187, in train_segmentor
main()
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 172, in main
runner.run(data_loaders, cfg.workflow)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
meta=meta)
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/train_api.py", line 187, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
main()
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 172, in main
meta=meta)
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/train_api.py", line 187, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
iter_runner(iter_loaders[i], **kwargs)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
iter_runner(iter_loaders[i], **kwargs)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
self.call_hook('after_train_iter')
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
self.call_hook('after_train_iter')
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/apex_runner/optimizer.py", line 37, in after_train_iter
getattr(hook, fn_name)(self)
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/apex_runner/optimizer.py", line 37, in after_train_iter
scaled_loss.backward()getattr(hook, fn_name)(self)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/apex_runner/optimizer.py", line 37, in after_train_iter
scaled_loss.backward()
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
scaled_loss.backward()
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flagtorch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/function.py", line 89, in apply
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/function.py", line 89, in apply
return self._forward_cls.backward(self, *args) # type: ignore
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/utils/checkpoint.py", line 99, in backward
allow_unreachable=True) # allow_unreachable flag
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/function.py", line 89, in apply
return self._forward_cls.backward(self, *args) # type: ignore
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/utils/checkpoint.py", line 99, in backward
torch.autograd.backward(outputs, args)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
return self._forward_cls.backward(self, *args) # type: ignore
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/utils/checkpoint.py", line 99, in backward
torch.autograd.backward(outputs, args)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
torch.autograd.backward(outputs, args)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases yet.
allow_unreachable=True) # allow_unreachable flag
allow_unreachable=True) # allow_unreachable flagRuntimeError
: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases yet.
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases yet.
Traceback (most recent call last):
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 176, in <module>
main()
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py", line 172, in main
meta=meta)
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/train_api.py", line 187, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 133, in run
iter_runner(iter_loaders[i], **kwargs)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/mmcv_custom/apex_runner/optimizer.py", line 37, in after_train_iter
scaled_loss.backward()
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/function.py", line 89, in apply
return self._forward_cls.backward(self, *args) # type: ignore
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/utils/checkpoint.py", line 99, in backward
torch.autograd.backward(outputs, args)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases yet.
Traceback (most recent call last):
File "/home/username/anaconda3/envs/py37/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/username/anaconda3/envs/py37/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/distributed/launch.py", line 260, in <module>
main()
File "/home/username/anaconda3/envs/py37/lib/python3.7/site-packages/torch/distributed/launch.py", line 256, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/home/username/anaconda3/envs/py37/bin/python3', '-u', '/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/train.py', '--local_rank=3', '/data/data0/username/spaceevo_segmentation/ibot/evaluation/semantic_segmentation/configs/upernet/vit_small_512_ade20k_160k.py', '--launcher', 'pytorch', '--work-dir', '/data/data0/username/spaceevo_segmentation/ibot/work_dirs/first_try/seg', '--deterministic', '--options', 'model.backbone.use_checkpoint=True', 'model.pretrained=/data/data0/username/spaceevo_segmentation/ibot/work_dirs/first_try/checkpoint_teacher.pth', 'data.samples_per_gpu=4', 'model.backbone.out_with_norm=true', 'optimizer.lr=3e-5']' returned non-zero exit status 1.
I also tried the linear head for segmentation, and there is no such error. Have you ever encountered such a problem? Thanks a lot!
The text was updated successfully, but these errors were encountered:
Hi, I'm new to ibot and mmcv, sorry to disturb. I'm trying to reproduce the object detection task in evaluation phase. I set the job name to "first_try" and my command is shown below:
and an error occurred before training:
I also tried the linear head for segmentation, and there is no such error. Have you ever encountered such a problem? Thanks a lot!
The text was updated successfully, but these errors were encountered: