You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I've been having many times this error, which appears randomly while finetuning:
[W python_anomaly_mode.cpp:104] Warning: Error detected in PowBackward0. Traceback of forward call that caused the error:
File "train_mvs_nerf_finetuning_pl.py", line 324, in <module>
trainer.fit(system)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit
self._run(model)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run
self.dispatch()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch
self.accelerator.start_training(self)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
self._results = trainer.run_stage()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage
return self.run_train()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 869, in run_train
self.train_loop.run_training_epoch()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 499, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 738, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 434, in optimizer_step
model_ref.optimizer_step(
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1403, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 214, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 134, in __optimizer_step
trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 329, in optimizer_step
self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 336, in run_optimizer_step
self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 193, in optimizer_step
optimizer.step(closure=lambda_closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/optim/adam.py", line 66, in step
loss = closure()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 732, in train_step_and_backward_closure
result = self.training_step_and_backward(
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 823, in training_step_and_backward
result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 290, in training_step
training_step_output = self.trainer.accelerator.training_step(args)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 204, in training_step
return self.training_type_plugin.training_step(*args)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 155, in training_step
return self.lightning_module.training_step(*args, **kwargs)
File "train_mvs_nerf_finetuning_pl.py", line 177, in training_step
img_loss = img2mse(rgbs, rgbs_target)
File "/host/home/ubuntu/mvsnerf/utils.py", line 12, in <lambda>
img2mse = lambda x, y : torch.mean((x - y) ** 2)
(function _print_stack)
Traceback (most recent call last):
File "train_mvs_nerf_finetuning_pl.py", line 324, in <module>
trainer.fit(system)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit
self._run(model)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run
self.dispatch()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch
self.accelerator.start_training(self)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
self.training_type_plugin.start_training(trainer)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
self._results = trainer.run_stage()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage
return self.run_train()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 869, in run_train
self.train_loop.run_training_epoch()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 499, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 738, in run_training_batch
self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 434, in optimizer_step
model_ref.optimizer_step(
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1403, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 214, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 134, in __optimizer_step
trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 329, in optimizer_step
self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 336, in run_optimizer_step
self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 193, in optimizer_step
optimizer.step(closure=lambda_closure, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/optim/adam.py", line 66, in step
loss = closure()
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 732, in train_step_and_backward_closure
result = self.training_step_and_backward(
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 836, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 869, in backward
result.closure_loss = self.trainer.accelerator.backward(
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 308, in backward
output = self.precision_plugin.backward(
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 79, in backward
model.backward(closure_loss, optimizer, opt_idx)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1275, in backward
loss.backward(*args, **kwargs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/tensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/root/miniconda3/envs/mvsnerf2/lib/python3.8/site-packages/torch/autograd/__init__.py", line 145, in backward
Variable._execution_engine.run_backward(
RuntimeError: Function 'PowBackward0' returned nan values in its 0th output.
I've been debugging, and so far what I've found is that this is due to this raw output to be NaN in renderer.py:156:
raw = network_query_fn(rays_ndc, angle, input_feat, network_fn)
I'm attaching some scope variables in a pickle to reproduce.
I've been having many times this error, which appears randomly while finetuning:
I've been debugging, and so far what I've found is that this is due to this raw output to be
NaN
in renderer.py:156:I'm attaching some scope variables in a pickle to reproduce.
https://www.dropbox.com/s/ew0uetjpedn2e9n/bug_dump.pkl?dl=0
how to better avoid, and mitigate these raw outputs?
The text was updated successfully, but these errors were encountered: