Output becomes zero after optimizer.step() yolo-v1 model #60

guruprasaad123 · 2021-06-20T14:29:23Z

I encountered this error while i was trying train the model on my local gpu

Here : Machine-Learning-Collection/ML/Pytorch/object_detection/YOLO/

This is the test script that i have used to test the yolo-v1 model

if __name__ == '__main__':

    csv_file_path = 'PascalVOC_YOLO/100examples.csv'
    img_dir = 'PascalVOC_YOLO/images'
    label_path = 'PascalVOC_YOLO/labels'

    learning_rate = 1e-10
    num_workers = 2
    batch_size = 2
    weight_decay = 1e-4

    sample_dataset = VOCDataset( csv_file_path , img_dir , label_path, transform=transform)

    sample_loader = DataLoader(
        dataset=sample_dataset,
        batch_size=2,
        num_workers=2,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )

    device = 'cuda'if torch.cuda.is_available() else 'cpu'

    model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(device).half()

    optimizer = optim.Adam( model.parameters() , lr=learning_rate , weight_decay=weight_decay)

    loss_func = YoloLoss().to(device)

    for _ in  range(2):

        print('iter : ',_,'\n')

        x , y = next( iter(sample_loader) )

        x , y = Variable(x).to(device).half() , Variable(y).to(device).half()

        # print( 'infinite : ' , torch.isfinite(x))

        # print('x : ',x)

        out = model(x)

        print('out : ',out , '\n')

        loss = loss_func(out,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print( 'loss : ',loss , '\n')
        print( 'loss : data ', loss.data , '\n')
        print(' loss : grad ',loss.grad , '\n')

        for name, param in model.named_parameters():
            print(name, torch.isfinite(param.grad).all() , torch.max(abs(param.grad)) )

        print('\n')

Note : i am using half() because of the cuda error => RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

while running the script i was getting this output below

iter :  0

out :  tensor([[-0.1432,  0.0819,  0.0342,  ..., -0.0377, -0.0745,  0.1312],
        [ 0.1110, -0.0650,  0.2410,  ..., -0.0765,  0.3328,  0.1908]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddmmBackward>)

loss :  tensor(1., device='cuda:0', dtype=torch.float16, grad_fn=<ClampBackward>)

loss : data  tensor(1., device='cuda:0', dtype=torch.float16)

test.py:94: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the gradient for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations.
  print(' loss : grad ',loss.grad , '\n')
 loss : grad  None

darknet.0.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.0.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.0.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.2.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.2.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.2.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.4.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.4.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.4.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.5.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.5.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.5.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.6.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.6.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.6.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.7.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.7.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.7.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.9.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.9.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.9.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.10.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.10.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.10.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.11.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.11.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.11.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.12.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.12.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.12.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.13.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.13.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.13.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.14.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.14.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.14.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.15.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.15.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.15.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.16.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.16.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.16.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.17.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.17.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.17.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.18.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.18.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.18.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.20.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.20.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.20.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.21.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.21.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.21.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.22.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.22.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.22.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.23.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.23.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.23.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.24.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.24.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.24.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.25.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.25.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.25.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.26.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.26.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.26.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.27.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.27.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.27.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.1.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.1.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.4.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.4.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)


iter :  1

out :  tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], device='cuda:0',
       dtype=torch.float16, grad_fn=<AddmmBackward>)

[W python_anomaly_mode.cpp:104] Warning: Error detected in MseLossBackward. Traceback of forward call that caused the error:
  File "test.py", line 86, in <module>
    loss = loss_func(out,y)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/mnt/e/workspace/@training/@datasets/cnns/yolo/yolo-v1-pytorch/loss.py", line 120, in forward
    torch.flatten(exists_box * target[..., :20], end_dim=-2,),
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/modules/loss.py", line 528, in forward
    return F.mse_loss(input, target, reduction=self.reduction)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/functional.py", line 2929, in mse_loss
    return torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
 (function _print_stack)
Traceback (most recent call last):
  File "test.py", line 89, in <module>
    loss.backward()
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/tensor.py", line 245, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/autograd/__init__.py", line 147, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: Function 'MseLossBackward' returned nan values in its 0th output.
(dev) buckaroo@hansolo:/mnt/e/workspace/@training/@datasets/cnns/yolo/yolo-v1-pytorch$ python3 test.py
iter :  0

out :  tensor([[-0.1044, -0.3135, -0.4897,  ..., -0.1079, -0.0055, -0.0380],
        [ 0.1190, -0.3154, -0.0910,  ..., -0.0995, -0.1595, -0.0576]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddmmBackward>)

loss :  tensor(1.0010, device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

loss : data  tensor(1.0010, device='cuda:0', dtype=torch.float16)

test.py:94: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the gradient for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations.
  print(' loss : grad ',loss.grad , '\n')
 loss : grad  None

darknet.0.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.0.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.0.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.2.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.2.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.2.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.4.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.4.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.4.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.5.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.5.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.5.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.6.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.6.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.6.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.7.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.7.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.7.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.9.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.9.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.9.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.10.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.10.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.10.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.11.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.11.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.11.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.12.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.12.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.12.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.13.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.13.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.13.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.14.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.14.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.14.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.15.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.15.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.15.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.16.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.16.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.16.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.17.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.17.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.17.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.18.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.18.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.18.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.20.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.20.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.20.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.21.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.21.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.21.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.22.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.22.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.22.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.23.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.23.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.23.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.24.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.24.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.24.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.25.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.25.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.25.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.26.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.26.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.26.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.27.conv.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.27.batchnorm.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
darknet.27.batchnorm.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.1.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.1.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.4.weight tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)
fcs.4.bias tensor(True, device='cuda:0') tensor(0., device='cuda:0', dtype=torch.float16)

iter :  1

out :  tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], device='cuda:0',
       dtype=torch.float16, grad_fn=<AddmmBackward>)

Observations

As you can see that for the first time , the model out is a valid tensor with values ( i.e before optimizer.step() )
when the iteration 1 begins ( i.e after optimizer.step() ) output becomes nan

Debug method : 0

after setting this torch.autograd.set_detect_anomaly(True) globally
i found this result below

[W python_anomaly_mode.cpp:104] Warning: Error detected in MseLossBackward. Traceback of forward call that caused the error:
  File "test.py", line 86, in <module>
    loss = loss_func(out,y)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/mnt/e/workspace/@training/@datasets/cnns/yolo/yolo-v1-pytorch/loss.py", line 120, in forward
    torch.flatten(exists_box * target[..., :20], end_dim=-2,),
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/modules/loss.py", line 528, in forward
    return F.mse_loss(input, target, reduction=self.reduction)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/nn/functional.py", line 2929, in mse_loss
    return torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
 (function _print_stack)
Traceback (most recent call last):
  File "test.py", line 89, in <module>
    loss.backward()
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/tensor.py", line 245, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/buckaroo/miniconda3/envs/dev/lib/python3.7/site-packages/torch/autograd/__init__.py", line 147, in backward
    allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError: Function 'MseLossBackward' returned nan values in its 0th output.

so i have tried

clamping the loss tensors torch.clamp(value, min=0.0 , max=1.0) in loss.py
adding epsilon (1e-6) after torch.sqrt() like torch.sqrt(val+epsilon) in loss.py

But this didnot fix my issue.