In [1]:
import torch, glob, os, pprint

files = sorted(glob.glob("../debug/nan_grad_epoch*.pth"))
if not files:
    print("No nan_grad dump found in debug/ — list debug dir:", os.listdir("debug"))
else:
    path = files[-1]
    print("Loading:", path)
    d = torch.load(path)
    # print summary
    pprint.pprint({k: (type(v).__name__ if not isinstance(v, torch.Tensor) else {"shape": tuple(v.shape),
                                                                               "min": float(torch.min(v)) if v.numel() else None,
                                                                               "max": float(torch.max(v)) if v.numel() else None}) for k,v in d.items()})
    # show grad sample values (if present)
    if "grad_sample" in d and hasattr(d["grad_sample"], "tolist"):
        print("grad_sample (first elements):", d["grad_sample"].view(-1)[:50].tolist())
    if "inputs" in d and isinstance(d["inputs"], torch.Tensor):
        print("inputs shape:", tuple(d["inputs"].shape))
    if "targets" in d and isinstance(d["targets"], torch.Tensor):
        print("targets unique/counts:", torch.unique(d["targets"], return_counts=True))


Loading: ../debug/nan_grad_epoch1_step46_model.layer1.0.conv1.weight.pth
{'current_scale': 'float',
 'epoch': 'int',
 'grad_sample': {'max': 0.0, 'min': 0.0, 'shape': (100,)},
 'inputs': {'max': 0.9921568632125854, 'min': 0.0, 'shape': (64, 3, 320, 320)},
 'model_state': 'dict',
 'opt_state_keys': 'list',
 'param_name': 'str',
 'step_in_epoch': 'int',
 'targets': {'max': 6.0, 'min': 0.0, 'shape': (64,)},
 'where': 'str'}
grad_sample (first elements): [-0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
inputs shape: (64, 3, 320, 320)
targets unique/counts: (tensor([0, 1, 2, 3, 4, 5, 6]), tensor([ 6, 13,  5, 17,  4, 18,  1]))


In [2]:
import torch, glob, os
path = sorted(glob.glob("../debug/nan_grad_epoch*.pth"))[-1]
d = torch.load(path)
print("Dump path:", path)
for k,v in d.items():
    print(k, "->", type(v))
    if isinstance(v, torch.Tensor):
        print("  shape", tuple(v.shape), "dtype", v.dtype)
        print("  any NaN?", torch.isnan(v).any().item(), "any Inf?", torch.isinf(v).any().item())
        # show a small slice
        print("  sample:", v.view(-1)[:50])
    else:
        # could be list/dict/str
        print("  value preview:", str(v)[:400])


Dump path: ../debug/nan_grad_epoch1_step46_model.layer1.0.conv1.weight.pth
where -> <class 'str'>
  value preview: nan_grad_after_unscale
epoch -> <class 'int'>
  value preview: 1
step_in_epoch -> <class 'int'>
  value preview: 46
param_name -> <class 'str'>
  value preview: model.layer1.0.conv1.weight
current_scale -> <class 'float'>
  value preview: 65536.0
grad_sample -> <class 'torch.Tensor'>
  shape (100,) dtype torch.float32
  any NaN? False any Inf? False
  sample: tensor([-0., -0., 0., -0., 0., 0., 0., 0., 0., -0., -0., -0., -0., -0., -0., -0., -0., -0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])
inputs -> <class 'torch.Tensor'>
  shape (64, 3, 320, 320) dtype torch.float32
  any NaN? False any Inf? False
  sample: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [5]:
# debug_inspect.py
import torch, glob, pprint, os

files = sorted(glob.glob("../debug/*.pth"))
print("Found debug files:", files)
for p in files:
    print("\n---", p)
    d = torch.load(p, map_location="cpu")
    summary = {}
    for k,v in d.items():
        if isinstance(v, torch.Tensor):
            summary[k] = {
                "type": "tensor",
                "shape": tuple(v.shape),
                "dtype": str(v.dtype),
                "any_nan": bool(torch.isnan(v).any().item()),
                "any_inf": bool(torch.isinf(v).any().item()),
                "min": float(torch.min(v)) if v.numel() else None,
                "max": float(torch.max(v)) if v.numel() else None,
                "sample": v.view(-1)[:10].tolist() if v.numel() else []
            }
        else:
            # show a compact preview for non-tensors
            try:
                summary[k] = {"type": type(v).__name__, "preview": str(v)[:400]}
            except Exception:
                summary[k] = {"type": type(v).__name__, "preview": "<unprintable>"}
    pprint.pprint(summary)


Found debug files: ['../debug/crash_final.pth', '../debug/crash_nan_grad.pth', '../debug/nan_grad_epoch1_step46_model.layer1.0.conv1.weight.pth']

--- ../debug/crash_final.pth
{'exc': {'preview': 'Non-finite grad detected in model.layer1.0.conv1.weight; '
                    'saved debug',
         'type': 'str'},
 'inputs': {'any_inf': False,
            'any_nan': False,
            'dtype': 'torch.float32',
            'max': 0.9921568632125854,
            'min': 0.0,
            'sample': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
            'shape': (64, 3, 320, 320),
            'type': 'tensor'},
 'model_state': {'preview': "OrderedDict([('linear.weight', tensor([[-0.0302,  "
                            '0.0208, -0.0642,  0.0693,  0.1149, -0.0922,  '
                            '0.1105,  0.0253,\n'
                            '          0.0959,  0.0190,  0.0632, -0.0165,  '
                            '0.1003,  0.0225, -0.0606,  0.0309,\n'
                            

In [1]:

import sys
sys.path.append("..")

# repro_offline.py — offline reproduce with detect_anomaly()
import torch, glob, traceback
from losses.reslt_loss import ResLTLoss  # adjust import if needed
from models import REGISTRY as MODEL_REG   # if your project exposes a model registry

# === ADAPT THESE to your project/config ===
MODEL_KEY = "resltresnet32"   # model registry key used in train.py
MODEL_ARGS = {"num_classes": 7, "scale": 1}
LOSS_ARGS = {"num_classes": 7, "head_classes": [...], "medium_classes": [...], "tail_classes": [...], "beta": 0.96}
CHECKPOINT_PATH = None  # if you have a checkpoint file, set path here (optional)
# ==========================================

# pick the latest debug file containing batch (we used nan_grad_... earlier)
dbg_files = sorted(glob.glob("../debug/nan_grad_epoch*.pth") + glob.glob("../debug/crash_nan_grad.pth") + glob.glob("../debug/crash_final.pth"))
if not dbg_files:
    raise SystemExit("No debug files found in debug/")

dbg = torch.load(dbg_files[-1], map_location="cpu")
print("Using debug file:", dbg_files[-1])
# Expect inputs/targets in dbg
inputs = dbg.get("inputs")
targets = dbg.get("targets")
if inputs is None or targets is None:
    raise SystemExit("Debug file does not contain inputs/targets. Use in-process reproduction.")

# build model and load checkpoint if available (or load model_state from crash_final if present)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MODEL_REG[MODEL_KEY](**MODEL_ARGS).to(device)

# If crash_final contains full state, use that (preferred)
if "model_state" in dbg and isinstance(dbg["model_state"], dict):
    try:
        model.load_state_dict(dbg["model_state"])
        print("Loaded model_state from debug file (crash_final preview).")
    except Exception as e:
        print("Failed loading model_state from debug file preview:", e)

# If you have an external checkpoint file, you can also load it now:
if CHECKPOINT_PATH:
    ck = torch.load(CHECKPOINT_PATH, map_location=device)
    model.load_state_dict(ck["model"])
    print("Loaded checkpoint", CHECKPOINT_PATH)

model.train()
inputs = inputs.to(device)
targets = targets.to(device)

# Recreate your loss exactly as in training
loss_fn = ResLTLoss(**{k:v for k,v in LOSS_ARGS.items() if k in ["num_classes","head_classes","medium_classes","tail_classes","beta"]})

torch.autograd.set_detect_anomaly(True)
try:
    with torch.autograd.detect_anomaly():
        with torch.cuda.amp.autocast(enabled=False):  # set enabled=True if you want AMP reproduction
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
        loss.backward()
except Exception as e:
    print("=== ANOMALY TRACE ===")
    traceback.print_exc()
    raise
else:
    print("No anomaly raised for that batch (unexpected).")


Using debug file: ../debug/nan_grad_epoch1_step46_model.layer1.0.conv1.weight.pth
Failed loading model_state from debug file preview: Error(s) in loading state_dict for ResLTResNet32:
	Missing key(s) in state_dict: "model.layer1.2.bn2.bias", "model.layer1.2.bn2.running_mean", "model.layer1.2.bn2.running_var", "model.layer1.3.conv1.weight", "model.layer1.3.bn1.weight", "model.layer1.3.bn1.bias", "model.layer1.3.bn1.running_mean", "model.layer1.3.bn1.running_var", "model.layer1.3.conv2.weight", "model.layer1.3.bn2.weight", "model.layer1.3.bn2.bias", "model.layer1.3.bn2.running_mean", "model.layer1.3.bn2.running_var", "model.layer1.4.conv1.weight", "model.layer1.4.bn1.weight", "model.layer1.4.bn1.bias", "model.layer1.4.bn1.running_mean", "model.layer1.4.bn1.running_var", "model.layer1.4.conv2.weight", "model.layer1.4.bn2.weight", "model.layer1.4.bn2.bias", "model.layer1.4.bn2.running_mean", "model.layer1.4.bn2.running_var", "model.layer2.0.conv1.weight", "model.layer2.0.bn1.weight", "mode

  with torch.autograd.detect_anomaly():
  with torch.cuda.amp.autocast(enabled=False):  # set enabled=True if you want AMP reproduction


=== ANOMALY TRACE ===


Traceback (most recent call last):
  File "/tmp/ipykernel_396928/4038075077.py", line 59, in <module>
    loss = loss_fn(outputs, targets)
  File "/home/user/abin_ref_papers/environments/snn_stdp_poisson/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/user/abin_ref_papers/environments/snn_stdp_poisson/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/user/abin_ref_papers/project_structure_demo/dnn_template/notebooks/../losses/reslt_loss.py", line 97, in forward
    labelH = target_onehot[:, self.head_classes].sum(dim=1).bool()   # boolean mask
RuntimeError: Could not infer dtype of ellipsis


RuntimeError: Could not infer dtype of ellipsis

In [None]:
# repro_anomaly.py
import torch, glob, traceback, sys, os
from losses.reslt_loss import ResLTLoss
from models import REGISTRY as MODEL_REG  # uses your project's model registry

DEBUG_DIR = "debug"
# pick the most informative debug file (prefer crash_final if it contains model_state)
cands = sorted(glob.glob(os.path.join(DEBUG_DIR, "crash_final.pth")) +
               glob.glob(os.path.join(DEBUG_DIR, "nan_grad_epoch*.pth")) +
               glob.glob(os.path.join(DEBUG_DIR, "crash_nan_grad.pth")))
if not cands:
    print("No debug files found in debug/. Exiting.")
    sys.exit(1)
dbg_path = cands[0] if os.path.basename(cands[0]) == "crash_final.pth" else cands[-1]
print("Using debug file:", dbg_path)
dbg = torch.load(dbg_path, map_location="cpu")

# Extract inputs and targets from debug
inputs = dbg.get("inputs", None)
targets = dbg.get("targets", None)
if inputs is None or targets is None:
    print("Debug file does not contain inputs/targets. Exiting.")
    sys.exit(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# === Construct model (match your train config) ===
MODEL_KEY = "resltresnet32"   # change if different
MODEL_ARGS = {"num_classes": 7, "scale": 1}

model = MODEL_REG[MODEL_KEY](**MODEL_ARGS).to(device)

# If crash_final saved full model_state, prefer that
if "model_state" in dbg and isinstance(dbg["model_state"], dict):
    try:
        model.load_state_dict(dbg["model_state"])
        print("Loaded model_state from debug file.")
    except Exception as e:
        print("Failed to load model_state from debug file:", e)

# Move inputs/targets to device
inputs = inputs.to(device)
targets = targets.to(device)

# === Recreate ResLTLoss with the same args you used ===
loss_fn = ResLTLoss(
    num_classes=7,
    head_classes=["Very_Low", "Non-burnable"],    # replace names with indices if your loss expects indices
    medium_classes=["Low", "Moderate"],
    tail_classes=["High", "Very_High", "Water"],
    beta=0.96
)
# If your ResLTLoss expects indices instead of names, adapt above to use index lists.

loss_fn = loss_fn.to(device) if hasattr(loss_fn, "to") else loss_fn

# Run under anomaly detection. Try first WITH AMP (since crash happened with AMP enabled),
# then WITHOUT AMP if needed.
print("Running forward/backward under torch.autograd.detect_anomaly() with AMP autocast enabled.")
torch.autograd.set_detect_anomaly(True)
try:
    # Use AMP autocast same as training (device_type auto)
    with torch.cuda.amp.autocast(enabled=True):
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
    # backward outside autocast; detect_anomaly will catch op producing NaN/Inf
    loss.backward()
    print("Backward completed without anomaly (unexpected).")
except Exception:
    print("=== ANOMALY TRACE (AMP run) ===")
    traceback.print_exc()

# If AMP run didn't reveal it, try without AMP
print("\nNow trying without AMP (plain fp32) under detect_anomaly().")
try:
    model.zero_grad(set_to_none=True)
    with torch.autograd.detect_anomaly():
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
    print("Backward (no AMP) completed without anomaly (unexpected).")
except Exception:
    print("=== ANOMALY TRACE (no AMP run) ===")
    traceback.print_exc()


In [5]:
import sys
sys.path.append("..")


# repro_anomaly_fixed.py
import os, glob, torch, traceback, sys

# --- project imports (assumes repo root is on PYTHONPATH like in train.py) ---
from models import REGISTRY as MODEL_REG
from losses.reslt_loss import ResLTLoss
from data import REGISTRY as DATA_REG   # to get class_to_idx mapping (same as train.py)

DEBUG_DIR = "../debug"
# pick prefer crash_final, else the last nan_grad file
cands = sorted(glob.glob(os.path.join(DEBUG_DIR, "crash_final.pth")) +
               glob.glob(os.path.join(DEBUG_DIR, "nan_grad_epoch*.pth")) +
               glob.glob(os.path.join(DEBUG_DIR, "crash_nan_grad.pth")))
if not cands:
    print("No debug files found in debug/. Exiting.")
    sys.exit(1)
dbg_path = cands[0] if os.path.basename(cands[0]) == "crash_final.pth" else cands[-1]
print("Using debug file:", dbg_path)
dbg = torch.load(dbg_path, map_location="cpu")

# Extract saved batch
inputs = dbg.get("inputs", None)
targets = dbg.get("targets", None)
if inputs is None or targets is None:
    print("Debug file missing inputs/targets. Exiting.")
    sys.exit(1)

# Reduce batch size to avoid OOM during repro; 8 is usually safe
inputs = inputs[:8]
targets = targets[:8]

# --- Build dataset to map names -> indices (only if your config used names) ---
# adapt these to the dataset key / args used in your training config
DATA_KEY = "firerisk"   # same as cfg["dataset"]
DATA_DIR = "../data/FireRisk"
# Use same arguments as training if needed (image_size, to_rgb, imagenet_norm)
train_set, _ = DATA_REG[DATA_KEY](DATA_DIR, image_size=320, to_rgb=False, imagenet_norm=False)
class_to_idx = getattr(train_set, "class_to_idx", None)
if class_to_idx is None:
    print("Warning: dataset has no class_to_idx. If your config used names you must provide indices.")
    class_to_idx = {}

# The names you used in your YAML config:
head_names   = ["Very_Low", "Non-burnable"]
medium_names = ["Low", "Moderate"]
tail_names   = ["High", "Very_High", "Water"]

# Convert to indices (if names missing, try using them as indices already)
def names_to_indices(lst):
    if not lst:
        return []
    if isinstance(lst[0], str):
        if not class_to_idx:
            raise RuntimeError("class_to_idx not available; cannot map names to indices.")
        return [class_to_idx[n] for n in lst]
    return list(lst)

head_idx = names_to_indices(head_names)
medium_idx = names_to_indices(medium_names)
tail_idx = names_to_indices(tail_names)
print("head_idx, medium_idx, tail_idx:", head_idx, medium_idx, tail_idx)

# --- Construct model (match your training model key and args) ---
MODEL_KEY = "resltresnet32"   # change if different
MODEL_ARGS = {"num_classes": 7, "scale": 1}
device = "cpu"   # run on CPU to avoid GPU OOM; change to "cuda" only if you want GPU and have memory
model = MODEL_REG[MODEL_KEY](**MODEL_ARGS).to(device)

# If crash_final.pth contained full model_state, try to load it
if "model_state" in dbg and isinstance(dbg["model_state"], dict):
    try:
        model.load_state_dict(dbg["model_state"])
        print("Loaded model_state from debug file.")
    except Exception as e:
        print("Warning: failed to load model_state from debug file:", e)

# Move inputs/targets to device
inputs = inputs.to(device)
targets = targets.to(device)

# --- Build the ResLTLoss with integer indices (safe) ---
loss_fn = ResLTLoss(
    num_classes=MODEL_ARGS["num_classes"],
    head_classes=head_idx,
    medium_classes=medium_idx,
    tail_classes=tail_idx,
    class_to_idx=None,
    beta=0.96
)
# If ResLTLoss has a .to(), move it
try:
    loss_fn = loss_fn.to(device)
except Exception:
    pass

# Run with anomaly detection (first try plain fp32 on CPU to avoid AMP/GPU memory complexity)
torch.autograd.set_detect_anomaly(True)
print("Running detect_anomaly() on CPU with a small sub-batch (no AMP).")
try:
    with torch.autograd.detect_anomaly():
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
    print("Backward completed without anomaly (unexpected).")
except Exception:
    print("=== ANOMALY TRACE (cpu, no AMP) ===")
    traceback.print_exc()
    raise


Using debug file: ../debug/crash_final.pth
head_idx, medium_idx, tail_idx: [5, 3] [1, 2] [0, 4, 6]
Loaded model_state from debug file.
Running detect_anomaly() on CPU with a small sub-batch (no AMP).


  with torch.autograd.detect_anomaly():


Backward completed without anomaly (unexpected).


In [4]:
train_set

Dataset ImageFolder
    Number of datapoints: 70331
    Root location: ../data/FireRisk/train
    StandardTransform
Transform: Compose(
               Resize(size=320, interpolation=bilinear, max_size=None, antialias=True)
               RandomCrop(size=(320, 320), padding=4)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
           )

In [7]:
# inspect_nan_debug.py
import torch, glob, os, pprint
p = "../debug/nan_grad_epoch6_step511_model.conv1.weight.pth"
if not os.path.exists(p):
    # fallback: pick most recent nan_grad file
    files = sorted(glob.glob("../debug/nan_grad_epoch*.pth") + glob.glob("../debug/crash_nan_grad.pth") + glob.glob("../debug/crash_final.pth"))
    if not files:
        raise SystemExit("No debug files found in debug/")
    p = files[-1]
print("Loading:", p)
d = torch.load(p, map_location="cpu")
summary = {}
for k,v in d.items():
    if isinstance(v, torch.Tensor):
        summary[k] = {
            "type": "tensor",
            "shape": tuple(v.shape),
            "dtype": str(v.dtype),
            "any_nan": bool(torch.isnan(v).any().item()),
            "any_inf": bool(torch.isinf(v).any().item()),
            "min": float(torch.min(v)) if v.numel() else None,
            "max": float(torch.max(v)) if v.numel() else None,
            "sample_flat": v.view(-1)[:50].tolist() if v.numel() else []
        }
    else:
        summary[k] = {"type": type(v).__name__, "preview": repr(v)[:400]}
pprint.pprint(summary)


Loading: ../debug/nan_grad_epoch6_step511_model.conv1.weight.pth
{'current_scale': {'preview': '32768.0', 'type': 'float'},
 'epoch': {'preview': '6', 'type': 'int'},
 'grad_sample': {'any_inf': False,
                 'any_nan': False,
                 'dtype': 'torch.float32',
                 'max': -0.0,
                 'min': -0.0,
                 'sample_flat': [-0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.0,
                                 -0.

In [10]:
# repro_nan_cpu.py
import torch, glob, os, traceback, sys
from models import REGISTRY as MODEL_REG
from losses.reslt_loss import ResLTLoss

# pick best debug file
cands = sorted(glob.glob("../debug/crash_final.pth") + glob.glob("../debug/nan_grad_epoch*.pth") + glob.glob("../debug/crash_nan_grad.pth"))
if not cands:
    raise SystemExit("No debug files in debug/")
dbg_path = cands[-1]
print("Using debug file:", dbg_path)
dbg = torch.load(dbg_path, map_location="cpu")

inputs = dbg.get("inputs")
targets = dbg.get("targets")
if inputs is None or targets is None:
    print("Debug file lacks inputs/targets. Exiting.")
    sys.exit(1)

# reduce batch size to avoid OOM
sub = min(8, inputs.shape[0])
inputs = inputs[:sub]
targets = targets[:sub]

device = "cpu"
# adapt model key/args to match your config
MODEL_KEY = "resltresnet32"
MODEL_ARGS = {"num_classes": 7, "scale": 1}
model = MODEL_REG[MODEL_KEY](**MODEL_ARGS).to(device)

# try to load full model state from crash_final if it's present
if "model_state" in dbg and isinstance(dbg["model_state"], dict):
    try:
        model.load_state_dict(dbg["model_state"])
        print("Loaded model_state from debug file.")
    except Exception as e:
        print("Warning: failed to load model_state from debug file:", e)

# prepare loss with integer indices (map names->indices if needed)
# replace with your dataset mapping if required
loss_fn = ResLTLoss(num_classes=7,
                    head_classes=[5,3], medium_classes=[1,2], tail_classes=[0,4,6],
                    class_to_idx=None, beta=0.96)
try:
    loss_fn = loss_fn.to(device)
except Exception:
    pass

inputs = inputs.to(device)
targets = targets.to(device)

torch.autograd.set_detect_anomaly(True)
try:
    with torch.autograd.detect_anomaly():
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
except Exception:
    print("=== ANOMALY TRACE ===")
    traceback.print_exc()
    raise
else:
    print("Backward completed without raising (on CPU). If this reproduces nothing, try AMP run on GPU with small batch and smaller init_scale.")


Using debug file: ../debug/nan_grad_epoch6_step511_model.conv1.weight.pth
	Missing key(s) in state_dict: "model.layer1.2.bn2.bias", "model.layer1.2.bn2.running_mean", "model.layer1.2.bn2.running_var", "model.layer1.3.conv1.weight", "model.layer1.3.bn1.weight", "model.layer1.3.bn1.bias", "model.layer1.3.bn1.running_mean", "model.layer1.3.bn1.running_var", "model.layer1.3.conv2.weight", "model.layer1.3.bn2.weight", "model.layer1.3.bn2.bias", "model.layer1.3.bn2.running_mean", "model.layer1.3.bn2.running_var", "model.layer1.4.conv1.weight", "model.layer1.4.bn1.weight", "model.layer1.4.bn1.bias", "model.layer1.4.bn1.running_mean", "model.layer1.4.bn1.running_var", "model.layer1.4.conv2.weight", "model.layer1.4.bn2.weight", "model.layer1.4.bn2.bias", "model.layer1.4.bn2.running_mean", "model.layer1.4.bn2.running_var", "model.layer2.0.conv1.weight", "model.layer2.0.bn1.weight", "model.layer2.0.bn1.bias", "model.layer2.0.bn1.running_mean", "model.layer2.0.bn1.running_var", "model.layer2.0.con

  with torch.autograd.detect_anomaly():


Backward completed without raising (on CPU). If this reproduces nothing, try AMP run on GPU with small batch and smaller init_scale.


In [11]:
# repro_amp_gpu.py
import os, glob, sys, traceback, torch
from models import REGISTRY as MODEL_REG
from losses.reslt_loss import ResLTLoss

DEBUG_DIR = "../debug"
# pick the most recent useful debug file
cands = sorted(glob.glob(os.path.join(DEBUG_DIR, "crash_final.pth")) +
               glob.glob(os.path.join(DEBUG_DIR, "nan_grad_epoch*.pth")) +
               glob.glob(os.path.join(DEBUG_DIR, "crash_nan_grad.pth")))
if not cands:
    print("No debug files found in debug/. Exiting.")
    sys.exit(1)
dbg_path = cands[-1]
print("Using debug file:", dbg_path)
dbg = torch.load(dbg_path, map_location="cpu")

inputs = dbg.get("inputs")
targets = dbg.get("targets")
if inputs is None or targets is None:
    print("Debug file missing inputs/targets. Exiting.")
    sys.exit(1)

# reduce sub-batch to avoid OOM
sub = min(4, inputs.shape[0])
inputs = inputs[:sub].cuda()
targets = targets[:sub].cuda()

# model config (match your train config)
MODEL_KEY = "resltresnet32"
MODEL_ARGS = {"num_classes": 7, "scale": 1}
model = MODEL_REG[MODEL_KEY](**MODEL_ARGS).cuda()

# try to load model_state if the crash_final contained it
if "model_state" in dbg and isinstance(dbg["model_state"], dict):
    try:
        model.load_state_dict(dbg["model_state"])
        print("Loaded model_state from debug file (partial/full).")
    except Exception as e:
        print("Warning: failed to load model_state from debug file:", e)

# Build loss with integer indices (use mapping you used in train)
loss_fn = ResLTLoss(num_classes=7, head_classes=[5,3], medium_classes=[1,2], tail_classes=[0,4,6], class_to_idx=None, beta=0.96)
try:
    loss_fn = loss_fn.cuda()
except Exception:
    pass

# Use a conservative init_scale to avoid spurious overflow while still using AMP
scaler = torch.cuda.amp.GradScaler(enabled=True, init_scale=2**8)

torch.autograd.set_detect_anomaly(True)
print("Running AMP/DetectAnomaly repro on GPU with small init_scale ...")
try:
    # detect_anomaly should wrap the backward region
    with torch.autograd.detect_anomaly():
        with torch.cuda.amp.autocast(enabled=True):
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
        # scaled backward (like training)
        scaler.scale(loss).backward()
    print("Backward completed without anomaly (unexpected).")
except Exception:
    print("=== ANOMALY TRACE (AMP GPU run) ===")
    traceback.print_exc()
    raise


Using debug file: ../debug/nan_grad_epoch6_step511_model.conv1.weight.pth
	Missing key(s) in state_dict: "model.layer1.2.bn2.bias", "model.layer1.2.bn2.running_mean", "model.layer1.2.bn2.running_var", "model.layer1.3.conv1.weight", "model.layer1.3.bn1.weight", "model.layer1.3.bn1.bias", "model.layer1.3.bn1.running_mean", "model.layer1.3.bn1.running_var", "model.layer1.3.conv2.weight", "model.layer1.3.bn2.weight", "model.layer1.3.bn2.bias", "model.layer1.3.bn2.running_mean", "model.layer1.3.bn2.running_var", "model.layer1.4.conv1.weight", "model.layer1.4.bn1.weight", "model.layer1.4.bn1.bias", "model.layer1.4.bn1.running_mean", "model.layer1.4.bn1.running_var", "model.layer1.4.conv2.weight", "model.layer1.4.bn2.weight", "model.layer1.4.bn2.bias", "model.layer1.4.bn2.running_mean", "model.layer1.4.bn2.running_var", "model.layer2.0.conv1.weight", "model.layer2.0.bn1.weight", "model.layer2.0.bn1.bias", "model.layer2.0.bn1.running_mean", "model.layer2.0.bn1.running_var", "model.layer2.0.con

  scaler = torch.cuda.amp.GradScaler(enabled=True, init_scale=2**8)
  with torch.autograd.detect_anomaly():
  with torch.cuda.amp.autocast(enabled=True):


Backward completed without anomaly (unexpected).


In [12]:
# inspect_bn_stats.py
import torch, glob, os, sys
from models import REGISTRY as MODEL_REG

DBG = sorted(glob.glob("../debug/crash_final.pth") + glob.glob("../debug/nan_grad_epoch*.pth"))
if not DBG:
    print("No debug files")
    sys.exit(1)
dbg = torch.load(DBG[-1], map_location="cpu")

MODEL_KEY = "resltresnet32"
MODEL_ARGS = {"num_classes": 7, "scale": 1}
model = MODEL_REG[MODEL_KEY](**MODEL_ARGS)

# try to load model_state if present
if "model_state" in dbg and isinstance(dbg["model_state"], dict):
    try:
        model.load_state_dict(dbg["model_state"], strict=False)
        print("Loaded model_state (partial) into model with strict=False")
    except Exception as e:
        print("Failed to load model_state:", e)

bn_info = []
for name, m in model.named_modules():
    if hasattr(m, "running_mean") and hasattr(m, "running_var"):
        rm = m.running_mean.detach().cpu().numpy() if m.running_mean is not None else None
        rv = m.running_var.detach().cpu().numpy() if m.running_var is not None else None
        bn_info.append((name, rm.shape if rm is not None else None, float(rv.mean()) if rv is not None else None,
                        float(rv.min()) if rv is not None else None,
                        float(rv.max()) if rv is not None else None))
for row in bn_info:
    print("BN:", row)


Loaded model_state (partial) into model with strict=False
BN: ('model.bn1', (16,), 0.022550377994775772, 0.0035867849364876747, 0.10274787247180939)
BN: ('model.layer1.0.bn1', (16,), 1.3074198961257935, 0.48375412821769714, 3.54730224609375)
BN: ('model.layer1.0.bn2', (16,), 1.0431103706359863, 0.4706116318702698, 1.965854525566101)
BN: ('model.layer1.1.bn1', (16,), 2.874728202819824, 1.4460272789001465, 5.051095008850098)
BN: ('model.layer1.1.bn2', (16,), 0.9074885845184326, 0.3552698791027069, 2.0965664386749268)
BN: ('model.layer1.2.bn1', (16,), 3.8629841804504395, 2.138880968093872, 6.481569766998291)
BN: ('model.layer1.2.bn2', (16,), 1.0, 1.0, 1.0)
BN: ('model.layer1.3.bn1', (16,), 1.0, 1.0, 1.0)
BN: ('model.layer1.3.bn2', (16,), 1.0, 1.0, 1.0)
BN: ('model.layer1.4.bn1', (16,), 1.0, 1.0, 1.0)
BN: ('model.layer1.4.bn2', (16,), 1.0, 1.0, 1.0)
BN: ('model.layer2.0.bn1', (32,), 1.0, 1.0, 1.0)
BN: ('model.layer2.0.bn2', (32,), 1.0, 1.0, 1.0)
BN: ('model.layer2.1.bn1', (32,), 1.0, 1.0, 