In [1]:
import os, gc, cv2, torch, shutil
import numpy as np
from PIL import Image
from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation

# ================== 1) 模型加载（ZoeDepth，米制深度） ==================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "Intel/zoedepth-nyu-kitti"
processor = AutoImageProcessor.from_pretrained(model_name)
model = ZoeDepthForDepthEstimation.from_pretrained(model_name).to(device).eval()

# ================== 2) 目录 & 工具 ==================
VAL_ROOT = "/kaggle/input/diode-val/val"
OUT_ROOT = "/kaggle/working/download"
os.makedirs(OUT_ROOT, exist_ok=True)

def mm_png_save(path, depth_m: np.ndarray):
    """以 16-bit PNG(毫米)保存深度图"""
    depth_mm = np.clip(depth_m * 1000.0, 0, 65535).astype(np.uint16)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    cv2.imwrite(path, depth_mm)

def npy_save(path, arr: np.ndarray):
    os.makedirs(os.path.dirname(path), exist_ok=True
               )
    np.save(path, arr.astype(np.float32))

def scale_match(pred: np.ndarray, gt: np.ndarray, mask: np.ndarray) -> float:
    """每张图的纯缩放对齐（scale-only）"""
    v = mask & np.isfinite(gt) & np.isfinite(pred) & (gt > 0)
    if v.sum() == 0:
        return 1.0
    p = pred[v]; g = gt[v]
    den = float((p**2).sum())
    return float((g * p).sum() / den) if den > 0 else 1.0

# ================== 3) 递归遍历，保存 预测 + GT ==================
count_total, count_saved = 0, 0

for root, _, files in os.walk(VAL_ROOT):
    pngs = [f for f in sorted(files) if f.endswith(".png")]
    if not pngs:
        continue

    # 相对 val 的路径
    rel_dir = os.path.relpath(root, VAL_ROOT)

    # 导出子目录（按相对路径展开）
    out_pred_raw_npy   = os.path.join(OUT_ROOT, rel_dir, "pred_raw_npy")      # 原始米制预测 .npy
    out_pred_aligned   = os.path.join(OUT_ROOT, rel_dir, "pred_aligned_npy")  # 可选：scale-only 对齐后 .npy
    out_pred_raw_png16 = os.path.join(OUT_ROOT, rel_dir, "pred_raw_png16")    # 原始米制转毫米 16-bit PNG
    out_gt_npy         = os.path.join(OUT_ROOT, rel_dir, "gt_npy")            # GT .npy (米)
    out_gt_png16       = os.path.join(OUT_ROOT, rel_dir, "gt_png16")          # GT 16-bit PNG (毫米)

    for img_name in pngs:
        count_total += 1
        rgb_path  = os.path.join(root, img_name)
        gt_path   = rgb_path.replace(".png", "_depth.npy")
        mask_path = rgb_path.replace(".png", "_depth_mask.npy")

        # DIODE 每张 RGB 对应 *_depth.npy 与 *_depth_mask.npy；缺失则跳过
        if not (os.path.exists(gt_path) and os.path.exists(mask_path)):
            continue

        # 读取
        rgb_bgr = cv2.imread(rgb_path)
        rgb = cv2.cvtColor(rgb_bgr, cv2.COLOR_BGR2RGB)
        gt = np.load(gt_path).astype(np.float32)
        if gt.ndim == 3:
            gt = gt[..., 0]
        mask = np.load(mask_path).astype(bool)

        # ------ 推理（得到米制深度，单位=米） ------
        with torch.no_grad():
            inputs = processor(images=Image.fromarray(rgb), return_tensors="pt").to(device)
            outputs = model(**inputs)
            processed = processor.post_process_depth_estimation(
                outputs, source_sizes=[(rgb.shape[0], rgb.shape[1])]
            )
            pred_metric = processed[0]["predicted_depth"].squeeze().detach().cpu().numpy().astype(np.float32)

        # 尺寸对齐到 GT
        if pred_metric.shape != gt.shape:
            pred_metric = cv2.resize(pred_metric, (gt.shape[1], gt.shape[0]), interpolation=cv2.INTER_LINEAR)

        # （可选）“scale-only 对齐后”的预测 .npy
        s = scale_match(pred_metric, gt, mask)
        pred_aligned = pred_metric * s

        # 文件名（与 RGB 同名）
        stem = os.path.splitext(img_name)[0]

        # 保存预测（原始米制）
        npy_save(os.path.join(out_pred_raw_npy,   f"{stem}.npy"), pred_metric)
        mm_png_save(os.path.join(out_pred_raw_png16, f"{stem}.png"), pred_metric)

        # 保存预测（对齐后 .npy）
        npy_save(os.path.join(out_pred_aligned,   f"{stem}.npy"), pred_aligned)

        # 保存 GT（米制）
        npy_save(os.path.join(out_gt_npy,     f"{stem}.npy"), gt)
        mm_png_save(os.path.join(out_gt_png16, f"{stem}.png"), gt)

        count_saved += 1

        # 清理
        del rgb_bgr, rgb, gt, mask, pred_metric, pred_aligned, inputs, outputs, processed
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

print(f"[ZoeDepth SAVE] scanned PNGs: {count_total}, saved pairs: {count_saved}")

# ================== 4) （可选）一键打包下载 ==================
zip_base = "/kaggle/working/zoedepth_diode_preds_and_gt"
shutil.make_archive(zip_base, "zip", OUT_ROOT)
print(f"打包完成: {zip_base}.zip")


2025-08-14 16:46:43.982441: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755190004.174947      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755190004.236196      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

[ZoeDepth SAVE] scanned PNGs: 771, saved pairs: 771
打包完成: /kaggle/working/zoedepth_diode_preds_and_gt.zip
