In [8]:
import torch
from PIL import Image
from tqdm import tqdm
import pandas as pd
import os
import json
from pathlib import Path


from engine.core import YAMLConfig
from engine.misc import dist_utils
from engine.data.transforms import Compose, Normalize, Resize, ConvertPILImage
from engine.solver import TASKS

import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor
from torchvision.transforms.functional import to_tensor

from ensemble_boxes import weighted_boxes_fusion

In [None]:
def load_dfine_solver(config_path, checkpoint_path, device="cuda"):

    update_dict = {
        "resume": None,
        "device": device,
        "seed": 42,
        "tuning": checkpoint_path,
        "use_amp": False,
        "use_ema": True,
    }

    cfg = YAMLConfig(config_path, **update_dict)

    if "HGNetv2" in cfg.yaml_cfg:
        cfg.yaml_cfg["HGNetv2"]["pretrained"] = False

    # Solver ÏÉùÏÑ±
    SolverClass = TASKS[cfg.yaml_cfg["task"]]
    solver = SolverClass(cfg)

    solver._setup()

    # checkpoint load Î∞©ÏãùÎèÑ train.pyÏôÄ ÎèôÏùº
    ckpt = torch.load(checkpoint_path, map_location="cpu")
    state = ckpt["model"] if "model" in ckpt else ckpt["ema"]["module"]
    solver.model.load_state_dict(state, strict=False)

    solver.model.to(device)
    solver.model.eval()
    return solver

In [None]:
def dfine_test_inference_tta(
    config,
    checkpoint,
    image_dir,
    output_csv="output.csv",
    threshold=0.01,
    base_size=1024,           # Î™®Îç∏ ÏûÖÎ†• Í∏∞Ï§Ä Ìï¥ÏÉÅÎèÑ
    device="cuda"
):
    print(f"Loading model with Flip TTA enabled (Normalized)...")
    solver = load_dfine_solver(config, checkpoint, device)
    model = solver.model
    postprocessor = solver.postprocessor
    model.eval()

    image_paths = sorted(list(Path(image_dir).glob("*.jpg"))) + \
                  sorted(list(Path(image_dir).glob("*.png")))

    predictions = []
    filenames = []

    print(f"Starting Inference with Flip TTA (Original + Horizontal Flip)...")
    
    #Ï†ïÍ∑úÌôî(Normalize)
    tfs = Compose([
        Resize((base_size, base_size)),
        lambda x: to_tensor(x).type(torch.float32),
        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 
    ])

    for img_path in tqdm(image_paths):
        # Ïù¥ÎØ∏ÏßÄ Î°úÎìú
        img_pil = Image.open(img_path).convert("RGB")
        w0, h0 = img_pil.size
        
        # Ï†ÑÏ≤òÎ¶¨ (Resize -> Tensor -> Normalize)
        img_tensor = tfs(img_pil) # [C, H, W]
        
        box_list = []
        score_list = []
        label_list = []

        # TTA Ï†ÑÎûµ
        img_variants = [
            (img_tensor.unsqueeze(0).to(device), False),          # Original
            (torch.flip(img_tensor, [2]).unsqueeze(0).to(device), True) # H-Flip
        ]

        # Ï∂îÎ°†
        for img_input, is_flipped in img_variants:
            with torch.no_grad():
                outputs = model(img_input)
            
            # Post-process 
            pred = postprocessor(
                outputs,
                orig_target_sizes=torch.tensor([[h0, w0]], device=device)
            )[0]

            boxes = pred["boxes"].cpu().numpy()  # [x1, y1, x2, y2]
            scores = pred["scores"].cpu().numpy()
            labels = pred["labels"].cpu().numpy()

            # Ï¢åÏö∞ Î∞òÏ†Ñ Î≥µÍµ¨
            if is_flipped: 
                original_x1 = boxes[:, 0].copy()
                original_x2 = boxes[:, 2].copy()
                boxes[:, 0] = w0 - original_x2
                boxes[:, 2] = w0 - original_x1

            # WBFÎ•º ÏúÑÌïú Ï†ïÍ∑úÌôî (0~1)
            boxes[:, [0, 2]] = boxes[:, [0, 2]] / w0
            boxes[:, [1, 3]] = boxes[:, [1, 3]] / h0
            boxes = np.clip(boxes, 0, 1)

            box_list.append(boxes.tolist())
            score_list.append(scores.tolist())
            label_list.append(labels.tolist())

        # [ÏïôÏÉÅÎ∏î] Weighted Box Fusion (WBF)
        if len(box_list) > 0:
            final_boxes, final_scores, final_labels = weighted_boxes_fusion(
                box_list, 
                score_list, 
                label_list, 
                weights=[1, 1],    # ÏõêÎ≥∏Í≥º Flip ÎèôÏùº Í∞ÄÏ§ëÏπò
                iou_thr=0.6,       # Í≤πÏπ® ÏûÑÍ≥ÑÍ∞í
                skip_box_thr=0.1
            )
            
            # Ï†ïÍ∑úÌôîÎêú Ï¢åÌëúÎ•º Îã§Ïãú ÌîΩÏÖÄ Ï¢åÌëúÎ°ú Î≥µÍµ¨
            final_boxes[:, [0, 2]] *= w0
            final_boxes[:, [1, 3]] *= h0
            
        else:
            final_boxes, final_scores, final_labels = [], [], []

        # Í≤∞Í≥º Î¨∏ÏûêÏó¥ ÏÉùÏÑ±
        pred_str = ""
        for box, score, label in zip(final_boxes, final_scores, final_labels):
            if score < threshold:
                continue
            
            label = int(label)
            x1, y1, x2, y2 = box
            pred_str += f"{label} {score:.4f} {x1:.2f} {y1:.2f} {x2:.2f} {y2:.2f} "

        predictions.append(pred_str.strip())
        filenames.append(f"test/{img_path.name}")

    # CSV Ï†ÄÏû•
    df = pd.DataFrame({
        "PredictionString": predictions,
        "image_id": filenames,
    })

    df.to_csv(output_csv, index=False)
    print(f"üìÑ Normalized Flip TTA Complete! Saved to ‚Üí {output_csv}")
    return df

In [None]:
df = dfine_test_inference_tta(
    config="./configs/deimv2/deimv2_dinov3_x_coco.yml",
    checkpoint="./outputs/deimv2_dinov3_x_coco/checkpoint0019.pth",
    image_dir="../dataset/test",
    output_csv="deimv2_submission.csv",
    threshold=0.01
)

df.head()

üîÑ Loading model with Flip TTA enabled (Normalized)...
Training DINOv3 from scratch...
Using Lite Spatial Prior Module with inplanes=64
     --- Use Gateway@True ---
     --- Use Share Bbox Head@False ---
     --- Use Share Score Head@False ---
     --- Wide Layer@1 ---
Tuning checkpoint from ./outputs/deimv2_dinov3_x_coco/checkpoint0019.pth
Load model.state_dict, {'missed': [], 'unmatched': []}
Using the new matching cost with iou_order_alpha = 4.0 at epoch 45
üöÄ Starting Inference with Flip TTA (Original + Horizontal Flip)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4871/4871 [25:59<00:00,  3.12it/s]


üìÑ Normalized Flip TTA Complete! Saved to ‚Üí deimv2_submission.csv


Unnamed: 0,PredictionString,image_id
0,7 0.9810 604.24 517.68 956.28 1024.00 7 0.9763...,test/0000.jpg
1,4 0.2884 344.70 250.02 751.98 694.22 3 0.2363 ...,test/0001.jpg
2,1 0.9492 775.87 406.84 1023.78 1023.88 4 0.748...,test/0002.jpg
3,9 0.6638 145.99 262.45 911.19 823.58 9 0.0364 ...,test/0003.jpg
4,0 0.3310 425.33 408.11 656.75 577.95 1 0.2268 ...,test/0004.jpg
