# Entrenamiento modelos RF-DETR

In [1]:
from dataclasses import asdict
from pathlib import Path
from shutil import copy2
import json

import albumentations as A
import numpy as np
import pandas as pd
import torch
from PIL import Image
from albumentations.pytorch import ToTensorV2
from omegaconf import OmegaConf
from tqdm import tqdm

from rfdetr import RFDETRLarge

from utils.common.bbox import convert_bbox_csv_to_points
from utils.herdnet import evaluate_points_from_csv
from utils.rf_detr import (
    DEFAULT_CATEGORIES,
    Detection,
    DetectionSample,
    HerdNetMetricsCallback,
    PatchSummary,
    SimpleStitcher,
    generate_patch_dataset,
    write_coco_predictions,
)


INFO:albumentations.check_version:A new version of Albumentations is available: 2.0.8 (you have 1.4.8). Upgrade using: pip install --upgrade albumentations


## Fase 0 ‚Äî Generaci√≥n de Parches

Dividimos las im√°genes 24MP en parches cuadr√°ticos y ajustamos las anotaciones COCO para obtener el dataset de entrenamiento.

### Configuraci√≥n Inicial
Constantes a utilizar durante el entrenamiento y carpetas de destino para los parches generados

In [2]:
PATCH_SIZE = 560
PATCH_OVERLAP = 160
IMG_NORMALIZE_MEAN = [0.485, 0.456, 0.406]
IMG_NORMALIZE_STD = [0.229, 0.224, 0.225]
TRAIN_EPOCHS = 50
BATCH_SIZE = 16
GRAD_ACCUM_STEPS = 4
CONF_THRESHOLD_STAGE1 = 0.5
CONF_THRESHOLD_STAGE2 = 0.5
MATCH_RADIUS = 20.0

In [None]:
patch_jobs = [
    {
        'split': 'train',
        'images_dir': 'data-delplanque/train',
        'json_file': 'data-delplanque/groundtruth/json/train.json',
        'output_dir': 'data-large-detr/train',
        'patch_width': PATCH_SIZE,
        'patch_height': PATCH_SIZE,
        'overlap': PATCH_OVERLAP,
        'min_visibility': 0.8,
    },
    {
        'split': 'valid',
        'images_dir': 'data-delplanque/val',
        'json_file': 'data-delplanque/groundtruth/json/val.json',
        'output_dir': 'data-large-detr/valid',
        'patch_width': PATCH_SIZE,
        'patch_height': PATCH_SIZE,
        'overlap': PATCH_OVERLAP,
        'min_visibility': 0.8,
    },
]

Generaci√≥n de parches y archivo con los groundtruth ajustados para la escala de parches. Los parches corresponden a √∫nicamente aquellos que cuentan con animales

In [None]:
patch_summaries = []
for job in patch_jobs:
    summary = generate_patch_dataset(
        images_dir=job['images_dir'],
        json_file=job['json_file'],
        output_dir=job['output_dir'],
        patch_width=job['patch_width'],
        patch_height=job['patch_height'],
        overlap=job['overlap'],
        min_visibility=job['min_visibility'],
    )
    entry = {'split': job['split']}
    entry.update(asdict(summary))
    patch_summaries.append(entry)

pd.DataFrame(patch_summaries)

## Fase 1 ‚Äî Entrenamiento Inicial RF-DETR

Entrenamos el flavor de RF-DETR sobre los parches generados (con animales) y registramos m√©tricas de HerdNet (callback) durante el entrenamiento.

### Inicializar modelo y callback

In [None]:
model = RFDETRLarge()

herdnet_callback = HerdNetMetricsCallback(
    model=model,
    val_dataset_path='data-large-detr/valid/_annotations.coco.json',
    val_images_dir='data-large-detr/valid',
    threshold_px=20,
    confidence_threshold=0.5,
    wandb_log=True,
    eval_every_n_epochs=5,
)

model.callbacks['on_fit_epoch_end'].append(herdnet_callback.update)


### Entrenamiento

In [None]:
model.train(
    dataset_dir='data-large-detr',
    dataset_file='roboflow',
    img_size=PATCH_SIZE,
    epochs=TRAIN_EPOCHS,
    batch_size=BATCH_SIZE,
    grad_accum_steps=GRAD_ACCUM_STEPS,
    output_dir='outputs/rfdetr_large_stage1',
    wandb=True,
    project='rf-detr-large',
)


### Evaluaci√≥n

#### Configuraci√≥n de inferencia y m√©tricas

In [None]:
eval_cfg = OmegaConf.create({
    'data': {
        'images_root': 'data-delplanque/test',
        'gt_points_csv': 'data-delplanque/test.csv',
    },
    'inference': {
        'device': 'cuda',
        'checkpoint_path': './outputs/rfdetr_large_2/checkpoint_best_total.pth',
        'threshold': CONF_THRESHOLD_STAGE1,
        'batch_size': 16,
        'output_path': './results/rfdetr_large',
        'detections_csv': 'rfdetr_stage1_detections.csv',
    },
    'metrics': {
        'radius': MATCH_RADIUS,
        'class_map': None,
    },
})


#### Cargar checkpoint y preparar stitcher
Se carga el checkpoint del modelo con el mejor resultado sobre el conjunto de validaci√≥n y se carga el stitcher, el cual se encargar√° de hacer las predicciones en cada parche del tama√±o definido en ventana deslizante, asegurandos√© de escalar las detecciones a las coordenas en la imagen original de 24MP.

In [None]:
checkpoint = torch.load(eval_cfg.inference.checkpoint_path, weights_only=False)
state_dict = checkpoint.get('model', checkpoint.get('ema_model'))
num_classes = state_dict['class_embed.weight'].shape[0]

model_eval = RFDETRLarge()
model_eval.model.reinitialize_detection_head(num_classes)
model_eval.model.model.load_state_dict(state_dict, strict=True)
model_eval.model.model.to(eval_cfg.inference.device).eval()

stitcher = SimpleStitcher(
    model=model_eval.model.model,
    patch_size=PATCH_SIZE,
    overlap=0,
    batch_size=eval_cfg.inference.batch_size,
    confidence_threshold=eval_cfg.inference.threshold,
    device=eval_cfg.inference.device,
    label_offset=0,
)

Using a different number of positional encodings than DINOv2, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Using patch size 16 instead of 14, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Loading pretrain weights


#### Ejecutar inferencia sobre im√°genes completas

In [10]:
from pathlib import Path


images_root = Path(eval_cfg.data.images_root)
image_files = sorted(
    list(images_root.glob('*.jpg'))
    + list(images_root.glob('*.JPG'))
    + list(images_root.glob('*.png'))
    + list[Path](images_root.glob('*.PNG'))
)

transform = A.Compose([
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

output_dir = Path(eval_cfg.inference.output_path)
output_dir.mkdir(parents=True, exist_ok=True)
all_detections = []

for img_path in tqdm(image_files, desc='Inference'):
    image = Image.open(img_path).convert('RGB')
    image_tensor = transform(image=np.array(image))['image']
    detections = stitcher(image_tensor)

    for i in range(len(detections['scores'])):
        all_detections.append({
            'images': img_path.name,
            'x': float(detections['boxes'][i, 0]),
            'y': float(detections['boxes'][i, 1]),
            'x_max': float(detections['boxes'][i, 2]),
            'y_max': float(detections['boxes'][i, 3]),
            'labels': int(detections['labels'][i]),
            'scores': float(detections['scores'][i]),
        })

pd.DataFrame(all_detections).to_csv(output_dir / eval_cfg.inference.detections_csv, index=False)
print('Saved', len(all_detections), 'detections to', output_dir / eval_cfg.inference.detections_csv)


Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 258/258 [03:19<00:00,  1.30it/s]

Saved 4292 detections to results/rfdetr_nano/rfdetr_stage1_detections.csv





#### Convertir detecciones a puntos

Debido a que los modelos RF-DETR gener√°n bounding boxes para las detecciones, se calcul√°n su centroides para calcular las m√©tricas y poder comparar con los resultados de HerdNet.

In [None]:
points_path = output_dir / 'rfdetr_stage1_detections_points.csv'
points_df = convert_bbox_csv_to_points(
    output_dir / eval_cfg.inference.detections_csv,
    points_path,
)
print('Converted', len(points_df), 'detections to points ->', points_path)
points_df.head()

Converted 4292 detections to points -> results/rfdetr_nano/rfdetr_stage1_detections_points.csv


Unnamed: 0,images,x,y,labels,scores
0,01802f75da35434ab373569fffc1fd65a3417aef.JPG,1472.237793,262.071701,6,0.513108
1,01802f75da35434ab373569fffc1fd65a3417aef.JPG,4673.474609,251.195457,6,0.667662
2,01802f75da35434ab373569fffc1fd65a3417aef.JPG,2985.883301,906.45874,6,0.581643
3,01802f75da35434ab373569fffc1fd65a3417aef.JPG,5190.343262,1105.431396,6,0.693571
4,01802f75da35434ab373569fffc1fd65a3417aef.JPG,1168.424866,1482.091553,6,0.745156


#### Calcular m√©tricas HerdNet

In [None]:
metrics_summary = evaluate_points_from_csv(
    gt_csv=eval_cfg.data.gt_points_csv,
    detections_csv=points_path,
    class_map_path=eval_cfg.metrics.class_map,
    radius=eval_cfg.metrics.radius,
)
metrics_summary['overall']

## Fase 2 ‚Äî Hard Negatives y Stage 2

Usamos el modelo de la fase 1 para recolectar falsos positivos, generar parches negativos y reentrenar RF-DETR con el dataset ampliado (dataset de la fase 1 + parches negativos).

### Inferencia sobre el split de entrenamiento
Se realiza la inferencia sobre las im√°genes originales (24MP) del conjunto de entrenamiento 

In [22]:
hn_cfg = OmegaConf.create({
    'data': {
        'train_root': 'data-delplanque/train',
    },
    'inference': {
        'device': 'cuda',
        'checkpoint_path': './outputs/rfdetr_large/checkpoint_best_total.pth',
        'threshold': 0.5,
        'batch_size': 16,
        'output_path': './results/rfdetr_large',
        'detections_csv': 'rfdetr_large_stage1_train_detections.csv',
        'detections_json': 'rfdetr_large_stage1_train_detections.json',
    },
    'patches': {
        'patch_width': PATCH_SIZE,
        'patch_height': PATCH_SIZE,
        'overlap': 160,
        'min_visibility': 0.8,
        'output_dir': 'data-large-detr/hnp',
    },
})


In [25]:
hn_checkpoint = torch.load(hn_cfg.inference.checkpoint_path, weights_only=False)
hn_state = hn_checkpoint.get('model', hn_checkpoint.get('ema_model'))
hn_num_classes = hn_state['class_embed.weight'].shape[0]

hn_model = RFDETRLarge()
hn_model.model.reinitialize_detection_head(hn_num_classes)
hn_model.model.model.load_state_dict(hn_state, strict=True)
hn_model.model.model.to(hn_cfg.inference.device).eval()

hn_stitcher = SimpleStitcher(
    model=hn_model.model.model,
    patch_size=hn_cfg.patches.patch_width,
    overlap=0,
    batch_size=hn_cfg.inference.batch_size,
    confidence_threshold=hn_cfg.inference.threshold,
    device=hn_cfg.inference.device,
    label_offset=0,
)

Loading pretrain weights


In [None]:
train_root = Path(hn_cfg.data.train_root)
train_images = sorted(
    list(train_root.glob('*.jpg')) +
    list(train_root.glob('*.JPG')) +
    list(train_root.glob('*.png')) +
    list(train_root.glob('*.PNG'))
)

transform = A.Compose([
    A.Normalize(mean=IMG_NORMALIZE_MEAN, std=IMG_NORMALIZE_STD),
    ToTensorV2(),
])

hn_output_dir = Path(hn_cfg.inference.output_path)
hn_output_dir.mkdir(parents=True, exist_ok=True)
hn_records = []
hn_samples = []

for img_path in tqdm(train_images, desc='HN inference'):
    image = Image.open(img_path).convert('RGB')
    width, height = image.size
    tensor = transform(image=np.array(image))['image']
    detections = hn_stitcher(tensor)

    det_list = []
    for i in range(len(detections['scores'])):
        x1, y1, x2, y2 = detections['boxes'][i].tolist()
        label = int(detections['labels'][i])
        score = float(detections['scores'][i])
        hn_records.append({
            'images': img_path.name,
            'x': x1,
            'y': y1,
            'x_max': x2,
            'y_max': y2,
            'labels': label,
            'scores': score,
        })
        det_list.append(Detection(bbox=[x1, y1, x2, y2], label=label, score=score))

    hn_samples.append(DetectionSample(file_name=img_path.name, width=width, height=height, detections=det_list))

hn_df = pd.DataFrame(hn_records)
hn_csv_path = hn_output_dir / hn_cfg.inference.detections_csv
hn_df.to_csv(hn_csv_path, index=False)
write_coco_predictions(hn_samples, hn_output_dir / hn_cfg.inference.detections_json)
hn_csv_path


HN inference:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 395/928 [06:30<09:41,  1.09s/it]

### Generar parches negativos

Utilizamos el JSON de detecciones para identificar los parches que contienen falsos positivos y se estructura el dataset para que en la carpeta *train/* se tenga el conjunto de datos de train de la fase 1 y los falsos positivos generados.

dataset/
‚îú‚îÄ‚îÄ train/
‚îÇ   ‚îú‚îÄ‚îÄ _annotations.coco.json
‚îÇ   ‚îú‚îÄ‚îÄ image1.jpg
‚îÇ   ‚îú‚îÄ‚îÄ image2.jpg
‚îÇ   ‚îî‚îÄ‚îÄ ... (other image files)
‚îú‚îÄ‚îÄ valid/
‚îÇ   ‚îú‚îÄ‚îÄ _annotations.coco.json
‚îÇ   ‚îú‚îÄ‚îÄ image1.jpg
‚îÇ   ‚îú‚îÄ‚îÄ image2.jpg
‚îÇ   ‚îî‚îÄ‚îÄ ... (other image files)
‚îî‚îÄ‚îÄ test/
    ‚îú‚îÄ‚îÄ _annotations.coco.json
    ‚îú‚îÄ‚îÄ image1.jpg
    ‚îú‚îÄ‚îÄ image2.jpg
    ‚îî‚îÄ‚îÄ ... (other image files)

In [None]:
# Preparar directorios para Stage 2
stage2_train_dir = Path("data-large-detr-stage2/train")
stage2_train_dir.mkdir(parents=True, exist_ok=True)

# Copiar im√°genes originales de Stage 1
print("Copying original Stage 1 patches...")
original_patches_dir = Path("data-large-detr/train")
for pattern in ("*.jpg", "*.JPG"):
    for src in original_patches_dir.glob(pattern):
        dst = stage2_train_dir / src.name
        if not dst.exists():
            copy2(src, dst)

# Cargar el JSON original de Stage 1
original_json = Path("data-large-detr/train/_annotations.coco.json")
original_coco = json.loads(original_json.read_text())
original_filenames = {img["file_name"] for img in original_coco["images"]}

print(f"\nStage 1 dataset: {len(original_coco['images'])} images, {len(original_coco['annotations'])} annotations")

# Generar patches con el patcher (incluye TPs y FPs)
print("\nGenerating patches from model predictions...")

hnp_summary = generate_patch_dataset(
    images_dir=hn_cfg.data.train_root,
    json_file=hn_output_dir / hn_cfg.inference.detections_json,
    output_dir=stage2_train_dir,
    patch_width=hn_cfg.patches.patch_width,
    patch_height=hn_cfg.patches.patch_height,
    overlap=hn_cfg.patches.overlap,
    min_visibility=hn_cfg.patches.min_visibility,
)

# Leer el JSON generado por el patcher
hnp_json_path = stage2_train_dir / "_annotations.coco.json"
hnp_coco = json.loads(hnp_json_path.read_text())

# Filtrar: Solo patches NUEVOS (que no est√°n en Stage 1) son HNP
print("\nFiltering Hard Negative Patches (FPs)...")
new_hnp_images = [img for img in hnp_coco["images"] if img["file_name"] not in original_filenames]

print(f"  Total patches generated: {len(hnp_coco['images'])}")
print(f"  Patches matching Stage 1 (TPs, skipped): {len(hnp_coco['images']) - len(new_hnp_images)}")
print(f"  New Hard Negative Patches (FPs): {len(new_hnp_images)}")

# Combinar: Stage 1 (con anotaciones) + HNP (sin anotaciones)
print("\nMerging Stage 1 + HNP...")
next_image_id = max((img["id"] for img in original_coco["images"]), default=0) + 1

stage2_coco = {
    "info": original_coco.get("info", {}),
    "licenses": original_coco.get("licenses", []),
    "categories": original_coco.get("categories", []),
    "images": list(original_coco["images"]),
    "annotations": list(original_coco["annotations"]),  # Solo anotaciones de Stage 1
}

# Agregar im√°genes HNP (sin anotaciones - son fondos puros)
for hnp_img in new_hnp_images:
    stage2_coco["images"].append({
        **hnp_img,
        "id": next_image_id,
    })
    next_image_id += 1

# Guardar JSON final de Stage 2
hnp_json_path.write_text(json.dumps(stage2_coco, indent=2))

print(f"\n‚úì Stage 2 dataset ready:")
print(f"  - Images: {len(stage2_coco['images'])} (Stage 1: {len(original_coco['images'])}, HNP: {len(new_hnp_images)})")
print(f"  - Annotations: {len(stage2_coco['annotations'])} (only from Stage 1, HNP have no annotations)")
print(f"  - Location: {stage2_train_dir}")

pd.DataFrame([{
    "stage1_images": len(original_coco['images']),
    "hnp_images": len(new_hnp_images),
    "total_images": len(stage2_coco['images']),
    "stage1_anns": len(original_coco['annotations']),
    "hnp_anns": 0,
    "total_anns": len(stage2_coco['annotations']),
}])

Patching images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 928/928 [12:29<00:00,  1.24it/s]



Filtering Hard Negative Patches (FPs)...
  Total patches generated: 12991
  Patches matching Stage 1 (TPs, skipped): 6803
  New Hard Negative Patches (FPs): 6188

Merging Stage 1 + HNP...

‚úì Stage 2 dataset ready:
  - Images: 14241 (Stage 1: 8053, HNP: 6188)
  - Annotations: 15893 (only from Stage 1, HNP have no annotations)
  - Location: data-large-detr-stage2/train


Unnamed: 0,stage1_images,hnp_images,total_images,stage1_anns,hnp_anns,total_anns
0,8053,6188,14241,15893,0,15893


El dataset de validaci√≥n es el mismo de la fase 1

In [30]:
# Preparar validaci√≥n para Stage 2 (reutilizar la misma de Stage 1)
stage2_valid_dir = Path("data-large-detr-stage2/valid")
if not stage2_valid_dir.exists():
    print("Creating validation dataset for Stage 2...")
    stage2_valid_dir.mkdir(parents=True, exist_ok=True)
    
    # Copiar im√°genes de validaci√≥n
    stage1_valid = Path("data-large-detr/valid")
    for pattern in ("*.jpg", "*.JPG"):
        for src in stage1_valid.glob(pattern):
            copy2(src, stage2_valid_dir / src.name)
    
    # Copiar JSON de validaci√≥n
    copy2(stage1_valid / "_annotations.coco.json", stage2_valid_dir / "_annotations.coco.json")
    print(f"‚úì Validation dataset ready: {stage2_valid_dir}")
else:
    print(f"‚úì Validation dataset already exists: {stage2_valid_dir}")


Creating validation dataset for Stage 2...
‚úì Validation dataset ready: data-large-detr-stage2/valid


### Entrenamiento

In [34]:
checkpoint_stage1 = torch.load(hn_cfg.inference.checkpoint_path, weights_only=False)
state_dict_stage1 = checkpoint_stage1.get('model', checkpoint_stage1.get('ema_model'))
num_classes_stage1 = state_dict_stage1['class_embed.weight'].shape[0]

print(f"üîÑ Loading Stage 1 checkpoint: num_classes={num_classes_stage1}")

# Crear modelo y cargar pesos MANUALMENTE
model_stage2 = RFDETRLarge()
model_stage2.model.reinitialize_detection_head(num_classes_stage1)
model_stage2.model.model.load_state_dict(state_dict_stage1, strict=True)

print(f"‚úÖ Stage 1 weights loaded successfully!")

herdnet_callback_stage2 = HerdNetMetricsCallback(
    model=model_stage2,
    val_dataset_path='data-large-detr-stage2/valid/_annotations.coco.json',
    val_images_dir='data-large-detr-stage2/valid',
    threshold_px=20,
    confidence_threshold=0.5,
    wandb_log=True,
    eval_every_n_epochs=5,
)

model_stage2.callbacks['on_fit_epoch_end'].append(herdnet_callback_stage2.update)

üîÑ Loading Stage 1 checkpoint: num_classes=7
Loading pretrain weights
‚úÖ Stage 1 weights loaded successfully!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


In [None]:
model_stage2.train(
    dataset_dir='data-large-detr-stage2',
    dataset_file='roboflow',
    img_size=PATCH_SIZE,
    epochs=50,
    batch_size=2, 
    grad_accum_steps=8,
    lr=1e-5,
    lr_encoder=1e-5, 
    ema_decay=0.993,
    weight_decay=1e-4,
    multi_scale=True,
    output_dir='outputs/rfdetr_large_stage2',
    wandb=True,
    project='rf-detr-large',
)

TensorBoard logging initialized. To monitor logs, use 'tensorboard --logdir outputs/rfdetr_large_stage2' and open http://localhost:6006/ in browser.


W&B logging initialized. To monitor logs, open https://wandb.ai/luis-manrique-car-camera-traps/rf-detr-large/runs/k9rjh4n7.
Not using distributed mode
git:
  sha: N/A, status: clean, branch: N/A

Namespace(num_classes=7, grad_accum_steps=8, amp=True, lr=1e-05, lr_encoder=1e-05, batch_size=2, weight_decay=0.0001, epochs=50, lr_drop=100, clip_max_norm=0.1, lr_vit_layer_decay=0.8, lr_component_decay=0.7, do_benchmark=False, dropout=0, drop_path=0.0, drop_mode='standard', drop_schedule='constant', cutoff_epoch=0, pretrained_encoder=None, pretrain_weights='rf-detr-large.pth', pretrain_exclude_keys=None, pretrain_keys_modify_to_load=None, pretrained_distiller=None, encoder='dinov2_windowed_base', vit_encoder_num_layers=12, window_block_indexes=None, position_embedding='sine', out_feature_indexes=[2, 5, 8, 11], freeze_encoder=False, layer_norm=True, rms_norm=False, backbone_lora=False, force_no_pretrain=False, dec_layers=3, dim_feedforward=2048, hidden_dim=384, sa_nheads=12, ca_nheads=24, num

fatal: not a git repository (or any of the parent directories): .git


Done (t=0.03s)
creating index...
index created!
Get benchmark
Start training
Grad accum steps:  8
Total batch size:  16
LENGTH OF DATA LOADER: 890
Epoch: [0]  [  0/890]  eta: 0:30:28  lr: 0.000010  class_error: 0.00  loss: 4.2570 (4.2570)  loss_ce: 0.4956 (0.4956)  loss_bbox: 0.1606 (0.1606)  loss_giou: 0.3419 (0.3419)  loss_ce_0: 0.5061 (0.5061)  loss_bbox_0: 0.1533 (0.1533)  loss_giou_0: 0.3346 (0.3346)  loss_ce_1: 0.5136 (0.5136)  loss_bbox_1: 0.1595 (0.1595)  loss_giou_1: 0.3429 (0.3429)  loss_ce_enc: 0.6106 (0.6106)  loss_bbox_enc: 0.2000 (0.2000)  loss_giou_enc: 0.4383 (0.4383)  loss_ce_unscaled: 0.4956 (0.4956)  class_error_unscaled: 0.0000 (0.0000)  loss_bbox_unscaled: 0.0321 (0.0321)  loss_giou_unscaled: 0.1710 (0.1710)  cardinality_error_unscaled: 1949.5000 (1949.5000)  loss_ce_0_unscaled: 0.5061 (0.5061)  loss_bbox_0_unscaled: 0.0307 (0.0307)  loss_giou_0_unscaled: 0.1673 (0.1673)  cardinality_error_0_unscaled: 1955.5000 (1955.5000)  loss_ce_1_unscaled: 0.5136 (0.5136)  loss




[HerdNet Metrics] Evaluating at epoch 0...
  Evaluating on 1137 images
  ‚úì Processed 1137 images (0 skipped)
  ‚úì Logged HerdNet metrics to WandB (misty-glade-9)

HerdNet Metrics @ Epoch 1 (threshold=20px)

Overall (Binary Detection):
  F1:        0.9090
  Precision: 0.9859
  Recall:    0.8433
  MAE:       0.3289
  RMSE:      0.7467

Per-Class Metrics:
  Class                 F1  Precision   Recall
  --------------- -------- ---------- --------
  Hartebeest        0.9088     0.9254   0.8927
  Buffalo           0.9184     0.9752   0.8678
  Kob               0.8895     0.9379   0.8459
  Warthog           0.6846     0.9273   0.5426
  Waterbuck         0.5641     0.9706   0.3976
  Elephant          0.8425     0.9622   0.7493

Grad accum steps:  8
Total batch size:  16
LENGTH OF DATA LOADER: 890
Epoch: [1]  [  0/890]  eta: 0:30:14  lr: 0.000010  class_error: 0.00  loss: 2.3559 (2.3559)  loss_ce: 0.3406 (0.3406)  loss_bbox: 0.0465 (0.0465)  loss_giou: 0.1726 (0.1726)  loss_ce_0: 0.3823 (

### Evaluaci√≥n

In [3]:
eval_cfg = OmegaConf.create({
    'data': {
        'images_root': 'data-delplanque/test',
        'gt_points_csv': 'data-delplanque/test.csv',
    },
    'inference': {
        'device': 'cuda',
        'checkpoint_path': './outputs/rfdetr_large_stage2/checkpoint_best_total.pth',
        'threshold': CONF_THRESHOLD_STAGE2,
        'batch_size': 16,
        'output_path': './results/rfdetr_large',
        'detections_csv': 'rfdetr_stage2_detections.csv',
    },
    'metrics': {
        'radius': MATCH_RADIUS,
        'class_map': None,
    },
})


In [4]:
checkpoint = torch.load(eval_cfg.inference.checkpoint_path, weights_only=False)
state_dict = checkpoint.get('model', checkpoint.get('ema_model'))
num_classes = state_dict['class_embed.weight'].shape[0]

model_eval = RFDETRLarge()
model_eval.model.reinitialize_detection_head(num_classes)
model_eval.model.model.load_state_dict(state_dict, strict=True)
model_eval.model.model.to(eval_cfg.inference.device).eval()

stitcher = SimpleStitcher(
    model=model_eval.model.model,
    patch_size=PATCH_SIZE,
    overlap=0,
    batch_size=eval_cfg.inference.batch_size,
    confidence_threshold=eval_cfg.inference.threshold,
    device=eval_cfg.inference.device,
    label_offset=0,
)

Loading pretrain weights


In [5]:
images_root = Path(eval_cfg.data.images_root)
image_files = sorted(
    list(images_root.glob('*.jpg'))
    + list(images_root.glob('*.JPG'))
    + list(images_root.glob('*.png'))
    + list[Path](images_root.glob('*.PNG'))
)

transform = A.Compose([
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

output_dir = Path(eval_cfg.inference.output_path)
output_dir.mkdir(parents=True, exist_ok=True)
all_detections = []

for img_path in tqdm(image_files, desc='Inference'):
    image = Image.open(img_path).convert('RGB')
    image_tensor = transform(image=np.array(image))['image']
    detections = stitcher(image_tensor)

    for i in range(len(detections['scores'])):
        all_detections.append({
            'images': img_path.name,
            'x': float(detections['boxes'][i, 0]),
            'y': float(detections['boxes'][i, 1]),
            'x_max': float(detections['boxes'][i, 2]),
            'y_max': float(detections['boxes'][i, 3]),
            'labels': int(detections['labels'][i]),
            'scores': float(detections['scores'][i]),
        })

pd.DataFrame(all_detections).to_csv(output_dir / eval_cfg.inference.detections_csv, index=False)
print('Saved', len(all_detections), 'detections to', output_dir / eval_cfg.inference.detections_csv)


Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 258/258 [04:35<00:00,  1.07s/it]

Saved 2285 detections to results/rfdetr_large/rfdetr_stage2_detections.csv





In [6]:
points_path = output_dir / 'rfdetr_stage2_detections_points.csv'
points_df = convert_bbox_csv_to_points(
    output_dir / eval_cfg.inference.detections_csv,
    points_path,
)
print('Converted', len(points_df), 'detections to points ->', points_path)
points_df.head()

Converted 2285 detections to points -> results/rfdetr_large/rfdetr_stage2_detections_points.csv


Unnamed: 0,images,x,y,labels,scores
0,01802f75da35434ab373569fffc1fd65a3417aef.JPG,5246.357422,3558.076172,6,0.777562
1,01802f75da35434ab373569fffc1fd65a3417aef.JPG,5399.695557,3471.771118,6,0.838995
2,01802f75da35434ab373569fffc1fd65a3417aef.JPG,5281.511963,3608.711914,6,0.781437
3,01802f75da35434ab373569fffc1fd65a3417aef.JPG,5362.650635,3534.472168,6,0.842959
4,01802f75da35434ab373569fffc1fd65a3417aef.JPG,5394.881836,3561.77832,6,0.537771


In [7]:
stage2_metrics = evaluate_points_from_csv(
    gt_csv=eval_cfg.data.gt_points_csv,
    detections_csv=output_dir / 'rfdetr_stage2_detections_points.csv',
    class_map_path=eval_cfg.metrics.class_map,
    radius=eval_cfg.metrics.radius,
)
stage2_metrics['overall']


{'precision': 0.8892778993435448,
 'recall': 0.8838625489343193,
 'f1_score': 0.8865619546247818,
 'mae': 1.2248062015503876,
 'rmse': 3.096634832656525,
 'mse': 9.589147286821705,
 'accuracy': np.float64(0.937992125984252)}