# Fase 0 — Generación de Parches

Dividimos las imágenes 24MP en parches cuadráticos y ajustamos las anotaciones COCO para obtener el dataset de entrenamiento.

In [1]:
from dataclasses import asdict
from pathlib import Path

import albumentations as A
import numpy as np
import pandas as pd
import torch
from PIL import Image
from albumentations.pytorch import ToTensorV2
from omegaconf import OmegaConf
from tqdm import tqdm

from rfdetr import RFDETRNano

from utils.common.bbox import convert_bbox_csv_to_points
from utils.herdnet import evaluate_points_from_csv
from utils.rf_detr import (
    DEFAULT_CATEGORIES,
    Detection,
    DetectionSample,
    HerdNetMetricsCallback,
    PatchSummary,
    SimpleStitcher,
    generate_patch_dataset,
    write_coco_predictions,
)


INFO:albumentations.check_version:A new version of Albumentations is available: 2.0.8 (you have 1.4.8). Upgrade using: pip install --upgrade albumentations


## Configuración de splits

Ajusta las rutas a tus directorios de origen (imágenes completas + JSON COCO) y el destino donde quedarán los parches.

In [13]:
PATCH_SIZE = 384
PATCH_OVERLAP = 160
IMG_NORMALIZE_MEAN = [0.485, 0.456, 0.406]
IMG_NORMALIZE_STD = [0.229, 0.224, 0.225]
TRAIN_EPOCHS = 50
BATCH_SIZE = 16
GRAD_ACCUM_STEPS = 4
CONF_THRESHOLD_STAGE1 = 0.5
CONF_THRESHOLD_STAGE2 = 0.5
MATCH_RADIUS = 20.0

In [None]:
patch_summaries = []
for job in patch_jobs:
    summary = generate_patch_dataset(
        images_dir=job['images_dir'],
        json_file=job['json_file'],
        output_dir=job['output_dir'],
        patch_width=job['patch_width'],
        patch_height=job['patch_height'],
        overlap=job['overlap'],
        min_visibility=job['min_visibility'],
    )
    entry = {'split': job['split']}
    entry.update(asdict(summary))
    patch_summaries.append(entry)

pd.DataFrame(patch_summaries)

# Fase 1 — Entrenamiento Inicial RF-DETR

Entrenamos RF-DETR Nano sobre los parches generados y registramos métricas estilo HerdNet durante el entrenamiento.

## Inicializar modelo y callback

In [None]:
model = RFDETRNano()

herdnet_callback = HerdNetMetricsCallback(
    model=model,
    val_dataset_path='data-nano-detr/valid/_annotations.coco.json',
    val_images_dir='data-nano-detr/valid',
    threshold_px=20,
    confidence_threshold=0.5,
    wandb_log=True,
    eval_every_n_epochs=5,
)

model.callbacks['on_fit_epoch_end'].append(herdnet_callback.update)


## Entrenar (Stage 1)

In [None]:
model.train(
    dataset_dir='data-nano-detr',
    dataset_file='roboflow',
    img_size=PATCH_SIZE,
    epochs=TRAIN_EPOCHS,
    batch_size=BATCH_SIZE,
    grad_accum_steps=GRAD_ACCUM_STEPS,
    output_dir='outputs/rfdetr_nano_stage1',
    wandb=True,
    project='rf-detr-nano',
)


# Fase 1 — Evaluación

## Configuración de inferencia y métricas

In [15]:
eval_cfg = OmegaConf.create({
    'data': {
        'images_root': 'data-delplanque/test',
        'gt_points_csv': 'data-delplanque/test.csv',
    },
    'inference': {
        'device': 'cuda',
        'checkpoint_path': './outputs/rfdetr_nano_2/checkpoint_best_total.pth',
        'threshold': CONF_THRESHOLD_STAGE1,
        'batch_size': 16,
        'output_path': './results/rfdetr_nano',
        'detections_csv': 'rfdetr_stage1_detections.csv',
    },
    'metrics': {
        'radius': MATCH_RADIUS,
        'class_map': None,
    },
})


## Cargar checkpoint y preparar stitcher

In [9]:
checkpoint = torch.load(eval_cfg.inference.checkpoint_path, weights_only=False)
state_dict = checkpoint.get('model', checkpoint.get('ema_model'))
num_classes = state_dict['class_embed.weight'].shape[0]

model_eval = RFDETRNano()
model_eval.model.reinitialize_detection_head(num_classes)
model_eval.model.model.load_state_dict(state_dict, strict=True)
model_eval.model.model.to(eval_cfg.inference.device).eval()

stitcher = SimpleStitcher(
    model=model_eval.model.model,
    patch_size=PATCH_SIZE,
    overlap=0,
    batch_size=eval_cfg.inference.batch_size,
    confidence_threshold=eval_cfg.inference.threshold,
    device=eval_cfg.inference.device,
    label_offset=0,
)

Using a different number of positional encodings than DINOv2, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Using patch size 16 instead of 14, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Loading pretrain weights


## Ejecutar inferencia sobre imágenes completas

In [10]:
from pathlib import Path


images_root = Path(eval_cfg.data.images_root)
image_files = sorted(
    list(images_root.glob('*.jpg'))
    + list(images_root.glob('*.JPG'))
    + list(images_root.glob('*.png'))
    + list[Path](images_root.glob('*.PNG'))
)

transform = A.Compose([
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

output_dir = Path(eval_cfg.inference.output_path)
output_dir.mkdir(parents=True, exist_ok=True)
all_detections = []

for img_path in tqdm(image_files, desc='Inference'):
    image = Image.open(img_path).convert('RGB')
    image_tensor = transform(image=np.array(image))['image']
    detections = stitcher(image_tensor)

    for i in range(len(detections['scores'])):
        all_detections.append({
            'images': img_path.name,
            'x': float(detections['boxes'][i, 0]),
            'y': float(detections['boxes'][i, 1]),
            'x_max': float(detections['boxes'][i, 2]),
            'y_max': float(detections['boxes'][i, 3]),
            'labels': int(detections['labels'][i]),
            'scores': float(detections['scores'][i]),
        })

pd.DataFrame(all_detections).to_csv(output_dir / eval_cfg.inference.detections_csv, index=False)
print('Saved', len(all_detections), 'detections to', output_dir / eval_cfg.inference.detections_csv)


Inference: 100%|██████████| 258/258 [03:19<00:00,  1.30it/s]

Saved 4292 detections to results/rfdetr_nano/rfdetr_stage1_detections.csv





## Convertir detecciones a puntos

In [11]:
points_path = output_dir / 'rfdetr_stage1_detections_points.csv'
points_df = convert_bbox_csv_to_points(
    output_dir / eval_cfg.inference.detections_csv,
    points_path,
)
print('Converted', len(points_df), 'detections to points ->', points_path)
points_df.head()

Converted 4292 detections to points -> results/rfdetr_nano/rfdetr_stage1_detections_points.csv


Unnamed: 0,images,x,y,labels,scores
0,01802f75da35434ab373569fffc1fd65a3417aef.JPG,1472.237793,262.071701,6,0.513108
1,01802f75da35434ab373569fffc1fd65a3417aef.JPG,4673.474609,251.195457,6,0.667662
2,01802f75da35434ab373569fffc1fd65a3417aef.JPG,2985.883301,906.45874,6,0.581643
3,01802f75da35434ab373569fffc1fd65a3417aef.JPG,5190.343262,1105.431396,6,0.693571
4,01802f75da35434ab373569fffc1fd65a3417aef.JPG,1168.424866,1482.091553,6,0.745156


## Calcular métricas HerdNet

In [None]:
metrics_summary = evaluate_points_from_csv(
    gt_csv=eval_cfg.data.gt_points_csv,
    detections_csv=points_path,
    class_map_path=eval_cfg.metrics.class_map,
    radius=eval_cfg.metrics.radius,
)
metrics_summary['overall']

# Fase 2 — Hard Negatives y Stage 2

Usamos el modelo de la fase 1 para recolectar falsos positivos, generar parches negativos y reentrenar RF-DETR con el dataset ampliado.

## Inferencia sobre el split de entrenamiento

In [None]:
hn_cfg = OmegaConf.create({
    'data': {
        'train_root': 'data/herdnet/raw/train',
    },
    'inference': {
        'device': 'cuda',
        'checkpoint_path': './outputs/rfdetr_nano_stage1/checkpoint_phase_1.pth',
        'threshold': 0.1,
        'batch_size': 2,
        'output_path': './results/hnp_stage1',
        'detections_csv': 'rfdetr_stage1_train_detections.csv',
        'detections_json': 'rfdetr_stage1_train_detections.json',
    },
    'patches': {
        'patch_width': 384,
        'patch_height': 384,
        'overlap': 160,
        'min_visibility': 0.5,
        'output_dir': 'data-nano-detr/hnp',
    },
})


In [None]:
hn_checkpoint = torch.load(hn_cfg.inference.checkpoint_path, weights_only=False)
hn_state = hn_checkpoint.get('model', hn_checkpoint.get('ema_model'))
hn_num_classes = hn_state['class_embed.weight'].shape[0] - 1

hn_model = RFDETRNano()
hn_model.model.reinitialize_detection_head(hn_num_classes)
hn_model.model.model.load_state_dict(hn_state, strict=True)
hn_model.model.model.to(hn_cfg.inference.device).eval()

hn_stitcher = SimpleStitcher(
    model=hn_model.model.model,
    patch_size=hn_cfg.patches.patch_width,
    overlap=0,
    batch_size=hn_cfg.inference.batch_size,
    confidence_threshold=hn_cfg.inference.threshold,
    device=hn_cfg.inference.device,
    label_offset=1,
)


In [None]:
train_root = Path(hn_cfg.data.train_root)
train_images = sorted(
    list(train_root.glob('*.jpg')) +
    list(train_root.glob('*.JPG')) +
    list(train_root.glob('*.png')) +
    list(train_root.glob('*.PNG'))
)

transform = A.Compose([
    A.Normalize(mean=IMG_NORMALIZE_MEAN, std=IMG_NORMALIZE_STD),
    ToTensorV2(),
])

hn_output_dir = Path(hn_cfg.inference.output_path)
hn_output_dir.mkdir(parents=True, exist_ok=True)
hn_records = []
hn_samples = []

for img_path in tqdm(train_images, desc='HN inference'):
    image = Image.open(img_path).convert('RGB')
    width, height = image.size
    tensor = transform(image=np.array(image))['image']
    detections = hn_stitcher(tensor)

    det_list = []
    for i in range(len(detections['scores'])):
        x1, y1, x2, y2 = detections['boxes'][i].tolist()
        label = int(detections['labels'][i])
        score = float(detections['scores'][i])
        hn_records.append({
            'images': img_path.name,
            'x': x1,
            'y': y1,
            'x_max': x2,
            'y_max': y2,
            'labels': label,
            'scores': score,
        })
        det_list.append(Detection(bbox=[x1, y1, x2, y2], label=label, score=score))

    hn_samples.append(DetectionSample(file_name=img_path.name, width=width, height=height, detections=det_list))

hn_df = pd.DataFrame(hn_records)
hn_csv_path = hn_output_dir / hn_cfg.inference.detections_csv
hn_df.to_csv(hn_csv_path, index=False)
write_coco_predictions(hn_samples, hn_output_dir / hn_cfg.inference.detections_json)
hn_csv_path


## Generar parches negativos

Utilizamos el JSON de detecciones para recortar parches alrededor de los falsos positivos.

In [None]:
hnp_summary = generate_patch_dataset(
    images_dir=hn_cfg.data.train_root,
    json_file=hn_output_dir / hn_cfg.inference.detections_json,
    output_dir=hn_cfg.patches.output_dir,
    patch_width=hn_cfg.patches.patch_width,
    patch_height=hn_cfg.patches.patch_height,
    overlap=hn_cfg.patches.overlap,
    min_visibility=hn_cfg.patches.min_visibility,
)
pd.DataFrame([asdict(hnp_summary)])


> Nota: para obtener verdaderos hard negatives conviene filtrar `hn_df` para eliminar detecciones que coinciden con el ground truth (true positives). Se puede cruzar con las anotaciones originales o con métricas de puntos antes de llamar a `generate_patch_dataset`.

## Combinar parches originales + HNP

In [None]:
from shutil import copy2

stage2_root = Path('data-nano-detr-stage2/train')
stage2_root.mkdir(parents=True, exist_ok=True)

original_patches = Path('data-nano-detr/train')
hnp_patches = Path(hn_cfg.patches.output_dir)

for src in original_patches.glob('*.jpg'):
    copy2(src, stage2_root / src.name)

for src in hnp_patches.glob('*.jpg'):
    dst = stage2_root / src.name
    if not dst.exists():
        copy2(src, dst)


Genera un nuevo `_annotations.coco.json` para Stage 2 fusionando el JSON original con los parches HNP. Puedes reutilizar `generate_patch_dataset` o construirlo manualmente con `write_coco_predictions` dependiendo de si mantienes etiquetas explícitas para los negativos.

## Entrenar (Stage 2)

In [None]:
model_stage2 = RFDETRNano()

herdnet_callback_stage2 = HerdNetMetricsCallback(
    model=model_stage2,
    val_dataset_path='data-nano-detr/valid/_annotations.coco.json',
    val_images_dir='data-nano-detr/valid',
    threshold_px=20,
    confidence_threshold=0.5,
    wandb_log=True,
    eval_every_n_epochs=5,
)

model_stage2.callbacks['on_fit_epoch_end'].append(herdnet_callback_stage2.update)

model_stage2.train(
    dataset_dir='data-nano-detr-stage2',
    dataset_file='roboflow',
    img_size=PATCH_SIZE,
    epochs=TRAIN_EPOCHS,
    batch_size=BATCH_SIZE,
    grad_accum_steps=GRAD_ACCUM_STEPS,
    output_dir='outputs/rfdetr_nano_stage2',
    wandb=True,
    project='rf-detr-nano-stage2',
)


Antes de calcular métricas, repite la celda de inferencia completa cambiando `eval_cfg.inference.checkpoint_path`, `output_path` y nombres de archivos para que apunten al checkpoint de Stage 2 (`outputs/rfdetr_nano_stage2`). Luego vuelve a ejecutar la conversión a puntos (`convert_bbox_csv_to_points`) para generar `./results/stage2/rfdetr_stage2_detections_points.csv`.


## Evaluar Stage 2

In [None]:
stage2_metrics = evaluate_points_from_csv(
    gt_csv=eval_cfg.data.gt_points_csv,
    detections_csv=Path('./results/stage2') / 'rfdetr_stage2_detections_points.csv',
    class_map_path=eval_cfg.metrics.class_map,
    radius=eval_cfg.metrics.radius,
)
stage2_metrics['overall']
