In [10]:
import torch
import numpy as np
from transformers import DetrForObjectDetection, DetrImageProcessor
import cv2


def export_detr_to_onnx(
    output_path: str,
    model_name: str = "facebook/detr-resnet-50",
    opset: int = 12
):
    """
    Export pretrained DETR model to ONNX format for on-device inference.
    """
    # Load model & processor
    model = DetrForObjectDetection.from_pretrained(model_name)
    processor = DetrImageProcessor.from_pretrained(model_name)
    model.eval()

    # Create dummy input of shape (1, 3, H, W)
    img_size = processor.size.get("shortest_edge", 800)
    dummy = torch.zeros(1, 3, img_size, img_size)

    # Export
    torch.onnx.export(
        model,
        (dummy,),
        output_path,
        input_names=["pixel_values"],
        output_names=["logits", "boxes"],
        dynamic_axes={
            "pixel_values": {0: "batch_size"},
            "logits": {0: "batch_size"},
            "boxes": {0: "batch_size"}
        },
        opset_version=opset
    )
    print(f"[EXPORT] DETR ONNX model saved to {output_path}")


# -----------------------------------------------------------------------------
# Inference on-device with ONNX Runtime
# -----------------------------------------------------------------------------
import onnxruntime as ort


class DetrOnDevice:
    """
    On-device DETR inference wrapper using ONNX Runtime.
    """
    def __init__(
        self,
        onnx_model_path: str,
        model_name: str = "facebook/detr-resnet-50",
        provider: str = "CPUExecutionProvider"
    ):
        # Load ONNX session
        self.session = ort.InferenceSession(
            onnx_model_path,
            providers=[provider]
        )
        # Processor for pre/post-processing
        self.processor = DetrImageProcessor.from_pretrained(model_name)

    def infer(self, frame: np.ndarray, threshold: float = 0.5):
        """
        Run DETR on a single BGR frame and return filtered detections.
    
        Args:
            frame: BGR image as numpy array
            threshold: confidence threshold
        Returns:
            List of dicts with keys: 'score', 'label', 'box' (x0,y0,x1,y1)
        """
        # Resize to match ONNX input
        resized = cv2.resize(frame, (800, 800))
        rgb = resized[:, :, ::-1]
        inputs = self.processor(images=rgb, return_tensors="np")
    
        # Run ONNX model
        outputs = self.session.run(None, {"pixel_values": inputs["pixel_values"]})
        logits, boxes = outputs[:2]  # Safe unpacking
    
        # Compute softmax over class logits
        class_logits = logits[0, :, :-1]  # Remove 'no-object' class
        exp_logits = np.exp(class_logits - np.max(class_logits, axis=-1, keepdims=True))  # stable softmax
        probs = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
    
        scores = np.max(probs, axis=-1)
        labels = np.argmax(probs, axis=-1)
        raw_boxes = boxes[0]
    
        # Rescale boxes to original frame size
        h, w = frame.shape[:2]
        bboxes = self._rescale_boxes(raw_boxes, (h, w))

        detections = []
        for score, label, box in zip(scores, labels, bboxes):
            if score >= threshold:
                detections.append({
                    "score": float(score),
                    "label": int(label),
                    "box": box.tolist()
                })
        return self.apply_nms(detections, iou_threshold=0.5)



    @staticmethod
    def _rescale_boxes(boxes: np.ndarray, image_size: tuple):
        """
        Convert bounding boxes from [cx,cy,w,h] normalized format to [x0,y0,x1,y1] absolute coords.
        """
        h, w = image_size
        # cx, cy, w, h are normalized relative to image dims
        boxes_abs = boxes.copy()
        boxes_abs[:, 0] *= w  # cx
        boxes_abs[:, 1] *= h  # cy
        boxes_abs[:, 2] *= w  # width
        boxes_abs[:, 3] *= h  # height

        # Convert to corner coords
        cxcy = boxes_abs[:, :2]
        wh = boxes_abs[:, 2:]
        top_left = cxcy - wh / 2
        bottom_right = cxcy + wh / 2
        return np.hstack((top_left, bottom_right))

    @staticmethod
    def apply_nms(detections, iou_threshold=0.5):
        """
        Apply Non-Maximum Suppression to filter overlapping boxes.
    
        Args:
            detections: list of dicts with 'score', 'label', 'box'
            iou_threshold: IoU threshold for suppression
        Returns:
            List of filtered detections after NMS
        """
        if not detections:
            return []
    
        boxes = [d['box'] for d in detections]
        scores = [d['score'] for d in detections]
    
        # Convert to [x, y, w, h] for OpenCV
        boxes_xywh = [
            [box[0], box[1], box[2] - box[0], box[3] - box[1]]
            for box in boxes
        ]
    
        indices = cv2.dnn.NMSBoxes(boxes_xywh, scores, score_threshold=0.5, nms_threshold=iou_threshold)
        indices = indices.flatten() if len(indices) > 0 else []
    
        return [detections[i] for i in indices]
    


if __name__ == "__main__":
    # Example usage
    onnx_out = "detr-resnet50.onnx"
    export_detr_to_onnx(onnx_out)

    cap = cv2.VideoCapture(0)
    detector = DetrOnDevice(onnx_out)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        dets = detector.infer(frame)
        # Draw boxes
        for det in dets:
            x0, y0, x1, y1 = map(int, det['box'])
            cv2.rectangle(frame, (x0, y0), (x1, y1), (0,255,0), 2)
            cv2.putText(frame, f"{det['label']}:{det['score']:.2f}", (x0,y0-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
        cv2.imshow("DETR On-Device", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[EXPORT] DETR ONNX model saved to detr-resnet50.onnx


KeyboardInterrupt: 