In [None]:
import torch
import requests
from PIL import Image
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor

url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 

image = Image.open(requests.get(url, stream=True).raw)

image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r18vd")

model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r18vd")
model.eval();

In [40]:
%%timeit
inputs = image_processor(images=image, return_tensors="pt")

with torch.inference_mode() and torch.no_grad():
    outputs = model(**inputs)
    results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)

291 ms ± 3.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
inputs["pixel_values"].shape

torch.Size([1, 3, 640, 640])

In [32]:
for result in results:
    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):

        score, label = score.item(), label_id.item()

        box = [round(i, 2) for i in box.tolist()]

        print(f"{model.config.id2label[label]}: {score:.2f} {box}")

sofa: 0.97 [0.14, 0.38, 640.13, 476.21]
cat: 0.96 [343.38, 24.28, 640.14, 371.5]
cat: 0.96 [13.23, 54.18, 318.98, 472.22]
remote: 0.95 [40.11, 73.44, 175.96, 118.48]
remote: 0.92 [333.73, 76.58, 369.97, 186.99]


## ONNX

In [29]:
torch.onnx.export(
    model=model,
    args=(torch.randn(1, 3, 640, 640),),
    f="rt_detr_r18vd_model.onnx",
    export_params=True,
    opset_version=16,
    do_constant_folding=True,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size"}, "output": {0: "batch_size"}},
)

In [34]:
import onnxruntime as ort
input_tensor = inputs["pixel_values"].numpy()
ort_session = ort.InferenceSession("rt_detr_r18vd_model.onnx")

input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

In [39]:
%%timeit
inputs = image_processor(images=image, return_tensors="pt")
input_tensor = inputs["pixel_values"].numpy()
ort_inputs = {input_name: input_tensor}
ort_outputs = ort_session.run([output_name], ort_inputs)

183 ms ± 10.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
