In [23]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image

In [24]:
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm")

In [25]:

image1 = Image.open("./ipr2bf01815-fig-0004-m.jpg")
image2 = Image.open("./cute-animals-other-objects_1308-104277.jpg")
image3 = Image.open("./1aOqEspE6kOwzW49V0vYpCQ.webp")

In [26]:
inputs1 = processor(images=image1, return_tensors="pt")
outputs1 = model(**inputs1)

inputs2 = processor(images=image2, return_tensors="pt")
outputs2 = model(**inputs2)

inputs3 = processor(images=image3, return_tensors="pt")
outputs3 = model(**inputs3)

In [27]:
target_sizes = torch.tensor([image1.size[::-1]])
results = processor.post_process_object_detection(outputs1, target_sizes=target_sizes, threshold=0.5)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )

Detected bird with confidence 0.993 at location [634.56, 77.75, 867.22, 238.67]
Detected person with confidence 0.991 at location [1560.48, 638.52, 1731.96, 866.79]
Detected bird with confidence 0.53 at location [95.23, 76.4, 277.73, 249.04]
Detected bird with confidence 0.985 at location [1281.45, 990.79, 1455.43, 1088.19]
Detected bird with confidence 0.997 at location [1549.96, 71.59, 1751.94, 258.5]
Detected bird with confidence 0.996 at location [943.77, 81.87, 1152.68, 232.77]
Detected apple with confidence 0.994 at location [656.39, 665.85, 852.52, 870.37]
Detected bird with confidence 0.998 at location [1246.86, 70.21, 1452.7, 257.44]
Detected motorcycle with confidence 0.58 at location [650.04, 984.51, 860.3, 1096.39]
Detected bird with confidence 0.549 at location [406.06, 78.91, 570.56, 249.65]
Detected potted plant with confidence 0.509 at location [47.45, 335.95, 286.25, 588.95]
Detected bird with confidence 0.975 at location [1551.72, 988.48, 1743.85, 1091.68]
Detected ap

In [28]:
target_sizes = torch.tensor([image2.size[::-1]])
results = processor.post_process_object_detection(outputs2, target_sizes=target_sizes, threshold=0.5)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )

Detected kite with confidence 0.542 at location [143.79, 241.55, 248.42, 345.71]
Detected cake with confidence 0.941 at location [10.86, 4.81, 626.09, 349.18]
Detected kite with confidence 0.745 at location [517.17, 239.9, 614.51, 345.44]
Detected kite with confidence 0.922 at location [355.5, 239.85, 494.96, 343.5]


In [30]:
target_sizes = torch.tensor([image3.size[::-1]])
results = processor.post_process_object_detection(outputs3, target_sizes=target_sizes, threshold=0.5)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )

Detected traffic light with confidence 0.763 at location [87.54, 1710.41, 141.97, 1769.29]
Detected traffic light with confidence 0.619 at location [1397.48, 756.04, 1446.39, 810.65]
Detected traffic light with confidence 0.681 at location [1341.54, 1908.36, 1389.83, 1956.99]
Detected person with confidence 0.913 at location [1683.33, 1967.26, 1712.61, 2046.6]
Detected person with confidence 0.836 at location [1822.92, 1970.76, 1855.79, 2046.79]
Detected car with confidence 0.809 at location [1306.84, 1973.05, 1389.62, 2032.97]
Detected car with confidence 0.562 at location [180.72, 702.11, 477.67, 936.4]
Detected person with confidence 0.509 at location [1619.63, 816.31, 1656.78, 873.18]
Detected person with confidence 0.931 at location [1762.05, 1969.6, 1794.89, 2048.25]
Detected person with confidence 0.875 at location [1775.03, 1970.96, 1805.77, 2049.95]
Detected person with confidence 0.928 at location [1734.5, 1970.5, 1766.79, 2049.56]
Detected fire hydrant with confidence 0.865 