In [12]:
from transformers import DetrImageProcessor, DetrForObjectDetection, DetrFeatureExtractor, DetrConfig
import torch
from PIL import Image

In [2]:
# Load one image to test
image = Image.open('RebarDSC/images/rebar_0_20MM.jpg')


In [9]:
# Load pre-trained model and image processor
pretrained_model = "facebook/detr-resnet-50"
processor = DetrImageProcessor.from_pretrained(pretrained_model, revision="no_timm")
model = DetrForObjectDetection.from_pretrained(pretrained_model, revision="no_timm")


In [4]:
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

In [6]:
# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.5
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.5)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )

Detected teddy bear with confidence 0.839 at location [108.81, 113.71, 2645.52, 3104.0]


In [13]:
# The model needs finetunning to detect rebar
num_labels = 1
feature_extractor = DetrFeatureExtractor.from_pretrained(
    pretrained_model, format="coco_panoptic"
)
config = DetrConfig.from_pretrained(
    pretrained_model,
    num_labels=num_labels
)

model = DetrForObjectDetection.from_pretrained(pretrained_model)
model.detr.class_labels_classifier = Linear(
    in_features=model.config.hidden_size,
    out_features=num_labels + 1,  # +1 for "no object" class
)


config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

In [14]:
config

DetrConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "DetrForObjectDetection"
  ],
  "attention_dropout": 0.0,
  "auxiliary_loss": false,
  "backbone": "resnet50",
  "backbone_config": null,
  "bbox_cost": 5,
  "bbox_loss_coefficient": 5,
  "class_cost": 1,
  "classifier_dropout": 0.0,
  "d_model": 256,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "dice_loss_coefficient": 1,
  "dilation": false,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_coefficient": 0.1,
  "giou_cost": 2,
  "giou_loss_coefficient": 2,
  "init_std": 0.02,
  "init_xavier_std": 1.0,
  "is_encoder_decoder": true,
  "mask_loss_coefficient": 1,
  "max_position_embeddings": 1024,
  "model_type": "detr",
  "num_channels": 3,
  "num_hidden_layers": 6,
  "num_queries": 100,
  "position_embedding_type": "sine",
  "scale_embed