In [13]:
import os
import torch
import torchvision
from groundingdino.util.inference import load_model, load_image, annotate, predict as _predict
import cv2
import tqdm

In [14]:
model = load_model("GroundingDINO_SwinT_OGC.py", "groundingdino_swint_ogc.pth")

final text_encoder_type: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
def predict(image, caption, box_threshold, text_threshold, save=False):
    fname = image.split(os.sep)[-1]
    image_source, image = load_image(image)
    boxes, confidence, phrases = _predict(
    model=model,
    image=image,
    caption=caption,
    box_threshold=box_threshold,
    text_threshold=text_threshold)
    if save:
        os.makedirs('output', exist_ok=True)
        annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=confidence, phrases=phrases)
        cv2.imwrite(os.path.join('output', fname), annotated_frame)
    h, w, c = image_source.shape
    boxes = boxes * torch.Tensor([w, h, w, h])  # undo normalization
    boxes = torchvision.ops.box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").tolist()
    confidence = confidence.tolist()
    return [{
        'img_name': fname,
        'class': 0,  # TODO: replace with actual ReID model results, but for now can just submit bbox and see map, 1: suspect, 0: not suspect
        'confidence': confidence[i],
        'ymin': boxes[i][1],
        'xmin': boxes[i][0],
        'ymax': boxes[i][3],
        'xmax': boxes[i][2]
    } for i in range(len(boxes))]

In [28]:
dataset_root = '../RT-DETR/dataset/test/images'
TEXT_PROMPT = "plushie"
BOX_TRESHOLD = 0.7
TEXT_TRESHOLD = 0.7
results = []

for fname in tqdm.tqdm(os.listdir(dataset_root)):
    results.extend(predict(os.path.join(dataset_root, fname), TEXT_PROMPT, BOX_TRESHOLD, TEXT_TRESHOLD, save=False))

100%|██████████| 1/1 [00:00<00:00,  2.26it/s]


In [30]:
import pandas as pd
df = pd.DataFrame(results)
df.to_csv('submission.csv', index=False)