In [2]:
import torch

def extract_confidences(pred_phrases):
    return torch.tensor([float(p.split('(')[-1].rstrip(')')) for p in pred_phrases])

def xywh_to_xyxy(boxes):
    # Convert [cx, cy, w, h] to [x1, y1, x2, y2]
    cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    x1 = cx - w / 2
    y1 = cy - h / 2
    x2 = cx + w / 2
    y2 = cy + h / 2
    return torch.stack([x1, y1, x2, y2], dim=1)

def compute_iou(box1, boxes2):
    # Compute IoU between one box and multiple boxes
    x1 = torch.max(box1[0], boxes2[:, 0])
    y1 = torch.max(box1[1], boxes2[:, 1])
    x2 = torch.min(box1[2], boxes2[:, 2])
    y2 = torch.min(box1[3], boxes2[:, 3])
    
    inter_area = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
    
    union_area = area1 + area2 - inter_area
    return inter_area / union_area

def filter_by_iou_and_confidence(boxes_filt, pred_phrases, iou_threshold=0.5, large_box_area_thresh=0.9):
    if boxes_filt.size(0) == 0:
        return boxes_filt, pred_phrases

    confidences = extract_confidences(pred_phrases)
    boxes_xyxy = xywh_to_xyxy(boxes_filt)
    areas = boxes_filt[:, 2] * boxes_filt[:, 3]

    # Optional: filter out extremely large boxes
    not_large = (boxes_filt[:, 2] < large_box_area_thresh) & (boxes_filt[:, 3] < large_box_area_thresh)
    boxes_filt = boxes_filt[not_large]
    boxes_xyxy = boxes_xyxy[not_large]
    confidences = confidences[not_large]
    pred_phrases = [pred_phrases[i] for i in torch.where(not_large)[0]]

    keep = torch.ones(len(boxes_filt), dtype=torch.bool)

    for i in range(len(boxes_filt)):
        if not keep[i]:
            continue
        iou = compute_iou(boxes_xyxy[i], boxes_xyxy)
        overlapping_idxs = torch.where((iou > iou_threshold) & (iou < 1.0))[0]

        for j in overlapping_idxs:
            if confidences[i] >= confidences[j]:
                keep[j] = False
            else:
                keep[i] = False

    boxes_final = boxes_filt[keep]
    phrases_final = [pred_phrases[i] for i in torch.where(keep)[0]]

    return boxes_final, phrases_final

boxes = torch.tensor([
    [0.1462, 0.6839, 0.2889, 0.2769],
    [0.7806, 0.4126, 0.0584, 0.0899],
    [0.1423, 0.6661, 0.2017, 0.1897],
    [0.5000, 0.4993, 0.9953, 0.9976],
    [0.4954, 0.1343, 0.0377, 0.0950],
    [0.4384, 0.4964, 0.0300, 0.0708],
])
phrases = ['cells(0.58)', 'cells(0.32)', 'cells(0.30)', 'cells(0.32)', 'cells(0.23)', 'cells(0.22)']

filtered_boxes, filtered_phrases = filter_by_iou_and_confidence(boxes, phrases)
filtered_boxes, filtered_phrases

(tensor([[0.1462, 0.6839, 0.2889, 0.2769],
         [0.7806, 0.4126, 0.0584, 0.0899],
         [0.1423, 0.6661, 0.2017, 0.1897],
         [0.4954, 0.1343, 0.0377, 0.0950],
         [0.4384, 0.4964, 0.0300, 0.0708]]),
 ['cells(0.58)', 'cells(0.32)', 'cells(0.30)', 'cells(0.23)', 'cells(0.22)'])

In [4]:
import torch

boxes = torch.tensor([
    [0.1462, 0.6839, 0.2889, 0.2769],
    [0.7806, 0.4126, 0.0584, 0.0899],
    [0.1423, 0.6661, 0.2017, 0.1897],
    [0.5000, 0.4993, 0.9953, 0.9976],
    [0.4954, 0.1343, 0.0377, 0.0950],
    [0.4384, 0.4964, 0.0300, 0.0708],
])
phrases = ['cells(0.58)', 'cells(0.32)', 'cells(0.30)', 'cells(0.32)', 'cells(0.23)', 'cells(0.22)']

def extract_confidences(pred_phrases):
    return torch.tensor([float(p.split('(')[-1].rstrip(')')) for p in pred_phrases])

def xywh_to_xyxy(boxes):
    cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    x1 = cx - w / 2
    y1 = cy - h / 2
    x2 = cx + w / 2
    y2 = cy + h / 2
    return torch.stack([x1, y1, x2, y2], dim=1)

def compute_iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - inter_area
    return inter_area / union if union > 0 else 0

def nms_by_confidence(boxes, phrases, iou_threshold=0.3):
    confidences = extract_confidences(phrases)
    boxes_xyxy = xywh_to_xyxy(boxes)
    indices = sorted(range(len(confidences)), key=lambda i: confidences[i], reverse=True)

    keep = []
    removed = set()

    for i in indices:
        if i in removed:
            continue
        keep.append(i)
        for j in indices:
            if j != i and j not in removed:
                iou = compute_iou(boxes_xyxy[i], boxes_xyxy[j])
                if iou > iou_threshold:
                    removed.add(j)

    filtered_boxes = boxes[keep]
    filtered_phrases = [phrases[i] for i in keep]
    return filtered_boxes, filtered_phrases

# Apply filtering
filtered_boxes, filtered_phrases = nms_by_confidence(boxes, phrases)

print("Filtered boxes:\n", filtered_boxes)
print("Filtered phrases:\n", filtered_phrases)

Filtered boxes:
 tensor([[0.1462, 0.6839, 0.2889, 0.2769],
        [0.7806, 0.4126, 0.0584, 0.0899],
        [0.5000, 0.4993, 0.9953, 0.9976],
        [0.4954, 0.1343, 0.0377, 0.0950],
        [0.4384, 0.4964, 0.0300, 0.0708]])
Filtered phrases:
 ['cells(0.58)', 'cells(0.32)', 'cells(0.32)', 'cells(0.23)', 'cells(0.22)']


In [2]:
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "../04-06-segment-anything/weights/groundingdino_swint_ogc.pth")
IMAGE_PATH = ".asset/cat_dog.jpeg"
TEXT_PROMPT = "chair . person . dog ."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25

image_source, image = load_image(IMAGE_PATH)

boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)

final text_encoder_type: bert-base-uncased


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



True

In [None]:
import os
import cv2
import torch
import numpy as np
import streamlit as st
from PIL import Image, ImageDraw, ImageFont
from glob import glob
from time import time
import pytesseract
import random  
import re
from PIL import ImageOps

import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
from groundingdino.util.slconfig import SLConfig

class GroundingDINOApp:
    def __init__(self, config_path, checkpoint_path, device, cpu_only=False):
        self.cpu_only = cpu_only        
        self.device   = device
        self.model = self.load_model(config_path, checkpoint_path)

    def load_model(self, config_path, checkpoint_path):
        args = SLConfig.fromfile(config_path)
        args.device = self.device
        model = build_model(args)
        checkpoint = torch.load(checkpoint_path, map_location="cpu")
        model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
        model.eval()
        return model.to(self.device)

    def preprocess_image(self, image_pil):
        transform = T.Compose([
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ])
        image, _ = transform(image_pil, None)
        return image_pil, image.to(self.device)

    def plot_boxes(self, image_pil, boxes, labels):
        W, H = image_pil.size
        draw = ImageDraw.Draw(image_pil)
        font = ImageFont.load_default()

        for box, label in zip(boxes, labels):
            box = box * torch.tensor([W, H, W, H])
            box[:2] -= box[2:] / 2
            box[2:] += box[:2]
            x0, y0, x1, y1 = box.int().tolist()
            color = tuple(np.random.randint(0, 255, size=3).tolist())
            draw.rectangle([x0, y0, x1, y1], outline=color, width=4)
            draw.text((x0, y0), label, fill="white", font=font)
        return image_pil

    def get_grounding_output(self, image_tensor, caption, box_thresh, text_thresh):
        image_tensor = image_tensor.to(self.device)
        caption = caption.strip().lower()
        if not caption.endswith("."):
            caption += "."

        with torch.no_grad():
            outputs = self.model(image_tensor[None], captions=[caption])

        logits = outputs["pred_logits"].sigmoid()[0]
        boxes = outputs["pred_boxes"][0]

        filt_mask = logits.max(dim=1)[0] > box_thresh
        logits_filt = logits[filt_mask]
        boxes_filt = boxes[filt_mask]

        tokenized = self.model.tokenizer(caption)
        pred_phrases = [
            get_phrases_from_posmap(logit > text_thresh, tokenized, self.model.tokenizer) +
            f" ({logit.max().item():.2f})"
            for logit in logits_filt
        ]
        return boxes_filt, pred_phrases

    def crop_and_ocr(self, original_image, box):
        H, W, _ = original_image.shape
        cx, cy, bw, bh = box.tolist()

        x1 = int((cx - bw / 2) * W)
        y1 = int((cy - bh / 2) * H)
        x2 = int((cx + bw / 2) * W)
        y2 = int((cy + bh / 2) * H)

        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(W, x2), min(H, y2)

        cropped = original_image[y1:y2, x1:x2]
        text = pytesseract.image_to_string(cropped, config='--psm 7')  
        
        return cropped, text.strip()
    
# Language selection
st.set_page_config(page_title="Grounding DINO Streamlit Demo", layout="centered")
lang = st.sidebar.selectbox("🌐 Select Language / 언어 선택", ["Korean", "English"])

# Language-specific text
if lang == "Korean":
    st.title("🔍 Grounding DINO 데모")
    st.write("이미지를 업로드하고 텍스트 프롬프트에 따라 객체를 탐지해보세요.")

    with st.sidebar:
        st.header("설정")
        config_path = st.text_input("설정 파일 경로", "groundingdino/config/GroundingDINO_SwinT_OGC.py")
        checkpoint_path = st.text_input("체크포인트 파일 경로", "/home/bekhzod/Desktop/localization_models_performance/weights/groundingdino_swint_ogc.pth")
        cpu_only = st.checkbox("CPU만 사용", value=False)
        box_thresh = st.slider("박스 임계값", 0.0, 1.0, 0.3, 0.05)
        text_thresh = st.slider("텍스트 임계값", 0.0, 1.0, 0.3, 0.05)

    st.title("🧠 차량 번호판 인식 앱")
    text_prompt = st.text_input("텍스트 프롬프트", "번호판")
    if text_prompt == "번호판": text_prompt = "license plate"

    uploaded_image = st.file_uploader("또는 이미지 업로드", type=["png", "jpg", "jpeg"])
    image_dir = st.text_input("이미지 폴더 경로 (선택 사항)", "/home/bekhzod/Desktop/localization_models_performance/lp_images/")

else:  # English interface
    st.title("🔍 Grounding DINO Demo")
    st.write("Upload an image and detect objects based on your text prompt.")

    with st.sidebar:
        st.header("Settings")
        config_path = st.text_input("Configuration File Path", "groundingdino/config/GroundingDINO_SwinT_OGC.py")
        checkpoint_path = st.text_input("Checkpoint File Path", "weights/groundingdino_swint_ogc.pth")
        cpu_only = st.checkbox("Use CPU only", value=False)
        box_thresh = st.slider("Box Threshold", 0.0, 1.0, 0.3, 0.05)
        text_thresh = st.slider("Text Threshold", 0.0, 1.0, 0.3, 0.05)

    st.title("🧠 Vehicle License Plate Recognition App")
    text_prompt = st.text_input("Text Prompt", "license plate")

    uploaded_image = st.file_uploader("Or upload an image", type=["png", "jpg", "jpeg"])
    image_dir = st.text_input("Image Folder Path (Optional)", "/home/bekhzod/Desktop/localization_models_performance/lp_images/")

# Initialize model
device = "cpu" if cpu_only else "cuda"

if os.path.exists(config_path) and os.path.exists(checkpoint_path):
    g_dino = GroundingDINOApp(config_path = config_path, checkpoint_path = checkpoint_path, cpu_only = cpu_only, device = device)
else:
    st.error("Please provide valid configuration and checkpoint file paths." if lang == "English" else "유효한 설정 파일 및 체크포인트 경로를 입력해주세요.")
    st.stop()

# Image preview and selection
detection_triggered = False
detection_image = None
original_cv2 = None
result_image = None
cropped_img = None
ocr_text = ""

if os.path.isdir(image_dir):
    image_paths = glob(os.path.join(image_dir, "*.[jp][pn]g"))
    random.shuffle(image_paths)
    selected_images = image_paths[:10]

    st.markdown("### 🖼️ Random Image Preview" if lang == "English" else "### 🖼️ 랜덤 이미지 미리보기")
    rows = [selected_images[i:i+5] for i in range(0, len(selected_images), 5)]
    for row in rows:
        cols = st.columns(5)
        for col, img_path in zip(cols, row):
            with col:
                pil_img = Image.open(img_path).convert("RGB")
                pil_img = ImageOps.fit(pil_img, (200, 200))
                st.image(pil_img, caption=os.path.basename(img_path), use_container_width=False)
                if st.button("Detect from Image" if lang == "English" else "위 이미지 탐지하기", key=img_path):
                    detection_triggered = True
                    detection_image = Image.open(img_path).convert("RGB")
                    original_cv2 = np.array(detection_image)
elif image_dir.strip():
    st.warning("Invalid image folder path or folder is empty." if lang == "English" else "입력한 이미지 폴더 경로가 잘못되었거나 폴더가 비어있습니다.")

if uploaded_image and not detection_triggered:
    detection_image = Image.open(uploaded_image).convert("RGB")
    original_cv2 = np.array(detection_image)
    detection_triggered = True

if detection_triggered and detection_image is not None:
    st.markdown("---")
    st.markdown("### 🔍 Detection Results" if lang == "English" else "### 🔍 탐지 결과")

    st.image(detection_image, caption="Original Image" if lang == "English" else "원본 이미지", use_container_width=True)

    with st.spinner("Detecting..." if lang == "English" else "탐지 중..."):
        _, image_tensor = g_dino.preprocess_image(detection_image)
        boxes, phrases = g_dino.get_grounding_output(image_tensor, text_prompt, box_thresh, text_thresh)
        boxes = boxes.to("cpu")

        result_image = g_dino.plot_boxes(detection_image.copy(), boxes, phrases)
        st.image(result_image, caption="Detected Results" if lang == "English" else "탐지된 결과", use_container_width=True)

        if len(boxes) > 0:
            cropped_img, ocr_text = g_dino.crop_and_ocr(original_cv2, boxes[0])
            st.subheader("📝 OCR Result" if lang == "English" else "📝 OCR 결과")
            st.image(cropped_img, caption="Cropped Region" if lang == "English" else "잘라낸 영역", use_container_width=True)

            cleaned_text = re.sub(r'[^A-Za-z0-9\- ]', '', ocr_text)
            st.success(f"OCR Result: {cleaned_text}" if lang == "English" else f"OCR 인식 결과: {cleaned_text}")
        else:
            st.warning("No object detected." if lang == "English" else "탐지된 객체가 없습니다.")
