In [1]:
pip install pycocotools tqdm




In [2]:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from PIL import Image
import requests, torch

from google.colab import drive
drive.mount('/content/drive')

# Path to your dataset
test_dataset_path = '/content/drive/MyDrive/cocoann'

dataset_path = '/content/drive/MyDrive/TestDataset'
image1 = '/content/drive/MyDrive/TestDataset/images/000000039769.jpg'
image2 = '/content/drive/MyDrive/graf.jpg'
image2=Image.open(image2)
video_path='/content/drive/MyDrive/clipped_2.mp4'

Mounted at /content/drive


In [None]:
import os
assert os.path.exists(video_path), "File not found"


In [5]:
import cv2
import torch
import time
import os
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

# -------------------------------
# Setup
# -------------------------------

if not os.path.exists(video_path):
    raise FileNotFoundError(f"Video file not found: {video_path}")

model_id = "IDEA-Research/grounding-dino-tiny"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading model...")
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device).eval()

text_labels = [["Graffiti", "graffiti letters", "painted graffiti drawings"]]
box_threshold = 0.4
text_threshold = 0.3

# -------------------------------
# Video loading
# -------------------------------
cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    raise IOError("Cannot open video file")

frame_idx = 0
total_time = 0

print("Running inference on video...")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_idx += 1
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)

    start = time.time()
    with torch.no_grad():
        outputs = model(**inputs)
    end = time.time()

    duration = end - start
    total_time += duration
    print(f"Frame {frame_idx}: {duration:.4f} s")

cap.release()

avg_time = total_time / frame_idx if frame_idx else 0
fps = 1 / avg_time if avg_time else 0

print(f"\nProcessed {frame_idx} frames")
print(f"Avg inference time: {avg_time:.4f} s")
print(f"Effective FPS: {fps:.2f}")


Loading model...
Running inference on video...
Frame 1: 0.2851 s
Frame 2: 0.2754 s
Frame 3: 0.2756 s
Frame 4: 0.2760 s
Frame 5: 0.2790 s
Frame 6: 0.2780 s
Frame 7: 0.2760 s
Frame 8: 0.2771 s
Frame 9: 0.2785 s
Frame 10: 0.2771 s
Frame 11: 0.2769 s
Frame 12: 0.2780 s
Frame 13: 0.2784 s
Frame 14: 0.2769 s
Frame 15: 0.2796 s
Frame 16: 0.2721 s
Frame 17: 0.2768 s
Frame 18: 0.2762 s
Frame 19: 0.2781 s
Frame 20: 0.2789 s
Frame 21: 0.2854 s
Frame 22: 0.2796 s
Frame 23: 0.2763 s
Frame 24: 0.2850 s
Frame 25: 0.2817 s
Frame 26: 0.2897 s
Frame 27: 0.2854 s
Frame 28: 0.2835 s
Frame 29: 0.2826 s
Frame 30: 0.2810 s
Frame 31: 0.2766 s
Frame 32: 0.2788 s
Frame 33: 0.2760 s
Frame 34: 0.2799 s
Frame 35: 0.2783 s
Frame 36: 0.2764 s
Frame 37: 0.2772 s
Frame 38: 0.2775 s
Frame 39: 0.2771 s
Frame 40: 0.2776 s
Frame 41: 0.2771 s
Frame 42: 0.2782 s
Frame 43: 0.2792 s
Frame 44: 0.2774 s
Frame 45: 0.2784 s
Frame 46: 0.2770 s
Frame 47: 0.2810 s
Frame 48: 0.2784 s
Frame 49: 0.2784 s
Frame 50: 0.2795 s
Frame 51: 0.

In [7]:
pip install matplotlib scikit-learn


