<a href="https://colab.research.google.com/github/adams-x0/cv_project/blob/main/owlvit_demo_week1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch torchvision opencv-python
!pip install matplotlib


In [None]:
from transformers import OwlViTProcessor, OwlViTForObjectDetection
from PIL import Image
import requests

# Load OWL-ViT processor and model
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")


In [None]:
url = "https://images.unsplash.com/photo-1593642634315-48f5414c3ad9"  # example image
image = Image.open(requests.get(url, stream=True).raw)
image.show()


In [None]:
texts = [["laptop", "person", "dog"]]  # list of objects you want to detect


In [None]:
import torch

inputs = processor(text=texts, images=image, return_tensors="pt")
outputs = model(**inputs)

# Post-process boxes
target_sizes = torch.tensor([image.size[::-1]])  # width, height
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)[0]

# Show detected objects
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    if score > 0.3:
        print(f"Detected {texts[0][label]} with score {score:.2f} at {box}")


In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

plt.figure(figsize=(10,10))
plt.imshow(image)
ax = plt.gca()

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    if score > 0.3:
        box = box.detach().cpu().numpy()
        x0, y0, x1, y1 = box
        rect = patches.Rectangle((x0, y0), x1-x0, y1-y0, linewidth=2, edgecolor='red', facecolor='none')
        ax.add_patch(rect)
        ax.text(x0, y0-5, f"{texts[0][label]} {score:.2f}", color='red', fontsize=12)

plt.show()
