# Test Run

In [5]:
!pip install torch


Collecting torch
  Downloading torch-2.6.0-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Downloading MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Downloading torch-2.6.0-cp310-cp310-win_amd64.whl (204.2 MB)
   ---------------------------------------- 0.0/204.2 MB ? eta -:--:--
   ---------------------------------------- 0.5/204.2 MB 2.8 MB/s eta 0:01:14
   ---------------------------------------- 1.0/204.2 MB 2.6 MB/s eta 0:01:17
   ---------------------------------------- 1.3/204.2 MB 2.3 MB/s eta 0:01:28
   ------------

In [2]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("image-text-to-text", model="unsloth/llava-1.5-7b-hf-bnb-4bit")
# messages = [
#     {
#       "role": "user",
#       "content": [
#           {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"},
#           {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
#         ],
#     },
# ]

# out = pipe(text=messages, max_new_tokens=20)
# print(out)

In [12]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "is there a dog on this picture?"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))
# >>> a photography of a woman and her dog

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))


is there a dog on this picture?
a woman sitting on the beach with her dog


In [13]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-tiny"
device = "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Check for cats and remote controls
text = "a cat. a remote control."

inputs = processor(images=image, text=text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)
print(results)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


[{'scores': tensor([0.4785, 0.4379, 0.4760]), 'labels': ['a cat', 'a cat', 'a remote control'], 'boxes': tensor([[344.6974,  23.1085, 637.1821, 374.2747],
        [ 12.2688,  51.9106, 316.8565, 472.4336],
        [ 38.5868,  70.0092, 176.7758, 118.1749]])}]


In [16]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence-transformers)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence-transformers)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.1 MB 2.6 MB/s eta 0:00:05
   ---- -------------

In [23]:
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
from PIL import Image
import requests

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP model from SentenceTransformer
clip_model = SentenceTransformer('clip-ViT-B-32')

# Load GPT-2 model and tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Download and open the image
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# Encode the image using CLIP
img_emb = clip_model.encode(image)  # Typically a 512-dim vector for 'clip-ViT-B-32'
img_emb_tensor = torch.tensor(img_emb, dtype=torch.float32).unsqueeze(0).to(device)  # Shape: (1, 512)

# Create a projection layer to map CLIP embeddings (512) to GPT-2 embeddings (768)
proj = nn.Linear(512, 768).to(device)

# Project the CLIP embedding into GPT-2's embedding space
prefix_emb = proj(img_emb_tensor)  # Shape: (1, 768)

# If you want a prefix sequence longer than 1 token, you can expand this vector
# For simplicity, we'll use a single token prefix; add a sequence dimension:
prefix_emb = prefix_emb.unsqueeze(1)  # Shape: (1, 1, 768)

# Now, generate text conditioned on this prefix using GPT-2.
# Note: GPT-2 was not trained on this type of input, so the result may be unrefined.
generated_outputs = gpt2_model.generate(
    inputs_embeds=prefix_emb,  # Using our projected image embedding as a prefix
    max_length=50,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    do_sample=True,
    top_p=0.95,
    temperature=0.7,
)

# Decode the generated tokens to text
caption = gpt2_tokenizer.decode(generated_outputs[0], skip_special_tokens=True)
print("Generated Caption:", caption)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Caption: 

The following are the current state of the European Union's banking sector, which are expected to see a rise in the next year.
 "This year, the EU banking system has seen a fall of almost 10% in GDP (in the


In [30]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# conditional image captioning
text = "a dog is"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))


a dog is sitting on the beach with a woman
a woman sitting on the beach with her dog


In [29]:
import requests
from PIL import Image
import torch

from transformers import OwlViTProcessor, OwlViTForObjectDetection

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
texts = [["a photo of a cat", "a photo of a dog"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

# Print detected objects and rescaled box coordinates
for box, score, label in zip(boxes, scores, labels):
    box = [round(i, 2) for i in box.tolist()]
    print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 