In [2]:
from transformers import Dinov2Model, AutoImageProcessor
from PIL import Image
import torch

# Step 1: Load the pre-trained DINOv2 model and processor
model = Dinov2Model.from_pretrained("facebook/dinov2-base")
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")

# Step 2: Load an input image
image = Image.open("linkedin_feed_sample.png")  # Replace with your image path

# Step 3: Preprocess the image
inputs = processor(images=image, return_tensors="pt")

# Step 4: Extract features using DINOv2
with torch.no_grad():
    outputs = model(**inputs)

# The extracted visual features
features = outputs.last_hidden_state
print(f"Shape of the feature vector: {features.shape}")
print(f"features {features}")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Shape of the feature vector: torch.Size([1, 257, 768])
features tensor([[[-0.7701,  1.2928,  0.0685,  ..., -0.6475,  0.9421,  1.2417],
         [-0.0187, -1.2202, -0.7044,  ..., -2.2026, -2.4774,  0.2517],
         [ 0.2693,  2.4182, -0.4973,  ...,  0.3492, -3.7005,  0.3649],
         ...,
         [ 0.2586,  0.6654, -0.8376,  ...,  0.9047, -1.7900, -0.1635],
         [ 0.1109, -0.2485, -0.1484,  ...,  1.0049, -0.5792,  0.9624],
         [ 1.1552,  0.1653, -1.0976,  ...,  0.7416, -0.5684,  0.6595]]])


In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Assuming you have already extracted the features using DINOv2

# Step 1: Convert features to text (a simple method to just convert tensor values to text)
# Here we're converting the first 5 feature vectors into a string (you can customize this)
feature_text = " ".join([f"Feature_{i}: {features[0, i, :5].tolist()}" for i in range(5)])
print("Generated feature text:", feature_text)

# Step 2: Load a small LLM (GPT2 or a smaller variant like distilgpt2)
model_name = "distilgpt2"  # This is a lightweight model suitable for local machines
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Step 3: Tokenize the text and pass it to the model
inputs = tokenizer(feature_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=500, num_return_sequences=1)

# Step 4: Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", generated_text)


Generated feature text: Feature_0: [-0.7701189517974854, 1.2928109169006348, 0.06849299371242523, -0.26569458842277527, -1.7128863334655762] Feature_1: [-0.018731415271759033, -1.2202171087265015, -0.7043544054031372, -1.6775083541870117, -1.361791968345642] Feature_2: [0.2693025469779968, 2.41815185546875, -0.49726587533950806, -2.247089147567749, 0.6631515622138977] Feature_3: [-0.24857012927532196, 2.2584683895111084, -1.543166995048523, -2.7039871215820312, 1.346882939338684] Feature_4: [0.4978777766227722, 2.021416187286377, 0.002517223358154297, -1.9776861667633057, -0.059578314423561096]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated response: Feature_0: [-0.7701189517974854, 1.2928109169006348, 0.06849299371242523, -0.26569458842277527, -1.7128863334655762] Feature_1: [-0.018731415271759033, -1.2202171087265015, -0.7043544054031372, -1.6775083541870117, -1.361791968345642] Feature_2: [0.2693025469779968, 2.41815185546875, -0.49726587533950806, -2.247089147567749, 0.6631515622138977] Feature_3: [-0.24857012927532196, 2.2584683895111084, -1.543166995048523, -2.7039871215820312, 1.346882939338684] Feature_4: [0.4978777766227722, 2.021416187286377, 0.002517223358154297, -1.9776861667633057, -0.059578314423561096] Feature_5: [0.4978777766


This did not work because distilgp2 was not able to generate any meaningful response from the input. 