In [2]:
from transformers import Dinov2Model, AutoImageProcessor
from PIL import Image
import torch

# Step 1: Load the pre-trained DINOv2 model and processor
model = Dinov2Model.from_pretrained("facebook/dinov2-base")
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")

# Step 2: Load an input image
image = Image.open("linkedin_feed_sample.png")  # Replace with your image path

# Step 3: Preprocess the image
inputs = processor(images=image, return_tensors="pt")

print(f"inputs {inputs}")
# Step 4: Extract features using DINOv2
with torch.no_grad():
    outputs = model(**inputs)

# The extracted visual features
features = outputs.last_hidden_state
print(f"Shape of the feature vector: {features.shape}")
print(f"features {features}")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


inputs {'pixel_values': tensor([[[[-1.4329, -1.4329, -1.4329,  ..., -1.4329, -1.4329, -1.4329],
          [-1.4329, -1.4329, -1.4329,  ..., -1.4329, -1.4329, -1.4329],
          [-1.4329, -1.4329, -1.4329,  ..., -1.4329, -1.4329, -1.4329],
          ...,
          [ 2.0605,  2.0605,  2.0605,  ...,  2.0605,  2.0605,  2.0605],
          [ 2.0605,  2.0605,  2.0605,  ...,  2.0605,  2.0605,  2.0605],
          [ 2.0605,  2.0605,  2.0605,  ...,  2.0605,  2.0605,  2.0605]],

         [[-1.3354, -1.3354, -1.3354,  ..., -1.3354, -1.3354, -1.3354],
          [-1.3354, -1.3354, -1.3354,  ..., -1.3354, -1.3354, -1.3354],
          [-1.3354, -1.3354, -1.3354,  ..., -1.3354, -1.3354, -1.3354],
          ...,
          [ 2.2010,  2.2010,  2.2010,  ...,  2.2010,  2.2010,  2.2010],
          [ 2.2010,  2.2010,  2.2010,  ...,  2.2010,  2.2010,  2.2010],
          [ 2.2010,  2.2010,  2.2010,  ...,  2.2010,  2.2010,  2.2010]],

         [[-1.1073, -1.1073, -1.1073,  ..., -1.1073, -1.1073, -1.1073],
      

In [3]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, AutoModelForSeq2SeqLM

# Step 1: Define a simple MLP to map DINOv2 features to token IDs
class FeatureToTokenMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeatureToTokenMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Step 2: Load your pre-trained DINOv2 features
# Assuming features is the tensor extracted from DINOv2 (shape: [batch_size, tokens, feature_dim])
# Example dimensions: (1, 197, 768) -> Taking 768 as feature_dim

# Step 3: Initialize the MLP
input_dim = features.shape[-1]  # 768 (feature dimension of DINOv2)
hidden_dim = 512  # Hidden layer size
output_dim = 128  # Mapping to a smaller dimension that represents token IDs
mlp = FeatureToTokenMLP(input_dim, hidden_dim, output_dim)

# Step 4: Pass the DINOv2 features through the MLP
# Flatten features and process them
flattened_features = features.view(-1, input_dim)  # Shape: [batch_size * tokens, input_dim]
mlp_output = mlp(flattened_features)

# Step 5: Convert MLP output to token-like representations (Simulating token IDs)
# For simplicity, we're mapping to a token range for GPT-2 (50257 vocab size)
vocab_size = 50257  # GPT-2 tokenizer vocabulary size
mlp_token_ids = torch.argmax(mlp_output, dim=-1) % vocab_size  # Shape: [batch_size * tokens]

# Step 6: Token IDs are now generated, and we can pass them into GPT-2 for text generation

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# model_name = "MiniGPT-4"  # You can look for a smaller version if available
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Reshape token IDs back to (batch_size, tokens)
mlp_token_ids = mlp_token_ids.view(1, -1)  # Batch of token sequences

attention_mask = torch.ones(mlp_token_ids.shape, dtype=torch.long)

# Step 7: Generate text using GPT-2 from token-like IDs
# Note: Simulating input token text based on MLP token output
generated_output = model.generate(input_ids=mlp_token_ids, attention_mask=attention_mask, max_length=300)

# Step 8: Decode the generated output to get human-readable text
generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)
print("Generated response:", generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated response: )*������y*+y�����yW�D�Dyyr)rr�wDD)qr)yrDr�+rrLy++)r*+yr+Dr+rrry+yyrqyrry+r+++r�+yrk+ryy+rr+rrr�++rr��+++y$+++q�++yrr++00qq+qyr++yrrr0r0Z0rr0rry+��rr000+00+00r)rwrrr0|0000+0r))Cwryr000+0++00r)CC�Wrr0r0rrrrrrwCy�Lr�ryryrrrrr))y�Lyyr��r:y:r�D�)|Lr��y?|r+rr��D|r+r+r+r+r+r+r+r+r+r+r+r+r+r+r+r+r+r+r+r+r+r


In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Assuming you have already extracted the features using DINOv2

# Step 1: Convert features to text (a simple method to just convert tensor values to text)
# Here we're converting the first 5 feature vectors into a string (you can customize this)
feature_text = " ".join([f"Feature_{i}: {features[0, i, :5].tolist()}" for i in range(5)])
print("Generated feature text:", feature_text)

# Step 2: Load a small LLM (GPT2 or a smaller variant like distilgpt2)
model_name = "distilgpt2"  # This is a lightweight model suitable for local machines
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Step 3: Tokenize the text and pass it to the model
inputs = tokenizer(feature_text, return_tensors="pt")
outputs = model.generate(**inputs, max_length=500, num_return_sequences=1)

# Step 4: Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated response:", generated_text)


Generated feature text: Feature_0: [-0.7701189517974854, 1.2928109169006348, 0.06849299371242523, -0.26569458842277527, -1.7128863334655762] Feature_1: [-0.018731415271759033, -1.2202171087265015, -0.7043544054031372, -1.6775083541870117, -1.361791968345642] Feature_2: [0.2693025469779968, 2.41815185546875, -0.49726587533950806, -2.247089147567749, 0.6631515622138977] Feature_3: [-0.24857012927532196, 2.2584683895111084, -1.543166995048523, -2.7039871215820312, 1.346882939338684] Feature_4: [0.4978777766227722, 2.021416187286377, 0.002517223358154297, -1.9776861667633057, -0.059578314423561096]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated response: Feature_0: [-0.7701189517974854, 1.2928109169006348, 0.06849299371242523, -0.26569458842277527, -1.7128863334655762] Feature_1: [-0.018731415271759033, -1.2202171087265015, -0.7043544054031372, -1.6775083541870117, -1.361791968345642] Feature_2: [0.2693025469779968, 2.41815185546875, -0.49726587533950806, -2.247089147567749, 0.6631515622138977] Feature_3: [-0.24857012927532196, 2.2584683895111084, -1.543166995048523, -2.7039871215820312, 1.346882939338684] Feature_4: [0.4978777766227722, 2.021416187286377, 0.002517223358154297, -1.9776861667633057, -0.059578314423561096] Feature_5: [0.4978777766


This did not work because distilgp2 was not able to generate any meaningful response from the input. 