In [1]:
image_array = []
targets = []

In [2]:
# get the images
import numpy as np
from PIL import Image
import os, json, torch

try:  # Local path to images
    from google.colab import drive
    IN_COLAB = True
    print("Running in Google Colab")
    mount_path = '/content/drive'
    # Mount only if not already mounted
    if not os.path.exists(mount_path):
        drive.mount(mount_path)
    imgs_directory_path = '/content/drive/MyDrive/ArtEmbed'
    pretraining_metadata = '/content/drive/MyDrive/ArtEmbed/wikiart_metadata_with_pretraining_groundtruth.json'

except:
    IN_COLAB = False
    print("Not running in Google Colab")
    imgs_directory_path = './paintings'
    pretraining_metadata = './wikiart_metadata_with_pretraining_groundtruth.json'



def load_image_from_drive():
  image_arrays = []
  image_names = []

  all_files = sorted(os.listdir(imgs_directory_path))
  for file_name in all_files:
      if file_name.lower().endswith((".jpg", ".jpeg", ".png")):
          path = os.path.join(imgs_directory_path, file_name)
          img = Image.open(path).convert("RGB")
          image_arrays.append(img)
          image_names.append(file_name)

  print(f"Found {len(image_arrays)} images.")
  print(image_names)
  return image_arrays

def load_pretraining_metadata():
    with open(pretraining_metadata, 'r', encoding="utf-8") as f:
        metadata = json.load(f)
    print(metadata.keys())
    print(f"Found metadata for {len(metadata)} paintings.")
    return metadata


image_arrays = load_image_from_drive()
pretraining_metadata = load_pretraining_metadata()




Not running in Google Colab
Found 5 images.
['000001_paul-bril—a-wooded-landscape-with-a-bridge-and-sportsmen-at-the-edge-of-the-river-1590.jpg', '000002_paul-bril—jesus-walking-on-the-sea-of-galilee-1590.jpg', '000003_paul-bril—landscape-with-st-jerome-and-rocky-crag.jpg', '000004_paul-bril—landscape-with-a-marsh-1595.jpg', '000005_paul-bril—landscape-with-stag-hunt-1595.jpg']
dict_keys(['000001', '000002', '000003', '000004', '000005', '000006', '000007', '000008', '000009', '000010', '000011', '000012', '000013', '000014', '000015', '000016', '000017', '000018', '000019', '000020', '000021', '000022', '000023', '000024', '000025', '000026', '000027', '000028', '000029', '000030', '000031', '000032', '000033', '000034', '000035', '000036', '000037', '000038', '000039', '000040', '000041', '000042', '000043', '000044', '000045', '000046', '000047', '000048', '000049', '000050', '000051', '000052', '000053', '000054', '000055', '000056', '000057', '000058', '000059', '000060', '000061'

In [3]:
# Install dependencies
# !pip install transformers accelerate sentencepiece

# --- Import libraries ---
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import requests
import torch

# --- Load BLIP-2 model and processor ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "Salesforce/blip2-flan-t5-xl"
processor = Blip2Processor.from_pretrained(model_name)
blip2 = Blip2ForConditionalGeneration.from_pretrained(model_name)

pixel_values_list = [processor(images=img, return_tensors="pt").pixel_values for img in image_arrays]
pixel_values = torch.cat(pixel_values_list, dim=0)  # shape: [N, 3, H, W]

# Freeze vision encoder to save memory; we are not training the vision encoder
for param in blip2.vision_model.parameters():
    param.requires_grad = False


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:  37%|###7      | 2.15G/5.81G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:  21%|##        | 2.06G/9.96G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
from torch import nn
class BLIP2Regression(nn.Module):
    def __init__(self, blip2_model, output_dim=16, train_qformer=False):
        super().__init__()
        self.blip2 = blip2_model
        # # Optionally train Q-Former
        # for param in self.blip2.qformer.parameters():
        #     param.requires_grad = train_qformer

        num_query_tokens = blip2.config.num_query_tokens
        hidden_size = blip2.config.qformer_config.hidden_size
        print(f"Num query tokens: {num_query_tokens}")
        print(f"Hidden size: {hidden_size}")

        self.regressor = nn.Sequential(
            nn.Linear(num_query_tokens * hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )
        # start 1024
        # three hidden layers


    def forward(self, images):
        print(f"\n=== Forward Pass Debug ===")
        print(f"Input images shape: {images.shape}")

        # Get vision features
        with torch.no_grad():
            vision_outputs = self.blip2.vision_model(pixel_values=images)
            image_embeds = vision_outputs.last_hidden_state
            #Image embeds shape: torch.Size([1, 257, 1408])

        print(f"Image embeds shape: {image_embeds.shape}")

        device = images.device  # Fixed: use images.device, not pixel_values.device
        query_tokens = self.blip2.query_tokens.expand(images.shape[0], -1, -1)
        query_tokens = query_tokens.to(device)
        # Query tokens shape: torch.Size([1, 32, 768])

        print(f"Query tokens shape: {query_tokens.shape}")

        image_attention_mask = torch.ones(
            image_embeds.shape[:-1], dtype=torch.long
        ).to(device)

        print(f"Image attention mask shape: {image_attention_mask.shape}")
        # Image attention mask shape: torch.Size([1, 257])

        query_outputs = self.blip2.qformer(
            query_embeds=query_tokens,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            return_dict=True,
        )

        print(f"Query outputs keys: {query_outputs.keys()}")
        # Query outputs keys: odict_keys(['last_hidden_state', 'pooler_output'])

        query_hidden_states = query_outputs.last_hidden_state
        print(f"Query hidden states shape: {query_hidden_states.shape}")
        # Query hidden states shape: torch.Size([1, 32, 768])

        flattened = query_hidden_states.flatten(start_dim=1)
        print(f"Flattened shape: {flattened.shape}")
        # Flattened shape: torch.Size([1, 24576])

        output = self.regressor(flattened)
        print(f"Final output shape: {output.shape}")
        print(f"Output values: {output}")
        # Final output shape: torch.Size([1, 2])

        normalized = torch.sigmoid(output)
        print(f"Normalized output values: {normalized}")

        return normalized

output_dim = 2
regression_model = BLIP2Regression(blip2, output_dim=output_dim, train_qformer=False)


single_output = regression_model(pixel_values[0:1])
print("Predicted vector for first image:", single_output)


Num query tokens: 32
Hidden size: 768

=== Forward Pass Debug ===
Input images shape: torch.Size([1, 3, 224, 224])
Image embeds shape: torch.Size([1, 257, 1408])
Query tokens shape: torch.Size([1, 32, 768])
Image attention mask shape: torch.Size([1, 257])
Query outputs keys: odict_keys(['last_hidden_state', 'pooler_output'])
Query hidden states shape: torch.Size([1, 32, 768])
Flattened shape: torch.Size([1, 24576])
Final output shape: torch.Size([1, 2])
Output values: tensor([[ 0.1375, -0.1580]], grad_fn=<AddmmBackward0>)
Normalized output values: tensor([[0.5343, 0.4606]], grad_fn=<SigmoidBackward0>)
Predicted vector for first image: tensor([[0.5343, 0.4606]], grad_fn=<SigmoidBackward0>)


In [None]:
def train_step(model, pixel_values, targets, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    predictions = model(pixel_values)
    loss = criterion(predictions, targets)
    loss.backward()
    optimizer.step()
    print(f"Loss: {loss.item()}")
    return loss.item(), predictions

def test_step(model, pixel_values, targets, criterion):
    model.eval()
    with torch.no_grad():
        predictions = model(pixel_values)
        loss = criterion(predictions, targets)
    return loss.item(), predictions


In [None]:
from torch import nn, optim
regression_model = BLIP2Regression(blip2, output_dim=output_dim, train_qformer=False)
criterion = nn.MSELoss()
optimizer = optim.Adam(regression_model.parameters(), lr=1e-4)

num_epochs = 5
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = regression_model(pixel_values)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item():.4f}")


single_output = regression_model(pixel_values[0:1])
print("Predicted vector for first image:", single_output)

Num query tokens: 32
Hidden size: 768

=== Forward Pass Debug ===
Input images shape: torch.Size([2, 3, 224, 224])
Image embeds shape: torch.Size([2, 257, 1408])
Query tokens shape: torch.Size([2, 32, 768])
Image attention mask shape: torch.Size([2, 257])
Query outputs keys: odict_keys(['last_hidden_state', 'pooler_output'])
Query hidden states shape: torch.Size([2, 32, 768])
Flattened shape: torch.Size([2, 24576])
Final output shape: torch.Size([2, 2])
Output values: tensor([[ 0.2158, -0.2147],
        [-0.0221,  0.4256]], grad_fn=<AddmmBackward0>)
Normalized output values: tensor([[0.5538, 0.4465],
        [0.4945, 0.6048]], grad_fn=<SigmoidBackward0>)
Epoch 1/5 - Loss: 0.1998

=== Forward Pass Debug ===
Input images shape: torch.Size([2, 3, 224, 224])
Image embeds shape: torch.Size([2, 257, 1408])
Query tokens shape: torch.Size([2, 32, 768])
Image attention mask shape: torch.Size([2, 257])
Query outputs keys: odict_keys(['last_hidden_state', 'pooler_output'])
Query hidden states sha