In [1]:
import os
import tasti
import jsonlines
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from collections import defaultdict

from datasets import Dataset, load_dataset
from PIL import Image
import requests
from transformers import AutoProcessor

from torch.utils.data import DataLoader

from transformers import BlipProcessor, BlipPreTrainedModel, BlipConfig, BlipVisionModel, BlipTextModel
from transformers.models.blip.modeling_blip import BlipImageTextMatchingModelOutput, BlipTextVisionModelOutput

import torch
import torch.nn as nn
from torch.nn.functional import normalize

from typing import Optional, Tuple, Union

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load "./cache/embeddings.npy"
def load_embeddings():
    embeddings = np.load("./cache/embeddings.npy")
    return embeddings

# load "./y_true.npy"
def load_y_true():
    y_true = np.load("./y_true.npy")
    return y_true

embeddings = load_embeddings()
y_true = load_y_true()

In [3]:
print(embeddings.shape)
print(y_true.shape)

(2500, 256)
(2500,)


In [4]:
y_true.sum()

7.0

In [5]:

class BlipForImageTextRetrieval(BlipPreTrainedModel):
    config_class = BlipConfig

    def __init__(self, config: BlipConfig):
        super().__init__(config)

        self.vision_model = BlipVisionModel(config.vision_config)

        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)

        # vision projection layer
        self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size)

        # text projection layer
        self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size)

        # image text matching head
        self.itm_head = nn.Linear(config.text_config.hidden_size, 2)

        self.decoder_pad_token_id = (
            config.text_config.pad_token_id
            if not hasattr(config, "decoder_pad_token_id")
            else config.decoder_pad_token_id
        )
        self.decoder_start_token_id = (
            config.text_config.bos_token_id
            if not hasattr(config, "decoder_start_token_id")
            else config.decoder_start_token_id
        )

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    def forward(
        self,
        input_ids: torch.LongTensor,
        pixel_values: torch.FloatTensor,
        use_itm_head: Optional[bool] = True,
        attention_mask: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BlipTextVisionModelOutput]:
        r"""
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForImageTextRetrieval

        >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "an image of a cat"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        image_embeds = vision_outputs[0]
        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long)

        if use_itm_head:
            question_embeds = self.text_encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                encoder_hidden_states=image_embeds,
                encoder_attention_mask=image_atts,
                return_dict=return_dict,
            )
            question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state

            output = self.itm_head(question_embeds[:, 0, :])
        else:
            question_embeds = self.text_encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict=return_dict,
            )
            question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state

            image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1)
            text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1)

            output = image_feat @ text_feat.t()

        if not return_dict:
            outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,)
            return tuple(output for output in outputs if output is not None)

        return BlipImageTextMatchingModelOutput(
            itm_score=output,
            last_hidden_state=vision_outputs.last_hidden_state,
            hidden_states=vision_outputs.hidden_states,
            attentions=vision_outputs.attentions,
            question_embeds=question_embeds,
        )


In [6]:
model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

In [7]:
attributes_dataset = load_dataset("visual_genome", "attributes_v1.2.0", split='train')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [8]:
image = attributes_dataset[1732]['image']
# image = Image.open(path)""
text = "The image is taken in an office"

inputs = processor(images=image, text=text, return_tensors="pt").to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
inputs

{'pixel_values': tensor([[[[-0.1864, -0.0696, -0.0550,  ..., -0.8142, -0.4346, -0.4492],
          [-0.0113,  0.1055,  0.1201,  ..., -0.6244, -0.2302, -0.2448],
          [-0.0550,  0.0763,  0.0909,  ..., -0.6536, -0.2594, -0.2740],
          ...,
          [-0.6244, -0.5222, -0.5076,  ...,  0.3245,  0.3537,  0.3829],
          [-0.6682, -0.6536, -0.7558,  ...,  0.2807,  0.3537,  0.2953],
          [-1.0185, -1.0331, -1.0915,  ...,  0.0179,  0.2515,  0.2807]],

         [[-0.4614, -0.3414, -0.3414,  ..., -0.4614,  0.0038,  0.0038],
          [-0.2813, -0.1613, -0.1613,  ..., -0.2663,  0.1989,  0.1989],
          [-0.3264, -0.2063, -0.2213,  ..., -0.2963,  0.1689,  0.1689],
          ...,
          [-0.8066, -0.6715, -0.6415,  ..., -0.2213, -0.2063, -0.2213],
          [-0.8366, -0.8216, -0.8666,  ..., -0.2813, -0.2213, -0.3114],
          [-1.1968, -1.1968, -1.2268,  ..., -0.5665, -0.3114, -0.3114]],

         [[-0.3995, -0.2857, -0.2857,  ..., -0.0298,  0.5675,  0.7381],
          [-0

In [19]:
question_embeds = model.text_encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], return_dict=True)[0]
question_embeds.shape

torch.Size([1, 9, 768])

In [20]:
text_feat = normalize(model.text_proj(question_embeds[:, 0, :]), dim=-1)
text_feat.shape

torch.Size([1, 256])

In [22]:
embeddings_torch = torch.from_numpy(embeddings).to(device)
image_feat = normalize(model.vision_proj(embeddings_torch), dim=-1)
image_feat.shape

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)