 # Visual Question Answering with BLIP-2
 Ashok Kumar Pant

## Model setup

In [None]:
from urllib.request import urlopen

import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = model.to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/128k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/5.81G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/repos/0a/95/0a959ad2bb676ad75f985d587afb299b8844a9fb270bcdabd46eb95042b394e7/c77bd5bcdbc0945c57444d34d28bd4452dc715c3d8aef54e2e91459fdafb7e03?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00002-of-00002.safetensors%3B+filename%3D%22model-00002-of-00002.safetensors%22%3B&Expires=1749704705&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0OTcwNDcwNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy8wYS85NS8wYTk1OWFkMmJiNjc2YWQ3NWY5ODVkNTg3YWZiMjk5Yjg4NDRhOWZiMjcwYmNkYWJkNDZlYjk1MDQyYjM5NGU3L2M3N2JkNWJjZGJjMDk0NWM1NzQ0NGQzNGQyOGJkNDQ1MmRjNzE1YzNkOGFlZjU0ZTJlOTE0NTlmZGFmYjdlMDM%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=UF2uq41IFsG1u-9ayw9Q5E8L1YlSnEpUIwG-F6LW0KEaCN2rDPP-YrJaFslGIpEPfhx%7EwQr2Nh6iW40u7-Zvv46ZlANXD6HcroDliXTS5a5ueTtSI1lajt5E4QzgPtI0MjIMx9iSiJFcbr16ntT-yNJqf-k0cXADqDKomq9ou0-8-NU9FUd91EdQmKoHOidgVusihJz3OOssiQfiC-H1DfE-PRkdh025whtiSSv1yHg

model-00001-of-00002.safetensors:  28%|##7       | 2.77G/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:  51%|#####     | 2.94G/5.81G [00:00<?, ?B/s]

## Load and preprocess an image

In [None]:
car_path = "https://www.detailingdevils.com/uploads/blogs/Lamborghini-Revuelto.webp"  # Replace with actual URL or local path
image = Image.open(urlopen(car_path)).convert("RGB")

## Visual Question Answering (VQA)

In [None]:
prompt = "Question: Write down what you see in this picture. Answer:"
inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=30)
generated_text = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

print(generated_text)

## Chat-like follow-up prompting

In [None]:
prompt = (
    "Question: Write down what you see in this picture. Answer: A sports car driving on the road at sunset. "
    "Question: What would it cost me to drive that car? Answer:"
)
inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=30)
generated_text = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

print(generated_text)


## Interactive Chatbot using gradio (for Jupyter notebooks)


In [1]:
import gradio as gr
import torch

memory = []


def qa_with_memory(user_input):
    if user_input.strip() == "":
        return "", ""

    # Create the prompt from memory
    prompt = " ".join(
        [f"Question: {q} Answer: {a}." for q, a in memory]
    ) + f" Question: {user_input} Answer:"

    # Process inputs
    inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=100)
    answer = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().split("Question")[
        0].strip()

    # Store to memory
    memory.append((user_input, answer))

    # Build output history
    history_html = ""
    for q, a in memory:
        history_html += f"<b>USER:</b> {q}<br><b>BLIP-2:</b> {a}<br><br>"

    return history_html, ""


with gr.Blocks() as demo:
    chatbot = gr.HTML()
    user_input = gr.Textbox(placeholder="Ask something about the image...")

    user_input.submit(qa_with_memory, inputs=user_input, outputs=[chatbot, user_input])

demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/gradio/blocks.py", line 2218, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/gradio/blocks.py", line 1729, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packag