 # Visual Question Answering with BLIP-2
 Ashok Kumar Pant

## Model setup

In [1]:
from urllib.request import urlopen

import torch
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"

blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = model.to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors: 100%|#########9| 9.92G/9.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

## Load and preprocess an image

In [5]:
# image_path = "https://www.detailingdevils.com/uploads/blogs/Lamborghini-Revuelto.webp"
# image = Image.open(urlopen(image_path)).convert("RGB")

image_path = "/Users/ashokpant/Projects/treeleaf/smartid-processor/data/np_nationalid_data/Driver Licence/1DL.png"
image = Image.open(image_path).convert("RGB")

## Visual Question Answering (VQA)

In [8]:
prompt = "Question: Write down what you see in this picture. Answer:"
inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=30)
generated_text = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

print(generated_text)

yes


## Chat-like follow-up prompting

In [9]:
prompt = (
    "Question: Write down what you see in this picture. Answer: A sports car driving on the road at sunset. "
    "Question: What would it cost me to drive that car? Answer:"
)
inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
generated_ids = model.generate(**inputs, max_new_tokens=30)
generated_text = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

print(generated_text)


it would cost me a lot of money


## Interactive Chatbot using gradio (for Jupyter notebooks)


In [10]:
import gradio as gr
import torch

memory = []


def qa_with_memory(user_input):
    if user_input.strip() == "":
        return "", ""

    # Create the prompt from memory
    prompt = " ".join(
        [f"Question: {q} Answer: {a}." for q, a in memory]
    ) + f" Question: {user_input} Answer:"

    # Process inputs
    inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=100)
    answer = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip().split("Question")[
        0].strip()

    # Store to memory
    memory.append((user_input, answer))

    # Build output history
    history_html = ""
    for q, a in memory:
        history_html += f"<b>USER:</b> {q}<br><b>BLIP-2:</b> {a}<br><br>"

    return history_html, ""


with gr.Blocks() as demo:
    chatbot = gr.HTML()
    user_input = gr.Textbox(placeholder="Ask something about the image...")

    user_input.submit(qa_with_memory, inputs=user_input, outputs=[chatbot, user_input])

demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


