# Visual Question Answering

In [None]:
import requests
from PIL import Image
import matplotlib.pyplot as plt
from transformers import BlipProcessor, BlipForQuestionAnswering

# -----------------------------
# 1) Config
# -----------------------------
img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"

# You can change this question as needed
question = "What is the woman doing?"

# BLIP VQA model (use base or large; base is usually faster)
model_name = "Salesforce/blip-vqa-base"
# model_name = "Salesforce/blip-vqa-capfilt-large"  # optional alternative if you want a larger variant

# -----------------------------
# 2) Load image from URL
# -----------------------------
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")

# -----------------------------
# 3) Load processor + VQA model
# -----------------------------
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForQuestionAnswering.from_pretrained(model_name)

# -----------------------------
# 4) Run VQA
# -----------------------------
inputs = processor(raw_image, question, return_tensors="pt")
output_ids = model.generate(**inputs, max_new_tokens=20)
answer = processor.decode(output_ids[0], skip_special_tokens=True)

# -----------------------------
# 5) Show image + result (inline)
# -----------------------------
plt.figure()
plt.imshow(raw_image)
plt.axis("off")
plt.title(f"Q: {question}\nA: {answer}")
plt.show()

print("Question:", question)
print("Answer:", answer)
