# 🚘 BLIP-2 Vision-Language VQA Demo for Self-Driving Cars
This notebook demonstrates how to use BLIP-2 to answer questions about road scenes. Perfect for developing interpretable vision-language agents for autonomous vehicles.

In [None]:
# 📦 Install Dependencies
!pip install torch torchvision transformers
!pip install git+https://github.com/salesforce/BLIP.git

In [None]:
# 🧠 Load BLIP-2 Model and Processor
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-2.7b",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

In [None]:
# 🖼️ Load Image
# Replace 'road_scene.jpg' with your own image
image = Image.open("road_scene.jpg").convert("RGB")

In [None]:
# ❓ Ask Driving-Relevant Questions
questions = [
    "Is there a stop sign?",
    "Are any pedestrians crossing the street?",
    "What color is the traffic light?",
    "What is the vehicle in front doing?",
    "Is it safe to turn left?"
]

for question in questions:
    inputs = processor(images=image, text=question, return_tensors="pt").to(
        device, torch.float16 if device == "cuda" else torch.float32
    )
    output = model.generate(**inputs, max_new_tokens=50)
    answer = processor.decode(output[0], skip_special_tokens=True)
    print(f"Q: {question}\nA: {answer}\n")

In [None]:
# 🧠 Bonus: Describe the Scene
inputs = processor(images=image, text="Describe the scene.", return_tensors="pt").to(
    device, torch.float16 if device == "cuda" else torch.float32
)
output = model.generate(**inputs, max_new_tokens=50)
description = processor.decode(output[0], skip_special_tokens=True)
print("Scene Description:", description)