## Multimodal LLMs: LLaVA-1.5
Ashok Kumar Pant


In [1]:
import torch
# Install dependencies (uncomment if running on a fresh notebook)
# !pip install transformers accelerate bitsandbytes torch torchvision --quiet
# !pip install git+https://github.com/haotian-liu/LLaVA.git --quiet
# !pip install peft safetensors --quiet
from transformers import AutoProcessor, LlavaForConditionalGeneration

In [31]:
# Load model and processor
model_name = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = LlavaForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


In [29]:
from traceback import print_exc
import requests
from io import BytesIO
from PIL import Image


def generate_response(image_url, prompt):
    try:
        # Load and convert the image
        response = requests.get(image_url)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert("RGB")
        # image = image.resize((224, 224))

        # Process input
        inputs = processor(prompt, image, return_tensors="pt").to(device)

        # Generate model output
        output = model.generate(**inputs, max_new_tokens=100)

        # Decode and clean up output
        response = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
        for token in ["<|start_of_text|>", "<|end_of_text|>", "<|user|>", "<|assistant|>"]:
            response = response.replace(token, "").strip()

        return response
    except requests.exceptions.RequestException as e:
        return f"Error fetching image: {e}"
    except Exception as e:
        print_exc()
        return f"Error generating response: {e}"


In [30]:
# --------------------------------------
# Image Captioning
# --------------------------------------
caption_prompt = "Describe this image in detail."
image_url = "https://www.detailingdevils.com/uploads/blogs/Lamborghini-Revuelto.webp"
caption = generate_response(image_url, caption_prompt)
print("Image Caption:\n", caption)

Image Caption:
 Error generating response: Image features and image tokens do not match: tokens: 0, features 576


Traceback (most recent call last):
  File "/var/folders/5_/jdfvrxgj605gsqb_0h_dw6kr0000gn/T/ipykernel_63162/2764706840.py", line 19, in generate_response
    output = model.generate(**inputs, max_new_tokens=100)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/transformers/generation/utils.py", line 2597, in generate
    result = self._sample(
             ^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/transformers/generation/utils.py", line 3557, in _sample
    outputs = self(**model_inputs, return_dict=True)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ashokpant/miniconda3/envs/ml/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739

In [8]:
# --------------------------------------
# Visual Question Answering
# --------------------------------------
vqa_prompt = "What is the man doing in the image?"
vqa_answer = generate_response(image_url, vqa_prompt)
print("VQA Answer:\n", vqa_answer)

ValueError: Image features and image tokens do not match: tokens: 0, features 576

In [9]:
# --------------------------------------
# Multimodal Dialogue
# --------------------------------------
dialogue_prompt = """USER: What are the people doing in the image?
ASSISTANT: They are sitting at a table having food.

USER: What kind of food do you see?
ASSISTANT:"""

multimodal_response = generate_response(image_url, dialogue_prompt, chat_mode=True)
print("Multimodal Dialogue Response:\n", multimodal_response)

ValueError: Image features and image tokens do not match: tokens: 0, features 576