## Multimodal LLMs: LLaVA-1.5
Ashok Kumar Pant


In [9]:
import torch
# Install dependencies (uncomment if running on a fresh notebook)
# !pip install transformers accelerate bitsandbytes torch torchvision --quiet
# !pip install git+https://github.com/haotian-liu/LLaVA.git --quiet
# !pip install peft safetensors --quiet
from transformers import AutoProcessor, LlavaForConditionalGeneration

In [None]:
# Load model and processor
model_name = "llava-hf/llava-1.5-7b-hf"  # example repo, adjust to your actual model repo
processor = AutoProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
).to("cuda")

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

In [None]:
import requests
from io import BytesIO
from PIL import Image
import torch


# Utility function
def generate_response(image_url, prompt, chat_mode=False):
    # Load and convert the image
    image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")

    # Process input
    inputs = processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)

    # Generate model output
    output = model.generate(**inputs, max_new_tokens=100)

    # Decode and clean up output
    response = processor.batch_decode(output, skip_special_tokens=True)[0].strip()

    # Additional cleanup for known special tokens (if any appear)
    for token in ["<|start_of_text|>", "<|end_of_text|>", "<|user|>", "<|assistant|>"]:
        response = response.replace(token, "").strip()

    return response


In [None]:
# --------------------------------------
# Image Captioning
# --------------------------------------
caption_prompt = "Describe this image in detail."
image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
caption = generate_response(image_url, caption_prompt)
print("Image Caption:\n", caption)

In [None]:
# --------------------------------------
# Visual Question Answering
# --------------------------------------
vqa_prompt = "What is the man doing in the image?"
vqa_answer = generate_response(image_url, vqa_prompt)
print("VQA Answer:\n", vqa_answer)

In [None]:
# --------------------------------------
# Multimodal Dialogue
# --------------------------------------
dialogue_prompt = """USER: What are the people doing in the image?
ASSISTANT: They are sitting at a table having food.

USER: What kind of food do you see?
ASSISTANT:"""

multimodal_response = generate_response(image_url, dialogue_prompt, chat_mode=True)
print("Multimodal Dialogue Response:\n", multimodal_response)