In [None]:
import torch
import requests
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration

### Load the Model

In [None]:
model_id = "aws-prototyping/long-llava-qwen2-7b"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id, 
    dtype=torch.bfloat16, 
    low_cpu_mem_usage=True, 
).to(0)

processor = AutoProcessor.from_pretrained(model_id, use_fast=True)

In [None]:
processor.patch_size = 14
processor.num_additional_image_tokens = 1
processor.vision_feature_select_strategy = "default"

### Create Chat

In [None]:
# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt
# Each value in "content" has to be a list of dicts with types ("text", "image") 
conversation = [
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": "You are a helpful Assistant that answers questions about images."
        }
      ]
    },
    {

      "role": "user",
      "content": [
          {
              "type": "text", 
              "text": "What are these?"
          },
          {
              "type": "image"
          },
        ],
    },
]

In [None]:
full_prompt = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True
)

# === Load and preprocess the image ===
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

# === Prepare model inputs ===
inputs = processor(
    images=image,
    text=full_prompt,
    return_tensors="pt"
)

# Move everything to the modelâ€™s device & correct dtype
device = 'cuda:0'
inputs = {k: v.to(device) for k, v in inputs.items()}

### Generate

In [None]:
with torch.inference_mode():
    generated = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=False
    )

# === Decode output text ===
# Skip the first tokens if these are special tokens (varies by model)
decoded = processor.decode(
    generated[0],
    skip_special_tokens=True
)

print(decoded.split('assistant')[1].strip())