In [1]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, AutoConfig
import torch

model_name = "Qwen/Qwen3-VL-8B-Thinking"

# 1. Load config FIRST
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)

# 2. Patch the broken field
if not hasattr(config.text_config, "pad_token_id"):
    config.text_config.pad_token_id = 0   # Qwen uses 0 as pad by default

# (optional but safe)
if not hasattr(config, "pad_token_id"):
    config.pad_token_id = 0

# 3. Now load model with patched config
# model = Qwen3VLForConditionalGeneration.from_pretrained(
#     model_name,
#     config=config,
#     torch_dtype="auto",
#     device_map="auto",
#     trust_remote_code=True
# )


# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-8B-Instruct",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")

Loading weights:   0%|          | 0/750 [00:00<?, ?it/s]

In [2]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "scratch_ss.png",
            },
            {"type": "text", "text": "Give HTML and CSS for the reference UI."},
        ],
    },
]

# Preparation for inference
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=4096)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

Passing `generation_config` together with generation-related arguments=({'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.


```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>User Profile</title>
    <style>
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 900px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f9f9f9;
        }
        
        h1, h2, h3 {
            color: #2c3e50;
        }
        
        .section {
            margin-bottom: 30px;
        }
        
        .section h2 {
            margin-top: 0;
            padding-bottom: 10px;
            border-bottom: 1px solid #e0e0e0;
        }
        
        .badge-container {
            display: flex;
            flex-wrap: wrap;
            gap: 20px;
            margin-top: 20px;
        }
        
        .badge-card {
            flex: 1 1 300px;
            background: #f