In [4]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
)


Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.47s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [6]:
# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [62]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "images/bankStatement.jpg",
            },
            {"type": "text", "text": (
                "From this bank statement image, extract all transactions as JSON with the keys: "
                "date, description, money_in, money_out, balance. Numbers as float/int without currency symbols."
            )},
        ],
    }
]

In [63]:
# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

In [64]:
image_inputs, video_inputs = process_vision_info(messages)

In [65]:
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

In [66]:
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print(output_text)

['```json\n[\n  {\n    "date": "14 Oct 20",\n    "description": "BALANCE BROUGHT FORWARD",\n    "money_in": 9000,\n    "money_out": 0.57,\n    "balance": 0.57\n  },\n  {\n    "date": "15 Oct 20",\n    "description": "construction",\n    "money_in": 0,\n    "money_out": 1225,\n    "balance": 5875.57\n  },\n  {\n    "date": "16 Oct']
