## Windows omni_env

## 00 Install & Import Libraries

In [None]:
import torch

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from IPython.display import Image, display, Video

## 01 Import Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

In [None]:
model_path = './00_Model/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit'

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
def inference(prompt, image_path):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : 224 * 224,
                    'max_pixels' : 1280 * 28 * 28,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = 4096, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )
    return output_text[0]

## 03 Run Inference

### 03.00 Initial Test

In [None]:
%%time

image_path = './00_Dataset/demo.jpeg'
prompt = """
What is the main subject of this image? Describe it in detail.
"""

display(Image(filename = image_path))
inference(prompt, image_path)

In [None]:
%%time

image_path = './00_Dataset/demo.jpeg'
prompt = """
i have this picture of my sefl, i want to post it in my instagram, help me to write the caption for it including emojis and hashtags.
"""

display(Image(filename = image_path))
inference(prompt, image_path)

### 03.01 Universal Recognition - Basic

In [None]:
%%time

image_path = './00_Dataset/unireco_bird_example.jpg'
prompt = """
What kind of bird is this? Please give its name in Chinese and English.
"""

display(Image(filename = image_path))
inference(prompt, image_path)

### 03.02 Universal Recognition - Multi Images

In [None]:
%%time

image_path = './00_Dataset/unireco_birds_example.jpg'
prompt = """
What are these birds? Please give their names in English and Chinese.
"""

display(Image(filename = image_path))
inference(prompt, image_path)

### 03.03 Universal Recognition - Landmarks

In [None]:
%%time

image_path = './00_Dataset/unireco_landmarks_example.jpg'
prompt = """
What are these attractions? Please give their names in English and Chinese
"""

display(Image(filename = image_path))
inference(prompt, image_path)

### 03.04 Universal Recognition - Celebrities

In [None]:
%%time

image_path = './00_Dataset/unireco_celebrities_example.jpg'
prompt = """
Who are these in this picture? Please give their names in English and Chinese.
"""

display(Image(filename = image_path))
inference(prompt, image_path)