## Windows omni_env

## 00 Install & Import Libraries

In [None]:
import torch
import os
import math
import hashlib
import requests
import numpy as np
import decord
import markdown

from bs4 import BeautifulSoup
from datetime import datetime
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont, ImageColor
from IPython.display import display, Markdown
from decord import VideoReader, cpu

## 01 Import Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

In [None]:
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
def get_video_frames(video_path, num_frames = 128, cache_dir = '.cache'):
    os.makedirs(cache_dir, exist_ok = True)

    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
    if video_path.startswith('http://') or video_path.startswith('https://'):
        video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
        if not os.path.exists(video_file_path):
            download_video(video_path, video_file_path)
    else:
        video_file_path = video_path

    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
    timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')

    if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
        frames = np.load(frames_cache_file)
        timestamps = np.load(timestamps_cache_file)
        return video_file_path, frames, timestamps

    vr = VideoReader(video_file_path, ctx = cpu(0))
    total_frames = len(vr)

    indices = np.linspace(0, total_frames - 1, num = num_frames, dtype = int)
    frames = vr.get_batch(indices).asnumpy()
    timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])

    np.save(frames_cache_file, frames)
    np.save(timestamps_cache_file, timestamps)
    
    return video_file_path, frames, timestamps

def create_image_grid(images, num_columns = 8):
    pil_images = [Image.fromarray(image) for image in images]
    num_rows = math.ceil(len(images) / num_columns)

    img_width, img_height = pil_images[0].size
    grid_width = num_columns * img_width
    grid_height = num_rows * img_height
    grid_image = Image.new('RGB', (grid_width, grid_height))

    for idx, image in enumerate(pil_images):
        row_idx = idx // num_columns
        col_idx = idx % num_columns
        position = (col_idx * img_width, row_idx * img_height)
        grid_image.paste(image, position)

    return grid_image

In [None]:
def inference(
    prompt,
    video_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 2048,
    total_pixels = 20480 * 28 * 28,
    min_pixels = 16 * 28 * 28
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'video',
                    'video' : video_path,
                    'total_pixels' : total_pixels,
                    'min_pixels' : min_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs = True)
    fps_inputs = video_kwargs['fps']
    print('video input:', video_inputs[0].shape)
    num_frames, _, resized_height, resized_width = video_inputs[0].shape
    print('num of video tokens:', int(num_frames / 2 * resized_height / 28 * resized_width / 28))
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        fps = fps_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )
    return output_text[0]

## 03 Run Inference

### 03.01 Detect Certain Object in the Image

In [None]:
%%time

video_path = './00_Dataset/50221078283.mp4'
prompt = '请用表格总结一下视频中的商品特点'

video_path, frames, timestamps = get_video_frames(video_path, num_frames = 32)
response = inference(prompt, video_path)
display(Markdown(response))