## WSL omni_env
pip install git+https://github.com/BakerBunker/transformers@21dbefaa54e5bf180464696aa70af0bfc7a61d53

## 00 Install & Import Libraries

In [None]:
import torch
import librosa
import audioread
import time

from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, BitsAndBytesConfig
from qwen_omni_utils import process_mm_info
from io import BytesIO
from urllib.request import urlopen
from IPython.display import Audio, Video

## 01 Import Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

In [None]:
model_path = './00_Model/Qwen2.5-Omni-3B'

model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
    attn_implementation = 'flash_attention_2',
).to(device) #''
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
# @title inference function
def inference(conversations):
    text = processor.apply_chat_template(conversations, tokenize = False, add_generation_prompt = True)
    audios, images, videos = process_mm_info(conversations, use_audio_in_video = True)
    inputs = processor(text = text, audio = audios, images = images, videos = videos, return_tensors = 'pt', padding = True, use_audio_in_video = True)
    inputs.to(model.device).to(model.dtype)

    output = model.generate(**inputs, use_audio_in_video = True, return_audio = True)

    text = processor.batch_decode(output[0], skip_special_tokens = True, clean_up_tokenization_spaces = False)
    audio = output[1]
    return text, audio

## 03 Run Inference

In [None]:
start_time = time.time()  # Start timer
conversations = [
    {'role' : 'system', 'content' : 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'},
]

video_path_round_3 = './00_Dataset/draw03.mp4'

display(Video(video_path_round_3, width = 640, height = 360))
display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(video_path_round_3), sr = 16000)[0], rate = 16000))

## Use a local HuggingFace model to inference.
conversations.append({'role' : 'user', 'content' : [{'type' : 'video', 'video' : video_path_round_3}]} )

response, audio  = inference(conversations)
print(response[0])

display(Audio(audio, rate = 24000))
end_time = time.time()  # End timer
print(100 * '=')
print(f"Execution Time: {end_time - start_time:.2f} seconds")