## 00 Install & Import Libraries

In [None]:
!pip install -q git+https://github.com/BakerBunker/transformers@21dbefaa54e5bf180464696aa70af0bfc7a61d53
!pip install -q qwen-omni-utils
!pip install -q openai
!pip install -q flash-attn --no-build-isolation
!pip install -q triton

In [None]:
import torch
import librosa
import audioread

from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
from io import BytesIO
from urllib.request import urlopen
from IPython.display import Audio, Video

## 01 Import Model

In [None]:
model_path = "Qwen/Qwen2.5-Omni-7B"
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    #attn_implementation="flash_attention_2",
)
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
# @title inference function
def inference(conversations):
    text = processor.apply_chat_template(conversations, tokenize=False, add_generation_prompt=True)
    audios, images, videos = process_mm_info(conversations, use_audio_in_video=True)
    inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=True)
    inputs.to(model.device).to(model.dtype)

    output = model.generate(**inputs, use_audio_in_video=True, return_audio=True)

    text = processor.batch_decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    audio = output[1]
    return text, audio

## 03 Run Inference

In [None]:
conversations = [
    {"role": "system", "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."},
]

video_path_round_1 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw1.mp4"

display(Video(video_path_round_1, width=640, height=360))
display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(video_path_round_1), sr=16000)[0], rate=16000))

## Use a local HuggingFace model to inference.
conversations.append( {"role": "user", "content":  [{"type": "video", "video": video_path_round_1}]} )

response, audio  = inference(conversations)
print(response[0])

display(Audio(audio, rate=24000))

In [None]:
conversations.append( {"role": "assistant", "content": response[0].split("\n")[-1]} )

video_path_round_2 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw2.mp4"

display(Video(video_path_round_2, width=640, height=360))
display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(video_path_round_2), sr=16000)[0], rate=16000))

## Use a local HuggingFace model to inference.
conversations.append( {"role": "user", "content":  [{"type": "video", "video": video_path_round_2}]} )

response, audio  = inference(conversations)
print(response[0])

display(Audio(audio, rate=24000))

In [None]:
conversations.append( {"role": "assistant", "content": response[0].split("\n")[-1]} )

video_path_round_3 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw3.mp4"

display(Video(video_path_round_3, width=640, height=360))
display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(video_path_round_3), sr=16000)[0], rate=16000))

## Use a local HuggingFace model to inference.
conversations.append( {"role": "user", "content":  [{"type": "video", "video": video_path_round_3}]} )

response, audio  = inference(conversations)
print(response[0])

display(Audio(audio, rate=24000))