In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [2]:
import torch
from transformers import (
    Qwen2_5OmniModel,
    Qwen2_5OmniProcessor,
    GenerationConfig,
    Qwen2_5OmniThinkerForConditionalGeneration,
)

from qwen_omni_utils import process_mm_info


omni_r1_path = "/mnt/public/home/zhonghao/Omini-R1-ORI/train_logs/omni_multi_9k_temper_fused_res_100_h100_4/"
# "/mnt/public/home/zhonghao/Omini-R1-ORI/train_logs/omni_multi_9k_temper_fused_res_100_h100_4/"
# /mnt/public/home/zhonghao/Omni-R1/train_logs/omni_refactor
qwen_omni_path = "/mnt/public/weight/Qwen2.5-Omni-7B"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
omni_r1 = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
    omni_r1_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
).eval()
qwen_omni = Qwen2_5OmniModel.from_pretrained(
    qwen_omni_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
).thinker.eval()

processor = Qwen2_5OmniProcessor.from_pretrained(qwen_omni_path)


generation_config = GenerationConfig(
    use_cache=True, max_new_tokens=1024, do_sample=False
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.41s/it]
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but flash_attention_2 only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.
Loading checkpoint shards: 100%|██████████| 5/5 [00:11<00:00,  2.37s/it]


In [4]:
def inference(model, video_path, prompt, sys_prompt, use_audio_in_video=True):
    messages = [
        {"role": "system", "content": [{"type": "text", "text": sys_prompt}]},
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video_path},
                {"type": "text", "text": prompt},
            ],
        },
    ]
    text_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    audio_input, image_input, video_input, process_args = process_mm_info(
        messages, use_audio_in_video=use_audio_in_video
    )

    inputs = processor(
        text=text_input,
        images=image_input,
        audios=audio_input,
        videos=video_input,
        use_audio_in_video=use_audio_in_video,
        return_tensors="pt",
        padding=True,
        padding_side="left",
        do_resize=True,
    )
    inputs = inputs.to(model.device).to(model.dtype)

    # 生成输出
    with torch.inference_mode():
        generated_ids = model.generate(**inputs, use_audio_in_video=use_audio_in_video, generation_config=generation_config)

    prompt_length = inputs["input_ids"].size(1)
    completion_ids = generated_ids[:, prompt_length:]
    # Decode the generated completions
    text = processor.batch_decode(completion_ids, skip_special_tokens=True)
    return text

In [5]:
from IPython.display import Video

In [6]:
video_path = "assets/videos/shopping.mp4"
prompt = (
    "Which kind of drinks is picked up last?"
    )


In [7]:
response = inference(
    omni_r1, video_path, prompt=prompt, sys_prompt="You are a helpful assistant.", use_audio_in_video=False
)
print(response[0])

qwen-vl-utils using decord to read video.
`generation_config` default values have been modified to match model-specific defaults: {'pad_token_id': 151643, 'bos_token_id': 151644, 'eos_token_id': 151645}. If this is not desired, please set these values explicitly.


The last drink picked up is a white bottle labeled "维C" (Vitamin C).


In [8]:
## Use a local HuggingFace model to inference.
response = inference(
    qwen_omni, video_path, prompt=prompt, sys_prompt="You are a helpful assistant.", use_audio_in_video=False
)
print(response[0])



The last drink picked up is a white bottle with a black cap and a label that reads "维C" (Vitamin C).


In [16]:
video_path = "assets/videos/beef.mp4"
prompt = "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. Provide the result in json format with 'mm:ss.ff' format for time depiction."


In [10]:
response = inference(
    omni_r1, video_path, prompt=prompt, sys_prompt="You are a helpful assistant.", use_audio_in_video=True
)
print(response[0])

  audios.append(librosa.load(path, sr=16000)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Token indices sequence length is longer than the specified maximum sequence length for this model (38496 > 32768). Running this sequence through the model will result in indexing errors


```json
[
    {
        "start_time": "00:38.00",
        "end_time": "00:46.00",
        "description": "season the brisket with salt and pepper."
    },
    {
        "start_time": "00:46.00",
        "end_time": "00:54.00",
        "description": "place the brisket on the grill."
    },
    {
        "start_time": "00:54.00",
        "end_time": "01:02.00",
        "description": "add some oil to the pan."
    },
    {
        "start_time": "01:02.00",
        "end_time": "01:10.00",
        "description": "add chopped onions carrots and bell peppers to the pan."
    },
    {
        "start_time": "01:10.00",
        "end_time": "01:18.00",
        "description": "add a chicken stock cube to the pan."
    },
    {
        "start_time": "01:18.00",
        "end_time": "01:26.00",
        "description": "place the brisket on the pan."
    },
    {
        "start_time": "01:26.00",
        "end_time": "01:34.00",
        "description": "add some barbecue sauce to the brisket."
    },
 

In [17]:
response = inference(
    qwen_omni, video_path, prompt=prompt, sys_prompt="You are a helpful assistant.", use_audio_in_video=True
)
print(response[0])

  audios.append(librosa.load(path, sr=16000)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


```json
[
    {
        "start_time": "00:37.00",
        "end_time": "00:44.00",
        "description": "cut the fat off the brisket."
    },
    {
        "start_time": "00:45.00",
        "end_time": "00:54.00",
        "description": "rub the brisket with salt and pepper."
    },
    {
        "start_time": "01:00.00",
        "end_time": "01:05.00",
        "description": "add some olive oil to the pan."
    },
    {
        "start_time": "01:06.00",
        "end_time": "01:10.00",
        "description": "add chopped onions to the pan."
    },
    {
        "start_time": "01:11.00",
        "end_time": "01:15.00",
        "description": "add chopped carrots to the pan."
    },
    {
        "start_time": "01:16.00",
        "end_time": "01:20.00",
        "description": "add chopped yellow and red bell peppers to the pan."
    },
    {
        "start_time": "01:21.00",
        "end_time": "01:25.00",
        "description": "add some chopped garlic to the pan."
    },
    {
       

In [12]:
video_path = "assets/videos/refavs_demo.mp4"
prompt = "Which object(s) you think make the sound in the video? Describe the sound and the object(s) in detail."


In [13]:
response = inference(
    omni_r1, video_path, prompt=prompt, sys_prompt="You are a helpful assistant.", use_audio_in_video=True
)
print(response[0])

  audios.append(librosa.load(path, sr=16000)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


The sound in the video is produced by a sitar, which is a long-necked lute with a gourd-shaped resonator. The sitar is played by plucking its strings with a small, curved instrument called a mizrab. The sound is characterized by a distinctive, resonant tone that is often associated with Indian classical music.


In [14]:
response = inference(
    qwen_omni, video_path, prompt=prompt, sys_prompt="You are a helpful assistant.", use_audio_in_video=True
)
print(response[0])

  audios.append(librosa.load(path, sr=16000)[0])
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


The sound in the video is produced by a sitar. The sitar is a stringed instrument that is commonly used in Indian classical music. It has a long neck and a gourd-shaped resonator. The sitar player uses a pick to pluck the strings, which produce the sound. The sound of the sitar is characterized by its twangy and resonant quality.
