In [None]:
import torch
from transformers import (
    Qwen2_5OmniModel,
    Qwen2_5OmniProcessor,
    GenerationConfig,
    Qwen2_5OmniThinkerForConditionalGeneration,
)
from transformers import AutoModelForCausalLM, AutoTokenizer
from qwen_omni_utils import process_mm_info, process_vision_info


omni_path = "/path/to/Omni-R1"


In [None]:
model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
    omni_path,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
).eval()
processor = Qwen2_5OmniProcessor.from_pretrained(omni_path)


generation_config = GenerationConfig(
    use_cache=True, max_new_tokens=1024, do_sample=False
)

In [None]:
def inference(video_path, prompt, sys_prompt):
    messages = [
        {"role": "system", "content": [{"type": "text", "text": sys_prompt}]},
        {
            "role": "user",
            "content": [
                {"type": "video", "video": video_path},
                {"type": "text", "text": prompt},
            ],
        },
    ]
    text_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    audio_input, image_input, video_input, process_args = process_mm_info(
        messages, use_audio_in_video=False
    )

    inputs = processor(
        text=text_input[0],
        images=image_input,
        audios=audio_input,
        videos=video_input,
        return_tensors="pt",
        do_resize=True,
    )

    # 生成输出
    with torch.inference_mode():
        generated_ids = model.generate(**inputs, generation_config=generation_config)

    prompt_length = inputs["input_ids"].size(1)
    completion_ids = generated_ids[:, prompt_length:]
    # Decode the generated completions
    text = processor.batch_decode(completion_ids, skip_special_tokens=True)
    return text

In [None]:
from IPython.display import Video

In [None]:
video_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/shopping.mp4"
prompt = "How many kind of drinks can you see in the video?"

display(Video(video_path, width=640, height=360))

## Use a local HuggingFace model to inference.
response = inference(
    video_path, prompt=prompt, sys_prompt="You are a helpful assistant."
)
print(response[0])

In [None]:
prompt = create_ref_prompt(selected_frames, text_prompt, use_time_stamp=True)

text_input = processor.apply_chat_template(
    prompt, tokenize=False, add_generation_prompt=True
)


audio_input, image_input, video_input, kwarg = process_mm_info(
    prompt, use_audio_in_video=False
)

inputs = processor(
    text=text_input[0],
    images=image_input,
    audios=audio_input,
    videos=video_input,
    return_tensors="pt",
    do_resize=False,
)


# 生成输出
with torch.inference_mode():
    generated_ids = model.generate(**inputs, generation_config=generation_config)

prompt_length = inputs["input_ids"].size(1)
completion_ids = generated_ids[:, prompt_length:]
# Decode the generated completions
output_text = processor.batch_decode(completion_ids, skip_special_tokens=True)