## v1 - Works but need improvement

In [None]:
import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, BitsAndBytesConfig
from qwen_omni_utils import process_mm_info

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = './00_Model/Qwen2.5-Omni-3B'
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map="cuda",
    #attn_implementation="flash_attention_2",
).to(device) #''
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

# Conversation history
history = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
            }
        ],
    }
]

USE_AUDIO_IN_VIDEO = True

# Chat logic with audio input/output
def voice_chat(audio_path, temperature, max_tokens):
    # Add user audio to conversation
    history.append({
        "role": "user",
        "content": [{"type": "audio", "audio": audio_path}]
    })

    # Prepare model input
    text = processor.apply_chat_template(history, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(history, use_audio_in_video=USE_AUDIO_IN_VIDEO)
    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO
    ).to(model.device).to(model.dtype)

    # Generate response (text + audio)
    text_ids, audio = model.generate(
        **inputs,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
        temperature=temperature,
        max_new_tokens=max_tokens
    )

    # Decode response
    text_response = processor.batch_decode(text_ids, skip_special_tokens=True)[0]

    # Add model's response to conversation history
    history.append({
        "role": "assistant",
        "content": [{"type": "text", "text": text_response}]
    })

    # Save audio to output
    output_path = "output.wav"
    sf.write(output_path, audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)

    return output_path

# Build Gradio UI (audio-only)
with gr.Blocks() as demo:
    gr.Markdown("## üéôÔ∏è Qwen2.5 Omni Voice Chat (Audio In/Out Only)")

    with gr.Row():
        audio_input = gr.Audio(label="üé§ Speak", type="filepath")
        audio_output = gr.Audio(label="üîä Response", type="filepath", interactive=False)

    with gr.Accordion("‚öôÔ∏è Parameters", open=False):
        temperature = gr.Slider(0, 1, step=0.1, value=0.6, label="Temperature")
        max_tokens = gr.Slider(128, 4096, step=1, value=256, label="Max new tokens")

    submit_btn = gr.Button("Send")

    # Event handler
    submit_btn.click(
        fn=voice_chat,
        inputs=[audio_input, temperature, max_tokens],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(debug=True)

## v2 - works and fail in direct generation output

In [None]:
import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, BitsAndBytesConfig
from qwen_omni_utils import process_mm_info

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = "Qwen/Qwen2.5-Omni-3B"
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map="cuda",
    #attn_implementation="flash_attention_2",
).to(device) #''
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

# Conversation history
history = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
            }
        ],
    }
]

USE_AUDIO_IN_VIDEO = True

# Chat logic with audio input/output
def voice_chat(audio_path, temperature, max_tokens):
    # Add user audio to conversation
    history.append({
        "role": "user",
        "content": [{"type": "audio", "audio": audio_path}]
    })

    # Prepare model input
    text = processor.apply_chat_template(history, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(history, use_audio_in_video=USE_AUDIO_IN_VIDEO)
    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO
    ).to(model.device).to(model.dtype)

    # Generate response (text + audio)
    text_ids, audio = model.generate(
        **inputs,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
        temperature=temperature,
        max_new_tokens=max_tokens
    )

    # Decode response
    text_response = processor.batch_decode(text_ids, skip_special_tokens=True)[0]

    # Add model's response to conversation history
    history.append({
        "role": "assistant",
        "content": [{"type": "text", "text": text_response}]
    })

    # Save audio to output
    output_path = "output.wav"
    sf.write(output_path, audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)

    return output_path

# Build Gradio UI (audio-only)
with gr.Blocks() as demo:
    gr.Markdown("## üéôÔ∏è Qwen2.5 Omni Voice Chat (Audio In/Out Only)")

    with gr.Row():
        audio_input = gr.Audio(label="üé§ Speak", type="filepath")
        audio_output = gr.Audio(label="üîä Response", type="filepath", interactive=True, autoplay=True)

    with gr.Accordion("‚öôÔ∏è Parameters", open=False):
        temperature = gr.Slider(0, 1, step=0.1, value=0.6, label="Temperature")
        max_tokens = gr.Slider(128, 4096, step=1, value=256, label="Max new tokens")

    submit_btn = gr.Button("Send")

    # Event handler
    submit_btn.click(
        fn=voice_chat,
        inputs=[audio_input, temperature, max_tokens],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(debug=True)

## v3

In [None]:
import gradio as gr
import torch
import soundfile as sf
import tempfile
import os
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, BitsAndBytesConfig
from qwen_omni_utils import process_mm_info

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = "Qwen/Qwen2.5-Omni-3B"
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map="cuda",
    #attn_implementation="flash_attention_2",
).to(device) #''
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

# Conversation history
history = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
            }
        ],
    }
]

USE_AUDIO_IN_VIDEO = True

# Chat logic with audio input/output
def voice_chat(audio_path, temperature, max_tokens):
    # Add user audio to conversation
    history.append({
        "role": "user",
        "content": [{"type": "audio", "audio": audio_path}]
    })

    # Prepare model input
    text = processor.apply_chat_template(history, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(history, use_audio_in_video=USE_AUDIO_IN_VIDEO)
    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO
    ).to(model.device).to(model.dtype)

    # Generate response (text + audio)
    text_ids, audio = model.generate(
        **inputs,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
        temperature=temperature,
        max_new_tokens=max_tokens
    )

    # Decode response
    text_response = processor.batch_decode(text_ids, skip_special_tokens=True)[0]

    # Add model's response to conversation history
    history.append({
        "role": "assistant",
        "content": [{"type": "text", "text": text_response}]
    })

    # Save audio to output
    output_path = "output.wav"
    sf.write(output_path, audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)

    return output_path

# Build Gradio UI (audio-only)
with gr.Blocks() as demo:
    gr.Markdown("## üéôÔ∏è Qwen2.5 Omni Voice Chat (Audio In/Out Only)")

    with gr.Row():
        audio_input = gr.Audio(label="üé§ Speak", type="filepath")
        audio_output = gr.Audio(label="üîä Response", type="filepath", interactive=True, autoplay=True)

    with gr.Accordion("‚öôÔ∏è Parameters", open=False):
        temperature = gr.Slider(0, 1, step=0.1, value=0.6, label="Temperature")
        max_tokens = gr.Slider(128, 4096, step=1, value=256, label="Max new tokens")

    submit_btn = gr.Button("Send")

    # Event handler
    submit_btn.click(
        fn=voice_chat,
        inputs=[audio_input, temperature, max_tokens],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(debug=True)