## Windows omni_env
## Notes this model is require more vram for image or video input

## 00 Install & Import Libraries

In [None]:
import torch
import librosa
import audioread

from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, AutoProcessor, BitsAndBytesConfig, TextStreamer
from qwen_omni_utils import process_mm_info, process_vision_info
from io import BytesIO
from urllib.request import urlopen
from IPython.display import Audio, Video, Image

## 01 Import Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

In [None]:
model_path = './00_Model/Qwen2.5-Omni-3B'

model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
    #attn_implementation = 'flash_attention_2',
).to(device) #''
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
# @title inference function
def inference(prompt, image_path = None, audio_path = None, video_path = None, chat = False, audio_output = False):

    content = [{'type' : 'text', 'text' : prompt}]
    
    if image_path is not None:
        content.append({'type' : 'image', 'image' : image_path})
    
    if audio_path is not None:
        content.append({'type' : 'audio', 'audio' : audio_path})
    
    if video_path is not None:
        content.append({'type' : 'video', 'video' : video_path})
    
    if video_path is not None:
        system_message = 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'
    elif audio_path is not None and chat is True:
        system_message = 'You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.'
    elif audio_path is not None and chat is False:
        system_message = 'You are a speech recognition model.'
    else:
        system_message = 'You are a helpful assistant.'
    
    messages = [
        {'role' : 'system', 'content' : system_message},
        {'role' : 'user', 'content' : content}
    ]
    
    text = processor.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
    print("text:", text)
    # image_inputs, video_inputs = process_vision_info([messages])
    audios, images, videos = process_mm_info(messages, use_audio_in_video = True)
    inputs = processor(text = text, audio = audios, images = images, videos = videos, return_tensors = 'pt', padding = True, use_audio_in_video = True)
    inputs = inputs.to(model.device).to(model.dtype)

    streamer = None
    if not audio_output:
        streamer = TextStreamer(processor.tokenizer, skip_prompt = True, skip_special_tokens = True)

    output = model.generate(**inputs, use_audio_in_video = True, return_audio = audio_output, thinker_do_sample = False, streamer = streamer)

    if audio_output:
        text = processor.batch_decode(output[0], skip_special_tokens = True, clean_up_tokenization_spaces = False)
        audio = output[1]
        return text, audio
    else:
        text = processor.batch_decode(output, skip_special_tokens = True, clean_up_tokenization_spaces = False)
        return text

## 03 Run Inference
### 03.01 Spech Recognition (Input = Text, Audio | Output = Text)

In [None]:
%%time

audio_path = './00_Dataset/1272-128104-0000.flac'
prompt = 'Transcribe the English audio into text without any punctuation marks.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = prompt, audio_path = audio_path)
print(response[0])


In [None]:
%%time

audio_path = './00_Dataset/BAC009S0764W0121.wav'
prompt = '请将这段中文语音转换为纯文本，去掉标点符号。'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = prompt, audio_path = audio_path)
print(response[0])

In [None]:
%%time

audio_path = './00_Dataset/10000611681338527501.wav'
prompt = 'Transcribe the Russian audio into text without including any punctuation marks.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = prompt, audio_path = audio_path)
print(response[0])

In [None]:
%%time

audio_path = './00_Dataset/7105431834829365765.wav'
prompt = 'Transcribe the French audio into text without including any punctuation marks.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = prompt, audio_path = audio_path)
print(response[0])

### 03.02 Spech Translation (Input = Text, Audio | Output = Text)

In [None]:
%%time

audio_path = './00_Dataset/1272-128104-0000.flac'
prompt = 'Listen to the provided English speech and produce a translation in Chinese text.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = prompt, audio_path = audio_path)
print(response[0])

### 03.03 Vocal Sound Classification (Input = Text, Audio | Output = Text)

In [None]:
%%time

audio_path = './00_Dataset/cough.wav'
prompt = 'Classify the given human vocal sound in English.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = prompt, audio_path = audio_path)
print(response[0])

### 03.04 Chatting or Conversation (Input = Audio | Output = Text)

In [None]:
%%time

audio_path = './00_Dataset/guess_age_gender.wav'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = '', audio_path = audio_path, chat = True, audio_output = False)
print(response[0])

In [None]:
%%time

audio_path = './00_Dataset/translate_to_chinese.wav'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = '', audio_path = audio_path, chat = True, audio_output = False)
print(response[0])

### 03.05 Chatting or Conversation (Input = Audio | Output = Text, Audio)

In [None]:
%%time

audio_path = './00_Dataset/guess_age_gender.wav'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = '', audio_path = audio_path, chat = True, audio_output = True)
print(response[0][0])
display(Audio(response[1], rate = 24000))

In [None]:
%%time

audio_path = './00_Dataset/translate_to_chinese.wav'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

response = inference(prompt = '', audio_path = audio_path, chat = True, audio_output = True)
print(response[0][0])
display(Audio(response[1], rate = 24000))

##