## Windows omni_env

## 00 Install & Import Libraries

In [None]:
import torch
import librosa
import time

from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, BitsAndBytesConfig
from qwen_omni_utils import process_mm_info
from io import BytesIO
from urllib.request import urlopen
from IPython.display import Audio

## 01 Import Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

In [None]:
model_path = './00_Model/Qwen2.5-Omni-7B'

model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
    #attn_implementation = 'flash_attention_2',
).to(device) #''
processor = Qwen2_5OmniProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
# @title inference function
def inference(audio_path, prompt, sys_prompt):
    messages = [
        {'role' : 'system', 'content' : sys_prompt},
        {'role' : 'user', 'content' : [
                {'type' : 'text', 'text' : prompt},
                {'type' : 'audio', 'audio' : audio_path},
            ]
        },
    ]
    text = processor.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)
    print("text:", text)
    # image_inputs, video_inputs = process_vision_info([messages])
    audios, images, videos = process_mm_info(messages, use_audio_in_video = True)
    inputs = processor(text = text, audio = audios, images = images, videos = videos, return_tensors = 'pt', padding = True, use_audio_in_video = True)
    inputs = inputs.to(model.device).to(model.dtype)

    output = model.generate(**inputs, use_audio_in_video = True, return_audio = False, thinker_do_sample = False)

    text = processor.batch_decode(output, skip_special_tokens = True, clean_up_tokenization_spaces = False)
    return text

## 03 Run Inference
### 03.01 Spech Recognition

In [None]:
start_time = time.time()  # Start timer
audio_path = './00_Dataset/1272-128104-0000.flac'
prompt = 'Transcribe the English audio into text without any punctuation marks.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

## Use a local HuggingFace model to inference.
response = inference(audio_path, prompt = prompt, sys_prompt = 'You are a speech recognition model.')
print(response[0])
end_time = time.time()  # End timer
print(100 * '=')
print(f"Execution Time: {end_time - start_time:.2f} seconds")

In [None]:
start_time = time.time()  # Start timer
audio_path = './00_Dataset/BAC009S0764W0121.wav'
prompt = '请将这段中文语音转换为纯文本，去掉标点符号。'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

## Use a local HuggingFace model to inference.
response = inference(audio_path, prompt = prompt, sys_prompt = 'You are a speech recognition model.')
print(response[0])
end_time = time.time()  # End timer
print(100 * '=')
print(f"Execution Time: {end_time - start_time:.2f} seconds")

In [None]:
start_time = time.time()  # Start timer
audio_path = './00_Dataset/10000611681338527501.wav'
prompt = 'Transcribe the Russian audio into text without including any punctuation marks.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

## Use a local HuggingFace model to inference.
response = inference(audio_path, prompt = prompt, sys_prompt = 'You are a speech recognition model.')
print(response[0])
end_time = time.time()  # End timer
print(100 * '=')
print(f"Execution Time: {end_time - start_time:.2f} seconds")

In [None]:
start_time = time.time()  # Start timer
audio_path = './00_Dataset/7105431834829365765.wav'
prompt = 'Transcribe the French audio into text without including any punctuation marks.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

## Use a local HuggingFace model to inference.
response = inference(audio_path, prompt = prompt, sys_prompt = 'You are a speech recognition model.')
print(response[0])
end_time = time.time()  # End timer
print(100 * '=')
print(f"Execution Time: {end_time - start_time:.2f} seconds")

### 03.02 Spech Translation

In [None]:
start_time = time.time()  # Start timer
audio_path = './00_Dataset/1272-128104-0000.flac'
prompt = 'Listen to the provided English speech and produce a translation in Chinese text.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

## Use a local HuggingFace model to inference.
response = inference(audio_path, prompt = prompt, sys_prompt = 'You are a speech recognition model.')
print(response[0])
end_time = time.time()  # End timer
print(100 * '=')
print(f"Execution Time: {end_time - start_time:.2f} seconds")

### 03.03 Vocal Sound Classification

In [None]:
start_time = time.time()  # Start timer
audio_path = './00_Dataset/cough.wav'
prompt = 'Classify the given human vocal sound in English.'

audio = librosa.load(audio_path, sr = 16000)[0]
display(Audio(audio, rate = 16000))

## Use a local HuggingFace model to inference.
response = inference(audio_path, prompt = prompt, sys_prompt = 'You are a speech recognition model.')
print(response[0])
end_time = time.time()  # End timer
print(100 * '=')
print(f"Execution Time: {end_time - start_time:.2f} seconds")