# PyTorch Qwen2.5-Omni Test

Test the **official** HuggingFace Qwen2.5-Omni with the same Jarvis audio.
If this works but MLX fails → the mlx-lm-omni port is broken.
If this also fails → the audio characteristics are the issue.

In [None]:
# Install dependencies (run once)
!pip install transformers accelerate soundfile librosa torch torchaudio

In [None]:
# Test with official Qwen2.5-Omni via transformers
import librosa
import numpy as np
import torch
import time

# Audio files
JFK_AUDIO = 'test_audio 11.04.46\u202fPM.wav'
JARVIS_AUDIO = 'test_audio.wav'

for f in [JFK_AUDIO, JARVIS_AUDIO]:
    audio, sr = librosa.load(f, sr=16000)
    rms = np.sqrt(np.mean(audio**2))
    print(f'{f}: {len(audio)/sr:.1f}s | RMS={rms:.4f}')

print(f'\nPyTorch device: {"mps" if torch.backends.mps.is_available() else "cpu"}')
print(f'Torch version: {torch.__version__}')

In [None]:
# Load official Qwen2.5-Omni-3B from HuggingFace
# Note: This uses PyTorch (MPS on Apple Silicon), NOT MLX
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor

MODEL_ID = 'Qwen/Qwen2.5-Omni-3B'

print(f'Loading {MODEL_ID}...')
t0 = time.time()

processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID)
model = Qwen2_5OmniModel.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map='auto',  # Will use MPS on Apple Silicon
)

print(f'Loaded in {time.time()-t0:.1f}s')
print(f'Device: {next(model.parameters()).device}')

In [None]:
# Transcribe both files with the official model
import librosa
import soundfile as sf

test_files = [
    (JFK_AUDIO, 'JFK (baseline)'),
    (JARVIS_AUDIO, 'Jarvis RAW'),
]

for audio_file, label in test_files:
    audio, sr = librosa.load(audio_file, sr=16000)
    rms = np.sqrt(np.mean(audio**2))
    print(f'\n[{label}] {len(audio)/sr:.1f}s | RMS={rms:.4f}')
    
    # Build conversation with audio
    conversation = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {
            'role': 'user',
            'content': [
                {'type': 'audio', 'audio': audio_file},
                {'type': 'text', 'text': 'Transcribe this audio accurately, word for word.'},
            ],
        },
    ]
    
    # Process with the official processor
    text_input = processor.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=False
    )
    
    audios, _ = sf.read(audio_file)
    if audios.ndim == 1:
        audios = [audios]
    else:
        audios = [audios]
    
    inputs = processor(
        text=text_input,
        audios=audios,
        sampling_rate=sr,
        return_tensors='pt',
        padding=True,
    )
    inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
    
    t0 = time.time()
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=500)
    elapsed = time.time() - t0
    
    # Decode only the generated part
    input_len = inputs['input_ids'].shape[1]
    result = processor.decode(output_ids[0][input_len:], skip_special_tokens=True)
    
    print(f'  Time: {elapsed:.2f}s')
    print(f'  Result: {result}')
    print('=' * 60)

In [None]:
# Alternative: Try with Whisper directly as a sanity check
# If Whisper can transcribe it, the audio is fine and the issue is Qwen-specific
try:
    import whisper
    print('Testing with OpenAI Whisper...')
    whisper_model = whisper.load_model('base')
    
    for audio_file, label in [(JFK_AUDIO, 'JFK'), (JARVIS_AUDIO, 'Jarvis')]:
        result = whisper_model.transcribe(audio_file)
        print(f'\n[{label}] {result["text"][:200]}')
    
    del whisper_model
except ImportError:
    print('Whisper not installed. Run: pip install openai-whisper')
    print('Or try mlx-whisper: pip install mlx-whisper')
    
    # Try mlx-whisper as fallback
    try:
        import mlx_whisper
        print('\nTesting with mlx-whisper...')
        for audio_file, label in [(JFK_AUDIO, 'JFK'), (JARVIS_AUDIO, 'Jarvis')]:
            result = mlx_whisper.transcribe(audio_file)
            print(f'[{label}] {result["text"][:200]}')
    except ImportError:
        print('mlx-whisper not installed either. Run: pip install mlx-whisper')

In [None]:
# Cleanup
import gc
try:
    del model, processor
except: pass
gc.collect()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
print('Cleaned up.')