In [1]:
import torch
from IPython.display import Audio, display
from transformers import AutoModelForImageTextToText, AutoProcessor

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
BASE_GEMMA_MODEL_ID = "google/gemma-3n-E2B-it"
GEMMA_MODEL_ID = "bilguun/gemma-3n-E2B-it-audio-en-mn"

processor = AutoProcessor.from_pretrained(BASE_GEMMA_MODEL_ID, device_map="auto")
model = AutoModelForImageTextToText.from_pretrained(GEMMA_MODEL_ID, torch_dtype="auto", device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Transcribing

Here are examples of transcribing audio files into text in original language.

In [4]:
file_path = "./audio_samples/en1.wav"

display(Audio(file_path))

messages = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": file_path},
            {"type": "text", "text": "Transcribe this audio."},
        ]
    }
]

input_ids = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True, return_dict=True,
        return_tensors="pt",
)
input_ids = input_ids.to(model.device, dtype=model.dtype)

outputs = model.generate(**input_ids, max_new_tokens=128)

text = processor.batch_decode(
    outputs,
    skip_special_tokens=False,
    clean_up_tokenization_spaces=False
)

print(text[0])

W0807 02:34:34.485000 3677206 torch/_inductor/utils.py:1436] [0/0] Not enough SMs to use max_autotune_gemm mode


<bos><start_of_turn>user


<start_of_audio><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><au

In [5]:
file_path = "./audio_samples/mn2.wav"

display(Audio(file_path))

messages = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": file_path},
            {"type": "text", "text": "Transcribe this audio."},
        ]
    }
]

input_ids = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True, return_dict=True,
        return_tensors="pt",
)
input_ids = input_ids.to(model.device, dtype=model.dtype)

outputs = model.generate(**input_ids, max_new_tokens=128)

text = processor.batch_decode(
    outputs,
    skip_special_tokens=False,
    clean_up_tokenization_spaces=False
)

print(text[0])

<bos><start_of_turn>user


<start_of_audio><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><au

### Transcribe and then Translate

Here are examples of transcribing audio files into text in original language and translate into another.

In [15]:
file_path = "./audio_samples/en3.wav"

display(Audio(file_path))

messages = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": file_path},
            {"type": "text", "text": "Transcribe this audio into English and translate into Mongolian."},
        ]
    }
]

input_ids = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True, return_dict=True,
        return_tensors="pt",
)
input_ids = input_ids.to(model.device, dtype=model.dtype)

outputs = model.generate(**input_ids, max_new_tokens=128)

text = processor.batch_decode(
    outputs,
    skip_special_tokens=False,
    clean_up_tokenization_spaces=False
)

print(text[0])

<bos><start_of_turn>user


<start_of_audio><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><au

In [12]:
file_path = "./audio_samples/mn2.wav"

display(Audio(file_path))

messages = [
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": file_path},
            {"type": "text", "text": "Transcribe this audio into Mongolian and translate into English."},
        ]
    }
]

input_ids = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True, return_dict=True,
        return_tensors="pt",
)
input_ids = input_ids.to(model.device, dtype=model.dtype)

outputs = model.generate(**input_ids, max_new_tokens=128)

text = processor.batch_decode(
    outputs,
    skip_special_tokens=False,
    clean_up_tokenization_spaces=False
)

print(text[0])

<bos><start_of_turn>user


<start_of_audio><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><audio_soft_token><au