In [10]:
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy
import soundfile
import torchaudio
import torch

from collections import defaultdict
from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment

from seamless_communication.inference import Translator
from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover

In [11]:
from datasets import load_dataset
from seamless_communication.inference import Translator
from jiwer import wer
from sacrebleu import corpus_bleu

In [12]:
# Initialize a Translator object with a multitask model, vocoder on the GPU.

model_name = "seamlessM4T_v2_large"
vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"

translator = Translator(
    model_name,
    vocoder_name,
    device=torch.device("cuda"), # Changed from "cuda:0" to "cpu"
    dtype=torch.float16,
)

Using the cached checkpoint of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_v2. Set `force` to `True` to download again.


In [13]:
from datasets import load_dataset
import torch
import torchaudio

references_all = {}
hypotheses_s2tt_all = {}
hypotheses_t2tt_all = {}
predicted_text_from_s2s_all = {}
predicted_text_from_t2s_all = {}

# Language mapping: SeamlessM4T tgt_lang тЖТ (FLEURS target_lang)
lang_map = {
    "tel": "te_in",   # Telugu
    "urd": "ur_pk"    # Urdu (Pakistan variant)
}

# Fixed source language
fleurs_src_lang = "hi_in"  # Hindi in FLEURS
sm4t_src_lang = "hin"      # Hindi in SeamlessM4T

def resample_to_16k(audio, orig_sr):
    # implement resampling logic here, e.g. torchaudio.transforms.Resample
    return torchaudio.transforms.Resample(orig_sr, 16000)(torch.tensor(audio)).numpy()

for sm4t_tgt_lang, fleurs_tgt_lang in lang_map.items():
    print("\n" + "="*50)
    print(f"ЁЯФ╣ Processing Target Language: {sm4t_tgt_lang.upper()} ({fleurs_tgt_lang})")
    print("="*50)

    # Load FLEURS test splits
    src_dataset = load_dataset("google/fleurs", fleurs_src_lang, split="test")
    tgt_dataset = load_dataset("google/fleurs", fleurs_tgt_lang, split="test")

    # Build mappings from sentence_id (the 'id' field) to record
    src_by_id = {item["id"]: item for item in src_dataset}
    tgt_by_id = {item["id"]: item for item in tgt_dataset}

    # Compute intersection of available sentence_ids
    common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))
    print(f"Found {len(common_ids)} parallel sentences")

    # Initialize lists
    references = []
    hypotheses_s2tt = []
    hypotheses_t2tt = []
    predicted_s2s = []
    predicted_t2s = []

    for sentence_id in common_ids:
        src = src_by_id[sentence_id]
        tgt = tgt_by_id[sentence_id]

        # Extract source audio & text, and target text
        src_audio = src["audio"]["array"]
        src_sr = src["audio"]["sampling_rate"]
        src_text = src["transcription"]
        tgt_text = tgt["transcription"]

        # Reference list expects list of references per example
        references.append([tgt_text])

        # Resample if needed
        if src_sr != 16000:
            src_audio = resample_to_16k(src_audio, src_sr)

        # Save audio using sentence_id in filename
        audio_path = f"/tmp/input_{sentence_id}.wav"
        torchaudio.save(audio_path, torch.tensor(src_audio).unsqueeze(0), 16000)

        # Speech-to-text translation (S2TT)
        s2tt_out, _ = translator.predict(
            input=audio_path,
            task_str="s2tt",
            tgt_lang=sm4t_tgt_lang
        )
        hypotheses_s2tt.append(s2tt_out[0])

        # Text-to-text translation (T2TT)
        t2tt_out, _ = translator.predict(
            input=src_text,
            task_str="t2tt",
            src_lang=sm4t_src_lang,
            tgt_lang=sm4t_tgt_lang
        )
        hypotheses_t2tt.append(t2tt_out[0])

        # Speech-to-speech translation (S2ST) + ASR on S2ST output
        _, s2s_audio_out = translator.predict(
            input=audio_path,
            task_str="s2st",
            tgt_lang=sm4t_tgt_lang
        )
        s2s_path = f"/tmp/s2s_{sentence_id}.wav"
        torchaudio.save(
            s2s_path,
            s2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
            s2s_audio_out.sample_rate
        )
        s2s_asr_out, _ = translator.predict(
            input=s2s_path,
            task_str="asr",
            tgt_lang=sm4t_tgt_lang
        )
        predicted_s2s.append(s2s_asr_out[0])

        # Text-to-speech translation (T2ST) + ASR on T2ST output
        _, t2s_audio_out = translator.predict(
            input=src_text,
            task_str="t2st",
            src_lang=sm4t_src_lang,
            tgt_lang=sm4t_tgt_lang
        )
        t2s_path = f"/tmp/t2s_{sentence_id}.wav"
        torchaudio.save(
            t2s_path,
            t2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
            t2s_audio_out.sample_rate
        )
        t2s_asr_out, _ = translator.predict(
            input=t2s_path,
            task_str="asr",
            tgt_lang=sm4t_tgt_lang
        )
        predicted_t2s.append(t2s_asr_out[0])

    # Store results
    references_all[sm4t_tgt_lang] = references
    hypotheses_s2tt_all[sm4t_tgt_lang] = hypotheses_s2tt
    hypotheses_t2tt_all[sm4t_tgt_lang] = hypotheses_t2tt
    predicted_text_from_s2s_all[sm4t_tgt_lang] = predicted_s2s
    predicted_text_from_t2s_all[sm4t_tgt_lang] = predicted_t2s

    print(f"тЬЕ [{sm4t_tgt_lang}] Processed {len(common_ids)} parallel sentences")



ЁЯФ╣ Processing Target Language: TEL (te_in)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Found 228 parallel sentences
тЬЕ [tel] Processed 228 parallel sentences

ЁЯФ╣ Processing Target Language: URD (ur_pk)
Found 176 parallel sentences
тЬЕ [urd] Processed 176 parallel sentences


In [21]:
print(references_all)

{'tel': [['р░░р▒Кр░ор░╛р░Вр░Яр░┐р░Хр▒Н р░╡р░╛р░жр░Вр░▓р▒Л р░╕р░╛р░Вр░╕р▒Нр░Хр▒Гр░др░┐р░Х р░ир░┐р░░р▒Нр░гр░пр░╛р░▓р▒Нр░▓р▒Л р░кр▒Жр░жр▒Нр░ж р░Ер░Вр░╢р░В р░Хр░▓р░жр▒Б р░Чр▒Лр░др▒З р░лр░┐р░Ър▒Нр░Яр▒З р░╖р▒Нр░▓р▒Зр░Чр▒Жр░▓р▒Н р░╡р░Вр░Яр░┐ р░░р░Ър░пр░┐р░др░▓ р░ир▒Бр░Вр░бр░┐ р░др▒Ар░╕р▒Бр░Хр▒Лр░мр░бр░┐р░Вр░жр░┐'], ['р░Хр▒Лр░др░▓ р░Хр▒Лр░╕р░В р░Жр░пр░и р░Тр░Х р░╕р░Вр░Цр▒Нр░пр░ир▒Б р░Ер░ир▒Бр░Хр▒Лр░▓р▒Зр░жр▒Б р░Ер░╡р░┐ р░Ър▒Ир░ир░╛ р░Жр░░р▒Нр░зр░┐р░Х р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Жр░зр░╛р░░р░Вр░Чр░╛ р░др░пр░╛р░░р▒Б р░Ър▒Зр░пр░мр░бр░др░╛р░пр░┐'], ['р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░▓р▒Ар░Хр▒Н р░кр▒Нр░░р░Хр░╛р░░р░В р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Бр░▓ р░╡р░┐р░╡р░╛р░жр░╛р░ир▒Нр░ир░┐ р░╕р▒Вр░Ър░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐ р░Зр░жр░┐ 1967 р░ор░┐р░бр▒Нр░Ир░╕р▒Нр░Яр▒Н р░пр▒Бр░жр▒Нр░зр░╛р░ир░┐р░Хр░┐ р░ор▒Бр░Вр░жр▒Б р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Бр░▓ р░Жр░зр░╛р░░р░Вр░Чр░╛ р░кр░╛р░▓р░╕р▒Нр░др▒Ар░ир░╛ р░Хр▒Лр░░р▒Бр░Хр▒Бр░Вр░жр░┐'], ['р░ор▒Ар░░р▒Б р░ор▒А р░╕р▒Нр░╡р░Вр░др░В р░Жр░▓р▒Лр░Ър░ир░▓р░др▒Л р░Хр░╛р░Хр▒Бр░Вр░бр░╛ р░кр▒Нр░

In [23]:
print(hypotheses_s2tt_all)

{'tel': [CString('р░░р▒Кр░ор░╛р░ир▒Ар░др░Вр░▓р▒Л р░╕р░╛р░Вр░╕р▒Нр░Хр▒Гр░др░┐р░Х р░ир▒Ир░др░┐р░Хр░╡р░╛р░жр░В р░пр▒Кр░Хр▒Нр░Х р░кр▒Жр░жр▒Нр░ж р░ор▒Вр░▓р░Хр░В р░Йр░Вр░жр░┐ р░Зр░жр░┐ р░Чр▒Лр░ер▒З р░лр░┐р░Ър▒Нр░Яр▒З р░ор░░р░┐р░пр▒Б р░╕р▒Нр░▓р▒Зр░Чр░▓р▒Н р░╡р░Вр░Яр░┐ р░░р░Ър░пр░┐р░др░▓ р░ир▒Бр░Вр░бр░┐ р░др▒Ар░╕р▒Бр░Хр▒Лр░мр░бр░┐р░Вр░жр░┐'), CString('р░Зр░жр░┐ р░Ър▒Жр░кр▒Нр░др▒В р░╡р░╛р░░р▒Б р░Ър▒Ир░ир░╛ р░пр▒Кр░Хр▒Нр░Х р░Жр░░р▒Нр░ер░┐р░Х р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Жр░зр░╛р░░р░Вр░Чр░╛ р░др░пр░╛р░░р▒Б р░Ър▒Зр░пр░мр░бр░др░╛р░░р▒Б р░Жр░пр░и р░Хр▒Лр░д р░Хр▒Лр░╕р░В р░П р░Чр░гр░╛р░Вр░Хр░В р░ир░┐р░░р▒Нр░гр░пр░┐р░Вр░Ър░▓р▒Зр░жр▒Б'), CString('р░▓р▒Ар░Чр▒Н р░кр▒Нр░░р░Хр░╛р░░р░В р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Б р░╡р░┐р░╡р░╛р░жр░╛р░ир▒Нр░ир░┐ р░кр▒Нр░░р░╕р▒Нр░др░╛р░╡р░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐ р░Зр░жр░┐ р░кр░╛р░▓р░╕р▒Нр░др▒Ар░ир░╛ р░кр░Вр░др▒Кр░ор▒Нр░ор░┐р░жр░┐ р░╡р░Вр░жр░▓ р░Ер░░р░╡р▒Ир░▓ р░ор░зр▒Нр░пр░кр▒Нр░░р░╛р░Ър▒Нр░п р░пр▒Бр░жр▒Нр░зр░╛р░ир░┐р░Хр░┐ р░ор▒Бр░Вр░жр▒Б р░╕р░░р░┐р░╣

In [24]:
print(hypotheses_t2tt_all)

{'tel': [CString('р░░р▒Кр░ор░╛р░ир░┐р░пр░В р░▓р▒Л р░╕р░╛р░Вр░╕р▒Нр░Хр▒Гр░др░┐р░Х р░ир░┐р░░р▒Нр░гр▒Ар░др░╡р░╛р░жр░В р░пр▒Кр░Хр▒Нр░Х р░кр▒Жр░жр▒Нр░ж р░Ер░Вр░╢р░В р░Йр░Вр░жр░┐ р░Ер░жр░┐ р░Чр▒Лр░ер▒З р░лр░┐р░Ър▒Нр░Яр▒З р░ор░░р░┐р░пр▒Б р░╖р▒Нр░▓р▒Жр░Чр░▓р▒Н р░╡р░Вр░Яр░┐ р░░р░Ър░пр░┐р░др░▓ р░ир▒Бр░Вр░бр░┐ р░др▒Ар░╕р▒Бр░Хр▒Лр░мр░бр░┐р░Вр░жр░┐'), CString('р░Ер░╡р░┐ р░Ър▒Ир░ир░╛ р░пр▒Кр░Хр▒Нр░Х р░Жр░░р▒Нр░ер░┐р░Х р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Жр░зр░╛р░░р░Вр░Чр░╛ р░др░пр░╛р░░р▒Б р░Ър▒Зр░пр░мр░бр░др░╛р░пр░ир░┐ р░Ър▒Жр░кр▒Нр░кр░бр░В р░╡р░╛р░░р▒Б р░др░Чр▒Нр░Чр░┐р░Вр░кр▒Б р░Хр▒Лр░╕р░В р░П р░Чр░гр░╛р░Вр░Хр░╛р░ир▒Нр░ир░┐ р░ир░┐р░░р▒Нр░гр░пр░┐р░Вр░Ър░▓р▒Зр░жр▒Б'), CString('р░▓р▒Ар░Хр▒Н р░кр▒Нр░░р░Хр░╛р░░р░В р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Б р░╡р░┐р░╡р░╛р░жр░╛р░ир▒Нр░ир░┐ р░╕р▒Вр░Ър░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐ р░Зр░жр░┐ р░кр░╛р░▓р░╕р▒Нр░др▒Ар░ир░╛ 1967 р░пр▒Кр░Хр▒Нр░Х р░ор░зр▒Нр░пр░кр▒Нр░░р░╛р░Ър▒Нр░п р░пр▒Бр░жр▒Нр░зр░╛р░ир░┐р░Хр░┐ р░ор▒Бр░Вр░жр▒Б р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Бр░▓ р░Жр░зр░╛р░

In [28]:
print(hypotheses_s2tt_all)

{'tel': [CString('р░░р▒Кр░ор░╛р░ир▒Ар░др░Вр░▓р▒Л р░╕р░╛р░Вр░╕р▒Нр░Хр▒Гр░др░┐р░Х р░ир▒Ир░др░┐р░Хр░╡р░╛р░жр░В р░пр▒Кр░Хр▒Нр░Х р░кр▒Жр░жр▒Нр░ж р░ор▒Вр░▓р░Хр░В р░Йр░Вр░жр░┐ р░Зр░жр░┐ р░Чр▒Лр░ер▒З р░лр░┐р░Ър▒Нр░Яр▒З р░ор░░р░┐р░пр▒Б р░╕р▒Нр░▓р▒Зр░Чр░▓р▒Н р░╡р░Вр░Яр░┐ р░░р░Ър░пр░┐р░др░▓ р░ир▒Бр░Вр░бр░┐ р░др▒Ар░╕р▒Бр░Хр▒Лр░мр░бр░┐р░Вр░жр░┐'), CString('р░Зр░жр░┐ р░Ър▒Жр░кр▒Нр░др▒В р░╡р░╛р░░р▒Б р░Ър▒Ир░ир░╛ р░пр▒Кр░Хр▒Нр░Х р░Жр░░р▒Нр░ер░┐р░Х р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Жр░зр░╛р░░р░Вр░Чр░╛ р░др░пр░╛р░░р▒Б р░Ър▒Зр░пр░мр░бр░др░╛р░░р▒Б р░Жр░пр░и р░Хр▒Лр░д р░Хр▒Лр░╕р░В р░П р░Чр░гр░╛р░Вр░Хр░В р░ир░┐р░░р▒Нр░гр░пр░┐р░Вр░Ър░▓р▒Зр░жр▒Б'), CString('р░▓р▒Ар░Чр▒Н р░кр▒Нр░░р░Хр░╛р░░р░В р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Б р░╡р░┐р░╡р░╛р░жр░╛р░ир▒Нр░ир░┐ р░кр▒Нр░░р░╕р▒Нр░др░╛р░╡р░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐ р░Зр░жр░┐ р░кр░╛р░▓р░╕р▒Нр░др▒Ар░ир░╛ р░кр░Вр░др▒Кр░ор▒Нр░ор░┐р░жр░┐ р░╡р░Вр░жр░▓ р░Ер░░р░╡р▒Ир░▓ р░ор░зр▒Нр░пр░кр▒Нр░░р░╛р░Ър▒Нр░п р░пр▒Бр░жр▒Нр░зр░╛р░ир░┐р░Хр░┐ р░ор▒Бр░Вр░жр▒Б р░╕р░░р░┐р░╣

In [25]:
print(predicted_text_from_s2s_all)

{'tel': [CString('р░░р▒Кр░ор░╛р░ир▒Ар░др░Вр░▓р▒Л р░╕р░╛р░Вр░╕р▒Нр░Хр▒Гр░др░┐р░Х р░ир▒Ир░др░┐р░Хр░╡р░╛р░жр░В р░пр▒Кр░Хр▒Нр░Х р░кр▒Жр░жр▒Нр░ж р░ор▒Вр░▓р░Хр░В р░Йр░Вр░жр░┐ р░Зр░жр░┐ р░Чр▒Лр░жр▒З р░лр░┐р░Яр▒Нр░Яр▒З р░ор░░р░┐р░пр▒Б р░╕р▒Нр░▓р▒Зр░Чр░▓р▒Н р░╡р░Вр░Яр░┐ р░░р░Ър░пр░┐р░др░▓ р░ир▒Бр░Вр░бр░┐ р░др▒Ар░╕р▒Бр░Хр▒Лр░мр░бр░┐р░Вр░жр░┐.'), CString('р░Зр░жр░┐ р░Ър▒Жр░кр▒Нр░др▒В р░╡р░╛р░░р▒Б р░Ър▒Ир░ир░╛ р░пр▒Кр░Хр▒Нр░Х р░Жр░░р▒Нр░ер░┐р░Х р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Жр░зр░╛р░░р░Вр░Чр░╛ р░др░пр░╛р░░р▒Б р░Ър▒Зр░пр░мр░бр░др░╛р░░р▒Б. р░Жр░пр░и р░Хр▒Лр░д р░Хр▒Лр░╕р░В р░П р░Чр░гр░╛р░Вр░Хр░В р░ир░┐р░░р▒Нр░гр░пр░┐р░Вр░Ър░▓р▒Зр░жр▒Б.'), CString('р░▓р▒Ар░Чр▒Н р░кр▒Нр░░р░Хр░╛р░░р░В р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Б р░╡р░┐р░╡р░╛р░жр░╛р░ир▒Нр░ир░┐ р░кр▒Нр░░р░╕р▒Нр░др░╛р░╡р░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐. р░Зр░жр░┐ р░кр░╛р░▓р░╕р▒Нр░др▒Ар░ир░╛ р░кр░Вр░др▒Кр░ор▒Нр░ор░┐р░жр░┐ р░╡р░Вр░жр░▓ р░Ер░░р░╡р▒Ир░▓ р░ор░зр▒Нр░пр░кр▒Нр░░р░╛р░Ър▒Нр░п р░пр▒Бр░жр▒Нр░зр░╛р░ир░┐р░Хр░┐ р░ор▒Бр░Вр░жр▒Б р░╕р░░р░

In [26]:
print(predicted_text_from_t2s_all)

{'tel': [CString('р░░р▒Кр░ор▒Зр░ир░┐р░пр░Вр░▓р▒Л р░╕р░╛р░Вр░╕р▒Нр░Хр▒Гр░др░┐р░Х р░ир░┐р░░р▒Нр░ор░┐р░др░╡р░╛р░жр░В р░пр▒Кр░Хр▒Нр░Х р░кр▒Жр░жр▒Нр░ж р░Ер░Вр░╢р░В р░Йр░Вр░жр░┐ р░Ер░жр░┐ р░Чр▒Лр░ер▒З р░лр░┐р░Яр▒Нр░Яр▒З р░ор░░р░┐р░пр▒Б р░╖р▒Нр░▓р▒Зр░Чр░▓р▒Н р░╡р░Вр░Яр░┐ р░░р░Ър░пр░┐р░др░▓ р░ир▒Бр░Вр░бр░┐ р░др▒Ар░╕р▒Бр░Хр▒Лр░мр░бр░┐р░Вр░жр░┐.'), CString('р░Ер░╡р░┐ р░Ър▒Ир░ир░╛ р░пр▒Кр░Хр▒Нр░Х р░Жр░░р▒Нр░ер░┐р░Х р░Йр░др▒Нр░кр░др▒Нр░др░┐ р░Жр░зр░╛р░░р░Вр░Чр░╛ р░др░пр░╛р░░р░╡р▒Бр░др░╛р░пр░ир░┐ р░Ър▒Жр░кр▒Нр░кр░бр░В р░╡р░╛р░░р▒Б р░др░Чр▒Нр░Чр░┐р░Вр░кр▒Б р░Хр▒Лр░╕р░В р░П р░Чр░гр░╛р░Вр░Хр░╛р░ир▒Нр░ир░┐ р░ир░┐р░░р▒Нр░гр░пр░┐р░Вр░Ър░▓р▒Зр░жр▒Б.'), CString('р░▓р▒Ар░Чр▒Н р░кр▒Нр░░р░Хр░╛р░░р░В р░бр░╛р░Хр▒Нр░пр▒Бр░ор▒Жр░Вр░Яр▒Н р░╕р░░р░┐р░╣р░жр▒Нр░жр▒Б р░╡р░┐р░╡р░╛р░жр░╛р░ир▒Нр░ир░┐ р░╕р▒Вр░Ър░┐р░╕р▒Нр░др▒Бр░Вр░жр░┐ р░Зр░жр░┐ р░кр░╛р░▓р░╕р▒Нр░др▒Ар░ир░╛ р░ир▒Ир░др░┐р░Х р░ир░┐р░░р▒Нр░ор▒Вр░▓р░и р░пр▒Кр░Хр▒Нр░Х р░ор░зр▒Нр░пр░кр▒Нр░░р░╛р░Ър▒Нр░п р░пр▒Бр░жр▒Нр░зр░╛р░ир░┐р░Хр░┐ р░ор▒Бр░Вр░жр▒Б р░╕р░░р░┐р░╣р░ж

In [None]:
from datasets import load_dataset
import torch
import torchaudio
import pandas as pd

# Dictionaries to store all outputs
references_all = {}
hypotheses_s2tt_all = {}
hypotheses_t2tt_all = {}
predicted_text_from_s2s_all = {}
predicted_text_from_t2s_all = {}

# Language mapping: SeamlessM4T tgt_lang тЖТ (FLEURS target_lang)
lang_map = {
    "tel": "te_in",   # Telugu
    "urd": "ur_pk"    # Urdu (Pakistan)
}

# Fixed source language
fleurs_src_lang = "hi_in"  # Hindi in FLEURS
sm4t_src_lang = "hin"      # Hindi in SeamlessM4T

# Resampling function to 16 kHz
def resample_to_16k(audio, orig_sr):
    return torchaudio.transforms.Resample(orig_sr, 16000)(torch.tensor(audio)).numpy()

# Loop through each target language
for sm4t_tgt_lang, fleurs_tgt_lang in lang_map.items():
    print("\n" + "="*50)
    print(f"ЁЯФ╣ Processing Target Language: {sm4t_tgt_lang.upper()} ({fleurs_tgt_lang})")
    print("="*50)

    # Load FLEURS test splits
    src_dataset = load_dataset("google/fleurs", fleurs_src_lang, split="test")
    tgt_dataset = load_dataset("google/fleurs", fleurs_tgt_lang, split="test")

    # Build mapping from sentence ID to record
    src_by_id = {item["id"]: item for item in src_dataset}
    tgt_by_id = {item["id"]: item for item in tgt_dataset}
    common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))
    print(f"Found {len(common_ids)} parallel sentences")

    # Initialize lists
    references = []
    hypotheses_s2tt = []
    hypotheses_t2tt = []
    predicted_s2s = []
    predicted_t2s = []
    source_texts = []

    for sentence_id in common_ids:
        src = src_by_id[sentence_id]
        tgt = tgt_by_id[sentence_id]

        src_audio = src["audio"]["array"]
        src_sr = src["audio"]["sampling_rate"]
        src_text = src["transcription"]
        tgt_text = tgt["transcription"]

        references.append([tgt_text])
        source_texts.append(src_text)

        # Resample if needed
        if src_sr != 16000:
            src_audio = resample_to_16k(src_audio, src_sr)

        # Save source audio temporarily
        audio_path = f"/tmp/input_{sentence_id}.wav"
        torchaudio.save(audio_path, torch.tensor(src_audio).unsqueeze(0), 16000)

        # Speech-to-text translation (S2TT)
        s2tt_out, _ = translator.predict(
            input=audio_path,
            task_str="s2tt",
            tgt_lang=sm4t_tgt_lang
        )
        hypotheses_s2tt.append(s2tt_out[0])

        # Text-to-text translation (T2TT)
        t2tt_out, _ = translator.predict(
            input=src_text,
            task_str="t2tt",
            src_lang=sm4t_src_lang,
            tgt_lang=sm4t_tgt_lang
        )
        hypotheses_t2tt.append(t2tt_out[0])

        # Speech-to-speech translation (S2ST) + ASR
        _, s2s_audio_out = translator.predict(
            input=audio_path,
            task_str="s2st",
            tgt_lang=sm4t_tgt_lang
        )
        s2s_path = f"/tmp/s2s_{sentence_id}.wav"
        torchaudio.save(
            s2s_path,
            s2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
            s2s_audio_out.sample_rate
        )
        s2s_asr_out, _ = translator.predict(
            input=s2s_path,
            task_str="asr",
            tgt_lang=sm4t_tgt_lang
        )
        predicted_s2s.append(s2s_asr_out[0])

        # Text-to-speech translation (T2ST) + ASR
        _, t2s_audio_out = translator.predict(
            input=src_text,
            task_str="t2st",
            src_lang=sm4t_src_lang,
            tgt_lang=sm4t_tgt_lang
        )
        t2s_path = f"/tmp/t2s_{sentence_id}.wav"
        torchaudio.save(
            t2s_path,
            t2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
            t2s_audio_out.sample_rate
        )
        t2s_asr_out, _ = translator.predict(
            input=t2s_path,
            task_str="asr",
            tgt_lang=sm4t_tgt_lang
        )
        predicted_t2s.append(t2s_asr_out[0])

    # Store results per language
    references_all[sm4t_tgt_lang] = references
    hypotheses_s2tt_all[sm4t_tgt_lang] = hypotheses_s2tt
    hypotheses_t2tt_all[sm4t_tgt_lang] = hypotheses_t2tt
    predicted_text_from_s2s_all[sm4t_tgt_lang] = predicted_s2s
    predicted_text_from_t2s_all[sm4t_tgt_lang] = predicted_t2s

    # Build final DataFrame
    df = pd.DataFrame({
        "source_text": source_texts,
        "reference_text": [r[0] for r in references],
        "S2TT_prediction": hypotheses_s2tt,
        "T2TT_prediction": hypotheses_t2tt,
        "S2ST_ASR": predicted_s2s,
        "T2ST_ASR": predicted_t2s
    })

    print(f"тЬЕ [{sm4t_tgt_lang}] Processed {len(common_ids)} parallel sentences")
    print(df.head(5))



ЁЯФ╣ Processing Target Language: TEL (te_in)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Found 228 parallel sentences
тЬЕ [tel] Processed 228 parallel sentences
                                         source_text  \
0  рд░реВрдорд╛рдирд┐рдпрдд рдореЗрдВ рд╕рд╛рдВрд╕реНрдХреГрддрд┐рдХ рдирд┐рдпрддрд┐рд╡рд╛рдж рдХрд╛ рдмрдбрд╝рд╛ рддрддреНрд╡ ...   
1  рдпрд╣ рдХрд╣рддреЗ рд╣реБрдП рдХрд┐ рд╡реЗ рдЪреАрди рдХреЗ рдЖрд░реНрдерд┐рдХ рдЙрддреНрдкрд╛рджрди рдХреЗ рдЖрдзрд╛...   
2  рд▓реАрдХ рдХреЗ рдЕрдиреБрд╕рд╛рд░ рджрд╕реНрддрд╛рд╡реЗрдЬрд╝ рд╕реАрдорд╛ рд╡рд┐рд╡рд╛рдж рдХреЛ рд╕рдВрджрд░реНрднрд┐рдд...   
3  рдЕрдкрдиреА рд╕рд░рдХрд╛рд░ рдХреЗ рдЕрд▓рд╛рд╡рд╛ рдЖрдк рдЕрдиреНрдп рджреЗрд╢реЛрдВ рдХреА рд╕рд░рдХрд╛рд░реЛрдВ рдХ...   
4  рд╡рд╛рдЗрд▓реНрдб рдХрд╛рд░реНрдб рдЦрд░реАрдж рд▓реЗрдирд╛ рдлрд╝рд╛рдпрджреЗрдордВрдж рднреА рд╣реЛ рд╕рдХрддрд╛ рд╣реИ...   

                                      reference_text  \
0  р░░р▒Кр░ор░╛р░Вр░Яр░┐р░Хр▒Н р░╡р░╛р░жр░Вр░▓р▒Л р░╕р░╛р░Вр░╕р▒Нр░Хр▒Гр░др░┐р░Х р░ир░┐р░░р▒Нр░гр░пр░╛р░▓р▒Нр░▓р▒Л р░кр▒Жр░жр▒Нр░ж ...   
1  р░Хр▒Лр░др░▓ р░Х

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Found 176 parallel sentences
тЬЕ [urd] Processed 176 parallel sentences
                                         source_text  \
0  рд░реВрдорд╛рдирд┐рдпрдд рдореЗрдВ рд╕рд╛рдВрд╕реНрдХреГрддрд┐рдХ рдирд┐рдпрддрд┐рд╡рд╛рдж рдХрд╛ рдмрдбрд╝рд╛ рддрддреНрд╡ ...   
1  рдЕрдкрдиреА рд╕рд░рдХрд╛рд░ рдХреЗ рдЕрд▓рд╛рд╡рд╛ рдЖрдк рдЕрдиреНрдп рджреЗрд╢реЛрдВ рдХреА рд╕рд░рдХрд╛рд░реЛрдВ рдХ...   
2  рдЖрдо рд╢рдмреНрджреЛрдВ рдореЗрдВ рдЬрдм рдкреНрд░рдмрдВрдзрдХ рдЕрдкрдиреЗ рдкреВрд░реНрд╡ рд╕рд╣рдпреЛрдЧрд┐рдпреЛрдВ ...   
3  рд╡рд╛рдЗрд▓реНрдб рдХрд╛рд░реНрдб рдЦрд░реАрдж рд▓реЗрдирд╛ рдлрд╝рд╛рдпрджреЗрдордВрдж рднреА рд╣реЛ рд╕рдХрддрд╛ рд╣реИ...   
4  рд╕рд╛рд╡рдзрд╛рди рд░рд╣реЗрдВ рдХрд┐ рдХрдкрдбрд╝реЗ рдХреЛ рдмрд╣реБрдд рдЧрд░реНрдо рди рд╣реЛрдиреЗ рджреЗрдВ рдЬ...   

                                      reference_text  \
0  ╪▒┘И┘Е╪з┘Ж┘И█М╪к ┘Е█М┌║ ╪л┘В╪з┘Б╪к█М ╪╣╪▓┘Е ┌й╪з ╪з█М┌й ╪и█Б╪к ╪и┌С╪з ╪╣┘Ж╪╡╪▒ █Б┘И...   
1  ╪в┘╛ ╪з┘╛┘Ж█М ╪н┌й┘И┘Е╪к ┌й█Т ╪╣┘Д╪з┘И█Б ╪п┘И╪│╪▒█М ╪н┌й┘И┘Е╪к┘

: 

In [None]:
import pandas as pd

# Pick a target language, e.g., Telugu
lang_code = "tel"
# Build mapping from sentence ID to record
src_by_id = {item["id"]: item for item in src_dataset}
tgt_by_id = {item["id"]: item for item in tgt_dataset}
common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))

# Extract lists
source_texts = []
reference_texts = []
s2tt_hyps = hypotheses_s2tt_all[lang_code]  # speech-to-text predictions
t2tt_hyps = hypotheses_t2tt_all[lang_code]  # text-to-text predictions
s2st_asr = predicted_text_from_s2s_all[lang_code]  # speech-to-speech ASR
t2st_asr = predicted_text_from_t2s_all[lang_code]  # text-to-speech ASR

for sentence_id in common_ids:
    src = src_by_id[sentence_id]
    tgt = tgt_by_id[sentence_id]

    source_texts.append(src["transcription"])
    reference_texts.append(tgt["transcription"])

# Create DataFrame
df = pd.DataFrame({
    "source_text": source_texts,
    "reference_text": reference_texts,
    "S2TT_prediction": s2tt_hyps,
    "T2TT_prediction": t2tt_hyps,
    "S2ST_ASR": s2st_asr,
    "T2ST_ASR": t2st_asr
})

# Show first 5 rows
print(df.head(5))


In [20]:
from sacrebleu import corpus_bleu, corpus_chrf
from jiwer import wer
from pprint import pprint

# Container for all language metrics
all_metrics = {}

for lang_code in references_all.keys():
    references = references_all[lang_code]
    hypotheses_s2tt = hypotheses_s2tt_all[lang_code]
    hypotheses_t2tt = hypotheses_t2tt_all[lang_code]
    predicted_text_from_s2s = predicted_text_from_s2s_all[lang_code]
    predicted_text_from_t2s = predicted_text_from_t2s_all[lang_code]

    # --- Normalize hypotheses ---
    hypotheses_s2tt = [str(h).strip() for h in hypotheses_s2tt]
    hypotheses_t2tt = [str(h).strip() for h in hypotheses_t2tt]
    predicted_text_from_s2s = [str(h).strip() for h in predicted_text_from_s2s]
    predicted_text_from_t2s = [str(h).strip() for h in predicted_text_from_t2s]

    # --- Normalize references (SacreBLEU expects list-of-lists) ---
    references_norm = [[str(r[0]).strip()] for r in references]
    multi_references = list(zip(*references_norm))  # shape: (num_refs, num_sentences)

    metrics = {}

    # --- BLEU / chrF2++ ---
    metrics["S2TT_SacreBLEU"] = corpus_bleu(hypotheses_s2tt, multi_references).score
    metrics["T2TT_chrF2++"] = corpus_chrf(hypotheses_t2tt, multi_references).score
    metrics["S2ST_ASR_BLEU"] = corpus_bleu(predicted_text_from_s2s, multi_references).score
    metrics["T2ST_ASR_BLEU"] = corpus_bleu(predicted_text_from_t2s, multi_references).score

    # --- WER (jiwer) ---
    metrics["S2ST_ASR_WER"] = sum(
        wer(ref[0], hyp) for ref, hyp in zip(references_norm, predicted_text_from_s2s)
    ) / len(references_norm)

    metrics["T2ST_ASR_WER"] = sum(
        wer(ref[0], hyp) for ref, hyp in zip(references_norm, predicted_text_from_t2s)
    ) / len(references_norm)

    metrics["S2TT_WER"] = sum(
        wer(ref[0], hyp) for ref, hyp in zip(references_norm, hypotheses_s2tt)
    ) / len(references_norm)

    metrics["T2TT_WER"] = sum(
        wer(ref[0], hyp) for ref, hyp in zip(references_norm, hypotheses_t2tt)
    ) / len(references_norm)

    # Print + save
    print(f"\n=== Metrics for target language: {lang_code.upper()} ===")
    pprint(metrics)

    all_metrics[lang_code] = metrics



=== Metrics for target language: TEL ===
{'S2ST_ASR_BLEU': 6.703186032040358,
 'S2ST_ASR_WER': 0.9207014212978243,
 'S2TT_SacreBLEU': 6.791687079616478,
 'S2TT_WER': 0.9214235330398954,
 'T2ST_ASR_BLEU': 9.072877840222192,
 'T2ST_ASR_WER': 0.8816193817330296,
 'T2TT_WER': 0.85229959325962,
 'T2TT_chrF2++': 49.22213115874377}

=== Metrics for target language: URD ===
{'S2ST_ASR_BLEU': 14.802831740006779,
 'S2ST_ASR_WER': 0.7457257921461948,
 'S2TT_SacreBLEU': 14.627013884604041,
 'S2TT_WER': 0.7461084350137203,
 'T2ST_ASR_BLEU': 16.53967566437534,
 'T2ST_ASR_WER': 0.7231635694724213,
 'T2TT_WER': 0.7154703562371476,
 'T2TT_chrF2++': 44.07527441374375}


In [1]:
import torch
print(f"\nTorch version: {torch.__version__}")


Torch version: 2.5.1+cu121


In [2]:
%pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch==2.2.2
  Downloading https://download.pytorch.org/whl/cu121/torch-2.2.2%2Bcu121-cp311-cp311-linux_x86_64.whl (757.3 MB)
[2K     [90mтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ[0m [32m757.3/757.3 MB[0m [31m32.0 MB/s[0m  [33m0:00:24[0m:00:01[0m00:01[0m
[?25hCollecting torchvision==0.17.2
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.17.2%2Bcu121-cp311-cp311-linux_x86_64.whl (7.0 MB)
[2K     [90mтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ[0m [32m7.0/7.0 MB[0m [31m27.4 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hCollecting torchaudio==2.2.2
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.2.2%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
[2K     [90mтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБтФБ