In [5]:
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy
import soundfile
import torchaudio
import torch

from collections import defaultdict
from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment

from seamless_communication.inference import Translator
from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover
import warnings
warnings.filterwarnings("ignore")

In [6]:
from datasets import load_dataset
from seamless_communication.inference import Translator
from jiwer import wer
from sacrebleu import corpus_bleu

2025-09-03 09:00:44,390 INFO -- datasets: PyTorch version 2.2.2+cu121 available.


In [7]:
# Initialize a Translator object with a multitask model, vocoder on the GPU.

model_name = "seamlessM4T_v2_large"
vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"

translator = Translator(
    model_name,
    vocoder_name,
    device=torch.device("cuda"), # Changed from "cuda:0" to "cpu"
    dtype=torch.float16,
)

Using the cached checkpoint of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_v2. Set `force` to `True` to download again.


In [8]:
from datasets import load_dataset
import torch
import torchaudio
def resample_to_16k(audio, orig_sr):
    # implement resampling logic here, e.g. torchaudio.transforms.Resample
    return torchaudio.transforms.Resample(orig_sr, 16000)(torch.tensor(audio)).numpy()

In [9]:
import whisper
import re

# Load Whisper-Large once (outside function, so it’s not reloaded every call)
whisper_model = whisper.load_model("large", device="cuda")

In [10]:
import os
os.environ["PATH"] = "/home/aj/Bhavna/ffmpeg_bin:" + os.environ["PATH"]


In [11]:
import whisper
whisper.audio.FFMPEG_PATH = "/home/aj/Bhavna/ffmpeg_bin/ffmpeg"

In [12]:
import os
import pandas as pd
from datasets import load_dataset
import ffmpeg

In [13]:
#%pip install seamless_communication

from seamless_communication.inference import SequenceGeneratorOptions

In [14]:
text_opts = SequenceGeneratorOptions(
    beam_size=5
    #temperature=0.7,   # 0 for deterministic, >0 adds diversity
)

# Beam search for unit hypotheses
unit_opts = SequenceGeneratorOptions(
    beam_size=5
)

In [15]:
import string

def normalize(text):
    return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()

In [None]:
#from whisper.normalizers.basic import BasicTextNormalizer
#normalize_s= BasicTextNormalizer()

In [28]:
def save_dataframe(df, lang, out_dir="/scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/outputs"):
    import os
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{lang}_results.csv")
    df.to_csv(out_path, index=False)
    print(f"Saved results to {out_path}")

def run_translation_for_language(sm4t_src_lang,fleurs_src_lang,sm4t_tgt_lang, fleurs_tgt_lang, full_tasks=True):
    """
    Run translation pipeline for one language.
    full_tasks=True → Run all 4 tasks
    full_tasks=False → Run only S2TT, T2TT
    """
    print("\n" + "="*60)
    print(f"🔹 Processing Target Language: {sm4t_tgt_lang.upper()} ({fleurs_tgt_lang})")
    print("="*60)

    # Load datasets
    src_dataset = load_dataset("google/fleurs", fleurs_src_lang, split="test")
    tgt_dataset = load_dataset("google/fleurs", fleurs_tgt_lang, split="test")

    src_by_id = {item["id"]: item for item in src_dataset}
    tgt_by_id = {item["id"]: item for item in tgt_dataset}
    common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))

    print(f"Found {len(common_ids)} parallel sentences")

    references, hypotheses_s2tt, hypotheses_t2tt = [], [], []
    predicted_s2s, predicted_t2s = [], []
    source_texts = []

    for sentence_id in common_ids:
        src = src_by_id[sentence_id]
        tgt = tgt_by_id[sentence_id]

        src_audio = src["audio"]["array"]
        src_sr = src["audio"]["sampling_rate"]
        src_text = src["transcription"]
        tgt_text = tgt["transcription"]

        references.append([tgt_text])
        source_texts.append(src_text)

        if src_sr != 16000:
            src_audio = resample_to_16k(src_audio, src_sr)

        base_dir = "/scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/input_audios_of_eng"
        lang_dir = os.path.join(base_dir, sm4t_tgt_lang)
        os.makedirs(lang_dir, exist_ok=True)
        audio_path = os.path.join(lang_dir, f"input_{sm4t_tgt_lang}_{sentence_id}.wav")

        torchaudio.save(audio_path, torch.tensor(src_audio).unsqueeze(0), 16000)

        # --- S2TT ---
        s2tt_out, _ = translator.predict(
            input=audio_path, task_str="s2tt", tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_s2tt.append(normalize(str(s2tt_out[0])))

        # --- T2TT ---
        t2tt_out, _ = translator.predict(
            input=src_text, task_str="t2tt", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_t2tt.append(normalize(str(t2tt_out[0])))

        if full_tasks:
            # --- S2ST + Whisper ASR ---
            _, s2s_audio_out = translator.predict(
                input=audio_path, task_str="s2st", tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/s2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_tgt_lang)
            os.makedirs(lang_dir, exist_ok=True)
            s2s_path= os.path.join(lang_dir, f"s2s_{sm4t_tgt_lang}_{sentence_id}.wav")

            torchaudio.save(
                s2s_path,
                s2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                s2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            s2s_result = whisper_model.transcribe(
                audio=s2s_path,
                task="transcribe",
                temperature=0.0,   # greedy, deterministic
                beam_size=None
            )
            predicted_s2s.append(normalize(s2s_result["text"]))

            # --- T2ST + Whisper ASR ---
            _, t2s_audio_out = translator.predict(
                input=src_text, task_str="t2st", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/t2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_tgt_lang)
            os.makedirs(lang_dir, exist_ok=True)
            t2s_path = os.path.join(lang_dir, f"t2s_{sm4t_tgt_lang}_{sentence_id}.wav")

            torchaudio.save(
                t2s_path,
                t2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                t2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            t2s_result = whisper_model.transcribe(
                audio=t2s_path,
                task="transcribe",
                temperature=0.0,
                beam_size=None
            )
            predicted_t2s.append(normalize(t2s_result["text"]))

    # Build dataframe
    data = {
        "source_text": source_texts,
        "reference_text": [r[0] for r in references],
        "S2TT_prediction": hypotheses_s2tt,
        "T2TT_prediction": hypotheses_t2tt
    }
    if full_tasks:
        data["S2ST_ASR"] = predicted_s2s
        data["T2ST_ASR"] = predicted_t2s

    df = pd.DataFrame(data)

    save_dataframe(df, sm4t_tgt_lang)

    return references, hypotheses_s2tt, hypotheses_t2tt, predicted_s2s, predicted_t2s

In [29]:
from sacrebleu import corpus_bleu, corpus_chrf
from jiwer import wer

def compute_metrics(lang_code, references, hypotheses_s2tt, hypotheses_t2tt, predicted_s2s, predicted_t2s):
    print(f"\nComputing metrics for language: {lang_code.upper()}")
    # Normalize hyps
    hypotheses_s2tt = [" ".join(h) if isinstance(h, list) else str(h) for h in hypotheses_s2tt]
    hypotheses_t2tt = [" ".join(h) if isinstance(h, list) else str(h) for h in hypotheses_t2tt]
    predicted_s2s   = [" ".join(h) if isinstance(h, list) else str(h) for h in predicted_s2s]
    predicted_t2s   = [" ".join(h) if isinstance(h, list) else str(h) for h in predicted_t2s]

    # Normalize refs
    references_norm = []
    for refset in references:
        references_norm.append([" ".join(r) if isinstance(r, list) else str(r) for r in refset])
    multi_references = list(zip(*references_norm))  # multiple references per sentence

    metrics = {}
    # Text-to-text & speech-to-text
    metrics["S2TT_SacreBLEU"] = corpus_bleu(hypotheses_s2tt, multi_references).score
    metrics["T2TT_chrF2++"]   = corpus_chrf(hypotheses_t2tt, multi_references).score
    metrics["S2TT_WER"]       = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, hypotheses_s2tt)) / len(references_norm)
    metrics["T2TT_WER"]       = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, hypotheses_t2tt)) / len(references_norm)

    # Speech-to-speech (decoded to text for scoring)
    if predicted_s2s:
        metrics["S2ST_ASR_WER"]  = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, predicted_s2s)) / len(references_norm)
        metrics["S2ST_ASR_BLEU"] = corpus_bleu(predicted_s2s, multi_references).score

    if predicted_t2s:
        metrics["T2ST_ASR_WER"]  = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, predicted_t2s)) / len(references_norm)
        metrics["T2ST_ASR_BLEU"] = corpus_bleu(predicted_t2s, multi_references).score
        
    return metrics


**eng-X metrics**

In [32]:
refs, s2tt, t2tt, s2s, t2s=run_translation_for_language("eng", "en_us","tel","te_in",full_tasks=True)
compute_metrics("tel", refs, s2tt, t2tt, s2s, t2s)


🔹 Processing Target Language: TEL (te_in)
Found 302 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/outputs/tel_results.csv

Computing metrics for language: TEL


{'S2TT_SacreBLEU': 19.768624174201648,
 'T2TT_chrF2++': 57.88079557608407,
 'S2TT_WER': 0.6862416954139396,
 'T2TT_WER': 0.6990720558165725,
 'S2ST_ASR_WER': 0.9101295785795336,
 'S2ST_ASR_BLEU': 1.845789027182998,
 'T2ST_ASR_WER': 0.915643101194562,
 'T2ST_ASR_BLEU': 1.9749424059894964}

In [33]:
refs3, s2tt3, t2tt3, s2s3, t2s3=run_translation_for_language("eng", "en_us","hin","hi_in",full_tasks=True)
compute_metrics("hin", refs3, s2tt3, t2tt3, s2s3, t2s3)


🔹 Processing Target Language: HIN (hi_in)
Found 265 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/outputs/hin_results.csv

Computing metrics for language: HIN


{'S2TT_SacreBLEU': 31.153900191619226,
 'T2TT_chrF2++': 57.8105727684805,
 'S2TT_WER': 0.5858850193121427,
 'T2TT_WER': 0.5608586884988706,
 'S2ST_ASR_WER': 0.6807390948093932,
 'S2ST_ASR_BLEU': 18.34571223405713,
 'T2ST_ASR_WER': 0.6835818748195117,
 'T2ST_ASR_BLEU': 17.849980255040418}

In [34]:
refs4, s2tt4, t2tt4, s2s4, t2s4=run_translation_for_language("eng", "en_us","urd","ur_pk",full_tasks=True)
compute_metrics("urd", refs4, s2tt4, t2tt4, s2s4, t2s4)


🔹 Processing Target Language: URD (ur_pk)
Found 230 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/outputs/urd_results.csv

Computing metrics for language: URD


{'S2TT_SacreBLEU': 20.950129624567204,
 'T2TT_chrF2++': 48.24124829628641,
 'S2TT_WER': 0.6851889290883566,
 'T2TT_WER': 0.6800507771805084,
 'S2ST_ASR_WER': 0.6800128376390979,
 'S2ST_ASR_BLEU': 20.440529931026514,
 'T2ST_ASR_WER': 0.6907736706233382,
 'T2ST_ASR_BLEU': 19.420914598331493}

In [22]:
refs1, s2tt1, t2tt1, s2s1, t2s1=run_translation_for_language("eng", "en_us","tam","ta_in",full_tasks=False)
compute_metrics("tam", refs1, s2tt1, t2tt1, s2s1, t2s1)


🔹 Processing Target Language: TAM (ta_in)
Found 336 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/outputs/tam_results.csv

Computing metrics for language: TAM


{'S2TT_SacreBLEU': 14.274613106193202,
 'T2TT_chrF2++': 56.90018770799228,
 'S2TT_WER': 0.7242391794911189,
 'T2TT_WER': 0.7244738057602584}

In [25]:
refs2, s2tt2, t2tt2, s2s2, t2s2=run_translation_for_language("eng", "en_us","ory","or_in",full_tasks=False)
compute_metrics("ory", refs2, s2tt2, t2tt2, s2s2, t2s2)


🔹 Processing Target Language: ORY (or_in)
Found 334 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/eng-X/outputs/ory_results.csv

Computing metrics for language: ORY


{'S2TT_SacreBLEU': 14.299858776616398,
 'T2TT_chrF2++': 44.936253311409345,
 'S2TT_WER': 0.7450997409593849,
 'T2TT_WER': 0.8251359523183592}

**X-eng metrics**

In [35]:
def save_dataframe(df, lang, out_dir="/scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/outputs"):
    import os
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{lang}_results.csv")
    df.to_csv(out_path, index=False)
    print(f"Saved results to {out_path}")

In [None]:
def run_translation_for_language_eng(sm4t_src_lang,fleurs_src_lang,sm4t_tgt_lang, fleurs_tgt_lang, full_tasks=True):
    """
    Run translation pipeline for one language.
    full_tasks=True → Run all 4 tasks
    full_tasks=False → Run only S2TT, T2TT
    """
    print("\n" + "="*60)
    print(f"🔹 Processing Target Language: {sm4t_tgt_lang.upper()} ({fleurs_tgt_lang})")
    print("="*60)

    # Load datasets
    src_dataset = load_dataset("google/fleurs", fleurs_src_lang, split="test")
    tgt_dataset = load_dataset("google/fleurs", fleurs_tgt_lang, split="test")

    src_by_id = {item["id"]: item for item in src_dataset}
    tgt_by_id = {item["id"]: item for item in tgt_dataset}
    common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))

    print(f"Found {len(common_ids)} parallel sentences")

    references, hypotheses_s2tt, hypotheses_t2tt = [], [], []
    predicted_s2s, predicted_t2s = [], []
    source_texts = []

    for sentence_id in common_ids:
        src = src_by_id[sentence_id]
        tgt = tgt_by_id[sentence_id]

        src_audio = src["audio"]["array"]
        src_sr = src["audio"]["sampling_rate"]
        src_text = src["transcription"]
        tgt_text = tgt["transcription"]

        references.append([tgt_text])
        source_texts.append(src_text)

        if src_sr != 16000:
            src_audio = resample_to_16k(src_audio, src_sr)

        base_dir = "/scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/input_audios_of_langs"
        lang_dir = os.path.join(base_dir, sm4t_src_lang)
        os.makedirs(lang_dir, exist_ok=True)
        audio_path = os.path.join(lang_dir, f"input_{sm4t_src_lang}_{sentence_id}.wav")

        torchaudio.save(audio_path, torch.tensor(src_audio).unsqueeze(0), 16000)

        # --- S2TT ---
        s2tt_out, _ = translator.predict(
            input=audio_path, task_str="s2tt", tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_s2tt.append(normalize(str(s2tt_out[0])))

        # --- T2TT ---
        t2tt_out, _ = translator.predict(
            input=src_text, task_str="t2tt", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_t2tt.append(normalize(str(t2tt_out[0])))

        if full_tasks:
            # --- S2ST + Whisper ASR ---
            _, s2s_audio_out = translator.predict(
                input=audio_path, task_str="s2st",tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/s2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_src_lang)
            os.makedirs(lang_dir, exist_ok=True)
            s2s_path= os.path.join(lang_dir, f"s2s_{sm4t_src_lang}_{sentence_id}.wav")

            torchaudio.save(
                s2s_path,
                s2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                s2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            s2s_result = whisper_model.transcribe(
                audio=s2s_path,
                task="transcribe",
                temperature=0.0,   # greedy, deterministic
                beam_size=None
            )
            predicted_s2s.append(normalize(s2s_result["text"]))

            # --- T2ST + Whisper ASR ---
            _, t2s_audio_out = translator.predict(
                input=src_text, task_str="t2st", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/t2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_src_lang)
            os.makedirs(lang_dir, exist_ok=True)
            t2s_path = os.path.join(lang_dir, f"t2s_{sm4t_src_lang}_{sentence_id}.wav")

            torchaudio.save(
                t2s_path,
                t2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                t2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            t2s_result = whisper_model.transcribe(
                audio=t2s_path,
                task="transcribe",
                temperature=0.0,
                beam_size=None
            )
            predicted_t2s.append(normalize(t2s_result["text"]))

    # Build dataframe
    data = {
        "source_text": source_texts,
        "reference_text": [r[0] for r in references],
        "S2TT_prediction": hypotheses_s2tt,
        "T2TT_prediction": hypotheses_t2tt
    }
    if full_tasks:
        data["S2ST_ASR"] = predicted_s2s
        data["T2ST_ASR"] = predicted_t2s

    df = pd.DataFrame(data)

    save_dataframe(df, sm4t_src_lang)

    return references, hypotheses_s2tt, hypotheses_t2tt, predicted_s2s, predicted_t2s

In [37]:
refs5, s2tt5, t2tt5, s2s5, t2s5=run_translation_for_language_eng("hin", "hi_in","eng","en_us",full_tasks=True)
compute_metrics("hin->eng", refs5, s2tt5, t2tt5, s2s5, t2s5)


🔹 Processing Target Language: ENG (en_us)
Found 265 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/outputs/hin_results.csv

Computing metrics for language: HIN->ENG


{'S2TT_SacreBLEU': 30.100699940404887,
 'T2TT_chrF2++': 64.43126015394755,
 'S2TT_WER': 0.5769781283054644,
 'T2TT_WER': 0.4829589650612814,
 'S2ST_ASR_WER': 0.5744392525630683,
 'S2ST_ASR_BLEU': 29.80706185581536,
 'T2ST_ASR_WER': 0.5062129707006541,
 'T2ST_ASR_BLEU': 35.03781892627364}

In [38]:
refs6, s2tt6, t2tt6, s2s6, t2s6=run_translation_for_language_eng("tel", "te_in","eng","en_us",full_tasks=True)
compute_metrics("tel->eng", refs6, s2tt6, t2tt6, s2s6, t2s6)


🔹 Processing Target Language: ENG (en_us)
Found 302 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/outputs/tel_results.csv

Computing metrics for language: TEL->ENG


{'S2TT_SacreBLEU': 25.751960658772255,
 'T2TT_chrF2++': 62.46116400263574,
 'S2TT_WER': 0.6132852423109971,
 'T2TT_WER': 0.5262495941621558,
 'S2ST_ASR_WER': 0.6039635892802974,
 'S2ST_ASR_BLEU': 25.96858004991482,
 'T2ST_ASR_WER': 0.5351444780751428,
 'T2ST_ASR_BLEU': 33.068577804727255}

In [39]:
refs7, s2tt7, t2tt7, s2s7, t2s7=run_translation_for_language_eng("tam", "ta_in","eng","en_us",full_tasks=True)
compute_metrics("tam->eng", refs7, s2tt7, t2tt7, s2s7, t2s7)


🔹 Processing Target Language: ENG (en_us)
Found 336 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/outputs/tam_results.csv

Computing metrics for language: TAM->ENG


{'S2TT_SacreBLEU': 23.350509110207998,
 'T2TT_chrF2++': 58.19404045684811,
 'S2TT_WER': 0.6711805367864453,
 'T2TT_WER': 0.5663497144591616,
 'S2ST_ASR_WER': 0.658504086797589,
 'S2ST_ASR_BLEU': 23.523496058910386,
 'T2ST_ASR_WER': 0.5782981519860814,
 'T2ST_ASR_BLEU': 28.455894567053345}

In [40]:
refs8, s2tt8, t2tt8, s2s8, t2s8=run_translation_for_language_eng("urd", "ur_pk","eng","en_us",full_tasks=True)
compute_metrics("urd->eng", refs8, s2tt8, t2tt8, s2s8, t2s8)


🔹 Processing Target Language: ENG (en_us)
Found 230 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/outputs/urd_results.csv

Computing metrics for language: URD->ENG


{'S2TT_SacreBLEU': 24.751033511752947,
 'T2TT_chrF2++': 61.60675535444684,
 'S2TT_WER': 0.6189562220362608,
 'T2TT_WER': 0.5121603399715183,
 'S2ST_ASR_WER': 0.6145540594050083,
 'S2ST_ASR_BLEU': 24.992822228880737,
 'T2ST_ASR_WER': 0.516878890245686,
 'T2ST_ASR_BLEU': 33.12744858731363}

In [41]:
refs9, s2tt9, t2tt9, s2s9, t2s9=run_translation_for_language_eng("ory", "or_in","eng","en_us",full_tasks=True)
compute_metrics("ory->eng", refs9, s2tt9, t2tt9, s2s9, t2s9)


🔹 Processing Target Language: ENG (en_us)
Found 334 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/Project/X-eng/outputs/ory_results.csv

Computing metrics for language: ORY->ENG


{'S2TT_SacreBLEU': 26.861764973835275,
 'T2TT_chrF2++': 63.70346590715455,
 'S2TT_WER': 0.6275503817546304,
 'T2TT_WER': 0.5113733669275848,
 'S2ST_ASR_WER': 0.6268162642308952,
 'S2ST_ASR_BLEU': 26.91504263695809,
 'T2ST_ASR_WER': 0.5130236769776545,
 'T2ST_ASR_BLEU': 34.68284905623634}

**S2T->T2S pipeline**

In [16]:
def run_translation_for_language_s2s(sm4t_src_lang, fleurs_src_lang, sm4t_tgt_lang, fleurs_tgt_lang, full_tasks=True):
    """
    Run translation pipeline for one language.
    full_tasks=True → Run all 4 tasks
    full_tasks=False → Run only S2TT, T2TT
    """
    print("\n" + "="*60)
    print(f"🔹 Processing Target Language: {sm4t_tgt_lang.upper()} ({fleurs_tgt_lang})")
    print("="*60)

    # Load datasets
    src_dataset = load_dataset("google/fleurs", fleurs_src_lang, split="test")
    tgt_dataset = load_dataset("google/fleurs", fleurs_tgt_lang, split="test")

    src_by_id = {item["id"]: item for item in src_dataset}
    tgt_by_id = {item["id"]: item for item in tgt_dataset}
    common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))

    print(f"Found {len(common_ids)} parallel sentences")

    references = []
    predicted_s2t_t2s = []
    source_texts = []

    for sentence_id in common_ids:
        src = src_by_id[sentence_id]
        tgt = tgt_by_id[sentence_id]

        src_audio = src["audio"]["array"]
        src_sr = src["audio"]["sampling_rate"]
        src_text = src["transcription"]
        tgt_text = tgt["transcription"]

        references.append([tgt_text])
        source_texts.append(src_text)

        # Resample if needed
        if src_sr != 16000:
            src_audio = resample_to_16k(src_audio, src_sr)

        # Convert numpy → torch.FloatTensor, add batch dimension if 1D
        src_audio = torch.tensor(src_audio, dtype=torch.float32)
        # Current: (1, seq_len)
        if src_audio.ndim == 2 and src_audio.shape[0] == 1:
            src_audio = src_audio.transpose(0, 1)   # (seq_len, 1)


        if full_tasks:
            # --- S2TT ---
            s2tt_out, _ = translator.predict(
                input=src_audio,
                task_str="s2tt",
                tgt_lang=sm4t_tgt_lang,
                text_generation_opts=text_opts
            )

            # --- T2ST ---
            _, t2s_audio_out = translator.predict(
                input=str(s2tt_out[0]),
                task_str="t2st",
                src_lang=sm4t_src_lang,
                tgt_lang=sm4t_tgt_lang,
                text_generation_opts=text_opts,
                unit_generation_opts=unit_opts
            )

            # Save generated speech
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/Project/s2t-t2s/s2t-t2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_tgt_lang)
            os.makedirs(lang_dir, exist_ok=True)
            t2s_path = os.path.join(lang_dir, f"t2s_{sm4t_tgt_lang}_{sentence_id}.wav")

            # Ensure tensor is (channels, time)
            audio_tensor = t2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu()
            if audio_tensor.ndim == 1:
                audio_tensor = audio_tensor.unsqueeze(0)

            torchaudio.save(
                t2s_path,
                audio_tensor,
                t2s_audio_out.sample_rate,
            )

            # Whisper ASR on synthesized speech
            t2s_result = whisper_model.transcribe(
                audio=t2s_path,
                task="transcribe",
                temperature=0.0,
                beam_size=None
            )
            predicted_s2t_t2s.append(normalize(t2s_result["text"]))

    return references, predicted_s2t_t2s

In [18]:
def compute_metrics_s2s(lang_code, references, predicted_s2t_t2s):
    print(f"\nComputing metrics for language: {lang_code.upper()}")

    # Normalize hypotheses
    predicted_s2t_t2s = [" ".join(h) if isinstance(h, list) else str(h) for h in predicted_s2t_t2s]

    # Normalize references
    references_norm = []
    for refset in references:
        references_norm.append([" ".join(r) if isinstance(r, list) else str(r) for r in refset])

    metrics = {}
    if predicted_s2t_t2s:
        metrics["S2T+T2S_ASR_WER"] = sum(
            wer(ref[0], hyp) for ref, hyp in zip(references_norm, predicted_s2t_t2s)
        ) / len(references_norm)

        metrics["S2T+T2S_ASR_BLEU"] = corpus_bleu(
            predicted_s2t_t2s, list(zip(*references_norm))
        ).score

    return metrics

In [19]:
ref,s2s_p=run_translation_for_language_s2s("eng", "en_us","tel","te_in",full_tasks=True)
compute_metrics_s2s("eng->tel", ref, s2s_p)


🔹 Processing Target Language: TEL (te_in)
Found 302 parallel sentences

Computing metrics for language: ENG->TEL


{'S2T+T2S_ASR_WER': 0.9101285191293687, 'S2T+T2S_ASR_BLEU': 2.3390303440246676}

In [20]:
ref1,s2s_p1=run_translation_for_language_s2s("eng", "en_us","hin","hi_in",full_tasks=True)
compute_metrics_s2s("eng->hin", ref1, s2s_p1)


🔹 Processing Target Language: HIN (hi_in)
Found 265 parallel sentences

Computing metrics for language: ENG->HIN


{'S2T+T2S_ASR_WER': 0.6804263902727953, 'S2T+T2S_ASR_BLEU': 18.01060195105432}

In [21]:
ref2,s2s_p2=run_translation_for_language_s2s("eng", "en_us","urd","ur_pk",full_tasks=True)
compute_metrics_s2s("eng->urd", ref2, s2s_p2)


🔹 Processing Target Language: URD (ur_pk)
Found 230 parallel sentences

Computing metrics for language: ENG->URD


{'S2T+T2S_ASR_WER': 0.6790183352396314, 'S2T+T2S_ASR_BLEU': 20.578279757322075}

In [23]:
ref3, s2s_p3=run_translation_for_language_s2s("tel","te_in","eng", "en_us",full_tasks=True)
compute_metrics_s2s("tel->eng", ref3, s2s_p3)


🔹 Processing Target Language: ENG (en_us)
Found 302 parallel sentences

Computing metrics for language: TEL->ENG


{'S2T+T2S_ASR_WER': 0.6243740005863173, 'S2T+T2S_ASR_BLEU': 25.323806971836365}

In [24]:
ref4, s2s_p4=run_translation_for_language_s2s("hin","hi_in","eng", "en_us",full_tasks=True)
compute_metrics_s2s("hin->eng", ref4, s2s_p4)


🔹 Processing Target Language: ENG (en_us)
Found 265 parallel sentences

Computing metrics for language: HIN->ENG


{'S2T+T2S_ASR_WER': 0.6022611520626794, 'S2T+T2S_ASR_BLEU': 27.8533757068395}

In [25]:
ref5,s2s_p5=run_translation_for_language_s2s("urd","ur_pk","eng", "en_us",full_tasks=True)
compute_metrics_s2s("urd->eng", ref5, s2s_p5)


🔹 Processing Target Language: ENG (en_us)
Found 230 parallel sentences

Computing metrics for language: URD->ENG


{'S2T+T2S_ASR_WER': 0.6160407029699605, 'S2T+T2S_ASR_BLEU': 24.260052353074}

In [26]:
ref6, s2s_p6=run_translation_for_language_s2s("ory","or_in","eng", "en_us",full_tasks=True)
compute_metrics_s2s("ory->eng", ref6, s2s_p6)


🔹 Processing Target Language: ENG (en_us)
Found 334 parallel sentences

Computing metrics for language: ORY->ENG


{'S2T+T2S_ASR_WER': 0.6271728590792701, 'S2T+T2S_ASR_BLEU': 26.918422328215055}

In [27]:
ref7, s2s_p7=run_translation_for_language_s2s("tam","ta_in","eng", "en_us",full_tasks=True)
compute_metrics_s2s("tam->eng", ref7, s2s_p7)


🔹 Processing Target Language: ENG (en_us)
Found 336 parallel sentences

Computing metrics for language: TAM->ENG


{'S2T+T2S_ASR_WER': 0.6603241781688973, 'S2T+T2S_ASR_BLEU': 23.356889157087096}