In [16]:
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy
import soundfile
import torchaudio
import torch
import os
import re
import pandas as pd
from datasets import load_dataset
import ffmpeg

from collections import defaultdict
from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment

from seamless_communication.inference import Translator
from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover
import warnings
warnings.filterwarnings("ignore")

In [17]:
from datasets import load_dataset
from seamless_communication.inference import Translator
from jiwer import wer
from sacrebleu import corpus_bleu

In [18]:
# Initialize a Translator object with a multitask model, vocoder on the GPU.

model_name = "seamlessM4T_v2_large"
vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"

translator = Translator(
    model_name,
    vocoder_name,
    device=torch.device("cuda"), # Changed from "cuda:0" to "cpu"
    dtype=torch.float16,
)

Using the cached checkpoint of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_v2_large. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_v2. Set `force` to `True` to download again.


In [19]:
from datasets import load_dataset
import torch
import torchaudio
def resample_to_16k(audio, orig_sr):
    # implement resampling logic here, e.g. torchaudio.transforms.Resample
    return torchaudio.transforms.Resample(orig_sr, 16000)(torch.tensor(audio)).numpy()

In [20]:
import whisper

# Load Whisper-Large once (outside function, so it’s not reloaded every call)
whisper_model = whisper.load_model("large-v3", device="cuda")

whisper.audio.FFMPEG_PATH = "/home/aj/Bhavna/ffmpeg_bin/ffmpeg"

In [21]:
import string

def normalize(text):
    return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()

In [22]:
def save_dataframe(df, lang, out_dir="/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs"):
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{lang}_results.csv")
    df.to_csv(out_path, index=False)
    print(f"Saved results to {out_path}")

In [23]:
from indicnlp.tokenize import indic_tokenize

In [29]:
#%pip install seamless_communication

from seamless_communication.inference import SequenceGeneratorOptions
text_opts = SequenceGeneratorOptions(
    beam_size=5
    #temperature=0.7,   # 0 for deterministic, >0 adds diversity
)

# Beam search for unit hypotheses
unit_opts = SequenceGeneratorOptions(
    beam_size=5
)

In [30]:
def normalize(text):
    return text.lower().translate(str.maketrans("", "", string.punctuation)).strip()

In [31]:
def apply_tokenizer(texts, lang_code):
    """Apply IndicNLP/Urdu tokenization for En→Indic evaluation"""
    if not texts:
        return texts
    elif lang_code in ["hi", "bn", "te", "ta", "ml", "kn", "gu", "mr", "pa", "or"]:
        return [" ".join(indic_tokenize.trivial_tokenize(t, lang=lang_code)) for t in texts]
    else:  # fallback (English etc.)
        return texts

In [33]:
def run_translation_for_language(sm4t_src_lang,fleurs_src_lang,sm4t_tgt_lang, fleurs_tgt_lang, full_tasks=True):
    """
    Run translation pipeline for one language.
    full_tasks=True → Run all 4 tasks
    full_tasks=False → Run only S2TT, T2TT
    """
    print("\n" + "="*60)
    print(f"🔹 Processing Target Language: {sm4t_tgt_lang.upper()} ({fleurs_tgt_lang})")
    print("="*60)

    # Load datasets
    src_dataset = load_dataset("google/fleurs", fleurs_src_lang, split="test")
    tgt_dataset = load_dataset("google/fleurs", fleurs_tgt_lang, split="test")

    src_by_id = {item["id"]: item for item in src_dataset}
    tgt_by_id = {item["id"]: item for item in tgt_dataset}
    common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))

    print(f"Found {len(common_ids)} parallel sentences")

    references, hypotheses_s2tt, hypotheses_t2tt = [], [], []
    predicted_s2s, predicted_t2s = [], []
    source_texts = []

    for sentence_id in common_ids:
        src = src_by_id[sentence_id]
        tgt = tgt_by_id[sentence_id]

        src_audio = src["audio"]["array"]
        src_sr = src["audio"]["sampling_rate"]
        src_text = src["transcription"]
        tgt_text = tgt["transcription"]

        references.append([tgt_text])
        source_texts.append(src_text)

        if src_sr != 16000:
            src_audio = resample_to_16k(src_audio, src_sr)

        base_dir = "/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/input_audios"
        lang_dir = os.path.join(base_dir, sm4t_tgt_lang)
        os.makedirs(lang_dir, exist_ok=True)
        audio_path = os.path.join(lang_dir, f"input_{sm4t_tgt_lang}_{sentence_id}.wav")

        torchaudio.save(audio_path, torch.tensor(src_audio).unsqueeze(0), 16000)

        # --- S2TT ---
        s2tt_out, _ = translator.predict(
            input=audio_path, task_str="s2tt", tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_s2tt.append(normalize(str(s2tt_out[0])))

        # --- T2TT ---
        t2tt_out, _ = translator.predict(
            input=src_text, task_str="t2tt", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_t2tt.append(normalize(str(t2tt_out[0])))

        if full_tasks:
            # --- S2ST + Whisper ASR ---
            _, s2s_audio_out = translator.predict(
                input=audio_path, task_str="s2st", tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/s2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_tgt_lang)
            os.makedirs(lang_dir, exist_ok=True)
            s2s_path= os.path.join(lang_dir, f"s2s_{sm4t_tgt_lang}_{sentence_id}.wav")

            torchaudio.save(
                s2s_path,
                s2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                s2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            s2s_result = whisper_model.transcribe(
                audio=s2s_path,
                task="transcribe",
                temperature=0.0,   # greedy, deterministic
                beam_size=None
            )
            predicted_s2s.append(normalize(s2s_result["text"]))

            # --- T2ST + Whisper ASR ---
            _, t2s_audio_out = translator.predict(
                input=src_text, task_str="t2st", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/t2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_tgt_lang)
            os.makedirs(lang_dir, exist_ok=True)
            t2s_path = os.path.join(lang_dir, f"t2s_{sm4t_tgt_lang}_{sentence_id}.wav")

            torchaudio.save(
                t2s_path,
                t2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                t2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            t2s_result = whisper_model.transcribe(
                audio=t2s_path,
                task="transcribe",
                temperature=0.0,
                beam_size=None
            )
            predicted_t2s.append(normalize(t2s_result["text"]))

    # Build dataframe
    data = {
        "source_text": source_texts,
        "reference_text": [r[0] for r in references],
        "S2TT_prediction": hypotheses_s2tt,
        "T2TT_prediction": hypotheses_t2tt
    }
    if full_tasks:
        data["S2ST_ASR"] = predicted_s2s
        data["T2ST_ASR"] = predicted_t2s

    df = pd.DataFrame(data)

    save_dataframe(df, sm4t_tgt_lang)

    return references, hypotheses_s2tt, hypotheses_t2tt, predicted_s2s, predicted_t2s

In [34]:
from sacrebleu import corpus_chrf

def compute_metrics(src_lang, tgt_lang, references, hypotheses_s2tt, hypotheses_t2tt, predicted_s2s, predicted_t2s):
    print(f"\nComputing metrics for {src_lang.upper()} → {tgt_lang.upper()}")

    # Normalize hyps
    hypotheses_s2tt = [str(h) for h in hypotheses_s2tt]
    hypotheses_t2tt = [str(h) for h in hypotheses_t2tt]
    predicted_s2s   = [str(h) for h in predicted_s2s]
    predicted_t2s   = [str(h) for h in predicted_t2s]

    # Normalize refs
    references_norm = [[str(r) for r in refset] for refset in references]
    multi_references = list(zip(*references_norm))

    # ---- Tokenization switch ----
    if src_lang == "en":  
        # Eng → Indic → tokenize target
        hypotheses_s2tt = apply_tokenizer(hypotheses_s2tt, tgt_lang)
        hypotheses_t2tt = apply_tokenizer(hypotheses_t2tt, tgt_lang)
        predicted_s2s   = apply_tokenizer(predicted_s2s, tgt_lang)
        predicted_t2s   = apply_tokenizer(predicted_t2s, tgt_lang)
        tokenized_refs = [apply_tokenizer(refs, tgt_lang) for refs in multi_references]
    else:
        # Indic → En → use sacreBLEU default tokenizer
        tokenized_refs = multi_references

    tokenized_refs = list(zip(*tokenized_refs))

    metrics = {}

    # --------------------
    # S2TT
    # --------------------
    metrics["S2TT_BLEU"]      = corpus_bleu(hypotheses_s2tt, tokenized_refs).score
    metrics["S2TT_chrF++"]    = corpus_chrf(hypotheses_s2tt, tokenized_refs).score
    metrics["S2TT_chrF2++"]   = corpus_chrf(hypotheses_s2tt, tokenized_refs, beta=2).score
    metrics["S2TT_WER"]       = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, hypotheses_s2tt)) / len(references_norm)

    # --------------------
    # T2TT
    # --------------------
    metrics["T2TT_BLEU"]      = corpus_bleu(hypotheses_t2tt, tokenized_refs).score
    metrics["T2TT_chrF++"]    = corpus_chrf(hypotheses_t2tt, tokenized_refs).score
    metrics["T2TT_chrF2++"]   = corpus_chrf(hypotheses_t2tt, tokenized_refs, beta=2).score
    metrics["T2TT_WER"]       = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, hypotheses_t2tt)) / len(references_norm)

    # --------------------
    # S2ST
    # --------------------
    if predicted_s2s:
        metrics["S2ST_BLEU"]    = corpus_bleu(predicted_s2s, tokenized_refs).score
        metrics["S2ST_chrF++"]  = corpus_chrf(predicted_s2s, tokenized_refs).score
        metrics["S2ST_chrF2++"] = corpus_chrf(predicted_s2s, tokenized_refs, beta=2).score
        metrics["S2ST_WER"]     = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, predicted_s2s)) / len(references_norm)

    # --------------------
    # T2ST
    # --------------------
    if predicted_t2s:
        metrics["T2ST_BLEU"]    = corpus_bleu(predicted_t2s, tokenized_refs).score
        metrics["T2ST_chrF++"]  = corpus_chrf(predicted_t2s, tokenized_refs).score
        metrics["T2ST_chrF2++"] = corpus_chrf(predicted_t2s, tokenized_refs, beta=2).score
        metrics["T2ST_WER"]     = sum(wer(ref[0], hyp) for ref, hyp in zip(references_norm, predicted_t2s)) / len(references_norm)

    return metrics


**North**

In [36]:
refs1, hyps_s2tt1, hyps_t2tt1, preds_s2s1, preds_t2s1 = run_translation_for_language("hin", "hi_in","mar","mr_in", full_tasks=False)



🔹 Processing Target Language: MAR (mr_in)
Found 264 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/mar_results.csv


In [39]:
compute_metrics("hin","mr_in", refs1, hyps_s2tt1, hyps_t2tt1, preds_s2s1, preds_t2s1)


Computing metrics for HIN → MR_IN


{'S2TT_BLEU': 7.859438681510603,
 'S2TT_chrF++': 46.507914690966345,
 'S2TT_chrF2++': 46.507914690966345,
 'S2TT_WER': 0.8490564509807691,
 'T2TT_BLEU': 27.22589423069701,
 'T2TT_chrF++': 57.39669578850625,
 'T2TT_chrF2++': 57.39669578850625,
 'T2TT_WER': 0.812781223835248}

In [40]:
refs2, hyps_s2tt2, hyps_t2tt2, preds_s2s2, preds_t2s2 = run_translation_for_language("hin","hi_in","npi","ne_np", full_tasks=False)


🔹 Processing Target Language: NPI (ne_np)
Found 260 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/npi_results.csv


In [42]:
compute_metrics("hin","ne_np", refs2, hyps_s2tt2, hyps_t2tt2, preds_s2s2, preds_t2s2)


Computing metrics for HIN → NE_NP


{'S2TT_BLEU': 25.33654946448646,
 'S2TT_chrF++': 64.99473106349947,
 'S2TT_chrF2++': 64.99473106349947,
 'S2TT_WER': 0.8399920090949645,
 'T2TT_BLEU': 21.651956746181064,
 'T2TT_chrF++': 68.10030674738218,
 'T2TT_chrF2++': 68.10030674738218,
 'T2TT_WER': 0.8087716672999247}

In [48]:
refs5, hyps_s2tt5, hyps_t2tt5, preds_s2s5, preds_t2s5 = run_translation_for_language("hin","hi_in","urd","ur_pk", full_tasks=True)


🔹 Processing Target Language: URD (ur_pk)
Found 176 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/urd_results.csv


In [49]:
compute_metrics("hin","ur_pk", refs5, hyps_s2tt5, hyps_t2tt5, preds_s2s5, preds_t2s5)


Computing metrics for HIN → UR_PK


{'S2TT_BLEU': 9.42119686197517,
 'S2TT_chrF++': 34.73328567381309,
 'S2TT_chrF2++': 34.73328567381309,
 'S2TT_WER': 0.7461084350137203,
 'T2TT_BLEU': 8.508341296101372,
 'T2TT_chrF++': 34.0999525664329,
 'T2TT_chrF2++': 34.0999525664329,
 'T2TT_WER': 0.7155139423986123,
 'S2ST_BLEU': 8.329829723842051,
 'S2ST_chrF++': 31.896050576254524,
 'S2ST_chrF2++': 31.896050576254524,
 'S2ST_WER': 0.7592791765820599,
 'T2ST_BLEU': 8.197162980930852,
 'T2ST_chrF++': 33.995066590239674,
 'T2ST_chrF2++': 33.995066590239674,
 'T2ST_WER': 0.7412146675340211}

**West**

In [41]:
refs3,hyps_s2tt3, hyps_t2tt3, preds_s2s3, preds_t2s3 = run_translation_for_language("guj","gu_in","pan","pa_in", full_tasks=False)


🔹 Processing Target Language: PAN (pa_in)
Found 278 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/pan_results.csv


In [46]:
compute_metrics("guj","pa_in", refs3, hyps_s2tt3, hyps_t2tt3, preds_s2s3, preds_t2s3)


Computing metrics for GUJ → PA_IN


{'S2TT_BLEU': 13.826777579228938,
 'S2TT_chrF++': 41.951670925291666,
 'S2TT_chrF2++': 41.951670925291666,
 'S2TT_WER': 0.743335535162905,
 'T2TT_BLEU': 12.021577610863728,
 'T2TT_chrF++': 38.875269796901364,
 'T2TT_chrF2++': 38.875269796901364,
 'T2TT_WER': 0.7435963691415168}

In [45]:
refs4, hyps_s2tt4, hyps_t2tt4, preds_s2s4, preds_t2s4 = run_translation_for_language("pan","pa_in","guj","gu_in", full_tasks=False)


🔹 Processing Target Language: GUJ (gu_in)
Found 278 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/guj_results.csv


In [47]:
compute_metrics("pan","gu_in", refs4, hyps_s2tt4, hyps_t2tt4, preds_s2s4, preds_t2s4)


Computing metrics for PAN → GU_IN


{'S2TT_BLEU': 4.6192151051305474,
 'S2TT_chrF++': 31.822103844478995,
 'S2TT_chrF2++': 31.822103844478995,
 'S2TT_WER': 0.8223669055613883,
 'T2TT_BLEU': 4.444587794585869,
 'T2TT_chrF++': 30.837545402859195,
 'T2TT_chrF2++': 30.837545402859195,
 'T2TT_WER': 0.8018523434687755}

**East**

In [51]:
refs6, hyps_s2tt6, hyps_t2tt6, preds_s2s6, preds_t2s6 = run_translation_for_language("ben","bn_in","ory","or_in", full_tasks=False)


🔹 Processing Target Language: ORY (or_in)
Found 333 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/ory_results.csv


In [52]:
compute_metrics("ben","or_in", refs6, hyps_s2tt6, hyps_t2tt6, preds_s2s6, preds_t2s6)


Computing metrics for BEN → OR_IN


{'S2TT_BLEU': 6.917184228205472,
 'S2TT_chrF++': 34.89762454148601,
 'S2TT_chrF2++': 34.89762454148601,
 'S2TT_WER': 0.8717856015394257,
 'T2TT_BLEU': 7.474875887495341,
 'T2TT_chrF++': 38.20305627425529,
 'T2TT_chrF2++': 38.20305627425529,
 'T2TT_WER': 0.8267578894870617}

In [54]:
refs7, hyps_s2tt7, hyps_t2tt7, preds_s2s7, preds_t2s7 = run_translation_for_language("ben","bn_in","asm","as_in", full_tasks=False)


🔹 Processing Target Language: ASM (as_in)
Found 348 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/asm_results.csv


In [55]:
compute_metrics("ben","as_in", refs7, hyps_s2tt7, hyps_t2tt7, preds_s2s7, preds_t2s7)


Computing metrics for BEN → AS_IN


{'S2TT_BLEU': 9.55204080682377,
 'S2TT_chrF++': 35.50819437364589,
 'S2TT_chrF2++': 35.50819437364589,
 'S2TT_WER': 0.8950376965191159,
 'T2TT_BLEU': 9.55204080682377,
 'T2TT_chrF++': 31.686870720549855,
 'T2TT_chrF2++': 31.686870720549855,
 'T2TT_WER': 0.8711566243900262}

**South**

In [56]:
refs8, hyps_s2tt8, hyps_t2tt8, preds_s2s8, preds_t2s8 = run_translation_for_language("tel","te_in","tam","ta_in", full_tasks=False)


🔹 Processing Target Language: TAM (ta_in)
Found 292 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/tam_results.csv


In [57]:
compute_metrics("tel","ta_in", refs8, hyps_s2tt8, hyps_t2tt8, preds_s2s8, preds_t2s8)


Computing metrics for TEL → TA_IN


{'S2TT_BLEU': 9.238430210261097,
 'S2TT_chrF++': 50.582031194781564,
 'S2TT_chrF2++': 50.582031194781564,
 'S2TT_WER': 0.8921039304658651,
 'T2TT_BLEU': 9.238430210261097,
 'T2TT_chrF++': 50.031995265030716,
 'T2TT_chrF2++': 50.031995265030716,
 'T2TT_WER': 0.8598565162295395}

In [58]:
refs9, hyps_s2tt9, hyps_t2tt9, preds_s2s9, preds_t2s9 = run_translation_for_language("tel","te_in","mal","ml_in", full_tasks=False)


🔹 Processing Target Language: MAL (ml_in)
Found 298 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/mal_results.csv


In [59]:
compute_metrics("tel","ml_in", refs9, hyps_s2tt9, hyps_t2tt9, preds_s2s9, preds_t2s9)


Computing metrics for TEL → ML_IN


{'S2TT_BLEU': 4.016138436407654,
 'S2TT_chrF++': 31.052630186576106,
 'S2TT_chrF2++': 31.052630186576106,
 'S2TT_WER': 0.918123165156763,
 'T2TT_BLEU': 7.141816289329644,
 'T2TT_chrF++': 39.42443373108362,
 'T2TT_chrF2++': 39.42443373108362,
 'T2TT_WER': 0.8735299979184779}

In [60]:
refs10, hyps_s2tt10, hyps_t2tt10, preds_s2s10, preds_t2s10 = run_translation_for_language("tel","te_in","kan","kn_in", full_tasks=False)


🔹 Processing Target Language: KAN (kn_in)
Found 297 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/f-X-X-Direct-CSVs/kan_results.csv


In [61]:
compute_metrics("tel","kn_in", refs10, hyps_s2tt10, hyps_t2tt10, preds_s2s10, preds_t2s10)


Computing metrics for TEL → KN_IN


{'S2TT_BLEU': 4.065425428798724,
 'S2TT_chrF++': 46.04671936551367,
 'S2TT_chrF2++': 46.04671936551367,
 'S2TT_WER': 0.8890393484051732,
 'T2TT_BLEU': 4.065425428798724,
 'T2TT_chrF++': 40.71791994262089,
 'T2TT_chrF2++': 40.71791994262089,
 'T2TT_WER': 0.8218582429619888}

**Reverse**

**North**

In [62]:
def save_dataframe(df, lang, out_dir="/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs"):
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{lang}_results.csv")
    df.to_csv(out_path, index=False)
    print(f"Saved results to {out_path}")

In [64]:
def run_translation_for_language(sm4t_src_lang,fleurs_src_lang,sm4t_tgt_lang, fleurs_tgt_lang, full_tasks=True):
    """
    Run translation pipeline for one language.
    full_tasks=True → Run all 4 tasks
    full_tasks=False → Run only S2TT, T2TT
    """
    print("\n" + "="*60)
    print(f"🔹 Processing Target Language: {sm4t_tgt_lang.upper()} ({fleurs_tgt_lang})")
    print("="*60)

    # Load datasets
    src_dataset = load_dataset("google/fleurs", fleurs_src_lang, split="test")
    tgt_dataset = load_dataset("google/fleurs", fleurs_tgt_lang, split="test")

    src_by_id = {item["id"]: item for item in src_dataset}
    tgt_by_id = {item["id"]: item for item in tgt_dataset}
    common_ids = sorted(set(src_by_id.keys()) & set(tgt_by_id.keys()))

    print(f"Found {len(common_ids)} parallel sentences")

    references, hypotheses_s2tt, hypotheses_t2tt = [], [], []
    predicted_s2s, predicted_t2s = [], []
    source_texts = []

    for sentence_id in common_ids:
        src = src_by_id[sentence_id]
        tgt = tgt_by_id[sentence_id]

        src_audio = src["audio"]["array"]
        src_sr = src["audio"]["sampling_rate"]
        src_text = src["transcription"]
        tgt_text = tgt["transcription"]

        references.append([tgt_text])
        source_texts.append(src_text)

        if src_sr != 16000:
            src_audio = resample_to_16k(src_audio, src_sr)

        base_dir = "/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/input_audios"
        lang_dir = os.path.join(base_dir, sm4t_src_lang)
        os.makedirs(lang_dir, exist_ok=True)
        audio_path = os.path.join(lang_dir, f"input_{sm4t_tgt_lang}_{sentence_id}.wav")

        torchaudio.save(audio_path, torch.tensor(src_audio).unsqueeze(0), 16000)

        # --- S2TT ---
        s2tt_out, _ = translator.predict(
            input=audio_path, task_str="s2tt", tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_s2tt.append(normalize(str(s2tt_out[0])))

        # --- T2TT ---
        t2tt_out, _ = translator.predict(
            input=src_text, task_str="t2tt", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts
        )
        hypotheses_t2tt.append(normalize(str(t2tt_out[0])))

        if full_tasks:
            # --- S2ST + Whisper ASR ---
            _, s2s_audio_out = translator.predict(
                input=audio_path, task_str="s2st", tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/s2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_src_lang)
            os.makedirs(lang_dir, exist_ok=True)
            s2s_path= os.path.join(lang_dir, f"s2s_{sm4t_tgt_lang}_{sentence_id}.wav")

            torchaudio.save(
                s2s_path,
                s2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                s2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            s2s_result = whisper_model.transcribe(
                audio=s2s_path,
                task="transcribe",
                temperature=0.0,   # greedy, deterministic
                beam_size=None
            )
            predicted_s2s.append(normalize(s2s_result["text"]))

            # --- T2ST + Whisper ASR ---
            _, t2s_audio_out = translator.predict(
                input=src_text, task_str="t2st", src_lang=sm4t_src_lang, tgt_lang=sm4t_tgt_lang,text_generation_opts=text_opts,unit_generation_opts=unit_opts
            )
            base_dir = "/scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/t2s_outputs"
            lang_dir = os.path.join(base_dir, sm4t_src_lang)
            os.makedirs(lang_dir, exist_ok=True)
            t2s_path = os.path.join(lang_dir, f"t2s_{sm4t_tgt_lang}_{sentence_id}.wav")

            torchaudio.save(
                t2s_path,
                t2s_audio_out.audio_wavs[0][0].to(torch.float32).cpu(),
                t2s_audio_out.sample_rate,
            )

            # Use Whisper-Large for ASR
            t2s_result = whisper_model.transcribe(
                audio=t2s_path,
                task="transcribe",
                temperature=0.0,
                beam_size=None
            )
            predicted_t2s.append(normalize(t2s_result["text"]))

    # Build dataframe
    data = {
        "source_text": source_texts,
        "reference_text": [r[0] for r in references],
        "S2TT_prediction": hypotheses_s2tt,
        "T2TT_prediction": hypotheses_t2tt
    }
    if full_tasks:
        data["S2ST_ASR"] = predicted_s2s
        data["T2ST_ASR"] = predicted_t2s

    df = pd.DataFrame(data)

    save_dataframe(df, sm4t_src_lang)

    return references, hypotheses_s2tt, hypotheses_t2tt, predicted_s2s, predicted_t2s

In [66]:
refs1_r, hyps_s2tt1_r, hyps_t2tt1_r, preds_s2s1_r, preds_t2s1_r = run_translation_for_language("mar","mr_in","hin", "hi_in", full_tasks=True)


🔹 Processing Target Language: HIN (hi_in)
Found 264 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/mar_results.csv


In [67]:
compute_metrics("mar","hi_in", refs1_r, hyps_s2tt1_r, hyps_t2tt1_r, preds_s2s1_r, preds_t2s1_r)


Computing metrics for MAR → HI_IN


{'S2TT_BLEU': 6.464941342480946,
 'S2TT_chrF++': 42.35185330359103,
 'S2TT_chrF2++': 42.35185330359103,
 'S2TT_WER': 0.7889982156704974,
 'T2TT_BLEU': 20.038908500140973,
 'T2TT_chrF++': 50.3440705551925,
 'T2TT_chrF2++': 50.3440705551925,
 'T2TT_WER': 0.7165888577509504,
 'S2ST_BLEU': 6.464941342480946,
 'S2ST_chrF++': 38.352288772663755,
 'S2ST_chrF2++': 38.352288772663755,
 'S2ST_WER': 0.826925291130424,
 'T2ST_BLEU': 7.126955677090929,
 'T2ST_chrF++': 40.33802140332752,
 'T2ST_chrF2++': 40.33802140332752,
 'T2ST_WER': 0.8048305062268768}

In [69]:
refs2_r, hyps_s2tt2_r, hyps_t2tt2_r, preds_s2s2_r, preds_t2s2_r = run_translation_for_language("npi","ne_np","hin","hi_in", full_tasks=True)


🔹 Processing Target Language: HIN (hi_in)
Found 260 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/npi_results.csv


In [70]:
compute_metrics("npi","hi_in", refs2_r, hyps_s2tt2_r, hyps_t2tt2_r, preds_s2s2_r, preds_t2s2_r)


Computing metrics for NPI → HI_IN


{'S2TT_BLEU': 27.130567714631198,
 'S2TT_chrF++': 58.301010970186255,
 'S2TT_chrF2++': 58.301010970186255,
 'S2TT_WER': 0.7487081372317297,
 'T2TT_BLEU': 43.59493824807389,
 'T2TT_chrF++': 68.20801255338704,
 'T2TT_chrF2++': 68.20801255338704,
 'T2TT_WER': 0.6692514937056953,
 'S2ST_BLEU': 27.065739132597326,
 'S2ST_chrF++': 56.455187994813,
 'S2ST_chrF2++': 56.455187994813,
 'S2ST_WER': 0.8120408481160151,
 'T2ST_BLEU': 39.42058093215872,
 'T2ST_chrF++': 68.92644962306159,
 'T2ST_chrF2++': 68.92644962306159,
 'T2ST_WER': 0.7699160556570889}

In [71]:
refs5_r, hyps_s2tt5_r, hyps_t2tt5_r, preds_s2s5_r, preds_t2s5_r = run_translation_for_language("urd","ur_pk","hin","hi_in", full_tasks=True)


🔹 Processing Target Language: HIN (hi_in)
Found 176 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/urd_results.csv


In [72]:
compute_metrics("urd","hi_in", refs5_r, hyps_s2tt5_r, hyps_t2tt5_r, preds_s2s5_r, preds_t2s5_r)


Computing metrics for URD → HI_IN


{'S2TT_BLEU': 8.607692533178168,
 'S2TT_chrF++': 41.19010078610498,
 'S2TT_chrF2++': 41.19010078610498,
 'S2TT_WER': 0.7717452958864062,
 'T2TT_BLEU': 9.035807436368023,
 'T2TT_chrF++': 39.42884552704032,
 'T2TT_chrF2++': 39.42884552704032,
 'T2TT_WER': 0.754861134035603,
 'S2ST_BLEU': 8.149723365644475,
 'S2ST_chrF++': 36.20660403901591,
 'S2ST_chrF2++': 36.20660403901591,
 'S2ST_WER': 0.8304131184239142,
 'T2ST_BLEU': 8.739145705346058,
 'T2ST_chrF++': 37.63534755633244,
 'T2ST_chrF2++': 37.63534755633244,
 'T2ST_WER': 0.835272110674643}

**East**

In [73]:
refs3_r, hyps_s2tt3_r, hyps_t2tt3_r, preds_s2s3_r, preds_t2s3_r = run_translation_for_language("asm","as_in","ben","bn_in", full_tasks=True)


🔹 Processing Target Language: BEN (bn_in)
Found 348 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/asm_results.csv


In [74]:
compute_metrics("asm","bn_in", refs3_r, hyps_s2tt3_r, hyps_t2tt3_r, preds_s2s3_r, preds_t2s3_r)


Computing metrics for ASM → BN_IN


{'S2TT_BLEU': 7.432998184513635,
 'S2TT_chrF++': 31.55519565124434,
 'S2TT_chrF2++': 31.55519565124434,
 'S2TT_WER': 0.8961094100191561,
 'T2TT_BLEU': 9.103526405546068,
 'T2TT_chrF++': 32.93628823601946,
 'T2TT_chrF2++': 32.93628823601946,
 'T2TT_WER': 0.8526534827526825,
 'S2ST_BLEU': 3.7199102928113716,
 'S2ST_chrF++': 31.326555062235627,
 'S2ST_chrF2++': 31.326555062235627,
 'S2ST_WER': 0.9749197934370839,
 'T2ST_BLEU': 6.468490584192431,
 'T2ST_chrF++': 30.014360940963503,
 'T2ST_chrF2++': 30.014360940963503,
 'T2ST_WER': 0.9683734937759376}

In [75]:
refs4_r, hyps_s2tt4_r, hyps_t2tt4_r, preds_s2s4_r, preds_t2s4_r = run_translation_for_language("ory","or_in","ben","bn_in", full_tasks=True)


🔹 Processing Target Language: BEN (bn_in)
Found 333 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/ory_results.csv


In [76]:
compute_metrics("ory","bn_in", refs4_r, hyps_s2tt4_r, hyps_t2tt4_r, preds_s2s4_r, preds_t2s4_r)


Computing metrics for ORY → BN_IN


{'S2TT_BLEU': 7.994607499472017,
 'S2TT_chrF++': 43.07068186882795,
 'S2TT_chrF2++': 43.07068186882795,
 'S2TT_WER': 0.8528774458964289,
 'T2TT_BLEU': 19.83544145418288,
 'T2TT_chrF++': 52.195821217185156,
 'T2TT_chrF2++': 52.195821217185156,
 'T2TT_WER': 0.8144597780392709,
 'S2ST_BLEU': 3.4933841821869938,
 'S2ST_chrF++': 32.07475524723871,
 'S2ST_chrF2++': 32.07475524723871,
 'S2ST_WER': 0.9747689093694288,
 'T2ST_BLEU': 3.7199102928113716,
 'T2ST_chrF++': 45.734217315889346,
 'T2ST_chrF2++': 45.734217315889346,
 'T2ST_WER': 0.9641954585970011}

**South**

In [77]:
refs6_r, hyps_s2tt6_r, hyps_t2tt6_r, preds_s2s6_r, preds_t2s6_r = run_translation_for_language("tam","ta_in","tel","te_in", full_tasks=True)


🔹 Processing Target Language: TEL (te_in)
Found 292 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/tam_results.csv


In [78]:
compute_metrics("tam","te_in", refs6_r, hyps_s2tt6_r, hyps_t2tt6_r, preds_s2s6_r, preds_t2s6_r)


Computing metrics for TAM → TE_IN


{'S2TT_BLEU': 13.217947626377288,
 'S2TT_chrF++': 44.08510530706513,
 'S2TT_chrF2++': 44.08510530706513,
 'S2TT_WER': 0.9112131304780734,
 'T2TT_BLEU': 15.718877363021207,
 'T2TT_chrF++': 48.03759662462975,
 'T2TT_chrF2++': 48.03759662462975,
 'T2TT_WER': 0.8330915214576137,
 'S2ST_BLEU': 3.377156414337854,
 'S2ST_chrF++': 22.86499894392577,
 'S2ST_chrF2++': 22.86499894392577,
 'S2ST_WER': 0.9891606546709331,
 'T2ST_BLEU': 6.917184228205472,
 'T2ST_chrF++': 33.32637761255218,
 'T2ST_chrF2++': 33.32637761255218,
 'T2ST_WER': 0.9736780329948216}

In [79]:
refs7_r, hyps_s2tt7_r, hyps_t2tt7_r, preds_s2s7_r, preds_t2s7_r = run_translation_for_language("mal","ml_in","tel","te_in", full_tasks=True)


🔹 Processing Target Language: TEL (te_in)
Found 298 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/mal_results.csv


In [80]:
compute_metrics("mal","te_in", refs7_r, hyps_s2tt7_r, hyps_t2tt7_r, preds_s2s7_r, preds_t2s7_r)


Computing metrics for MAL → TE_IN


{'S2TT_BLEU': 11.359354890271161,
 'S2TT_chrF++': 39.615953088508626,
 'S2TT_chrF2++': 39.615953088508626,
 'S2TT_WER': 0.8768997992263303,
 'T2TT_BLEU': 15.467294147156862,
 'T2TT_chrF++': 41.58621152662165,
 'T2TT_chrF2++': 41.58621152662165,
 'T2TT_WER': 0.8201387993015712,
 'S2ST_BLEU': 4.02724819242185,
 'S2ST_chrF++': 6.378643553529761,
 'S2ST_chrF2++': 6.378643553529761,
 'S2ST_WER': 0.9735397631012651,
 'T2ST_BLEU': 6.754312828675707,
 'T2ST_chrF++': 28.611049735568827,
 'T2ST_chrF2++': 28.611049735568827,
 'T2ST_WER': 0.9575591097225604}

In [81]:
refs8_r, hyps_s2tt8_r, hyps_t2tt8_r, preds_s2s8_r, preds_t2s8_r = run_translation_for_language("kan","kn_in","tel","te_in", full_tasks=True)


🔹 Processing Target Language: TEL (te_in)
Found 297 parallel sentences
Saved results to /scratch/aj/Bhavna/bhav_venv_311/f-X-X-Direct/Reverse/f-X-X-Direct-CSVs/kan_results.csv


In [82]:
compute_metrics("kan","te_in", refs8_r, hyps_s2tt8_r, hyps_t2tt8_r, preds_s2s8_r, preds_t2s8_r)


Computing metrics for KAN → TE_IN


{'S2TT_BLEU': 13.94120548961102,
 'S2TT_chrF++': 48.65628895837739,
 'S2TT_chrF2++': 48.65628895837739,
 'S2TT_WER': 0.8540457677299435,
 'T2TT_BLEU': 13.94120548961102,
 'T2TT_chrF++': 47.98767774606964,
 'T2TT_chrF2++': 47.98767774606964,
 'T2TT_WER': 0.8073695541161855,
 'S2ST_BLEU': 2.8398387225677895,
 'S2ST_chrF++': 26.743076465908043,
 'S2ST_chrF2++': 26.743076465908043,
 'S2ST_WER': 0.9787201952661825,
 'T2ST_BLEU': 3.1251907639724417,
 'T2ST_chrF++': 30.554146715793955,
 'T2ST_chrF2++': 30.554146715793955,
 'T2ST_WER': 0.9506681620942768}