In [1]:
from datasets import load_dataset
import tensorflow as tf
from transformers import MBart50Tokenizer, TFTrainingArguments, TFMBartForConditionalGeneration, DataCollatorForSeq2Seq, create_optimizer
import pandas as pd
from tqdm import tqdm  # For progress bar

en_nl_model = TFMBartForConditionalGeneration.from_pretrained("model_en_nl")
nl_en_model = TFMBartForConditionalGeneration.from_pretrained("model_nl_en")

# Load the tokenizers
tokenizer = MBart50Tokenizer.from_pretrained("model_en_nl")
tokenizer_nl_en = MBart50Tokenizer.from_pretrained("model_nl_en")

# 7. Function to translate from Dutch to English
def translate_nl_to_en(text):
    # Set source language to Dutch
    tokenizer_nl_en.src_lang = "nl_XX"
    
    # Tokenize the input text
    inputs = tokenizer_nl_en(text, return_tensors="tf", max_length=128, truncation=True)
    
    # Generate translation with forced English BOS token
    output_ids = model_nl_en.generate(
        **inputs,
        forced_bos_token_id=tokenizer_nl_en.lang_code_to_id["en_XX"]  # Force English as output
    )
    
    # Decode the output
    translation = tokenizer_nl_en.decode(output_ids[0], skip_special_tokens=True)
    return translation

# 8. Function to perform round-trip translation
def round_trip_translation(df, text_column, en_nl_model, nl_en_model, batch_size=16):

    result_df = df.copy()
    
    dutch_translations = []
    back_translations = []

    texts = df[text_column].astype(str).tolist()

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        
        # === ENGLISH → DUTCH ===
        tokenizer.src_lang = "en_XX"
        inputs_en = tokenizer(batch_texts, return_tensors="tf", padding=True, truncation=True, max_length=128)
        outputs_nl = en_nl_model.generate(
            **inputs_en,
            forced_bos_token_id=tokenizer.lang_code_to_id["nl_XX"]
        )
        batch_nl = tokenizer.batch_decode(outputs_nl, skip_special_tokens=True)
        dutch_translations.extend(batch_nl)
        
        # === DUTCH → ENGLISH ===
        tokenizer_nl_en.src_lang = "nl_XX"
        inputs_nl = tokenizer_nl_en(batch_nl, return_tensors="tf", padding=True, truncation=True, max_length=128)
        outputs_en = nl_en_model.generate(
            **inputs_nl,
            forced_bos_token_id=tokenizer_nl_en.lang_code_to_id["en_XX"]
        )
        batch_back = tokenizer_nl_en.batch_decode(outputs_en, skip_special_tokens=True)
        back_translations.extend(batch_back)

    result_df["Translation"] = dutch_translations
    result_df["English Translation"] = back_translations
    
    return result_df

INFO: TensorFlow version 2.19.0 available.
2025-04-05 12:31:03.690722: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-05 12:31:03.705438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743856263.720070 1209061 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743856263.724621 1209061 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743856263.737553 1209061 computation_placer.cc:177] computation placer already register

In [2]:
assembly_ai = pd.read_excel('../Task_3/STT_AssemblyAI.xlsx')

round_translation = round_trip_translation(assembly_ai, 'Corrected', en_nl_model, nl_en_model)

100%|██████████| 10/10 [1:27:43<00:00, 526.34s/it]


In [3]:
round_translation.to_csv('translation_scored.csv', index=False)