In [1]:
from datasets import load_dataset
import tensorflow as tf
from transformers import MBart50Tokenizer, TFTrainingArguments, TFMBartForConditionalGeneration, DataCollatorForSeq2Seq, create_optimizer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
eng_nl = load_dataset("kde4", lang1="en", lang2="nl", download_mode="force_redownload")

# Checking the quality of translation
for i in range(100):
    print(f"Example {i}:")
    print(eng_nl["train"][i]["translation"])
    print()

kde4.py:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.10M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/192060 [00:00<?, ? examples/s]

Example 0:
{'en': 'Lauri Watts', 'nl': 'Lauri Watts'}

Example 1:
{'en': '& Lauri. Watts. mail;', 'nl': '& Lauri.Watts.mail;'}

Example 2:
{'en': 'ROLES_OF_TRANSLATORS', 'nl': '& Niels.Reedijk; Bram.Schoenmakers; Natalie.Koning;'}

Example 3:
{'en': '2006-02-26 3.5.1', 'nl': '2006-02-26 3.5.1'}

Example 4:
{'en': 'The Babel & konqueror; plugin gives you quick access to the Babelfish translation service.', 'nl': 'De plugin Babel voor & konqueror; geeft u snel en gemakkelijk toegang tot de Babelfish-vertaaldienst.'}

Example 5:
{'en': 'KDE', 'nl': 'KDE'}

Example 6:
{'en': 'kdeaddons', 'nl': 'kdeaddons'}

Example 7:
{'en': 'konqueror', 'nl': 'konqueror'}

Example 8:
{'en': 'plugins', 'nl': 'plugins'}

Example 9:
{'en': 'babelfish', 'nl': 'babelfish'}

Example 10:
{'en': 'translate', 'nl': 'translate'}

Example 11:
{'en': 'The Babel & konqueror; plugin', 'nl': 'De & konqueror; Babel-plugin'}

Example 12:
{'en': 'Using the Babelfish plugin', 'nl': 'De Babelfish-plugin gebruiken'}

Example 

In [3]:
# 1. Split the dataset for fine tuning
split_datasets = eng_nl["train"].train_test_split(test_size=0.1) # 90% train, 10% test

# 2. Initialize TensorFlow model
model = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
tokenizer = MBart50Tokenizer.from_pretrained(
    "facebook/mbart-large-50",
    src_lang="en_XX",
    tgt_lang="nl_XX"
)

# 3. Preprocessing function
def preprocess(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["nl"] for ex in examples["translation"]]
    
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels["input_ids"]
    }

# 4. Prepare TF Dataset
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

tokenized_datasets = split_datasets.map(
    preprocess,
    batched=True,
    remove_columns=split_datasets["train"].column_names
)

tf_train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    collate_fn=data_collator,
    batch_size=8,
    shuffle=True
)

tf_val_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    collate_fn=data_collator,
    batch_size=8
)

# 5. Configure Training
num_train_steps = len(tf_train_dataset) * 10

optimizer, lr_schedule = create_optimizer(
    init_lr=3e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.05,
    num_warmup_steps=0,  # 10% warmup
)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# 6. Train the model
model.fit(
    tf_train_dataset,
    validation_data=tf_val_dataset,
    epochs=13,
    steps_per_epoch=200,
    validation_steps=200
)

# Save the tokenizer
tokenizer.save_pretrained("model_en_nl")

# Save the model
model.save_pretrained("model_en_nl")

2025-04-03 08:10:21.950971: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
I0000 00:00:1743667821.951286 1150602 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46672 MB memory:  -> device: 0, name: NVIDIA RTX 6000 Ada Generation, pci bus id: 0000:01:00.0, compute capability: 8.9
All model checkpoint layers were used when initializing TFMBartForConditionalGeneration.

All the layers of TFMBartForConditionalGeneration were initialized from the model checkpoint at facebook/mbart-large-cc25.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMBartForConditionalGeneration for predictions without further training.


Map:   0%|          | 0/172854 [00:00<?, ? examples/s]



Map:   0%|          | 0/19206 [00:00<?, ? examples/s]

Epoch 1/13




Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


Non-default generation parameters: {'max_length': 1024, 'num_beams': 5}


In [4]:
# 1. Split the dataset for fine tuning
split_datasets = eng_nl["train"].train_test_split(test_size=0.1) # 90% train, 10% test

# 2. Initialize TensorFlow model
model_nl_en = TFMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
tokenizer_nl_en = MBart50Tokenizer.from_pretrained(
    "facebook/mbart-large-50",
    src_lang="nl_XX",  # Dutch is now the source language
    tgt_lang="en_XX"   # English is now the target language
)

# 3. Preprocessing function
def preprocess_nl_en(examples):
    # Swap source and target languages
    inputs = [ex["nl"] for ex in examples["translation"]]  # Now using Dutch as input
    targets = [ex["en"] for ex in examples["translation"]]  # Now using English as target
    
    model_inputs = tokenizer_nl_en(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    
    with tokenizer_nl_en.as_target_tokenizer():
        labels = tokenizer_nl_en(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": labels["input_ids"]
    }

# 4. Prepare TF Dataset
data_collator_nl_en = DataCollatorForSeq2Seq(tokenizer_nl_en, model=model_nl_en, return_tensors="tf")
tokenized_datasets_nl_en = split_datasets.map(
    preprocess_nl_en,
    batched=True,
    remove_columns=split_datasets["train"].column_names
)

tf_train_dataset_nl_en = model_nl_en.prepare_tf_dataset(
    tokenized_datasets_nl_en["train"],
    collate_fn=data_collator_nl_en,
    batch_size=8,
    shuffle=True
)

tf_val_dataset_nl_en = model_nl_en.prepare_tf_dataset(
    tokenized_datasets_nl_en["test"],
    collate_fn=data_collator_nl_en,
    batch_size=8
)

# 5. Configure Training
num_train_steps = len(tf_train_dataset_nl_en) * 10
optimizer, lr_schedule = create_optimizer(
    init_lr=3e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.05,
    num_warmup_steps=int(num_train_steps * 0.1),  # 10% warmup
)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model_nl_en.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# 6. Train the model
model_nl_en.fit(
    tf_train_dataset_nl_en,
    validation_data=tf_val_dataset_nl_en,
    epochs=13,
    steps_per_epoch=200,
    validation_steps=200
)

# Save the tokenizer
tokenizer.save_pretrained("model_nl_en")

# Save the model
model.save_pretrained("model_nl_en")

All model checkpoint layers were used when initializing TFMBartForConditionalGeneration.

All the layers of TFMBartForConditionalGeneration were initialized from the model checkpoint at facebook/mbart-large-cc25.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMBartForConditionalGeneration for predictions without further training.


Map:   0%|          | 0/172854 [00:00<?, ? examples/s]



Map:   0%|          | 0/19206 [00:00<?, ? examples/s]

Epoch 1/13




Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


Non-default generation parameters: {'max_length': 1024, 'num_beams': 5}
