In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

In [2]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud

In [4]:
raw_datasets = load_dataset("kde4", lang1="en", lang2="fr", trust_remote_code=True)

README.md: 0.00B [00:00, ?B/s]

kde4.py: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

In [6]:
#smaller dataset for faster training

split_datasets = raw_datasets["train"].train_test_split(train_size=0.1, seed=20) 
split_datasets["validation"] = split_datasets.pop("test")

In [7]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



In [9]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

In [10]:
max_length = 64  

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, 
        text_target=targets, 
        max_length=max_length, 
        truncation=True,
        padding=True  
    )
    return model_inputs

In [12]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    batch_size=1000, 
    remove_columns=split_datasets["train"].column_names,
    num_proc=4)

Map (num_proc=4):   0%|          | 0/21017 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/189156 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [27]:


import numpy as np
from evaluate import load

metric = load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

   
    if isinstance(preds, tuple):
        preds = preds[0]

   
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]  # Note: double brackets

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

 


In [29]:
args = Seq2SeqTrainingArguments(
    output_dir="marian-finetuned-kde4-en-to-fr",
    run_name="marian-en-to-fr-run",
    
    
    num_train_epochs=1,  
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64,   
    gradient_accumulation_steps=1,   
    
    
    fp16=True, 
    dataloader_num_workers=4,  
    dataloader_pin_memory=True,  
    

    learning_rate=5e-5,  
    warmup_steps=500,    
    
    
    eval_strategy="steps",
    eval_steps=1000,     
    save_strategy="steps",
    save_steps=1000,
    logging_steps=100,
    
   
    save_total_limit=1, 
    load_best_model_at_end=False,  
    
    
    predict_with_generate=True,
    generation_max_length=max_length,
    generation_num_beams=2, 
    
   
    weight_decay=0.01,
    remove_unused_columns=True,
    push_to_hub=False,
    report_to=[],  
    
    # Memory optimization
    gradient_checkpointing=True,  # Trade compute for memory
)


In [30]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [31]:
import torch
if torch.cuda.is_available():
    
    model.config.use_cache = False  
    
    
    try:
        model = torch.compile(model)
        print("Model compiled successfully!")
    except:
        print("Model compilation not available")


print("Starting training...")

Model compiled successfully!
Starting training...


In [32]:
trainer.train()

Step,Training Loss,Validation Loss




TrainOutput(global_step=657, training_loss=0.6468484332753824, metrics={'train_runtime': 153.479, 'train_samples_per_second': 136.937, 'train_steps_per_second': 4.281, 'total_flos': 356220967845888.0, 'train_loss': 0.6468484332753824, 'epoch': 1.0})

In [33]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 0.24895942211151123,
 'eval_bleu': 46.167374396381,
 'eval_runtime': 4413.0806,
 'eval_samples_per_second': 42.863,
 'eval_steps_per_second': 0.67,
 'epoch': 1.0}