# Fine-Tuning a Translation Model using Hugging Face 🤗

This notebook demonstrates how to fine-tune a pre-trained translation model (`Helsinki-NLP/opus-mt-en-es`) using Hugging Face's `transformers` and `datasets` libraries on the English-French `opus_books` dataset.

We:
- Load and tokenize data
- Fine-tune the model on a small sample
- Generate translations from the fine-tuned model


In [None]:
# !pip install transformers datasets

In [None]:
import os
import numpy as np
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

In [None]:
# Direction: 'en-es' or 'es-en'
DIRECTION = "en-es"  # Change as needed
SOURCE_LANG = DIRECTION.split("-")[0]
TARGET_LANG = DIRECTION.split("-")[1]
MODEL_CHECKPOINT = f"Helsinki-NLP/opus-mt-{DIRECTION}"
OUTPUT_DIR = f"finetuned-translation-{DIRECTION}"

# Load model and tokenizer
tokenizer = MarianTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = MarianMTModel.from_pretrained(MODEL_CHECKPOINT)

In [None]:
# Load dataset
dataset = load_dataset("opus100", "en-es")
train_data = dataset["train"]
val_data = dataset["validation"]

In [None]:
print("Sample data structure:", train_data[0])

In [None]:
# Determine source and target text based on direction
if SOURCE_LANG == "en":
    source_key, target_key = "en", "es"
else:
    source_key, target_key = "es", "en"

# Preprocessing function for batched processing
def preprocess_function(examples):
    # When batched=True, examples contains dictionary of lists, not list of dictionaries
    # The structure becomes: {"translation": [{"en": text1, "es": text1}, {"en": text2, "es": text2}, ...]}
    
    inputs = [translation[source_key] for translation in examples["translation"]]
    targets = [translation[target_key] for translation in examples["translation"]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

# Tokenize datasets
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_val = val_data.map(preprocess_function, batched=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Rest of code unchanged...


In [None]:
# Metrics
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode predictions and references
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # BLEU requires lists of references for each prediction
    result = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"bleu": result["score"]}

# Check transformers version and import latest version
import transformers
print(f"Transformers version: {transformers.__version__}")

# Training arguments - optimized for RTX 4060 (8GB VRAM)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    # Use eval_steps instead of evaluation_strategy
    eval_steps=500,                  # Evaluate every 500 steps
    save_steps=500,                  # Save checkpoint every 500 steps
    logging_steps=100,               # Log every 100 steps
    learning_rate=5e-5,
    per_device_train_batch_size=32,  # Increased for faster training
    per_device_eval_batch_size=32,   # Increased for faster evaluation
    gradient_accumulation_steps=2,   # Decreased as we increased batch size
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,                        # Mixed precision for better memory usage
    report_to="tensorboard",
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Check GPU utilization
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device count: {torch.cuda.device_count()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    
    # Check if model is on GPU
    print(f"Model device: {next(model.parameters()).device}")
    
    # Force model to GPU if not already there
    if next(model.parameters()).device.type != 'cuda':
        device = torch.device("cuda")
        model = model.to(device)
        print(f"Model moved to: {next(model.parameters()).device}")
    
    # Memory check
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
else:
    print("WARNING: CUDA NOT AVAILABLE - Training will be on CPU and very slow!")

In [None]:
# Add before trainer.train()
if torch.cuda.is_available():
    print(f"Initial GPU memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    
    # Optional: Add a hook to monitor memory usage
    old_step = trainer.training_step
    def training_step_with_memory(*args, **kwargs):
        result = old_step(*args, **kwargs)
        if trainer.state.global_step % 100 == 0:  # Log every 100 steps
            print(f"Step {trainer.state.global_step} GPU memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        return result
    trainer.training_step = training_step_with_memory# Fine-tune the model
trainer.train()
# Save the model
trainer.save_model()
print(f"Model saved to {OUTPUT_DIR}")

In [None]:
# Direction to test
DIRECTION = "en-es"  # Change as needed
MODEL_DIR = f"finetuned-translation-{DIRECTION}"

# Load fine-tuned model and tokenizer
tokenizer = MarianTokenizer.from_pretrained(MODEL_DIR)
model = MarianMTModel.from_pretrained(MODEL_DIR)

# Test function
def translate(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    
    # Generate translation
    outputs = model.generate(**inputs)
    
    # Decode
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return translation[0]

# Test examples
english_texts = [
    "Hello, how are you today?",
    "Machine translation is an interesting field of natural language processing."
]

spanish_texts = [
    "Hola, ¿cómo estás hoy?",
    "La traducción automática es un campo interesante del procesamiento del lenguaje natural."
]

# Choose appropriate texts based on direction
if DIRECTION == "en-es":
    test_texts = english_texts
else:
    test_texts = spanish_texts

# Test translations
for text in test_texts:
    translation = translate(text)
    print(f"Original: {text}")
    print(f"Translation: {translation}")
    print("-" * 50)