In [1]:
! pip install transformers datasets evaluate sacrebleu sentencepiece accelerate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import gc
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
import evaluate
import numpy as np

# Clear all memory first
torch.cuda.empty_cache()
gc.collect()

# Set memory optimization settings
torch.backends.cuda.max_split_size_mb = 128
torch.cuda.set_per_process_memory_fraction(0.9)


In [3]:
print("🧹 Memory cleaned and optimized")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

# =============================
# 2️⃣ Load Your Pre-trained Model with Memory Optimizations
# =============================
model_name = "Eshan210352R/mt5-small-denoising-en-it-final"

# Load model in fp16 to save memory
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Half precision
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"  # Important for mT5

print("✅ Loaded your pre-trained model with memory optimizations")

# Move model to GPU
model = model.to('cuda')
print("📊 Model moved to GPU")


🧹 Memory cleaned and optimized
GPU available: True
GPU: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/757 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/893 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


✅ Loaded your pre-trained model with memory optimizations
📊 Model moved to GPU


In [4]:
# =============================
# 3️⃣ Load and Prepare Dataset (Memory Efficient)
# =============================
# Load dataset
try:
    dataset = load_dataset("opus100", "en-it")
except:
    dataset = load_dataset("Helsinki-NLP/opus-100", "en-it")

# Use very small subset for Colab T4
train_size = 15000  # Reduced for memory
val_size = 1000

train_dataset = dataset["train"].select(range(min(train_size, len(dataset["train"]))))
val_dataset = dataset["validation"].select(range(min(val_size, len(dataset["validation"]))))

print(f"📊 Training samples: {len(train_dataset)}")
print(f"📊 Validation samples: {len(val_dataset)}")


README.md: 0.00B [00:00, ?B/s]

en-it/test-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

en-it/train-00000-of-00001.parquet:   0%|          | 0.00/91.7M [00:00<?, ?B/s]

en-it/validation-00000-of-00001.parquet:   0%|          | 0.00/220k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

📊 Training samples: 15000
📊 Validation samples: 1000


In [5]:
# =============================
# 4️⃣ Memory-Optimized Tokenization
# =============================
max_length = 64  # Reduced sequence length

def tokenize_for_translation(examples):
    """Memory-efficient tokenization"""
    inputs = [f"translate English to Italian: {ex['en']}" for ex in examples["translation"]]
    targets = [ex["it"] for ex in examples["translation"]]

    # Tokenize without padding first
    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        truncation=True,
        padding=False,  # DataCollator will handle padding
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_length,
            truncation=True,
            padding=False,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
# Tokenize in small batches
print("🔄 Tokenizing dataset...")
tokenized_train = train_dataset.map(
    tokenize_for_translation,
    batched=True,
    batch_size=256,  # Small batches
    remove_columns=train_dataset.column_names
)

tokenized_val = val_dataset.map(
    tokenize_for_translation,
    batched=True,
    batch_size=256,
    remove_columns=val_dataset.column_names
)

print("✅ Tokenization completed")

🔄 Tokenizing dataset...


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

✅ Tokenization completed


In [7]:
# =============================
# 5️⃣ Memory-Optimized Data Collator
# =============================
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8,
    label_pad_token_id=-100,
    return_tensors="pt"
)

# =============================
# 6️⃣ Memory Monitoring Function
# =============================
def print_gpu_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        reserved = torch.cuda.memory_reserved(0) / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")

print_gpu_memory()


GPU Memory - Allocated: 0.57GB, Reserved: 0.59GB


In [9]:
# =============================
# 7️⃣ Ultra Memory-Optimized Training Arguments
# =============================
training_args = TrainingArguments(
    # Output settings
    output_dir="./mt5-translation-optimized",
    overwrite_output_dir=True,

    # Batch sizes for T4 GPU
    per_device_train_batch_size=2,    # Very small batch size
    per_device_eval_batch_size=2,     # Very small batch size
    gradient_accumulation_steps=8,    # Effective batch size = 2 * 8 = 16

    # Training schedule
    num_train_epochs=3,
    max_steps=500,  # Safety limit

    # Optimization
    learning_rate=2e-5,
    warmup_steps=50,
    optim="adafactor",  # Memory-efficient optimizer
    weight_decay=0.01,
    max_grad_norm=1.0,

    # Memory optimizations
    fp16=True,                      # Mixed precision
    gradient_checkpointing=True,    # Trade compute for memory
    dataloader_pin_memory=False,
    dataloader_num_workers=0,
    remove_unused_columns=False,    # Avoid memory issues

    # Evaluation
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # Logging
    logging_steps=25,
    report_to=None,
    save_total_limit=1,

      # Disable during training to save memory
)

In [10]:
# =============================
# 8️⃣ Simple Metrics Function (Memory Efficient)
# =============================
def compute_simple_metrics(eval_pred):
    """Lightweight metrics to save memory"""
    predictions, labels = eval_pred

    # Handle -100 labels
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Calculate accuracy on non-padding tokens
    matches = (predictions == labels)
    accuracy = matches.mean()

    return {"accuracy": accuracy}

In [11]:
# =============================
# 9️⃣ Memory-Aware Trainer with Callbacks
# =============================
class MemoryCallback:
    """Callback to monitor and manage memory"""
    def on_step_begin(self, args, state, control, **kwargs):
        if state.global_step % 50 == 0:
            print_gpu_memory()
            torch.cuda.empty_cache()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_simple_metrics,  # Lightweight metrics
)

# Add memory callback
trainer.add_callback(MemoryCallback())

  trainer = Trainer(


In [12]:
# =============================
# 🔟 Gradual Training with Error Handling
# =============================
print("🚀 Starting memory-optimized training...")

try:
    # Train in phases to monitor memory
    print("Phase 1: Training first 100 steps...")
    train_result = trainer.train()

except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("❌ OOM Error! Trying recovery...")

        # Clear memory and try smaller batch size
        torch.cuda.empty_cache()
        gc.collect()

        # Ultra-aggressive settings
        training_args.per_device_train_batch_size = 1
        training_args.gradient_accumulation_steps = 16
        training_args.max_steps = 200

        # Recreate trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        trainer.add_callback(MemoryCallback())

        print("🔄 Retrying with ultra-low memory settings...")
        train_result = trainer.train()


🚀 Starting memory-optimized training...
Phase 1: Training first 100 steps...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33meshanmaduranga0329[0m ([33meshanmaduranga0329-esh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


AttributeError: 'MemoryCallback' object has no attribute 'on_train_begin'

In [None]:
# =============================
# 1️⃣1️⃣ Final Evaluation with BLEU (After Training)
# =============================
if 'train_result' in locals():
    print("\n=== Training Completed Successfully ===")

    # Clear memory for proper evaluation
    torch.cuda.empty_cache()
    gc.collect()

    # Full BLEU evaluation
    def compute_bleu(eval_pred):
        predictions, labels = eval_pred

        predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Use a subset for evaluation to save memory
        if len(decoded_preds) > 50:
            decoded_preds = decoded_preds[:50]
            decoded_labels = decoded_labels[:50]

        bleu_metric = evaluate.load("sacrebleu")
        result = bleu_metric.compute(
            predictions=decoded_preds,
            references=[[label] for label in decoded_labels]
        )
        return {"bleu": result["score"]}

    print("📊 Running final BLEU evaluation...")
    eval_results = trainer.evaluate()
    print(f"✅ Final BLEU Score: {eval_results.get('eval_bleu', 'N/A'):.2f}")

    # =============================
    # 1️⃣2️⃣ Save the Model
    # =============================
    print("💾 Saving model...")
    trainer.save_model("./mt5-en-it-translator-final")
    tokenizer.save_pretrained("./mt5-en-it-translator-final")
    print("✅ Model saved successfully!")

else:
    print("❌ Training failed. Trying emergency fallback...")

    # Emergency minimal training
    from transformers import TrainingArguments, Trainer

    # Use tiny subset
    tiny_train = tokenized_train.select(range(100))

    emergency_args = TrainingArguments(
        output_dir="./emergency-training",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        max_steps=50,
        learning_rate=1e-5,
        fp16=True,
        save_strategy="no",
        logging_steps=10,
    )

    emergency_trainer = Trainer(
        model=model,
        args=emergency_args,
        train_dataset=tiny_train,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    emergency_trainer.train()
    emergency_trainer.save_model("./mt5-en-it-translator-minimal")
    print("✅ Minimal model saved!")
