In [1]:
 ! pip install datasets transformers



In [2]:
from datasets import load_dataset
import random
import numpy as np
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import concatenate_datasets

In [3]:
# ------------------ Load OPUS-100 dataset ------------------
language_pair = "en-it"
opus100_dataset = load_dataset("opus100", language_pair)

subset_size = 10000
train_subset = opus100_dataset["train"].select(range(subset_size))
validation_subset = opus100_dataset["validation"].select(range(subset_size // 10))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.7M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/220k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
# ------------------ Span Corruption Function ------------------
def apply_span_corruption(text, tokenizer, noise_density=0.15, mean_span_length=3):
    tokens = tokenizer.tokenize(text)
    n_tokens = len(tokens)
    n_mask = max(1, int(n_tokens * noise_density))  # number of tokens to mask

    # Choose random start positions
    mask_indices = np.random.choice(n_tokens, n_mask, replace=False)
    mask_indices.sort()

    corrupted_tokens = []
    target_tokens = []
    current_extra_id = 0
    i = 0

    while i < n_tokens:
        if i in mask_indices:
            # Insert sentinel token
            sentinel = f"<extra_id_{current_extra_id}>"
            corrupted_tokens.append(sentinel)
            target_tokens.append(sentinel)

            # Mask a random span length
            span_len = np.random.poisson(mean_span_length)
            if span_len < 1:
                span_len = 1

            # Collect masked tokens for target
            for j in range(i, min(i + span_len, n_tokens)):
                if j in mask_indices:
                    target_tokens.append(tokens[j])

            # Skip over the span
            i += span_len
            current_extra_id += 1
        else:
            corrupted_tokens.append(tokens[i])
            i += 1

    # Append EOS sentinel to target
    target_tokens.append(f"<extra_id_{current_extra_id}>")

    corrupted_text = tokenizer.convert_tokens_to_string(corrupted_tokens)
    target_text = tokenizer.convert_tokens_to_string(target_tokens)

    return corrupted_text, target_text

In [5]:
# ------------------ Apply corruption to dataset ------------------
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

def preprocess_dataset(examples, src_lang="en", tgt_lang="it"):
    corrupted_texts = []
    target_texts = []
    for ex in examples["translation"]:
        corrupted, target = apply_span_corruption(ex[src_lang], tokenizer)
        corrupted_texts.append(corrupted)
        target_texts.append(target)
    return {"input_text": corrupted_texts, "labels_text": target_texts}

train_subset = train_subset.map(preprocess_dataset, batched=True)
validation_subset = validation_subset.map(preprocess_dataset, batched=True)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
# ------------------ Tokenization ------------------
def tokenize_function(examples):
    return tokenizer(
        examples["input_text"],
        max_length=512,
        truncation=True,
        padding=False,
        text_target=examples["labels_text"],
    )

In [7]:
tokenized_train = train_subset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "labels_text"])
tokenized_val = validation_subset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "labels_text"])

tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
# ------------------ Model ------------------
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# ------------------ Data Collator ------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
# ------------------ Training Args ------------------
training_args = TrainingArguments(
    output_dir="./mt5-small-span-denoising",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    seed=42,
    learning_rate=5e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

# ------------------ Trainer ------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)


In [12]:
# ------------------ Train ------------------
print("Start Model Training ------------------------")
trainer.train()
print("Model trained Successfully --------")

Start Model Training ------------------------


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33meshanmaduranga0329[0m ([33meshanmaduranga0329-esh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,1.1334,1.076286
2,0.9443,0.986983
3,0.9313,0.961402


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Model trained Successfully --------


In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
repo_name = "mt5-span-denoising-en-it-final"

# 3. Save trained model & tokenizer locally
model.save_pretrained(repo_name)
tokenizer.save_pretrained(repo_name)

# 4. Push to Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"✅ Model and tokenizer uploaded successfully to https://huggingface.co/<your-username>/{repo_name}")

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

✅ Model and tokenizer uploaded successfully to https://huggingface.co/<your-username>/mt5-span-denoising-en-it-final
