In [1]:
 ! pip install datasets transformers



In [3]:
from datasets import load_dataset

language_pair = "en-it"
opus100_dataset = load_dataset("opus100", language_pair)
print(opus100_dataset)

test-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.7M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/220k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})


In [4]:
subset_size = 10000
train_subset = opus100_dataset["train"].select(range(subset_size))
print(train_subset)

Dataset({
    features: ['translation'],
    num_rows: 10000
})


In [6]:
import random

def apply_noise(text):
    words = text.split()
    noisy_words = []
    for word in words:
        if random.random() < 0.1:  # 10% chance to delete a word
            continue
        if random.random() < 0.1 and len(word) > 1: # 10% chance to delete a character
            char_list = list(word)
            del char_list[random.randint(0, len(char_list) - 1)]
            word = "".join(char_list)

        noisy_words.append(word)

    # 5% chance to swap adjacent words
    if len(noisy_words) > 1 and random.random() < 0.05:
      swap_index = random.randint(0, len(noisy_words) - 2)
      noisy_words[swap_index], noisy_words[swap_index + 1] = noisy_words[swap_index + 1], noisy_words[swap_index]

    return " ".join(noisy_words)

train_subset = train_subset.map(lambda example: {
    "translation": {
        "en": example["translation"]["en"],
        "it": example["translation"]["it"],
        "en_noisy": apply_noise(example["translation"]["en"]),
        "it_noisy": apply_noise(example["translation"]["it"]),
    }
})

print(train_subset[0])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'translation': {'en': '- Thanks, buddy.', 'en_noisy': '- Thanks, buddy.', 'it': '- Grazie, amico.', 'it_noisy': '- amico.'}}


In [7]:
import random
from transformers import MT5Tokenizer
from datasets import DatasetDict, concatenate_datasets


tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

def tokenize_en_fr(examples):
    # Access the lists of noisy English and original French
    en_noisy_batch = [item["en_noisy"] for item in examples["translation"]]
    fr_original_batch = [item["it"] for item in examples["translation"]]

    return tokenizer(
        en_noisy_batch,
        max_length=512,
        truncation=True,
        text_target=fr_original_batch,
    )

def tokenize_fr_en(examples):
    # Access the lists of noisy French and original English
    fr_noisy_batch = [item["it_noisy"] for item in examples["translation"]]
    en_original_batch = [item["en"] for item in examples["translation"]]

    return tokenizer(
        fr_noisy_batch,
        max_length=512,
        truncation=True,
        text_target=en_original_batch,
    )

# Tokenize for both directions separately
tokenized_en_fr = train_subset.map(tokenize_en_fr, batched=True)
tokenized_fr_en = train_subset.map(tokenize_fr_en, batched=True)

# Combine the two tokenized datasets
tokenized_dataset = concatenate_datasets([tokenized_en_fr, tokenized_fr_en])

# Ensure the dataset format is suitable for transformers training
tokenized_dataset = tokenized_dataset.remove_columns(["translation"])
tokenized_dataset.set_format("torch")

print(tokenized_dataset[0])


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

{'input_ids': tensor([   259,    264,  16385,    261,    259, 154618,    260,      1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor([   259,    264,    259,  46445,    261,    259, 121496,    260,      1])}


In [8]:
from transformers import MT5ForConditionalGeneration

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
print(model)

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [11]:
from transformers import TrainingArguments, Trainer
from datasets import DatasetDict, concatenate_datasets

training_args = TrainingArguments(
    output_dir="./mt5-small-denoising",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch", # Changed to match eval_strategy
    save_total_limit=2,
    seed=42,
    learning_rate=5e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)


validation_subset = opus100_dataset["validation"].select(range(subset_size // 10))

# Apply noise to the validation subset
validation_subset = validation_subset.map(lambda example: {
    "translation": {
        "en": example["translation"]["en"],
        "fr": example["translation"]["it"],
        "en_noisy": apply_noise(example["translation"]["en"]),
        "fr_noisy": apply_noise(example["translation"]["it"]),
    }
})

def tokenize_en_fr(examples):
    # Access the lists of noisy English and original French
    en_noisy_batch = [item["en_noisy"] for item in examples["translation"]]
    fr_original_batch = [item["fr"] for item in examples["translation"]]

    return tokenizer(
        en_noisy_batch,
        max_length=512,
        truncation=True,
        text_target=fr_original_batch,
    )

def tokenize_fr_en(examples):
    # Access the lists of noisy French and original English
    fr_noisy_batch = [item["fr_noisy"] for item in examples["translation"]]
    en_original_batch = [item["en"] for item in examples["translation"]]

    return tokenizer(
        fr_noisy_batch,
        max_length=512,
        truncation=True,
        text_target=en_original_batch,
    )

tokenized_validation_dataset = validation_subset.map(tokenize_en_fr, batched=True)
tokenized_validation_dataset_fr_en = validation_subset.map(tokenize_fr_en, batched=True)
tokenized_validation_dataset = concatenate_datasets([tokenized_validation_dataset, tokenized_validation_dataset_fr_en])
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(["translation"])
tokenized_validation_dataset.set_format("torch")


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_validation_dataset,
)

print(trainer)

<transformers.trainer.Trainer object at 0x7a0f13fe6060>


In [12]:
from transformers import DataCollatorForSeq2Seq
# ---------- data collator (the missing piece) ----------
# DataCollatorForSeq2Seq will dynamically pad inputs & labels and set label padding to -100
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [13]:
training_args = TrainingArguments(
    output_dir="./mt5-small-denoising",
    num_train_epochs=3,
    per_device_train_batch_size=2, # Further reduced batch size
    per_device_eval_batch_size=2,  # Further reduced batch size
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    seed=42,
    learning_rate=5e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    gradient_accumulation_steps=4, # Increased gradient accumulation
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_validation_dataset,
    data_collator=data_collator,
)

print("Start Model Training ------------------------")
trainer.train()
print("Model trained Sucessfully --------")

Start Model Training ------------------------


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33meshanmaduranga0329[0m ([33meshanmaduranga0329-esh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,4.2709,3.355012
2,3.8315,3.160891
3,3.7009,3.118719


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


Model trained Sucessfully --------


In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
repo_name = "mt5-small-denoising-en-it-final"

# 3. Save trained model & tokenizer locally
model.save_pretrained(repo_name)
tokenizer.save_pretrained(repo_name)

# 4. Push to Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"✅ Model and tokenizer uploaded successfully to https://huggingface.co/<your-username>/{repo_name}")


model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

✅ Model and tokenizer uploaded successfully to https://huggingface.co/<your-username>/mt5-small-denoising-en-it-final


In [16]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


# Load the model and tokenizer from the Hub
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("Eshan210352R/mt5-small-denoising-en-it-final")
loaded_tokenizer = AutoTokenizer.from_pretrained("Eshan210352R/mt5-small-denoising-en-it-final")

print("Model and tokenizer loaded successfully!")

config.json:   0%|          | 0.00/757 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/893 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Model and tokenizer loaded successfully!
