In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
! pip install datasets transformers

^C
Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 4, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/main.py", line 11, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spinners import open_spinner
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/spinners.py", line 9, in <module>
    from pip._internal.utils.logging import get_indentation
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/u

In [None]:
from datasets import load_dataset, concatenate_datasets
import random
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

In [None]:
# ---------------------------
# Load dataset
# ---------------------------
language_pair = "en-es"
opus100_dataset = load_dataset("opus100", language_pair)
print(opus100_dataset)



In [None]:
subset_size = 100000
train_subset = opus100_dataset["train"].select(range(subset_size))
print(train_subset)

In [None]:
# ---------------------------
# Noise function
# ---------------------------
def apply_noise(text):
    words = text.split()
    noisy_words = []
    for word in words:
        if random.random() < 0.1:  # 10% chance to delete a word
            continue
        if random.random() < 0.1 and len(word) > 1:  # 10% chance to delete a character
            char_list = list(word)
            del char_list[random.randint(0, len(char_list) - 1)]
            word = "".join(char_list)
        noisy_words.append(word)

    # 5% chance to swap adjacent words
    if len(noisy_words) > 1 and random.random() < 0.05:
        swap_index = random.randint(0, len(noisy_words) - 2)
        noisy_words[swap_index], noisy_words[swap_index + 1] = (
            noisy_words[swap_index + 1],
            noisy_words[swap_index],
        )

    return " ".join(noisy_words)


In [None]:
# ---------------------------
# Add noisy fields to training set
# ---------------------------
train_subset = train_subset.map(
    lambda example: {
        "translation": {
            "en": example["translation"]["en"],
            "es": example["translation"]["es"],
            "en_noisy": apply_noise(example["translation"]["en"]),
            "es_noisy": apply_noise(example["translation"]["es"]),
        }
    }
)
print(train_subset[0])

In [None]:
# ---------------------------
# Tokenizer
# ---------------------------
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

In [None]:
# ---------------------------
# Tokenization functions (DENOISING!)
# ---------------------------
def tokenize_en_denoise(examples):
    # Noisy English → Original English
    en_noisy_batch = [item["en_noisy"] for item in examples["translation"]]
    en_original_batch = [item["en"] for item in examples["translation"]]

    return tokenizer(
        en_noisy_batch,
        max_length=512,
        truncation=True,
        text_target=en_original_batch,
    )

In [None]:
def tokenize_fr_denoise(examples):
    # Noisy French → Original French
    fr_noisy_batch = [item["es_noisy"] for item in examples["translation"]]
    fr_original_batch = [item["es"] for item in examples["translation"]]

    return tokenizer(
        fr_noisy_batch,
        max_length=512,
        truncation=True,
        text_target=fr_original_batch,
    )

In [None]:
# ---------------------------
# Tokenize datasets
# ---------------------------
tokenized_en = train_subset.map(tokenize_en_denoise, batched=True)
tokenized_fr = train_subset.map(tokenize_fr_denoise, batched=True)

# Combine both directions
tokenized_dataset = concatenate_datasets([tokenized_en, tokenized_fr])

# Remove original column & format
tokenized_dataset = tokenized_dataset.remove_columns(["translation"])
tokenized_dataset.set_format("torch")

print(tokenized_dataset[0])

In [None]:
# ---------------------------
# Validation set
# ---------------------------
validation_subset = opus100_dataset["validation"]

validation_subset = validation_subset.map(
    lambda example: {
        "translation": {
            "en": example["translation"]["en"],
            "es": example["translation"]["es"],
            "en_noisy": apply_noise(example["translation"]["en"]),
            "es_noisy": apply_noise(example["translation"]["es"]),
        }
    }
)

tokenized_val_en = validation_subset.map(tokenize_en_denoise, batched=True)
tokenized_val_fr = validation_subset.map(tokenize_fr_denoise, batched=True)
tokenized_validation_dataset = concatenate_datasets([tokenized_val_en, tokenized_val_fr])
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(["translation"])
tokenized_validation_dataset.set_format("torch")

In [None]:
# ---------------------------
# Load model
# ---------------------------
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
print(model)

In [None]:
# ---------------------------
# Training setup
# ---------------------------
training_args = TrainingArguments(
    output_dir="./mt5-small-denoising",
    num_train_epochs=1,
    per_device_train_batch_size=2,  # reduce if OOM
    per_device_eval_batch_size=2,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    seed=42,
    learning_rate=5e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    gradient_accumulation_steps=4,
    report_to="none"
    
)

In [None]:
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# ---------------------------
# Trainer
# ---------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_validation_dataset,
    data_collator=data_collator,
)


In [None]:
# ---------------------------
# Train
# ---------------------------
print("Start Model Training ------------------------")
trainer.train()
print("Model trained Successfully --------")

In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
repo_name = "mt5-small-denoising-en-es-correct-deoise"

# 3. Save trained model & tokenizer locally
model.save_pretrained(repo_name)
tokenizer.save_pretrained(repo_name)

# 4. Push to Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"✅ Model and tokenizer uploaded successfully to https://huggingface.co/Eshan210352R/{repo_name}")