In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install transformers datasets sacrebleu sentencepiece accelerate peft -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta 

In [5]:
!pip install evaluate -q

In [6]:
import evaluate

In [8]:
# ===============================
# 2. Imports
# ===============================
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import random
import torch

In [9]:
# ===============================
# 3. Load base mT5 model + tokenizer
# ===============================
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
# ===============================
# 4. Freeze model weights
# ===============================
for param in model.parameters():
    param.requires_grad = False

In [14]:
# ===============================
# 5. Apply LoRA
# ===============================
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)




In [15]:
# ===============================
# 6. Load OPUS-100 dataset (EN-FR)
# ===============================
dataset = load_dataset("opus100", "en-fr")

In [19]:
# Sample 100k pairs from training set
train_dataset = dataset["train"].shuffle(seed=42).select(range(100_000))
val_dataset = dataset["validation"].shuffle(seed=42).select(range(1_000))

In [20]:
train_dataset

Dataset({
    features: ['translation'],
    num_rows: 100000
})

In [21]:
# ===============================
# 7. Preprocess with Instruction Format (Bidirectional)
# ===============================
max_input_length = 128
max_target_length = 128

def preprocess(batch):
    inputs = []
    targets = []
    for ex in batch["translation"]:
        # EN -> FR
        inputs.append(f"translate English to French: {ex['en']}")
        targets.append(ex["fr"])

        # FR -> EN
        inputs.append(f"translate French to English: {ex['fr']}")
        targets.append(ex["en"])

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [22]:
tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess, batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [47]:
# ===============================
# 8. Data Collator
# ===============================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [48]:
# ===============================
# 9. Training Arguments
# ===============================
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-enfr-lora-100k",
    eval_strategy="epoch",
    learning_rate=5e-4,  # LoRA can use slightly higher LR
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    report_to="none"
)

In [49]:
# ===============================
# 10. Metric (BLEU)
# ===============================
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[tokenizer.decode(l, skip_special_tokens=True)] for l in labels]
    result = metric.compute(predictions=decoded_preds, references=labels)
    return {"bleu": result["score"]}

In [50]:
# ===============================
# 11. Trainer
# ===============================
trainer =  MySeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer =  MySeq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [51]:
# ===============================
# 12. Train (LoRA instruction tuning)
# ===============================
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss




OverflowError: out of range integral type conversion attempted

In [52]:
# ===============================
# 13. Save LoRA model and tokenizer
# ===============================
model.save_pretrained("./mt5-enfr-lora-100k")
tokenizer.save_pretrained("./mt5-enfr-lora-100k")

('./mt5-enfr-lora-100k/tokenizer_config.json',
 './mt5-enfr-lora-100k/special_tokens_map.json',
 './mt5-enfr-lora-100k/spiece.model',
 './mt5-enfr-lora-100k/added_tokens.json',
 './mt5-enfr-lora-100k/tokenizer.json')

In [53]:
from huggingface_hub import login

login(token="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")

In [63]:
from huggingface_hub import HfApi

# Set your repository name
repo_name = "mt5-small-lora-enfr-final"

# 3. Save trained model & tokenizer locally
model.save_pretrained(repo_name)
tokenizer.save_pretrained(repo_name)

# 4. Push to Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"✅ Model and tokenizer uploaded successfully to https://huggingface.co/Eshan210352R/{repo_name}")


Uploading...:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Uploading...:   0%|          | 0.00/20.7M [00:00<?, ?B/s]

✅ Model and tokenizer uploaded successfully to https://huggingface.co/Eshan210352R/mt5-small-lora-enfr-final


In [55]:
model.save_pretrained("./mt5-enfr-lora-100k", push_to_hub=True)
tokenizer.save_pretrained("./mt5-enfr-lora-100k", push_to_hub=True)

Uploading...:   0%|          | 0.00/20.7M [00:00<?, ?B/s]

('./mt5-enfr-lora-100k/tokenizer_config.json',
 './mt5-enfr-lora-100k/special_tokens_map.json',
 './mt5-enfr-lora-100k/spiece.model',
 './mt5-enfr-lora-100k/added_tokens.json',
 './mt5-enfr-lora-100k/tokenizer.json')

In [56]:
# ===============================
# 14. Test Inference
# ===============================
def translate_en_to_fr(text, max_length=128):
    input_text = f"translate English to French: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [57]:
def translate_fr_to_en(text, max_length=128):
    input_text = f"translate French to English: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [58]:
# Example test
print("EN → FR:", translate_en_to_fr("How are you today?"))
print("FR → EN:", translate_fr_to_en("Je suis très heureux de vous voir."))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)