In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
!pip install transformers==4.37.2
!pip install peft==0.7.1
!pip install accelerate==0.26.0
!pip install sentence-transformers==2.7.0
!pip install sentencepiece datasets sacremoses
!pip install torch # accelerate tensorboard transformers
!pip install pandas numpy peft
"""

In [None]:
!pip install transformers==4.51.3 datasets==3.5.0 peft==0.15.2
!pip install accelerate==1.6.0 evaluate==0.4.3 sacrebleu==2.5.1
!pip install datasets sacremoses torch pandas numpy

In [3]:
import gc
import torch
import numpy as np
import sacrebleu
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import TrainerCallback
from transformers import NllbTokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback # ,CustomEarlyStoppingCallback
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq # ,DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType

In [4]:
model_name = 'facebook/nllb-200-distilled-600M'
downloaded = '/content/drive/MyDrive/nllb-200-distilled-600M'

train_data_dir = '/content/drive/MyDrive/Corpus/train.xlsx'
test_data_dir = '/content/drive/MyDrive/Corpus/test.xlsx'
val_data_dir = '/content/drive/MyDrive/Corpus/val.xlsx'

checkpoint_output_dir = '/content/drive/MyDrive/NLLB Checkpoints/MY-EN/Clementine/'

In [5]:
MAX_SRC_LENGTH = 256
MAX_TGT_LENGTH = 256

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="mya_Mymr", tgt_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
print(model)

In [None]:
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        print(name)
# model.<component>.layers.<index>.<sub_component>.<layer_name>

In [8]:
def prepare(tokenizer, df):

    inputs = tokenizer(
        list(df["my"]),              # source text (Myanmar)
        text_target=list(df["en"]),  # target text (English)
        truncation=True,
        padding="max_length",
        max_length=MAX_SRC_LENGTH,
        return_tensors="pt"
    )

    # For Debug
    first_input_ids = inputs["input_ids"][0]
    first_label_ids = inputs["labels"][0]

    # Decode source (Myanmar)
    decoded_src = tokenizer.decode(first_input_ids, skip_special_tokens=False)
    print("Source (Myanmar):", decoded_src)

    # Decode target (English)
    decoded_tgt = tokenizer.decode(first_label_ids, skip_special_tokens=False)
    print("Target (English):", decoded_tgt)

    # Replace padding tokens in labels with -100
    labels = inputs["labels"]
    labels[labels == tokenizer.pad_token_id] = -100
    inputs["labels"] = labels

    # For Debug
    x = inputs["labels"]
    print("First target IDs:", x[0])
    # print("Positions with -100:", (x[0] == -100).nonzero())

    dataset = Dataset.from_dict({
        "input_ids": inputs["input_ids"].numpy(),
        "attention_mask": inputs["attention_mask"].numpy(),
        "labels": inputs["labels"].numpy()
    })

    return dataset

In [None]:
train_df = pd.read_excel(train_data_dir)
val_df = pd.read_excel(val_data_dir)
test_df = pd.read_excel(test_data_dir)

train_dataset = prepare(tokenizer=tokenizer, df=train_df)
val_dataset = prepare(tokenizer=tokenizer, df=val_df)
test_dataset = prepare(tokenizer=tokenizer, df=test_df)

print(train_dataset)
print(test_dataset)
print(val_dataset)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,      # range: r*2 - r*4
    lora_dropout=0.05,  # 0.1 = Best general purpose, 0.05 = Higher quality, larger datasets, 0.2 = Tiny datasets / avoid overfitting
    bias="none",

    # IMPORTANT: NLLB uses encoder + decoder so target modules:
    target_modules=[
      "q_proj", "k_proj", "v_proj", "out_proj",   # attention layers
      # ffn layers - not included here
    ],

    task_type="SEQ_2_SEQ_LM",

    # Additional useful parameters:
    inference_mode=False,  # Set to False for training
    modules_to_save=None,  # Add layer names if you want to train specific layers fully
)

model = get_peft_model(model, lora_config)
print(model)

In [11]:
'''
def compute_metrics(eval_pred):
    """eval_pred is (predictions, labels) as numpy arrays from Trainer."""
    preds, labels = eval_pred

    # If predictions are token ids (not strings), decode using tokenizer
    if isinstance(preds, tuple):
        preds = preds[0]

    # decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels as pad_token_id for correct decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # sacrebleu expects list of hypothesis strings and list-of-list of references
    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels]).score
    chrf = sacrebleu.corpus_chrf(decoded_preds, [decoded_labels]).score
    ter = sacrebleu.corpus_ter(decoded_preds, [decoded_labels]).score

    # Optionally add chrF or others later
    return {
        "bleu": bleu,
        "chrf": chrf,
        "ter": ter
    }
'''

def safe_batch_decode(sequences):
    safe_sequences = []
    for seq in sequences:
        # replace token IDs outside vocab with pad_token_id
        safe_seq = [t if 0 <= t < tokenizer.vocab_size else tokenizer.pad_token_id for t in seq]
        safe_sequences.append(safe_seq)
    return tokenizer.batch_decode(safe_sequences, skip_special_tokens=True)

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    if isinstance(preds, tuple):
        preds = preds[0]

    # safe decode
    decoded_preds = safe_batch_decode(preds.tolist())

    # safe decode labels (replace -100)
    safe_labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = safe_batch_decode(safe_labels.tolist())

    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels]).score
    chrf = sacrebleu.corpus_chrf(decoded_preds, [decoded_labels]).score
    ter = sacrebleu.corpus_ter(decoded_preds, [decoded_labels]).score

    return {"bleu": bleu, "chrf": chrf, "ter": ter}

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    # padding=True,             # dynamic padding
    # pad_to_multiple_of=8,     # (optional but recommended for GPU speed)
    # label_pad_token_id=-100,  # default
    return_tensors="pt"
)

model_args = Seq2SeqTrainingArguments(
    output_dir=checkpoint_output_dir,
    save_safetensors=False,

    # eval_strategy='steps',
    # eval_steps=10,
    # save_strategy='steps',
    # save_steps=1000,

    eval_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,

    torch_empty_cache_steps=50,

    learning_rate=1e-5,
    weight_decay=0.01,
    num_train_epochs=5,

    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,

    # warmup_steps=500,
    warmup_ratio=0.05,
    lr_scheduler_type='linear',

    report_to="none",
    push_to_hub=False,

    logging_strategy='steps',
    logging_steps=100,
    logging_dir="./logs",

    use_cpu=False,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,

    # Replace with ##-commented sentences for eval_bleu

    ## predict_with_generate=True,
    ## generation_num_beams=4,
    ## generation_max_length=MAX_TGT_LENGTH,

    load_best_model_at_end=True,
    ## metric_for_best_model="eval_bleu",
    ## greater_is_better=True,

    predict_with_generate=False,
    metric_for_best_model="eval_loss"
    greater_is_better=False
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,
    threshold=0.001
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=model_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    ## compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Chrf,Ter
1,2.6039,2.335105,8.328811,37.03961,106.509691
2,2.4237,2.251187,9.537977,38.615662,101.845851
3,2.3627,2.217932,9.304036,38.811326,107.077529
4,2.324,2.203336,9.838826,38.991934,101.721684


Epoch,Training Loss,Validation Loss,Bleu,Chrf,Ter
1,2.6039,2.335105,8.328811,37.03961,106.509691
2,2.4237,2.251187,9.537977,38.615662,101.845851
3,2.3627,2.217932,9.304036,38.811326,107.077529
4,2.324,2.203336,9.838826,38.991934,101.721684
5,2.338,2.199088,9.971439,39.017314,100.025742


TrainOutput(global_step=1875, training_loss=2.4381283935546874, metrics={'train_runtime': 14835.8693, 'train_samples_per_second': 8.089, 'train_steps_per_second': 0.126, 'total_flos': 6.58828689408e+16, 'train_loss': 2.4381283935546874, 'epoch': 5.0})

In [15]:
trainer.save_model('/content/drive/MyDrive/NLLB Checkpoints/MY-EN/Clementine/Final Model')

In [None]:
tokenizer.save_pretrained('/content/drive/MyDrive/NLLB Checkpoints/MY-EN/Clementine/Final Tokenizer')