In [None]:
# Step 1: Install and Import Required Libraries
!pip install transformers datasets sacrebleu --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import sacrebleu
import numpy as np

In [None]:
# Step 2: Load Pre-trained NLP Model and Tokenizer
model_name = "d2niraj555/mt5-eng2nep"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [None]:
# Step 3: Prepare Small Parallel Dataset
data = {
    'translation': [
        {'en': "Hello, how are you?", 'ne': "[translate:नमस्ते, तपाईंलाई कस्तो छ?]"},
        {'en': "This is a test.", 'ne': "[translate:यो एउटा परीक्षण हो।]"},
        {'en': "I love learning languages.", 'ne': "[translate:म भाषा सिक्न मन पराउँछु।]"},
        {'en': "Transformers are powerful models.", 'ne': "[translate:ट्रान्सफर्मरहरू शक्तिशाली मोडेलहरू हुन्।]"},
        {'en': "Fine-tuning improves results.", 'ne': "[translate:फाइन-ट्यूनिङले नतिजा सुधार गर्छ।]"},
        {'en': "The weather is nice today.", 'ne': "[translate:आज मौसम राम्रो छ।]"},
        {'en': "Can you help me?", 'ne': "[translate:के तपाईं मलाई सहयोग गर्न सक्नुहुन्छ?]"},
        {'en': "I am reading a book.", 'ne': "[translate:म किताब पढिरहेको छु।]"},
        {'en': "We are going to the market.", 'ne': "[translate:हामी बजार जाँदैछौं।]"},
        {'en': "She loves to cook food.", 'ne': "[translate:उ खाना पकाउन मन पराउँछ।]"},
        {'en': "The cat is sleeping on the sofa.", 'ne': "[translate:बिरालो सोफामा सुतिरहेको छ।]"},
        {'en': "He is a good friend.", 'ne': "[translate:उ राम्रो साथी हो।]"},
        {'en': "Learning new things is fun.", 'ne': "[translate:नयाँ कुरा सिक्नु रमाइलो हो।]"},
        {'en': "Please open the door.", 'ne': "[translate:कृपया ढोका खोल्नुहोस्।]"},
        {'en': "I am feeling happy today.", 'ne': "[translate:म आज खुशी महसुस गरिरहेको छु।]"},
        {'en': "They are playing football.", 'ne': "[translate:उनीहरू फुटबल खेल्दैछन्।]"},
        {'en': "The children are studying.", 'ne': "[translate:बालबालिका पढ्दैछन्।]"},
        {'en': "We need more time.", 'ne': "[translate:हामीलाई थप समय चाहिन्छ।]"},
        {'en': "This is my favorite song.", 'ne': "[translate:यो मेरो मनपर्ने गीत हो।]"},
        {'en': "Technology is advancing quickly.", 'ne': "[translate:प्रविधि छिटो विकास हुँदैछ।]"}
    ]
}

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({'en': [d['en'] for d in data['translation']],
                             'ne': [d['ne'] for d in data['translation']]})

In [None]:
# Step 4: Tokenization Function
max_length = 128
def preprocess_function(examples):
    inputs = examples['en']
    targets = examples['ne']
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



In [None]:
# Step 5: Define Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)

In [None]:
# Step 6: Data Collator for Seq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Step 7: Define Compute Metrics Function for BLEU
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu = sacrebleu.corpus_bleu(decoded_preds, [decoded_labels])
    return {"bleu": bleu.score}

In [None]:
# Step 8: Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [None]:
# Step 9: Fine-tune the Model
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,No log,2.960395,1.566692
2,No log,2.803182,4.831604
3,No log,2.746565,4.831604


TrainOutput(global_step=9, training_loss=3.657555898030599, metrics={'train_runtime': 191.5481, 'train_samples_per_second': 0.078, 'train_steps_per_second': 0.047, 'total_flos': 274000776192.0, 'train_loss': 3.657555898030599, 'epoch': 3.0})

In [None]:
# Step 10: Generate Translations on Test Set
test_sentences = [
    "How are you today?",
    "I am learning Nepali.",
    "This transformer model is amazing."
]

inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True)
# Move input tensors to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}
translated_tokens = model.generate(**inputs)
translations = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

for i, sentence in enumerate(test_sentences):
    print(f"English: {sentence}")
    print(f"Nepali: {translations[i]}")
    print()

English: How are you today?
Nepali: तपाईँ आज कस्तो हुनुहुन्छ ?

English: I am learning Nepali.
Nepali: म नेपाली सिक्दैछु ।

English: This transformer model is amazing.
Nepali: यो टर्मिनल मोड अत्यन्तै चम्किलो छ ।



In [None]:
# Step 11: Evaluate BLEU on Test Set (Mock Example)
references = [
    "[translate:तपाईं आज कस्तो हुनुहुन्छ?]",
    "[translate:म नेपाली सिक्दैछु।]",
    "[translate:यो ट्रान्सफर्मर मोडेल अद्भुत छ।]"
]

bleu = sacrebleu.corpus_bleu(translations, [references])
print(f"BLEU score: {bleu.score:.2f}")


BLEU score: 14.19
