In [1]:
# Standard library imports
import pandas as pd
import numpy as np
import torch

# Third-party library imports
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from evaluate import load as load_metric
from matplotlib import pyplot as plt

# Transformers and related libraries
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    pipeline,
)

# LoRA (optional, if you still want to use it)
# from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

In [2]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [3]:
# Load datasets
dataset_english_to_hindi = load_dataset(
    "csv", data_files={"train": "../Datasets/WikiMatrix/Processed/clean_en-hi.csv"},
    split="train[:80000]"  # Use a subset for faster debugging
)

dataset_english_to_greek = load_dataset(
    "csv", data_files={"train": "../Datasets/WikiMatrix/Processed/clean_en-el.csv"},
    split="train[:80000]"  # Use a subset for faster debugging
)

In [4]:
def preprocess_datasets(dataset1, dataset2, lang1_token, lang2_token, col_mapping1, col_mapping2):
    """
    Preprocess two datasets to rename columns, add language tokens, and combine them.

    Args:
        dataset1: First dataset.
        dataset2: Second dataset.
        lang1_token: Language token for dataset1 (e.g., "hi" for Hindi).
        lang2_token: Language token for dataset2 (e.g., "el" for Greek).
        col_mapping1: Dictionary mapping for dataset1 column renaming (e.g., {"English": "source", "Hindi": "target"}).
        col_mapping2: Dictionary mapping for dataset2 column renaming (e.g., {"English": "source", "Greek": "target"}).

    Returns:
        Combined dataset with consistent formatting.
    """
    # Rename columns for the first dataset
    dataset1 = dataset1.rename_columns(col_mapping1)

    # Add language token to the source column of dataset1
    dataset1 = dataset1.map(lambda x: {"source": f"<{lang1_token}> " + x["source"], "tgt_lang": lang1_token})

    # Rename columns for the second dataset
    dataset2 = dataset2.rename_columns(col_mapping2)

    # Add language token to the source column of dataset2
    dataset2 = dataset2.map(lambda x: {"source": f"<{lang2_token}> " + x["source"], "tgt_lang": lang2_token})

    # Combine both datasets
    combined_dataset = concatenate_datasets([dataset1, dataset2])

    return combined_dataset

# Preprocess and combine datasets
combined_dataset = preprocess_datasets(
    dataset_english_to_hindi,
    dataset_english_to_greek,
    lang1_token="hi",
    lang2_token="el",
    col_mapping1={"English": "source", "Hindi": "target"},
    col_mapping2={"English": "source", "Greek": "target"}
)

# Shuffle the combined dataset
combined_dataset = combined_dataset.shuffle(seed=42)

# Verify the result
print(combined_dataset[0])  # Should show a sample from the combined dataset with <hi> token
print(combined_dataset[-1]) # Should show a sample from the combined dataset with <el> token

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

{'source': '<hi> as of 2008, 52 params have been deployed.', 'target': '2008 तक , 52 परम को तैनात किया गया है।', 'tgt_lang': 'hi'}
{'source': '<hi> she is tasked with ensuring smooth relations between the crew members.', 'target': 'सचेतकों को अपने दल के सदस्यों से घनिष्ठ संबंध बनाए रखना पड़ता है।', 'tgt_lang': 'hi'}


In [5]:
# Split the combined dataset into train, validation, and test sets
train_test_split = combined_dataset.train_test_split(test_size=0.2, seed=42)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Create a DatasetDict
final_dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
})

# Verify the splits
print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'tgt_lang'],
        num_rows: 128000
    })
    validation: Dataset({
        features: ['source', 'target', 'tgt_lang'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['source', 'target', 'tgt_lang'],
        num_rows: 16000
    })
})


In [6]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_name = "facebook/m2m100_418M"
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable()
tokenizer = M2M100Tokenizer.from_pretrained(model_name)

In [9]:
def preprocess_function(examples):
    # Set the source language
    tokenizer.src_lang = "en"
    tokenizer.tgt_lang = examples["tgt_lang"][0]  # Dynamically set the target language based on the dataset

    # Tokenize source and target texts
    model_inputs = tokenizer(examples["source"], truncation=True)
    labels = tokenizer(examples["target"], truncation=True)
    
    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [10]:
tokenized_dataset = final_dataset.map(preprocess_function, batched=True, remove_columns=["source", "target", "tgt_lang"])


Map:   0%|          | 0/128000 [00:00<?, ? examples/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model,pad_to_multiple_of=8)


In [12]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model checkpoints per epoch
    num_train_epochs=10,         # Total epochs
    learning_rate=2e-5,
    warmup_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.1,
    gradient_accumulation_steps=16,
    fp16=False,                  # Disable FP16 for MacBook
    logging_dir="./logs",        # Directory for TensorBoard logs
    logging_steps=50,            # Log every 10 steps
    save_total_limit=3,          # Limit to 3 checkpoints
    predict_with_generate=True,
    report_to=None,              # No integration with external tools
)



In [13]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,                          # The model to fine-tune
    args=training_args,                   # Training configuration
    train_dataset=tokenized_dataset["train"],  # Training dataset
    eval_dataset=tokenized_dataset["validation"],  # Validation dataset
    tokenizer=tokenizer,                  # Tokenizer
    data_collator=data_collator,          # Data collator
)


  trainer = Seq2SeqTrainer(


In [14]:
trainer.train()


  0%|          | 0/10000 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 3.6203, 'grad_norm': 51.84031295776367, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 1.4118, 'grad_norm': 22.80968475341797, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1}
{'loss': 1.2238, 'grad_norm': 21.809968948364258, 'learning_rate': 6e-06, 'epoch': 0.15}
{'loss': 1.1933, 'grad_norm': 21.529052734375, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.2}
{'loss': 1.1751, 'grad_norm': 22.904752731323242, 'learning_rate': 1e-05, 'epoch': 0.25}
{'loss': 1.1592, 'grad_norm': 20.255441665649414, 'learning_rate': 1.2e-05, 'epoch': 0.3}
{'loss': 1.1569, 'grad_norm': 20.20372772216797, 'learning_rate': 1.4e-05, 'epoch': 0.35}
{'loss': 1.1433, 'grad_norm': 21.514604568481445, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.4}
{'loss': 1.1337, 'grad_norm': 20.750478744506836, 'learning_rate': 1.8e-05, 'epoch': 0.45}
{'loss': 1.1258, 'grad_norm': 20.026456832885742, 'learning_rate': 2e-05, 'epoch': 0.5}
{'loss': 1.1279, 'grad_norm': 20.521411895751953,

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.9845430850982666, 'eval_runtime': 221.8368, 'eval_samples_per_second': 72.125, 'eval_steps_per_second': 9.016, 'epoch': 1.0}




{'loss': 1.0341, 'grad_norm': 19.889404296875, 'learning_rate': 1.8842105263157898e-05, 'epoch': 1.05}
{'loss': 1.0101, 'grad_norm': 18.472627639770508, 'learning_rate': 1.873684210526316e-05, 'epoch': 1.1}
{'loss': 1.0149, 'grad_norm': 18.797931671142578, 'learning_rate': 1.8631578947368424e-05, 'epoch': 1.15}
{'loss': 1.0308, 'grad_norm': 20.70826530456543, 'learning_rate': 1.8526315789473684e-05, 'epoch': 1.2}
{'loss': 1.0096, 'grad_norm': 18.492034912109375, 'learning_rate': 1.8421052631578947e-05, 'epoch': 1.25}
{'loss': 0.9994, 'grad_norm': 19.457138061523438, 'learning_rate': 1.831578947368421e-05, 'epoch': 1.3}
{'loss': 1.0016, 'grad_norm': 19.335248947143555, 'learning_rate': 1.8210526315789477e-05, 'epoch': 1.35}
{'loss': 1.0106, 'grad_norm': 16.963558197021484, 'learning_rate': 1.810526315789474e-05, 'epoch': 1.4}
{'loss': 0.9895, 'grad_norm': 19.457237243652344, 'learning_rate': 1.8e-05, 'epoch': 1.45}
{'loss': 1.0039, 'grad_norm': 19.460041046142578, 'learning_rate': 1.789

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.9439496994018555, 'eval_runtime': 221.7308, 'eval_samples_per_second': 72.16, 'eval_steps_per_second': 9.02, 'epoch': 2.0}
{'loss': 0.9296, 'grad_norm': 18.361976623535156, 'learning_rate': 1.673684210526316e-05, 'epoch': 2.05}
{'loss': 0.9379, 'grad_norm': 18.08510971069336, 'learning_rate': 1.6631578947368423e-05, 'epoch': 2.1}
{'loss': 0.922, 'grad_norm': 17.74725914001465, 'learning_rate': 1.6526315789473686e-05, 'epoch': 2.15}
{'loss': 0.9418, 'grad_norm': 18.898895263671875, 'learning_rate': 1.642105263157895e-05, 'epoch': 2.2}
{'loss': 0.925, 'grad_norm': 18.823726654052734, 'learning_rate': 1.6315789473684213e-05, 'epoch': 2.25}
{'loss': 0.9273, 'grad_norm': 18.835203170776367, 'learning_rate': 1.6210526315789473e-05, 'epoch': 2.3}
{'loss': 0.9248, 'grad_norm': 18.476179122924805, 'learning_rate': 1.6105263157894736e-05, 'epoch': 2.35}
{'loss': 0.9269, 'grad_norm': 18.535886764526367, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.4}
{'loss': 0.9265, 'grad_

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.92531418800354, 'eval_runtime': 234.265, 'eval_samples_per_second': 68.299, 'eval_steps_per_second': 8.537, 'epoch': 3.0}
{'loss': 0.8726, 'grad_norm': 18.03898048400879, 'learning_rate': 1.4631578947368424e-05, 'epoch': 3.05}
{'loss': 0.8749, 'grad_norm': 18.282928466796875, 'learning_rate': 1.4526315789473687e-05, 'epoch': 3.1}
{'loss': 0.8645, 'grad_norm': 16.716835021972656, 'learning_rate': 1.4421052631578948e-05, 'epoch': 3.15}
{'loss': 0.8774, 'grad_norm': 18.056316375732422, 'learning_rate': 1.4315789473684212e-05, 'epoch': 3.2}
{'loss': 0.8839, 'grad_norm': 17.766489028930664, 'learning_rate': 1.4210526315789475e-05, 'epoch': 3.25}
{'loss': 0.8814, 'grad_norm': 19.41548728942871, 'learning_rate': 1.4105263157894738e-05, 'epoch': 3.3}
{'loss': 0.8792, 'grad_norm': 17.543310165405273, 'learning_rate': 1.4e-05, 'epoch': 3.35}
{'loss': 0.8854, 'grad_norm': 18.45050811767578, 'learning_rate': 1.3894736842105265e-05, 'epoch': 3.4}
{'loss': 0.8794, 'grad_norm': 17.899

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.9124051332473755, 'eval_runtime': 235.7135, 'eval_samples_per_second': 67.879, 'eval_steps_per_second': 8.485, 'epoch': 4.0}
{'loss': 0.8452, 'grad_norm': 17.841787338256836, 'learning_rate': 1.2526315789473684e-05, 'epoch': 4.05}
{'loss': 0.8252, 'grad_norm': 19.776287078857422, 'learning_rate': 1.2421052631578949e-05, 'epoch': 4.1}
{'loss': 0.8291, 'grad_norm': 16.880905151367188, 'learning_rate': 1.2315789473684212e-05, 'epoch': 4.15}
{'loss': 0.8463, 'grad_norm': 18.04164695739746, 'learning_rate': 1.2210526315789475e-05, 'epoch': 4.2}
{'loss': 0.8435, 'grad_norm': 18.85912322998047, 'learning_rate': 1.2105263157894737e-05, 'epoch': 4.25}
{'loss': 0.8421, 'grad_norm': 19.50041389465332, 'learning_rate': 1.2e-05, 'epoch': 4.3}
{'loss': 0.8359, 'grad_norm': 17.628496170043945, 'learning_rate': 1.1894736842105264e-05, 'epoch': 4.35}
{'loss': 0.8308, 'grad_norm': 16.655805587768555, 'learning_rate': 1.1789473684210527e-05, 'epoch': 4.4}
{'loss': 0.8275, 'grad_norm': 16.

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.9073100686073303, 'eval_runtime': 242.9161, 'eval_samples_per_second': 65.866, 'eval_steps_per_second': 8.233, 'epoch': 5.0}
{'loss': 0.7967, 'grad_norm': 16.750539779663086, 'learning_rate': 1.0421052631578948e-05, 'epoch': 5.05}
{'loss': 0.7922, 'grad_norm': 18.10331916809082, 'learning_rate': 1.0315789473684213e-05, 'epoch': 5.1}
{'loss': 0.799, 'grad_norm': 17.62315559387207, 'learning_rate': 1.0210526315789476e-05, 'epoch': 5.15}
{'loss': 0.7853, 'grad_norm': 17.476232528686523, 'learning_rate': 1.0105263157894738e-05, 'epoch': 5.2}
{'loss': 0.8115, 'grad_norm': 19.356128692626953, 'learning_rate': 1e-05, 'epoch': 5.25}
{'loss': 0.8072, 'grad_norm': 17.135086059570312, 'learning_rate': 9.894736842105264e-06, 'epoch': 5.3}
{'loss': 0.8134, 'grad_norm': 18.62698745727539, 'learning_rate': 9.789473684210527e-06, 'epoch': 5.35}
{'loss': 0.8127, 'grad_norm': 17.185853958129883, 'learning_rate': 9.68421052631579e-06, 'epoch': 5.4}
{'loss': 0.808, 'grad_norm': 16.56945228

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.9005177617073059, 'eval_runtime': 222.2213, 'eval_samples_per_second': 72.0, 'eval_steps_per_second': 9.0, 'epoch': 6.0}
{'loss': 0.7743, 'grad_norm': 16.572776794433594, 'learning_rate': 8.315789473684212e-06, 'epoch': 6.05}
{'loss': 0.7772, 'grad_norm': 18.729717254638672, 'learning_rate': 8.210526315789475e-06, 'epoch': 6.1}
{'loss': 0.7637, 'grad_norm': 18.851781845092773, 'learning_rate': 8.105263157894736e-06, 'epoch': 6.15}
{'loss': 0.7675, 'grad_norm': 17.560155868530273, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.2}
{'loss': 0.7805, 'grad_norm': 17.89071273803711, 'learning_rate': 7.894736842105265e-06, 'epoch': 6.25}
{'loss': 0.7826, 'grad_norm': 17.92136573791504, 'learning_rate': 7.789473684210526e-06, 'epoch': 6.3}
{'loss': 0.787, 'grad_norm': 17.540565490722656, 'learning_rate': 7.68421052631579e-06, 'epoch': 6.35}
{'loss': 0.7772, 'grad_norm': 18.792987823486328, 'learning_rate': 7.578947368421054e-06, 'epoch': 6.4}
{'loss': 0.7787, 'grad_norm': 1

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.899794340133667, 'eval_runtime': 220.5854, 'eval_samples_per_second': 72.534, 'eval_steps_per_second': 9.067, 'epoch': 7.0}
{'loss': 0.7522, 'grad_norm': 16.98540496826172, 'learning_rate': 6.2105263157894745e-06, 'epoch': 7.05}
{'loss': 0.7565, 'grad_norm': 19.03006935119629, 'learning_rate': 6.105263157894738e-06, 'epoch': 7.1}
{'loss': 0.7516, 'grad_norm': 18.728172302246094, 'learning_rate': 6e-06, 'epoch': 7.15}
{'loss': 0.7578, 'grad_norm': 17.814823150634766, 'learning_rate': 5.8947368421052634e-06, 'epoch': 7.2}
{'loss': 0.7469, 'grad_norm': 16.541528701782227, 'learning_rate': 5.789473684210527e-06, 'epoch': 7.25}
{'loss': 0.7487, 'grad_norm': 17.722572326660156, 'learning_rate': 5.68421052631579e-06, 'epoch': 7.3}
{'loss': 0.7636, 'grad_norm': 17.234554290771484, 'learning_rate': 5.578947368421052e-06, 'epoch': 7.35}
{'loss': 0.7616, 'grad_norm': 18.095537185668945, 'learning_rate': 5.4736842105263165e-06, 'epoch': 7.4}
{'loss': 0.7533, 'grad_norm': 17.8129196

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.8988119959831238, 'eval_runtime': 221.7244, 'eval_samples_per_second': 72.162, 'eval_steps_per_second': 9.02, 'epoch': 8.0}
{'loss': 0.7404, 'grad_norm': 17.93016242980957, 'learning_rate': 4.105263157894737e-06, 'epoch': 8.05}
{'loss': 0.7371, 'grad_norm': 18.42850685119629, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.1}
{'loss': 0.7426, 'grad_norm': 19.434194564819336, 'learning_rate': 3.894736842105263e-06, 'epoch': 8.15}
{'loss': 0.7392, 'grad_norm': 19.388946533203125, 'learning_rate': 3.789473684210527e-06, 'epoch': 8.2}
{'loss': 0.7328, 'grad_norm': 16.429262161254883, 'learning_rate': 3.6842105263157896e-06, 'epoch': 8.25}
{'loss': 0.7329, 'grad_norm': 18.328954696655273, 'learning_rate': 3.578947368421053e-06, 'epoch': 8.3}
{'loss': 0.7494, 'grad_norm': 18.4221248626709, 'learning_rate': 3.473684210526316e-06, 'epoch': 8.35}
{'loss': 0.7384, 'grad_norm': 17.616933822631836, 'learning_rate': 3.368421052631579e-06, 'epoch': 8.4}
{'loss': 0.7475, 'grad_norm

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.8991617560386658, 'eval_runtime': 228.0078, 'eval_samples_per_second': 70.173, 'eval_steps_per_second': 8.772, 'epoch': 9.0}
{'loss': 0.7305, 'grad_norm': 16.74913787841797, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.05}
{'loss': 0.7253, 'grad_norm': 17.04001808166504, 'learning_rate': 1.8947368421052634e-06, 'epoch': 9.1}
{'loss': 0.7267, 'grad_norm': 17.871292114257812, 'learning_rate': 1.7894736842105265e-06, 'epoch': 9.15}
{'loss': 0.7393, 'grad_norm': 18.500925064086914, 'learning_rate': 1.6842105263157895e-06, 'epoch': 9.2}
{'loss': 0.7169, 'grad_norm': 19.191768646240234, 'learning_rate': 1.5789473684210526e-06, 'epoch': 9.25}
{'loss': 0.7267, 'grad_norm': 16.805835723876953, 'learning_rate': 1.4736842105263159e-06, 'epoch': 9.3}
{'loss': 0.7413, 'grad_norm': 18.04779815673828, 'learning_rate': 1.3684210526315791e-06, 'epoch': 9.35}
{'loss': 0.7414, 'grad_norm': 18.67582130432129, 'learning_rate': 1.2631578947368422e-06, 'epoch': 9.4}
{'loss': 0.7236, 'g

  0%|          | 0/2000 [00:00<?, ?it/s]

{'eval_loss': 0.8985698223114014, 'eval_runtime': 218.5384, 'eval_samples_per_second': 73.214, 'eval_steps_per_second': 9.152, 'epoch': 10.0}
{'train_runtime': 73109.5664, 'train_samples_per_second': 17.508, 'train_steps_per_second': 0.137, 'train_loss': 0.8734251182556152, 'epoch': 10.0}


TrainOutput(global_step=10000, training_loss=0.8734251182556152, metrics={'train_runtime': 73109.5664, 'train_samples_per_second': 17.508, 'train_steps_per_second': 0.137, 'total_flos': 1.5231833306038272e+17, 'train_loss': 0.8734251182556152, 'epoch': 10.0})

In [15]:
trainer.save_model("../Model/translate_en_hi_el")
tokenizer.save_pretrained("../Model/translate_en_hi_el")


('../Model/translate_en_hi_el/tokenizer_config.json',
 '../Model/translate_en_hi_el/special_tokens_map.json',
 '../Model/translate_en_hi_el/vocab.json',
 '../Model/translate_en_hi_el/sentencepiece.bpe.model',
 '../Model/translate_en_hi_el/added_tokens.json')

In [16]:
text = 'break a leg'
translator = pipeline("translation_en_to_hi", model="../Model/translate_en_hi_el")
translator(text)



Device set to use mps:0


[{'translation_text': 'एक पैर टूटना'}]

In [17]:
text = 'break a leg'
translator = pipeline("translation_en_to_el", model="../Model/translate_en_hi_el")
translator(text)



Device set to use mps:0


[{'translation_text': 'Διακόψτε ένα πόδι'}]