### Imports 

In [1]:
# Standard library imports
import pandas as pd
import numpy as np
import torch

# Third-party library imports
from datasets import (
    Dataset,
    DatasetDict,
    concatenate_datasets, 
    load_dataset,
)
from evaluate import load as load_metric  # Renamed for clarity when loading metrics
from matplotlib import pyplot as plt  # Fixed incorrect alias

# Transformers and related libraries
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    BitsAndBytesConfig,
    pipeline,

)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training
)

### **Dataset Sampling**

In [6]:
from datasets import load_dataset, concatenate_datasets
percent_data_select = "train[:20]"
def preprocess_datasets(dataset1, dataset2, lang1_token, lang2_token, col_mapping1, col_mapping2):
    """
    Preprocess two datasets to rename columns, add language tokens, and combine them.

    Args:
        dataset1: First dataset.
        dataset2: Second dataset.
        lang1_token: Language token for dataset1 (e.g., "hi" for Hindi).
        lang2_token: Language token for dataset2 (e.g., "el" for Greek).
        col_mapping1: Dictionary mapping for dataset1 column renaming (e.g., {"English": "source", "Hindi": "target"}).
        col_mapping2: Dictionary mapping for dataset2 column renaming (e.g., {"English": "source", "Greek": "target"}).

    Returns:
        Combined dataset with consistent formatting.
    """

    # Rename columns for the first dataset
    dataset1 = dataset1.rename_columns(col_mapping1)

    # Add language token to the source column of dataset1
    dataset1 = dataset1.map(lambda x: {"source": f"<{lang1_token}> " + x["source"]})

    # Rename columns for the second dataset
    dataset2 = dataset2.rename_columns(col_mapping2)

    # Add language token to the source column of dataset2
    dataset2 = dataset2.map(lambda x: {"source": f"<{lang2_token}> " + x["source"]})

    # Combine both datasets
    combined_dataset = concatenate_datasets([dataset1, dataset2])

    return combined_dataset

# Example Usage

# Load datasets
dataset_english_to_hindi = load_dataset(
    "csv", data_files={"train": "../Datasets/WikiMatrix/Processed/clean_en-hi.csv"},
    split=percent_data_select
)
dataset_english_to_greek = load_dataset(
    "csv", data_files={"train": "../Datasets/WikiMatrix/Processed/clean_en-el.csv"},
    split=percent_data_select
)

# Preprocess and combine datasets
combined_dataset = preprocess_datasets(
    dataset_english_to_hindi,
    dataset_english_to_greek,
    lang1_token="hi",
    lang2_token="el",
    col_mapping1={"English": "source", "Hindi": "target"},
    col_mapping2={"English": "source", "Greek": "target"}
)

# Verify the result
print(combined_dataset[0])  # Should show a sample from the combined dataset with <hi> token
print(combined_dataset[-1]) # Should show a sample from the combined dataset with <el> token


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

{'source': '<hi> recite in the name of your lord who created—created man from a clinging substance.', 'target': 'अपने परवरदिगार का नाम ले कर पढ़ो, जिसने (दुनिया को) पैदा \u200eकिया।'}
{'source': '<el> next week i will build his new house.)', 'target': 'Την επόμενη εβδομάδα θα χτίσω το νέο του σπίτι.)'}


In [7]:
from datasets import DatasetDict

# Split the combined dataset into train, validation, and test sets
train_test_split = combined_dataset.train_test_split(test_size=0.2, seed=42)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Create a DatasetDict
final_dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
})

# Verify the splits
print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 4
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 4
    })
})


In [8]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for better accuracy
    bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
    bnb_4bit_compute_dtype=torch.float16  # Use FP16 for computation
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/m2m100_418M",
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="hi")



`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [9]:
def tokenize_function(examples):
    # Tokenize the source text
    model_inputs = tokenizer(
        examples["source"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    
    # Tokenize the target text
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    
    # Add the labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to the dataset
tokenized_dataset = final_dataset.map(tokenize_function, batched=True)

# Verify the tokenized dataset
print(tokenized_dataset["train"][0])

Map:   0%|          | 0/32 [00:00<?, ? examples/s]



Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

{'source': '<el> they ask him to come in for questioning.', 'target': 'Ήρθαν λοιπόν για να το ζητήσουν.', 'input_ids': [128022, 5966, 91, 3473, 72983, 14173, 10693, 128, 3676, 28, 193, 22573, 150, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [128036, 60024, 25964, 7499, 60057, 1038, 6

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
})

In [10]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target specific layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"  # Task type for sequence-to-sequence models
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [12]:
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 486,264,832 || trainable%: 0.4852


In [13]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save after each epoch (match evaluation strategy)
    num_train_epochs=10,
    learning_rate=2e-5,
    warmup_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.1,
    gradient_accumulation_steps=4,
    fp16=True,
    logging_steps=10,
    lr_scheduler_type="linear",  # Linear decay after warmup
    metric_for_best_model="eval_loss",
    predict_with_generate=True,
    report_to=None,  # Or "wandb" if integrated
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,11.492839
2,No log,11.492778
3,No log,11.49303
4,No log,11.493
5,No log,11.49298
6,No log,11.49282
7,No log,11.4926
8,No log,11.492879
9,No log,11.492352
10,11.342800,11.492349


TrainOutput(global_step=10, training_loss=11.342752838134766, metrics={'train_runtime': 17.2464, 'train_samples_per_second': 18.555, 'train_steps_per_second': 0.58, 'total_flos': 87264004669440.0, 'train_loss': 11.342752838134766, 'epoch': 10.0})

In [17]:
model.save_pretrained("../Model/lora/M2M100_multi_task/")
tokenizer.save_pretrained("../Model/lora/M2M100_multi_task/")

('../Model/lora/M2M100_multi_task/tokenizer_config.json',
 '../Model/lora/M2M100_multi_task/special_tokens_map.json',
 '../Model/lora/M2M100_multi_task/vocab.json',
 '../Model/lora/M2M100_multi_task/sentencepiece.bpe.model',
 '../Model/lora/M2M100_multi_task/added_tokens.json')

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("../Model/lora/M2M100_multi_task/")
tokenizer = AutoTokenizer.from_pretrained("../Model/lora/M2M100_multi_task/")

In [20]:
# Set the target language to Hindi
tokenizer.tgt_lang = "hi"

# Create a translation pipeline for English-to-Hindi
translator_en_hi = pipeline("translation", model=model, tokenizer=tokenizer, src_lang="en", tgt_lang="hi")

# Test the model
text = "this is a test"
translated_text = translator_en_hi(text)

# Print the result
print(f"Input: {text}")
print(f"Translated: {translated_text[0]['translation_text']}")

Device set to use cuda:0


Input: this is a test
Translated: यह एक परीक्षण है


In [22]:
# Set the target language to Greek
tokenizer.tgt_lang = "el"

# Create a translation pipeline for English-to-Greek
translator_en_el = pipeline("translation", model=model, tokenizer=tokenizer, src_lang="en", tgt_lang="el")

# Test the model
text = "this is a test"
translated_text = translator_en_el(text)

# Print the result
print(f"Input: {text}")
print(f"Translated: {translated_text[0]['translation_text']}")

Device set to use cuda:0


Input: this is a test
Translated: Αυτή είναι μια δοκιμή


In [23]:
# Test for English-to-Hindi with prefix
text = "translate English to Hindi: break a leg"
translated_text = translator_en_hi(text)
print(f"Input: {text}")
print(f"Translated: {translated_text[0]['translation_text']}")

# Test for English-to-Greek with prefix
text = "translate English to Greek: break a leg"
translated_text = translator_en_el(text)
print(f"Input: {text}")
print(f"Translated: {translated_text[0]['translation_text']}")

Input: translate English to Hindi: break a leg
Translated: अंग्रेजी में अंग्रेजी में अनुवाद: एक पैर तोड़ना
Input: translate English to Greek: break a leg
Translated: Μετάφραση Αγγλικών στα Ελληνικά: σπάστε ένα πόδι
