# Imports 

In [1]:
# Standard library imports
from time import time
import numpy as np

# Third-party library imports
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    AutoTokenizer,
    GenerationConfig,
    M2M100Config,
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline,
)

# Local application/library specific imports
import torch
import evaluate

# 1. Data loading and spliting

In [2]:
percent_data_select = "train[:10%]" # add percent sign ie. "train[:20%]" to select that percent of dama 
# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "../Datasets/processed_data.csv"}, split=percent_data_select)

# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 17935
    })
    validation: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 2242
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 2242
    })
})


In [3]:
print(dataset['train'][0])
print(dataset['train'][1])
print(dataset['train'][2])
print(dataset['train'][3])

{'English': 'prior to its inclusion as a medal sport, basketball was held as a demonstration event in 1904.', 'Hindi': 'एक पदक खेल के रूप में शामिल होने से पूर्व, 1904 में बास्केटबाल एक प्रदर्शन इवेंट के रूप में आयोजित किया गया था।'}
{'English': 'after a final inspection at 21:30 the match was abandoned without a ball bowled.', 'Hindi': '21:30 पर अंतिम निरीक्षण के बाद मैच को गेंद के बिना बोल्ड किया गया था।'}
{'English': 'both these principles are enshrined within the constitutions of most modern democracies.', 'Hindi': 'इन दोनों सिद्धांतों को अधिकांश आधुनिक लोकतंत्र के संविधान में शामिल किया गया है।'}
{'English': "david challenges elijah's theory with an incident from his childhood when he almost drowned.", 'Hindi': 'डेविड का एलिजा में विश्वास हिल जाता है जब उसे अपने बचपन की एक घटना याद आती है जिसमें वह लगभग डूब चुका था।'}


# 2.	GooglE / T5
- T5 propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.

In [4]:
## Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_ID = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ID)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ID)

## 2.1 TOKENIZING DATASETS


In [5]:
max_input_length = 128
max_target_length = 128

source_lang = "English"
target_lang = "Hindi"


def preprocess_function(examples):
    inputs = tokenizer(examples[source_lang], max_length=128, padding="max_length", truncation=True)
    targets = tokenizer(examples[target_lang], max_length=128, padding="max_length", truncation=True)

    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2242 [00:00<?, ? examples/s]

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 17935
    })
    validation: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2242
    })
    test: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2242
    })
})

In [7]:
# Sample data from the dataset
sample = dataset["train"][12]  # Replace with an actual sample index
input_text = "translate Hindi to English: " + sample["Hindi"]
target_text = sample["English"]

# Tokenize inputs and targets
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(target_text, return_tensors="pt", padding=True, truncation=True)

# Convert token IDs back to tokens
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
target_tokens = tokenizer.convert_ids_to_tokens(targets["input_ids"][0])

# Print the results for inspection
print("Original Hindi Input:", input_text)
print("Input Tokens:", input_tokens)
print("Decoded Input Text:", tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=False))
print("\nOriginal English Target:", target_text)
print("Target Tokens:", target_tokens)
print("Decoded Target Text:", tokenizer.decode(targets["input_ids"][0], skip_special_tokens=False))

Original Hindi Input: translate Hindi to English: सदाशिव मालगुजर, कन्होपात्रा की अपेक्षा की पिता, कन्होपत्रा की सुंदरता के बारे में सुना है और उसके नृत्य को देखने के लिए कामना की, लेकिन कन्होपत्रा से इनकार कर दिया।
Input Tokens: ['▁translate', '▁Hindi', '▁to', '▁English', ':', '▁', '<unk>', '▁', '<unk>', ',', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', ',', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', ',', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '</s>']
Decoded Input Text: translate Hindi to English: <unk> <unk>, <unk> <unk> <unk> <unk> <unk>, <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>, <unk> <unk> <unk> <unk> <unk> <unk></s>

Original English Target: sadashiva malaguja



## 2.2 Data collator


In machine learning, particularly for transformer models, a data collator plays a crucial role in preparing batches of data for training. It's essentially a function that takes individual samples and combines them into batches in a way that's efficient and optimized for the model's processing.


For sequence-to-sequence tasks like translation, a specialized data collator (often DataCollatorForSeq2Seq) becomes critical. It ensures that:
- Source and target sequences are properly aligned
- Padding is applied consistently
- Attention mechanisms can correctly ignore padded tokens
- Labels are prepared in a format that allows for loss calculation during training



Without a proper data collator, you'd need to manually handle sequence padding, masking, and batch preparation, which would be computationally expensive and error-prone.

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_ID)

## 2.3 BLEU (Bilingual Evaluation Understudy): 


In [9]:
metric = evaluate.load("sacrebleu")

In [10]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## 2.4 Fine Tuning


In [11]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../Model/Base/Checkpoint/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

  trainer = Seq2SeqTrainer(


  0%|          | 0/6726 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'loss': 1.2493, 'grad_norm': 0.2913993000984192, 'learning_rate': 1.8534046981861434e-05, 'epoch': 0.22}
{'loss': 0.1381, 'grad_norm': 0.4004156291484833, 'learning_rate': 1.704727921498662e-05, 'epoch': 0.45}
{'loss': 0.1203, 'grad_norm': 0.3394205570220947, 'learning_rate': 1.5560511448111804e-05, 'epoch': 0.67}
{'loss': 0.1179, 'grad_norm': 0.4598807096481323, 'learning_rate': 1.4073743681236991e-05, 'epoch': 0.89}


  0%|          | 0/281 [00:00<?, ?it/s]

{'eval_loss': 0.0995221957564354, 'eval_bleu': 0.0002, 'eval_gen_len': 19.9514, 'eval_runtime': 122.9425, 'eval_samples_per_second': 18.236, 'eval_steps_per_second': 2.286, 'epoch': 1.0}
{'loss': 0.1116, 'grad_norm': 0.25294265151023865, 'learning_rate': 1.2586975914362178e-05, 'epoch': 1.12}
{'loss': 0.1064, 'grad_norm': 0.4236171543598175, 'learning_rate': 1.1100208147487363e-05, 'epoch': 1.34}
{'loss': 0.1048, 'grad_norm': 0.2937662899494171, 'learning_rate': 9.613440380612548e-06, 'epoch': 1.56}
{'loss': 0.1027, 'grad_norm': 0.29766419529914856, 'learning_rate': 8.126672613737735e-06, 'epoch': 1.78}


  0%|          | 0/281 [00:00<?, ?it/s]

{'eval_loss': 0.09191317856311798, 'eval_bleu': 0.0215, 'eval_gen_len': 19.9264, 'eval_runtime': 126.845, 'eval_samples_per_second': 17.675, 'eval_steps_per_second': 2.215, 'epoch': 2.0}
{'loss': 0.1002, 'grad_norm': 0.38355517387390137, 'learning_rate': 6.6399048468629205e-06, 'epoch': 2.01}
{'loss': 0.1003, 'grad_norm': 0.25803908705711365, 'learning_rate': 5.1531370799881065e-06, 'epoch': 2.23}
{'loss': 0.098, 'grad_norm': 0.24807749688625336, 'learning_rate': 3.666369313113292e-06, 'epoch': 2.45}
{'loss': 0.0993, 'grad_norm': 0.3833020329475403, 'learning_rate': 2.179601546238478e-06, 'epoch': 2.68}
{'loss': 0.0972, 'grad_norm': 0.24348798394203186, 'learning_rate': 6.928337793636634e-07, 'epoch': 2.9}


  0%|          | 0/281 [00:00<?, ?it/s]

{'eval_loss': 0.09001521021127701, 'eval_bleu': 0.043, 'eval_gen_len': 19.9246, 'eval_runtime': 152.401, 'eval_samples_per_second': 14.711, 'eval_steps_per_second': 1.844, 'epoch': 3.0}
{'train_runtime': 1325.0554, 'train_samples_per_second': 40.606, 'train_steps_per_second': 5.076, 'train_loss': 0.19262574621513065, 'epoch': 3.0}


TrainOutput(global_step=6726, training_loss=0.19262574621513065, metrics={'train_runtime': 1325.0554, 'train_samples_per_second': 40.606, 'train_steps_per_second': 5.076, 'total_flos': 1820516407050240.0, 'train_loss': 0.19262574621513065, 'epoch': 3.0})

In [13]:
results = trainer.evaluate()
print(results)

  0%|          | 0/281 [00:00<?, ?it/s]

{'eval_loss': 0.09001521021127701, 'eval_bleu': 0.043, 'eval_gen_len': 19.9246, 'eval_runtime': 128.6596, 'eval_samples_per_second': 17.426, 'eval_steps_per_second': 2.184, 'epoch': 3.0}


# 3.GooglE / T5 + LoRa 


## 3.1. Data loading and spliting

In [18]:
percent_data_select = "train[:10%]" # add percent sign ie. "train[:20%]" to select that percent of dama 
# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "../Datasets/processed_data.csv"}, split=percent_data_select)
# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 17935
    })
    validation: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 2242
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 2242
    })
})


## 3.2. model loading with lora

In [19]:
## Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_ID = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ID)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ID)

In [20]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Low rank (fewer trainable parameters)
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Apply LoRA to attention layers (query and value)
    bias="none",  # Specify which biases to train
    task_type="SEQ_2_SEQ_LM",  # Task type (sequence-to-sequence)
)

# Wrap the base model with LoRA
model = get_peft_model(model, lora_config)


## 3.3. Data processing

## 3.3. Data Tokenization

In [22]:
max_input_length = 128
max_target_length = 128

source_lang = "English"
target_lang = "Hindi"


def preprocess_function(examples):
    inputs = tokenizer(examples[source_lang], max_length=128, padding="max_length", truncation=True)
    targets = tokenizer(examples[target_lang], max_length=128, padding="max_length", truncation=True)

    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2242 [00:00<?, ? examples/s]

## 3.4. Data Collator

In [23]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## 3.5. Finetuning

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../Model/Base/Checkpoint/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

  trainer = Seq2SeqTrainer(


  0%|          | 0/6726 [00:00<?, ?it/s]

{'loss': 5.7862, 'grad_norm': 1.4321386814117432, 'learning_rate': 1.8539994052928933e-05, 'epoch': 0.22}
{'loss': 0.5351, 'grad_norm': 0.5148785710334778, 'learning_rate': 1.7053226286054118e-05, 'epoch': 0.45}
{'loss': 0.2941, 'grad_norm': 0.2992958128452301, 'learning_rate': 1.5566458519179307e-05, 'epoch': 0.67}
{'loss': 0.2404, 'grad_norm': 0.3625405728816986, 'learning_rate': 1.4079690752304492e-05, 'epoch': 0.89}


  0%|          | 0/281 [00:00<?, ?it/s]

{'eval_loss': 0.17279577255249023, 'eval_bleu': 0.6432, 'eval_gen_len': 5.5731, 'eval_runtime': 136.9763, 'eval_samples_per_second': 16.368, 'eval_steps_per_second': 2.051, 'epoch': 1.0}
{'loss': 0.2099, 'grad_norm': 0.2580944299697876, 'learning_rate': 1.2592922985429677e-05, 'epoch': 1.12}
{'loss': 0.1879, 'grad_norm': 0.3906034231185913, 'learning_rate': 1.1106155218554862e-05, 'epoch': 1.34}
{'loss': 0.1775, 'grad_norm': 0.28916212916374207, 'learning_rate': 9.619387451680047e-06, 'epoch': 1.56}
{'loss': 0.1677, 'grad_norm': 0.26275718212127686, 'learning_rate': 8.132619684805234e-06, 'epoch': 1.78}


  0%|          | 0/281 [00:00<?, ?it/s]

{'eval_loss': 0.13570769131183624, 'eval_bleu': 0.0126, 'eval_gen_len': 18.2007, 'eval_runtime': 132.4324, 'eval_samples_per_second': 16.929, 'eval_steps_per_second': 2.122, 'epoch': 2.0}
{'loss': 0.1593, 'grad_norm': 0.37174805998802185, 'learning_rate': 6.645851917930419e-06, 'epoch': 2.01}
{'loss': 0.1565, 'grad_norm': 0.24937406182289124, 'learning_rate': 5.159084151055605e-06, 'epoch': 2.23}
{'loss': 0.1521, 'grad_norm': 0.2279694527387619, 'learning_rate': 3.6723163841807913e-06, 'epoch': 2.45}
{'loss': 0.1523, 'grad_norm': 0.3080856502056122, 'learning_rate': 2.185548617305977e-06, 'epoch': 2.68}
{'loss': 0.1483, 'grad_norm': 0.3220287561416626, 'learning_rate': 6.987808504311627e-07, 'epoch': 2.9}


KeyboardInterrupt: 

In [None]:
results = trainer.evaluate()
print(results)

## 4. Comparison