# Imports 

In [1]:
# Standard library imports
from time import time
import numpy as np
import pandas as pd
import random
# Third-party library imports
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
    AutoTokenizer, GenerationConfig,
    M2M100Config, M2M100ForConditionalGeneration,
    M2M100Tokenizer, Seq2SeqTrainingArguments,
    Seq2SeqTrainer, pipeline,
)

# Local application/library specific imports
import torch
import evaluate

# 1. Data loading and spliting

In [2]:
percent_data_select = "train[:1%]" # add percent sign ie. "train[:20%]" to select that percent of dama 
# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "../Datasets/processed_data.csv"}, split=percent_data_select)

# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 1793
    })
    validation: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 224
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 225
    })
})


In [3]:
print(dataset['train'][0])
print(dataset['train'][1])
print(dataset['train'][2])
print(dataset['train'][3])

{'English': "when jackson's group reaches china, their plane runs out of fuel.", 'Hindi': 'जब जैक्सन का समूह चीन पहुंचता है, तो उनका विमान ईंधन से बाहर निकलता है।'}
{'English': 'then he swings to the left and blows the ball loose.', 'Hindi': 'फिर पहले दाईं ओर मुँह फेरता है और तब बाईं ओर।'}
{'English': 'it may not always be necessary to treat with four drugs from the beginning.', 'Hindi': 'हमेशा शुरुआत से चार दवाओं के साथ उपचार करना जरुरी नहीं होता है।'}
{'English': 'while three machines were built, only one machine was put into operational service.', 'Hindi': 'हालांकि तीन मशीनों का निर्माण किया गया था, उनमें से सिर्फ एक ही मशीन को संचालक सेवा में लिया गया था।'}


# 2.	GooglE / T5
- T5 propose reframing all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. Our text-to-text framework allows us to use the same model, loss function, and hyperparameters on any NLP task.

In [4]:
## Load model directly

model_ID = "google-t5/t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_ID)
modelGt5 = AutoModelForSeq2SeqLM.from_pretrained(model_ID)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## 2.1 TOKENIZING DATASETS


In [5]:
# Define hyperparameters
max_input_length = 128
max_target_length = 128
source_lang = "English"  # Replace with the actual column name for English in your dataset
target_lang = "Hindi"    # Replace with the actual column name for Hindi in your dataset
prefix = "translate English to Hindi: "  # Task prefix for T5

# Preprocess function
def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[source_lang]]
    targets = [ex for ex in examples[target_lang]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1793
    })
    validation: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 224
    })
    test: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 225
    })
})

In [7]:
print(tokenized_datasets["train"][1])

{'English': 'then he swings to the left and blows the ball loose.', 'Hindi': 'फिर पहले दाईं ओर मुँह फेरता है और तब बाईं ओर।', 'input_ids': [13959, 1566, 12, 25763, 10, 258, 3, 88, 7180, 7, 12, 8, 646, 11, 6019, 7, 8, 1996, 6044, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3, 2, 3, 2, 3, 2, 3, 2, 3, 2

In [8]:
# Sample data from the dataset
sample = dataset["train"][12]  
input_text = "translate Hindi to English: " + sample["Hindi"]
target_text = sample["English"]

# Tokenize inputs and targets
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(target_text, return_tensors="pt", padding=True, truncation=True)

# Convert token IDs back to tokens
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
target_tokens = tokenizer.convert_ids_to_tokens(targets["input_ids"][0])

# Print the results for inspection
print("Original Hindi Input:", input_text)
print("Input Tokens:", input_tokens)
print("Decoded Input Text:", tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=False))
print("\nOriginal English Target:", target_text)
print("Target Tokens:", target_tokens)
print("Decoded Target Text:", tokenizer.decode(targets["input_ids"][0], skip_special_tokens=False))

Original Hindi Input: translate Hindi to English: हालांकि इससे सात संसदों और सरकारों के लिए अनुमति मिलेगी, पर जब 1980 में समुदाय और क्षेत्र बनाए गए, तो फ्लेमिश राजनीतिज्ञों ने दोनों के विलय का फैसला किया।
Input Tokens: ['▁translate', '▁Hindi', '▁to', '▁English', ':', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', ',', '▁', '<unk>', '▁', '<unk>', '▁1980', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', ',', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '</s>']
Decoded Input Text: translate Hindi to English: <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>, <unk> <unk> 1980 <unk> <unk> <unk> <unk> <unk> <unk>, <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk></s>

Original English Target: although this would allow for seven parliaments and governme



## 2.2 Data collator


In machine learning, particularly for transformer models, a data collator plays a crucial role in preparing batches of data for training. It's essentially a function that takes individual samples and combines them into batches in a way that's efficient and optimized for the model's processing.


For sequence-to-sequence tasks like translation, a specialized data collator (often DataCollatorForSeq2Seq) becomes critical. It ensures that:
- Source and target sequences are properly aligned
- Padding is applied consistently
- Attention mechanisms can correctly ignore padded tokens
- Labels are prepared in a format that allows for loss calculation during training



Without a proper data collator, you'd need to manually handle sequence padding, masking, and batch preparation, which would be computationally expensive and error-prone.

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=modelGt5)

## 2.3 BLEU (Bilingual Evaluation Understudy): 


In [10]:
metric = evaluate.load("sacrebleu")

In [11]:

def postprocess_text(preds, labels):
    """
    Post-process the predictions and labels by stripping whitespace.
    """
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]  # SacreBLEU expects a list of references

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Check if token IDs are within the valid range
    max_token_id = tokenizer.vocab_size - 1
    invalid_preds = np.where(preds > max_token_id)
    if invalid_preds[0].size > 0:
        print(f"Invalid token IDs found in predictions: {preds[invalid_preds]}")
        preds[invalid_preds] = tokenizer.pad_token_id  # Replace invalid IDs with pad token
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## 2.4 Fine Tuning


In [12]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../Model/GT5/Base/Checkpoint_T5/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    
)

trainer = Seq2SeqTrainer(
    model=modelGt5,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

  trainer = Seq2SeqTrainer(


  0%|          | 0/675 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.16295529901981354, 'eval_bleu': 0.0, 'eval_gen_len': 0.0, 'eval_runtime': 15.9433, 'eval_samples_per_second': 14.113, 'eval_steps_per_second': 1.819, 'epoch': 1.0}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.12182475626468658, 'eval_bleu': 0.0, 'eval_gen_len': 19.9111, 'eval_runtime': 16.0929, 'eval_samples_per_second': 13.981, 'eval_steps_per_second': 1.802, 'epoch': 2.0}
{'loss': 0.8367, 'grad_norm': 0.4020611643791199, 'learning_rate': 5.303703703703704e-06, 'epoch': 2.22}


  0%|          | 0/29 [00:00<?, ?it/s]

{'eval_loss': 0.11731509864330292, 'eval_bleu': 0.0, 'eval_gen_len': 20.0, 'eval_runtime': 17.7484, 'eval_samples_per_second': 12.677, 'eval_steps_per_second': 1.634, 'epoch': 3.0}
{'train_runtime': 148.0842, 'train_samples_per_second': 36.324, 'train_steps_per_second': 4.558, 'train_loss': 0.6545272346779152, 'epoch': 3.0}


TrainOutput(global_step=675, training_loss=0.6545272346779152, metrics={'train_runtime': 148.0842, 'train_samples_per_second': 36.324, 'train_steps_per_second': 4.558, 'total_flos': 182000887529472.0, 'train_loss': 0.6545272346779152, 'epoch': 3.0})

## 2.5 Saving fine tuned model


In [14]:
trainer.save_model("../Model/GT5/Base/GoogleT5/")
tokenizer.save_pretrained("../Model/GT5/Base/GoogleT5/")

('../Model/GT5/Base/GoogleT5/tokenizer_config.json',
 '../Model/GT5/Base/GoogleT5/special_tokens_map.json',
 '../Model/GT5/Base/GoogleT5/spiece.model',
 '../Model/GT5/Base/GoogleT5/added_tokens.json')

In [15]:
from transformers import pipeline
text = 'my name is'
translator = pipeline("translation_en_to_hi", model="../Model/GT5/Base/GoogleT5/")
translator(text)


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Device set to use cuda:0


[{'translation_text': '         '}]

# 3.GooglE / T5 + LoRa 


## 3.1. Data loading and spliting

In [16]:
percent_data_select = "train[:2%]" # add percent sign ie. "train[:20%]" to select that percent of dama 
# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "../Datasets/processed_data.csv"}, split=percent_data_select)
# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 3587
    })
    validation: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 448
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 449
    })
})


## 3.2. model loading with lora

In [17]:
## Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_ID = "google-t5/t5-small"
tokenizer_lora = T5Tokenizer.from_pretrained(model_ID)
model_lora = AutoModelForSeq2SeqLM.from_pretrained(model_ID)

In [18]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Low rank (fewer trainable parameters)
    lora_alpha=32,  # Scaling factor
    target_modules=["q", "v"],  # Apply LoRA to attention layers (query and value)
    bias="none",  # Specify which biases to train
    task_type="SEQ_2_SEQ_LM",  # Task type (sequence-to-sequence)
)

# Wrap the base model with LoRA
model_lora = get_peft_model(model_lora, lora_config)


## 3.3. Data Tokenization

In [19]:
max_input_length = 128
max_target_length = 128

source_lang = "English"
target_lang = "Hindi"


def preprocess_function(examples):
    inputs = tokenizer_lora(examples[source_lang], max_length=128, padding="max_length", truncation=True)
    targets = tokenizer_lora(examples[target_lang], max_length=128, padding="max_length", truncation=True)

    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

## 3.4. Data Collator

In [20]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer_lora, model=model_lora)

## 3.5. Finetuning

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../Model/GT5/LoRa/Checkpoint_T5/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    
)

trainer = Seq2SeqTrainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

  trainer = Seq2SeqTrainer(


  0%|          | 0/1347 [00:00<?, ?it/s]

  0%|          | 0/57 [00:00<?, ?it/s]

Invalid token IDs found in predictions: [32099 32099 32099]
{'eval_loss': 0.9357219934463501, 'eval_bleu': 1.1929, 'eval_gen_len': 5.2405, 'eval_runtime': 41.064, 'eval_samples_per_second': 10.934, 'eval_steps_per_second': 1.388, 'epoch': 1.0}
{'loss': 6.5256, 'grad_norm': 7.706357479095459, 'learning_rate': 1.2694877505567932e-05, 'epoch': 1.11}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.3692099153995514, 'eval_bleu': 0.0, 'eval_gen_len': 0.1782, 'eval_runtime': 34.9102, 'eval_samples_per_second': 12.862, 'eval_steps_per_second': 1.633, 'epoch': 2.0}
{'loss': 0.9059, 'grad_norm': 0.5674149394035339, 'learning_rate': 5.270972531551597e-06, 'epoch': 2.23}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.35184869170188904, 'eval_bleu': 0.0, 'eval_gen_len': 0.1782, 'eval_runtime': 33.3172, 'eval_samples_per_second': 13.477, 'eval_steps_per_second': 1.711, 'epoch': 3.0}
{'train_runtime': 286.5367, 'train_samples_per_second': 37.555, 'train_steps_per_second': 4.701, 'train_loss': 2.8929170581616557, 'epoch': 3.0}


TrainOutput(global_step=1347, training_loss=2.8929170581616557, metrics={'train_runtime': 286.5367, 'train_samples_per_second': 37.555, 'train_steps_per_second': 4.701, 'total_flos': 366540566298624.0, 'train_loss': 2.8929170581616557, 'epoch': 3.0})

## 3.6 Saving Finetuned Model


In [22]:
model_lora.merge_and_unload()
model_lora.save_pretrained("../Model/GT5/LoRa/GoogleT5/")
tokenizer_lora.save_pretrained("../Model/GT5/LoRa/GoogleT5/")


('../Model/GT5/LoRa/GoogleT5/tokenizer_config.json',
 '../Model/GT5/LoRa/GoogleT5/special_tokens_map.json',
 '../Model/GT5/LoRa/GoogleT5/spiece.model',
 '../Model/GT5/LoRa/GoogleT5/added_tokens.json')

## 3.7 Testing Finetuned Model


In [23]:
from transformers import pipeline
text = 'my name is'
translator = pipeline("translation_en_to_hi", model="../Model/GT5/LoRa/GoogleT5/")
translator(text)


Device set to use cuda:0


[{'translation_text': 'Mein Name ist.'}]

## 4. Comparison

In [None]:
import torch
import matplotlib.pyplot as plt
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")

# Load the datasets
test_dataset = tokenized_datasets["test"]

def evaluate_model(model, test_dataset):
    trainer = Seq2SeqTrainer(
        model=model,
        args=Seq2SeqTrainingArguments(
            output_dir="../Model/temp",
            per_device_eval_batch_size=8,
            predict_with_generate=True,
            evaluation_strategy="epoch",
            logging_dir='./logs',
        ),
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    results = trainer.evaluate()
    
    # Print some predictions for debugging
    predictions = trainer.predict(test_dataset)
    for i in range(min(5, len(predictions.predictions))):
        print(f"Prediction: {tokenizer.decode(predictions.predictions[i], skip_special_tokens=True)}")
        print(f"Reference: {tokenizer.decode(predictions.label_ids[i], skip_special_tokens=True)}")
    
    return results["eval_bleu"], results["eval_loss"]

# Load the untrained model
untrained_model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

# Load the fine-tuned model
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("../Model/GT5/Base/GoogleT5/")

# Load the fine-tuned LoRA model
lora_model = PeftModel.from_pretrained(
    AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small"),
    "../Model/GT5/LoRa/GoogleT5/"
)

# Evaluate all models
untrained_bleu, untrained_loss = evaluate_model(untrained_model, test_dataset)
fine_tuned_bleu, fine_tuned_loss = evaluate_model(fine_tuned_model, test_dataset)
lora_bleu, lora_loss = evaluate_model(lora_model, test_dataset)

# Plot BLEU scores
plt.figure(figsize=(10, 5))
plt.bar(["Untrained", "Fine-Tuned", "LoRA"], [untrained_bleu, fine_tuned_bleu, lora_bleu], color=["blue", "green", "orange"])
plt.ylabel("BLEU Score")
plt.title("BLEU Score Comparison")
plt.show()

# Plot Loss
plt.figure(figsize=(10, 5))
plt.bar(["Untrained", "Fine-Tuned", "LoRA"], [untrained_loss, fine_tuned_loss, lora_loss], color=["blue", "green", "orange"])
plt.ylabel("Loss")
plt.title("Loss Comparison")
plt.show()