# Imports 

In [1]:
# Standard library imports
from time import time
import numpy as np
import evaluate
import matplotlib as pt

# Third-party library imports
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import (
    TrainerCallback,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    AutoTokenizer,
    GenerationConfig,
    M2M100Config,
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline,
)

# Local application/library specific imports
import torch

# 1. Data loading and spliting

In [2]:
percent_data_select = "train[:20]" # add percent sign ie. "train[:20%]" to select that percent of data 
# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "../Datasets/processed_data.csv"}, split=percent_data_select)

# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 16
    })
    validation: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 2
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 2
    })
})


# 2. Facebook / M2M100 418M + Lora
- LoRA (Low-Rank Adaptation) refers to a parameter-efficient fine-tuning technique.

In the context of Large Language Models (LLMs), LoRA (Low-Rank Adaptation) refers to a parameter-efficient fine-tuning technique. 

**Key Concepts:**

* **Fine-tuning:** LLMs are often pre-trained on massive datasets. Fine-tuning involves adapting these pre-trained models to specific tasks or domains using smaller, more relevant datasets.
* **Parameter-Efficiency:** Fine-tuning LLMs can be computationally expensive, especially for very large models. LoRA addresses this by significantly reducing the number of parameters that need to be updated during fine-tuning.

**How LoRA Works:**

Instead of fine-tuning all the parameters of the base LLM, LoRA introduces two small, trainable matrices (A and B) for each attention layer:

1. **Decomposition:** The update to the original weight matrix (W) is approximated as the product of these two smaller matrices: W' = W + A * B.
2. **Reduced Parameters:** Since A and B have significantly fewer parameters than the original weight matrix, the overall number of trainable parameters is drastically reduced.
3. **Fine-tuning:** Only the parameters of A and B are trained during fine-tuning, while the original weights of the base LLM remain frozen.

**Benefits of LoRA:**

* **Reduced Training Time and Cost:** By training only a small subset of parameters, LoRA significantly reduces training time and computational resources.
* **Improved Efficiency:** The smaller number of parameters leads to faster inference times.
* **Preserving Base Model:** Since the base model's weights are frozen, it retains its general knowledge and capabilities while being adapted to the specific task.
* **Easier Deployment:** Smaller models are easier to deploy and run on devices with limited resources.

**Applications:**

LoRA has been successfully applied to a wide range of LLM fine-tuning tasks, including:

* **Domain Adaptation:** Adapting LLMs to specific domains like finance, medicine, or law.
* **Task-Specific Fine-tuning:** Fine-tuning LLMs for specific tasks such as question answering, text summarization, and code generation.
* **Personalization:** Creating personalized LLMs for individual users or groups.

**In summary:**

LoRA is a powerful technique that enables efficient and effective fine-tuning of LLMs. By significantly reducing the number of trainable parameters, LoRA makes it possible to customize large models for specific applications while minimizing training costs and preserving the valuable knowledge of the base model.


![alt text](image.png)

In [13]:
model_ID = "facebook/m2m100_418M"  # Replace with your desired model
model_lora = AutoModelForSeq2SeqLM.from_pretrained(model_ID)
tokenizer_lora = AutoTokenizer.from_pretrained(model_ID, src_lang="en", tgt_lang="hi")
# Create a GenerationConfig with the desired parameters

## 2.2 text preprocessing

**Importance of Text Preprocessing:**

1. **Converting Text to Numbers:** Machine learning models can't directly understand raw text. Preprocessing transforms text into numerical representations (tokens) that the model can process.

2. **Normalization and Consistency:** Text data can have inconsistencies like capitalization, punctuation, and variations in word forms (e.g., singular vs. plural). Preprocessing steps like lowercasing or stemming/lemmatization can address these issues, promoting consistency in the data.

3. **Feature Engineering:** Preprocessing can create new features for the model. In your example, prepending "translate Hindi to English: " to the source sentences might help the model understand the context of translation.

4. **Handling Text Length:** Different models have limitations on input and output lengths. Preprocessing techniques like truncation and padding ensure your data adheres to these limitations.

In [14]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=4,  # Low rank (fewer trainable parameters)
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj","k_proj"],  # Apply LoRA to attention layers (query and value)
    bias="none",  # Specify which biases to train
    task_type="SEQ_2_SEQ_LM",  # Task type (sequence-to-sequence)
)

# Wrap the base model with LoRA
model_lora = get_peft_model(model_lora, lora_config)

In [15]:

def preprocess_function(examples, src_lang, tgt_lang):
    inputs = [f"translate {src_lang} to {tgt_lang}: " + ex for ex in examples[src_lang]]
    targets = examples[tgt_lang]
    model_inputs = tokenizer_lora(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer_lora.as_target_tokenizer():
        labels = tokenizer_lora(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [16]:
tokenized_datasets_hindi_to_english = dataset.map(lambda x: preprocess_function(x, "Hindi", "English"), batched=True)
tokenized_datasets_english_to_hindi = dataset.map(lambda x: preprocess_function(x, "English", "Hindi"), batched=True)

In [17]:
for idx in range(3):  # Print first 3 examples as a sample
    print(f"Original Hindi: {dataset['train'][idx]['Hindi']}")
    print(f"Original English: {dataset['train'][idx]['English']}")

    # Tokenized inputs
    tokenized_input = tokenized_datasets_hindi_to_english["train"][idx]["input_ids"]
    print(f"Tokenized Input IDs: {tokenized_input}")
    print(f"Decoded Input: {tokenizer_lora.decode(tokenized_input, skip_special_tokens=False)}")

    # Tokenized outputs
    tokenized_label = tokenized_datasets_hindi_to_english["train"][idx]["labels"]
    print(f"Tokenized Label IDs: {tokenized_label}")
    print(f"Decoded Label: {tokenizer_lora.decode(tokenized_label, skip_special_tokens=False)}")

    print("=" * 10)

Original Hindi: मंगल ग्रह के मिशन की सूची "Launch Event Details – When did the Rovers Launch?
Original English: "launch event details – when did the rovers launch?".
Tokenized Input IDs: [128022, 5815, 80447, 11631, 128, 18006, 9, 62188, 33146, 2642, 456, 9729, 17158, 783, 79477, 33, 7939, 35233, 67444, 115309, 36, 119782, 4322, 1197, 180, 73558, 343, 35233, 24, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded Input: __en__ translate Hindi to English: मंगल ग्रह के मिशन की सूची "Launch Event Details – When did the Rovers Launch?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

## 2.3 BLEU (Bilingual Evaluation Understudy): 


In [18]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds, tokenizer_lora, metric):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer_lora.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer_lora.pad_token_id)
    decoded_labels = tokenizer_lora.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer_lora.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## 2.4 Data collator


In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer_lora, model=model_lora)

## 2.5 FineTuning
 

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../Model/fb/Base/Checkpoint/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_datasets_hindi_to_english["train"],
    eval_dataset=tokenized_datasets_hindi_to_english["validation"],
    tokenizer=tokenizer_lora,
    data_collator=data_collator,
    compute_metrics=lambda x: compute_metrics(x, tokenizer_lora, metric),
)

trainer.train()

# Fine-tune for English to Hindi
trainer = Seq2SeqTrainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_datasets_english_to_hindi["train"],
    eval_dataset=tokenized_datasets_english_to_hindi["validation"],
    tokenizer=tokenizer_lora,
    data_collator=data_collator,
    compute_metrics=lambda x: compute_metrics(x, tokenizer_lora, metric),
)

trainer.train()

  trainer = Seq2SeqTrainer(


## 3.7 Saving Finetuned Model


In [11]:
model_lora.merge_and_unload()
model_lora.save_pretrained("../Model/fb/LoRa/M2M100/")
tokenizer_lora.save_pretrained("../Model/fb/LoRa/M2M100/")


('../Model/fb/LoRa/M2M100/tokenizer_config.json',
 '../Model/fb/LoRa/M2M100/special_tokens_map.json',
 '..\\Model\\fb\\LoRa\\M2M100\\vocab.json',
 '..\\Model\\fb\\LoRa\\M2M100\\sentencepiece.bpe.model',
 '../Model/fb/LoRa/M2M100/added_tokens.json')

In [12]:
# Test Hindi to English
translator_hindi_to_english = pipeline("translation_hi_to_en", model="../Model/fb/LoRa/M2M100/")
result_hindi_to_english = translator_hindi_to_english("यह एक परीक्षण है")
print(result_hindi_to_english)

# Test English to Hindi
translator_english_to_hindi = pipeline("translation_en_to_hi", model="../Model/fb/LoRa/M2M100/")
result_english_to_hindi = translator_english_to_hindi("This is a test")
print(result_english_to_hindi)

Device set to use cuda:0


[{'translation_text': 'This is a test.'}]


Device set to use cuda:0


[{'translation_text': 'यह एक परीक्षण है'}]
