# Imports 

In [30]:
from datasets import load_dataset
from datasets import DatasetDict

# 1. Data loading and spliting

In [31]:
percent_data_select = "train[:100]" # add percent sign ie. "train[:20%]" to select that percent of data 
# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "../Datasets/processed_data.csv"}, split=percent_data_select)

# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 10
    })
    test: Dataset({
        features: ['English', 'Hindi'],
        num_rows: 10
    })
})


In [32]:
print(dataset['train'][0])
print(dataset['train'][1])
print(dataset['train'][2])
print(dataset['train'][3])

{'English': 'for more detailed figures from 2011 census, see this table.', 'Hindi': '2011 की जनगणना से अधिक विस्तृत आंकड़ों के लिए, इस तालिका को देखें।'}
{'English': 'death is seen as a boundary to another world.', 'Hindi': 'मौत एक और दुनिया के लिए एक सीमा के रूप में देखा जाता है।'}
{'English': 'george miller always wanted one person to do both .', 'Hindi': 'जॉर्ज मिलर हमेशा एक व्यक्ति को दोनों करना चाहते थे।'}
{'English': 'the targets of the german aircraft were actually the rail lines and bridges.', 'Hindi': 'जर्मन विमानों के लक्ष्य वास्तव में रेल लाइन और पुल थे।'}


# 2.	Facebook / M2M100 418M
- M2M-100 stands for Massively Multilingual Model with 100 languages.
- 418M refers to the model size in terms of the number of parameters. Parameters are the learnable weights within the model's neural network. A larger number of parameters generally allows the model to learn more complex patterns and achieve higher accuracy.

## 2.1 Comparing Tokenizers 
- Our approach will involve a systematic comparison of tokenization strategies, paying close attention to how each method handles linguistic characteristics specific to Hindi and English.

#### 2.1.1 AutoTokenizer

In [33]:
from transformers import GenerationConfig, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

model_ID = "facebook/m2m100_418M"  # Replace with your desired model
model = AutoModelForSeq2SeqLM.from_pretrained(model_ID)
tokenizer = AutoTokenizer.from_pretrained(model_ID, src_lang="en", tgt_lang="hi")
# Create a GenerationConfig with the desired parameters

In [34]:
# Sample data from the dataset
sample = dataset["train"][12]  # Replace with an actual sample index
input_text = "translate Hindi to English: " + sample["Hindi"]
target_text = sample["English"]

# Tokenize inputs and targets
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(target_text, return_tensors="pt", padding=True, truncation=True)

# Convert token IDs back to tokens
input_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
target_tokens = tokenizer.convert_ids_to_tokens(targets["input_ids"][0])

# Print the results for inspection
print("Original Hindi Input:", input_text)
print("Input Tokens:", input_tokens)
print("Decoded Input Text:", tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=False))
print("\nOriginal English Target:", target_text)
print("Target Tokens:", target_tokens)
print("Decoded Target Text:", tokenizer.decode(targets["input_ids"][0], skip_special_tokens=False))

Original Hindi Input: translate Hindi to English: इस काम के बोझ को मेरे साथ कौन साझा करेगा?
Input Tokens: ['__en__', '▁trans', 'late', '▁Hindi', '▁to', '▁English', ':', '▁इस', '▁काम', '▁के', '▁बो', 'झ', '▁को', '▁मेरे', '▁साथ', '▁कौन', '▁सा', 'झा', '▁करे', 'गा', '?', '</s>']
Decoded Input Text: __en__ translate Hindi to English: इस काम के बोझ को मेरे साथ कौन साझा करेगा?</s>

Original English Target: who will share the burden of this work with me?
Target Tokens: ['__hi__', '▁who', '▁will', '▁share', '▁the', '▁bur', 'den', '▁of', '▁this', '▁work', '▁with', '▁me', '?', '</s>']
Decoded Target Text: __hi__ who will share the burden of this work with me?</s>




 **Why are "__hi__" and "__en__" appearing before the English and Hindi sentences, respectively?**

These special tokens, "__hi__" and "__en__", are often used in machine translation tasks to explicitly indicate the **source language** and **target language**. They serve as markers for the model to understand which language it's currently processing.

##### Here's a breakdown of their functions:

* **__hi__ and __en__: Language Identifiers:**
   - **__hi__**: This token typically represents the Hindi language. When placed at the beginning of a sentence, it tells the model that the following text is in Hindi and should be translated into the target language (in this case, English).
   - **__en__**: Similarly, "__en__" indicates that the following text is in English. It might be used in scenarios where the model is asked to translate from English to Hindi or for tasks like language identification.

##### Why are they added?

* **Clarity for the Model:** These markers provide clear and explicit information to the model about the language of the input and output sequences. This helps the model to better understand the context and improve the accuracy of its translations.
* **Handling Multiple Language Pairs:** In multilingual models, these tokens can be used to handle various language pairs. For example, if the model is trained on multiple language pairs (e.g., English-French, English-Spanish), these tokens can help the model distinguish between the different language pairs.
* **Facilitating Language Identification:** In some cases, these tokens can also be used for language identification tasks, where the model is asked to determine the language of a given text.

##### In above given example:

* **"__en__ translate Hindi to English: जॉर्ज मिलर हमेशा एक व्यक्ति को दोनों करना चाहते थे।"**: This part tells the model that the following text is in English and the task is to translate the Hindi text that follows.
* **"__hi__ george miller always wanted one person to do both ."**: This indicates that the following text is the English translation of the preceding Hindi text.

**Note:** The specific tokens used (e.g., "__hi__", "__en__") can vary depending on the pre-training data and the specific configuration of the model. However, the general concept of using special tokens to indicate language remains consistent.

By understanding the role of these tokens, one can better interpret the model's output and fine-tune your training data for more accurate translations.


#### 2.1.2 M2M100 Tokenizer

In [35]:
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer

model_ID_fb = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer_fb = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="hi")


In [36]:
# Sample data from the dataset
sample2 = dataset["train"][12]  # Replace with an actual sample index
input_text2 = "translate Hindi to English: " + sample["Hindi"]
target_text2 = sample["English"]

# Tokenize inputs and targets
inputs2 = tokenizer_fb(input_text2, return_tensors="pt", padding=True, truncation=True)
with tokenizer_fb.as_target_tokenizer():
    targets2 = tokenizer_fb(target_text2, return_tensors="pt", padding=True, truncation=True)

# Convert token IDs back to tokens
input_tokens2 = tokenizer_fb.convert_ids_to_tokens(inputs2["input_ids"][0])
target_tokens2 = tokenizer_fb.convert_ids_to_tokens(targets2["input_ids"][0])

# Print the results for inspection
print("Original Hindi Input:", input_text2)
print("Input Tokens:", input_tokens2)
print("Decoded Input Text:", tokenizer_fb.decode(inputs2["input_ids"][0], skip_special_tokens=False))
print("\nOriginal English Target:", target_text2)
print("Target Tokens:", target_tokens2)
print("Decoded Target Text:", tokenizer_fb.decode(targets2["input_ids"][0], skip_special_tokens=False))

Original Hindi Input: translate Hindi to English: इस काम के बोझ को मेरे साथ कौन साझा करेगा?
Input Tokens: ['__en__', '▁trans', 'late', '▁Hindi', '▁to', '▁English', ':', '▁इस', '▁काम', '▁के', '▁बो', 'झ', '▁को', '▁मेरे', '▁साथ', '▁कौन', '▁सा', 'झा', '▁करे', 'गा', '?', '</s>']
Decoded Input Text: __en__ translate Hindi to English: इस काम के बोझ को मेरे साथ कौन साझा करेगा?</s>

Original English Target: who will share the burden of this work with me?
Target Tokens: ['__hi__', '▁who', '▁will', '▁share', '▁the', '▁bur', 'den', '▁of', '▁this', '▁work', '▁with', '▁me', '?', '</s>']
Decoded Target Text: __hi__ who will share the burden of this work with me?</s>


In [37]:
print(tokenizer.special_tokens_map)  # Check special tokens (e.g., <pad>, <unk>, etc.)


{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'additional_special_tokens': ['__af__', '__am__', '__ar__', '__ast__', '__az__', '__ba__', '__be__', '__bg__', '__bn__', '__br__', '__bs__', '__ca__', '__ceb__', '__cs__', '__cy__', '__da__', '__de__', '__el__', '__en__', '__es__', '__et__', '__fa__', '__ff__', '__fi__', '__fr__', '__fy__', '__ga__', '__gd__', '__gl__', '__gu__', '__ha__', '__he__', '__hi__', '__hr__', '__ht__', '__hu__', '__hy__', '__id__', '__ig__', '__ilo__', '__is__', '__it__', '__ja__', '__jv__', '__ka__', '__kk__', '__km__', '__kn__', '__ko__', '__lb__', '__lg__', '__ln__', '__lo__', '__lt__', '__lv__', '__mg__', '__mk__', '__ml__', '__mn__', '__mr__', '__ms__', '__my__', '__ne__', '__nl__', '__no__', '__ns__', '__oc__', '__or__', '__pa__', '__pl__', '__ps__', '__pt__', '__ro__', '__ru__', '__sd__', '__si__', '__sk__', '__sl__', '__so__', '__sq__', '__sr__', '__ss__', '__su__', '__sv__', '__sw__', '__ta__', 

In [38]:
print(tokenizer_fb.special_tokens_map)  # Check special tokens (e.g., <pad>, <unk>, etc.)


{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'additional_special_tokens': ['__af__', '__am__', '__ar__', '__ast__', '__az__', '__ba__', '__be__', '__bg__', '__bn__', '__br__', '__bs__', '__ca__', '__ceb__', '__cs__', '__cy__', '__da__', '__de__', '__el__', '__en__', '__es__', '__et__', '__fa__', '__ff__', '__fi__', '__fr__', '__fy__', '__ga__', '__gd__', '__gl__', '__gu__', '__ha__', '__he__', '__hi__', '__hr__', '__ht__', '__hu__', '__hy__', '__id__', '__ig__', '__ilo__', '__is__', '__it__', '__ja__', '__jv__', '__ka__', '__kk__', '__km__', '__kn__', '__ko__', '__lb__', '__lg__', '__ln__', '__lo__', '__lt__', '__lv__', '__mg__', '__mk__', '__ml__', '__mn__', '__mr__', '__ms__', '__my__', '__ne__', '__nl__', '__no__', '__ns__', '__oc__', '__or__', '__pa__', '__pl__', '__ps__', '__pt__', '__ro__', '__ru__', '__sd__', '__si__', '__sk__', '__sl__', '__so__', '__sq__', '__sr__', '__ss__', '__su__', '__sv__', '__sw__', '__ta__', 

**Both tokenizers i.e. pretrained models and model-specific tokenizers can yield similar outputs in many scenarios, but this isn't a universal rule.**

  -  ***When comparing different models, such as `M2M100` and `Google's T5 small`, we'll encounter variations in how unknown words are processed. The Google T5 small model, for instance, tends to generate more `<unk>` tokens when encountering vocabulary outside its training set, which can pose challenges for translation*** 

### 2.2 text preprocessing



**Importance of Text Preprocessing:**

1. **Converting Text to Numbers:** Machine learning models can't directly understand raw text. Preprocessing transforms text into numerical representations (tokens) that the model can process.

2. **Normalization and Consistency:** Text data can have inconsistencies like capitalization, punctuation, and variations in word forms (e.g., singular vs. plural). Preprocessing steps like lowercasing or stemming/lemmatization can address these issues, promoting consistency in the data.

3. **Feature Engineering:** Preprocessing can create new features for the model. In your example, prepending "translate Hindi to English: " to the source sentences might help the model understand the context of translation.

4. **Handling Text Length:** Different models have limitations on input and output lengths. Preprocessing techniques like truncation and padding ensure your data adheres to these limitations.

In [39]:
def preprocess_function(examples):
    inputs = ["translate Hindi to English: " + ex for ex in examples["Hindi"]]
    targets = examples["English"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [40]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
    test: Dataset({
        features: ['English', 'Hindi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10
    })
})

In [41]:

# Inspect the first example
for idx in range(3):  # Print first 3 examples as a sample
    print(f"Original Hindi: {dataset['train'][idx]['Hindi']}")
    print(f"Original English: {dataset['train'][idx]['English']}")

    # Tokenized inputs
    tokenized_input = tokenized_datasets["train"][idx]["input_ids"]
    print(f"Tokenized Input IDs: {tokenized_input}")
    print(f"Decoded Input: {tokenizer.decode(tokenized_input, skip_special_tokens=False)}")

    # Tokenized outputs
    tokenized_label = tokenized_datasets["train"][idx]["labels"]
    print(f"Tokenized Label IDs: {tokenized_label}")
    print(f"Decoded Label: {tokenizer.decode(tokenized_label, skip_special_tokens=False)}")

    print("=" * 10)

Original Hindi: 2011 की जनगणना से अधिक विस्तृत आंकड़ों के लिए, इस तालिका को देखें।
Original English: for more detailed figures from 2011 census, see this table.
Tokenized Input IDs: [128022, 5815, 80447, 11631, 128, 18006, 9, 2294, 783, 15258, 2568, 3207, 1383, 1044, 21283, 118809, 58278, 1843, 15694, 8967, 1839, 456, 3460, 4, 5163, 48867, 8964, 929, 78705, 209, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded Input: __en__ translate Hindi to English: 2011 की जनगणना से अधिक विस्तृत आंकड़ों के लिए, इस तालिका को देखें।</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

**Specific Breakdown to Code:**

- **`inputs = ["translate Hindi to English: " + ex for ex in examples["Hindi"]]`**: This line creates a new list (`inputs`) by prepending a context string to each sentence in the `Hindi` column of the dataset.

- **`model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")`**: This tokenizes the `inputs` list, converting them into numerical representations using the `tokenizer`. The `max_length` argument limits the length of each sequence, and `truncation=True` ensures sequences exceeding the limit are shortened. `padding="max_length"` pads shorter sequences with special tokens to create a uniform length.

- **`with tokenizer.as_target_tokenizer():`**: This context manager configures the tokenizer for handling the target language (English) by setting the appropriate attributes.

- **`labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")`**: This tokenizes the target sentences (`English`) with similar parameters as the input.

- **`model_inputs["labels"] = labels["input_ids"]`**: This adds the tokenized target sentence IDs (stored as `input_ids` in the `labels` dictionary) as a new key "labels" within the `model_inputs` dictionary.

- **`tokenized_datasets = dataset.map(preprocess_function, batched=True)`**: This line applies the `preprocess_function` to each element of the dataset (`dataset`) in batches using `batched=True` for efficiency. The resulting processed data is stored in `tokenized_datasets`.

In summary, this text preprocessing step transforms your raw text data into a format suitable for training your machine translation model. It ensures consistency, handles sequence lengths, and potentially adds contextual information to aid the translation process.

### 2.3 Data collator


In machine learning, particularly for transformer models, a data collator plays a crucial role in preparing batches of data for training. It's essentially a function that takes individual samples and combines them into batches in a way that's efficient and optimized for the model's processing.


For sequence-to-sequence tasks like translation, a specialized data collator (often DataCollatorForSeq2Seq) becomes critical. It ensures that:
- Source and target sequences are properly aligned
- Padding is applied consistently
- Attention mechanisms can correctly ignore padded tokens
- Labels are prepared in a format that allows for loss calculation during training



Without a proper data collator, you'd need to manually handle sequence padding, masking, and batch preparation, which would be computationally expensive and error-prone.

### 2.4 BLEU (Bilingual Evaluation Understudy): 


In [42]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_ID)

In [43]:
import evaluate

metric = evaluate.load("sacrebleu")

In [44]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [45]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

### 2.5 Fine Tuning


In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="../Model/Base/Checkpoint/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

  trainer = Seq2SeqTrainer(


In [18]:
model.save_pretrained("../Model/Base/M2M100/")
tokenizer.save_pretrained("../Model/Base/M2M100/")


('../Model/Base/M2M100/tokenizer_config.json',
 '../Model/Base/M2M100/special_tokens_map.json',
 '..\\Model\\Base\\M2M100\\vocab.json',
 '..\\Model\\Base\\M2M100\\sentencepiece.bpe.model',
 '../Model/Base/M2M100/added_tokens.json')

In [19]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the model and tokenizer
model_FT = AutoModelForSeq2SeqLM.from_pretrained("../Model/Base/M2M100/")
tokenizer = AutoTokenizer.from_pretrained("../Model/Base/M2M100/")


In [20]:
from transformers import pipeline

# Initialize the translation pipeline
translator = pipeline("translation_en_to_hi", model=model_FT, tokenizer=tokenizer)

# Test translation
text = "Hello, how are you?"
translated_text = translator(text)

print(translated_text)

Device set to use cuda:0


[{'translation_text': 'नमस्ते, आप कैसे हैं?'}]


# 3.	Facebook / M2M100 418M + LoRa
- LoRA (Low-Rank Adaptation) refers to a parameter-efficient fine-tuning technique.


![alt text](image.png)

In the context of Large Language Models (LLMs), LoRA (Low-Rank Adaptation) refers to a parameter-efficient fine-tuning technique. 

**Key Concepts:**

* **Fine-tuning:** LLMs are often pre-trained on massive datasets. Fine-tuning involves adapting these pre-trained models to specific tasks or domains using smaller, more relevant datasets.
* **Parameter-Efficiency:** Fine-tuning LLMs can be computationally expensive, especially for very large models. LoRA addresses this by significantly reducing the number of parameters that need to be updated during fine-tuning.

**How LoRA Works:**

Instead of fine-tuning all the parameters of the base LLM, LoRA introduces two small, trainable matrices (A and B) for each attention layer:

1. **Decomposition:** The update to the original weight matrix (W) is approximated as the product of these two smaller matrices: W' = W + A * B.
2. **Reduced Parameters:** Since A and B have significantly fewer parameters than the original weight matrix, the overall number of trainable parameters is drastically reduced.
3. **Fine-tuning:** Only the parameters of A and B are trained during fine-tuning, while the original weights of the base LLM remain frozen.

**Benefits of LoRA:**

* **Reduced Training Time and Cost:** By training only a small subset of parameters, LoRA significantly reduces training time and computational resources.
* **Improved Efficiency:** The smaller number of parameters leads to faster inference times.
* **Preserving Base Model:** Since the base model's weights are frozen, it retains its general knowledge and capabilities while being adapted to the specific task.
* **Easier Deployment:** Smaller models are easier to deploy and run on devices with limited resources.

**Applications:**

LoRA has been successfully applied to a wide range of LLM fine-tuning tasks, including:

* **Domain Adaptation:** Adapting LLMs to specific domains like finance, medicine, or law.
* **Task-Specific Fine-tuning:** Fine-tuning LLMs for specific tasks such as question answering, text summarization, and code generation.
* **Personalization:** Creating personalized LLMs for individual users or groups.

**In summary:**

LoRA is a powerful technique that enables efficient and effective fine-tuning of LLMs. By significantly reducing the number of trainable parameters, LoRA makes it possible to customize large models for specific applications while minimizing training costs and preserving the valuable knowledge of the base model.


In [21]:
# Access the model's named modules
for name, module in model.named_modules():
    print(name, module)


 M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): M2M100ScaledWordEmbedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0-11): 12 x M2M100EncoderLayer(
          (self_attn): M2M100SdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
  

In [28]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=4,  # Low rank (fewer trainable parameters)
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers (query and value)
    bias="none",  # Specify which biases to train
    task_type="SEQ_2_SEQ_LM",  # Task type (sequence-to-sequence)
)

# Wrap the base model with LoRA
model_lora = get_peft_model(model, lora_config)

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="../Model/LoRa/Checkpoint/",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    
)

trainer = Seq2SeqTrainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

  trainer = Seq2SeqTrainer(


  0%|          | 0/6 [00:00<?, ?it/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 10.110392570495605, 'eval_bleu': 1.4304, 'eval_gen_len': 24.6667, 'eval_runtime': 12.3127, 'eval_samples_per_second': 0.487, 'eval_steps_per_second': 0.081, 'epoch': 1.0}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 10.103043556213379, 'eval_bleu': 1.4304, 'eval_gen_len': 24.6667, 'eval_runtime': 11.6643, 'eval_samples_per_second': 0.514, 'eval_steps_per_second': 0.086, 'epoch': 2.0}
{'train_runtime': 100.8059, 'train_samples_per_second': 0.952, 'train_steps_per_second': 0.06, 'train_loss': 10.42147445678711, 'epoch': 2.0}


TrainOutput(global_step=6, training_loss=10.42147445678711, metrics={'train_runtime': 100.8059, 'train_samples_per_second': 0.952, 'train_steps_per_second': 0.06, 'total_flos': 26048741769216.0, 'train_loss': 10.42147445678711, 'epoch': 2.0})

In [24]:
model_lora.merge_and_unload()
model_lora.save_pretrained("../Model/LoRa/M2M100/")


In [25]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the merged model
merged_model = AutoModelForSeq2SeqLM.from_pretrained("../Model/LoRa/M2M100/")



Loading adapter weights from ../Model/LoRa/M2M100/ led to missing keys in the model: model.encoder.layers.0.self_attn.v_proj.lora_A.default.weight, model.encoder.layers.0.self_attn.v_proj.lora_B.default.weight, model.encoder.layers.0.self_attn.q_proj.lora_A.default.weight, model.encoder.layers.0.self_attn.q_proj.lora_B.default.weight, model.encoder.layers.1.self_attn.v_proj.lora_A.default.weight, model.encoder.layers.1.self_attn.v_proj.lora_B.default.weight, model.encoder.layers.1.self_attn.q_proj.lora_A.default.weight, model.encoder.layers.1.self_attn.q_proj.lora_B.default.weight, model.encoder.layers.2.self_attn.v_proj.lora_A.default.weight, model.encoder.layers.2.self_attn.v_proj.lora_B.default.weight, model.encoder.layers.2.self_attn.q_proj.lora_A.default.weight, model.encoder.layers.2.self_attn.q_proj.lora_B.default.weight, model.encoder.layers.3.self_attn.v_proj.lora_A.default.weight, model.encoder.layers.3.self_attn.v_proj.lora_B.default.weight, model.encoder.layers.3.self_attn.

In [27]:
from transformers import pipeline

# Initialize the translation pipeline
translator = pipeline("translation_en_to_hi", model=merged_model, tokenizer=tokenizer)

# Test translation
text = "Hello, how are you?"
translated_text = translator(text)

print(translated_text)

Device set to use cuda:0


[{'translation_text': 'नमस्ते, आप कैसे हैं?'}]


In [26]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, DatasetDict
import torch

In [27]:
def load_model_and_tokenizer(model_name):
    """
    Load the base model and tokenizer.
    """
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Load T5-small model and tokenizer
model_name = "t5-small"
model, tokenizer = load_model_and_tokenizer(model_name)

In [28]:
def configure_lora(model, task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, lora_dropout=0.1):
    """
    Configure LoRA for the model.
    """
    lora_config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=["q", "v"],  # Apply LoRA to attention layers (query and value)
        bias="none",
        task_type=task_type,
    )
    return get_peft_model(model, lora_config)

# Apply LoRA to the model
model = configure_lora(model)

In [36]:
def load_and_split_dataset(data_file, split_percentage=50, train_test_split_ratio=0.2, val_test_split_ratio=0.5):
    """
    Load the dataset and split it into train, validation, and test sets.
    """
    # Load dataset
    dataset = load_dataset("csv", data_files={"train": data_file}, split=f"train[:{split_percentage}%]")
    
    # Split into train and test
    train_test_split = dataset.train_test_split(test_size=train_test_split_ratio, seed=42)
    
    # Further split test into validation and test
    val_test_split = train_test_split["test"].train_test_split(test_size=val_test_split_ratio, seed=42)
    
    # Combine splits into a DatasetDict
    raw_dataset = {
        "train": train_test_split["train"],
        "validation": val_test_split["train"],
        "test": val_test_split["test"],
    }
    
    return DatasetDict(raw_dataset)

# Load and split the dataset
data_file = '../Datasets/processed_data.csv'
dataset = load_and_split_dataset(data_file)

In [None]:
dataset

In [None]:
def preprocess_function(examples, tokenizer, source_lang="English", target_lang="Hindi", max_length=128):
    """
    Tokenize and preprocess the dataset for Seq2Seq tasks.
    """
    inputs = tokenizer(examples[source_lang], max_length=max_length, padding="max_length", truncation=True)
    targets = tokenizer(examples[target_lang], max_length=max_length, padding="max_length", truncation=True)
    
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, fn_kwargs={"tokenizer": tokenizer})

In [40]:
def create_data_collator(tokenizer, model):
    """
    Create a data collator for Seq2Seq tasks.
    """
    return DataCollatorForSeq2Seq(tokenizer, model=model)

# Create data collator
data_collator = create_data_collator(tokenizer, model)

In [None]:
def configure_training_args(output_dir, num_train_epochs=3, per_device_train_batch_size=16, fp16=True):
    """
    Configure training arguments for Seq2SeqTrainer.
    """
    return Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-4,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        save_strategy="epoch",
        predict_with_generate=True,
        fp16=fp16,
    )

# Configure training arguments
output_dir = "./lora_t5"
training_args = configure_training_args(output_dir)

In [None]:
def create_trainer(model, tokenizer, training_args, train_dataset, eval_dataset, data_collator):
    """
    Create a Seq2SeqTrainer instance.
    """
    return Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

# Create trainer
trainer = create_trainer(
    model=model,
    tokenizer=tokenizer,
    training_args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()

In [None]:
def save_merged_model(model, output_dir):
    """
    Merge LoRA layers and save the model.
    """
    model.merge_and_unload()
    model.save_pretrained(output_dir)

# Save the merged model
save_merged_model(model, "./merged_lora_t5")

In [None]:
def evaluate_model(trainer, eval_dataset):
    """
    Evaluate the model on the test dataset.
    """
    results = trainer.evaluate(eval_dataset=eval_dataset)
    print(f"Evaluation Results: {results}")

# Evaluate the model
evaluate_model(trainer, tokenized_datasets["test"])

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, DatasetDict
import numpy as np
import evaluate

# Load evaluation metric (BLEU)
metric = evaluate.load("sacrebleu")

# Postprocess text function
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

# Compute metrics function
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# Load model and tokenizer
def load_model_and_tokenizer(model_name):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Configure LoRA
def configure_lora(model, task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, lora_dropout=0.1):
    lora_config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=["q", "v"],
        bias="none",
        task_type=task_type,
    )
    return get_peft_model(model, lora_config)

# Load and split dataset
def load_and_split_dataset(data_file, split_percentage=0.1, train_test_split_ratio=0.2, val_test_split_ratio=0.5):
    dataset = load_dataset("csv", data_files={"train": data_file}, split=f"train[:{split_percentage}%]")
    train_test_split = dataset.train_test_split(test_size=train_test_split_ratio, seed=42)
    val_test_split = train_test_split["test"].train_test_split(test_size=val_test_split_ratio, seed=42)
    raw_dataset = {
        "train": train_test_split["train"],
        "validation": val_test_split["train"],
        "test": val_test_split["test"],
    }
    return DatasetDict(raw_dataset)

# Preprocess function
def preprocess_function(examples, tokenizer, source_lang="English", target_lang="Hindi", max_length=128):
    inputs = tokenizer(examples[source_lang], max_length=max_length, padding="max_length", truncation=True)
    targets = tokenizer(examples[target_lang], max_length=max_length, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Create data collator
def create_data_collator(tokenizer, model):
    return DataCollatorForSeq2Seq(tokenizer, model=model)

# Configure training arguments
def configure_training_args(output_dir, num_train_epochs=3, per_device_train_batch_size=16, fp16=True):
    return Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=5e-4,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        save_strategy="epoch",
        predict_with_generate=True,
        fp16=fp16,
    )

# Create trainer
def create_trainer(model, tokenizer, training_args, train_dataset, eval_dataset, data_collator, compute_metrics):
    return Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,  # Add compute_metrics to the trainer
    )

# Save merged model
def save_merged_model(model, output_dir):
    model.merge_and_unload()
    model.save_pretrained(output_dir)

# Evaluate model
def evaluate_model(trainer, eval_dataset):
    results = trainer.evaluate(eval_dataset=eval_dataset)
    print(f"Evaluation Results: {results}")

# Main workflow
if __name__ == "__main__":
    # Load model and tokenizer
    model_name = "t5-small"
    model, tokenizer = load_model_and_tokenizer(model_name)

    # Apply LoRA
    model = configure_lora(model)

    # Load and split dataset
    data_file = "../Datasets/processed_data.csv"
    dataset = load_and_split_dataset(data_file)

    # Preprocess dataset
    tokenized_datasets = dataset.map(preprocess_function, batched=True, fn_kwargs={"tokenizer": tokenizer})

    # Create data collator
    data_collator = create_data_collator(tokenizer, model)

    # Configure training arguments
    output_dir = "./lora_t5"
    training_args = configure_training_args(output_dir)

    # Create trainer
    trainer = create_trainer(
        model=model,
        tokenizer=tokenizer,
        training_args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,  # Pass compute_metrics to the trainer
    )

    # Train the model
    trainer.train()

    # Save the merged model
    save_merged_model(model, "./merged_lora_t5")

    # Evaluate the model
    evaluate_model(trainer, tokenized_datasets["test"])