In [None]:
!pip install sentencepiece
!pip install transformers==4.28.0
!pip install datasets evaluate
!pip install sacrebleu

In [None]:
from transformers import AutoTokenizer, M2M100ForConditionalGeneration,Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq,pipeline
from huggingface_hub import notebook_login
from datasets import load_dataset,load_metric
import evaluate
import numpy as np

In [None]:
#The model we will use for training Sanskrit-English model is “facebook/m2m100_418M” which is Meta’s open source model with 418 Million parameters.

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")

In [None]:
#Loading Dataset
#Itihasa is a Sanskrit-English translation corpus containing 93,000 Sanskrit shlokas and their English translations extracted from M.N.Dutt’s seminal works on The Ramayana and The Mahabharata.

from datasets import load_dataset
dataset = load_dataset("rahular/itihasa")
dataset


In [None]:
#Tokenization
'''
The tokenize_function is specifically designed to process a batch of translation data. Each batch consists of pairs of Sanskrit and English texts extracted from the dataset. The function starts by separating the Sanskrit and English texts from each translation pair in the batch. This separation is achieved using list comprehensions that iterate through the 'translation' field of each entry in the batch, resulting in two separate lists: one for the Sanskrit texts (sanskrit_texts) and another for their corresponding English translations (english_texts).

Next, the function tokenizes these texts using a pre-defined tokenizer, which is applied separately to the Sanskrit texts (inputs) and the English texts (targets). The tokenizer is configured with the following parameters: it truncates texts longer than a maximum length to handle variability in text length, pads sequences to a uniform length of 128 tokens, and outputs the tokenized data as PyTorch tensors ('pt').

After tokenization, the function organizes the tokens into a structured format suitable for training a sequence-to-sequence (Seq2Seq) model. It returns a dictionary containing the following elements:

input_ids: The token IDs representing the Sanskrit texts.

attention_mask: The attention masks for the Sanskrit texts, indicating which tokens should be attended to by the model.

decoder_input_ids: The token IDs for the English texts, which serve as inputs to the decoder of the Seq2Seq model.

decoder_attention_mask: The attention masks for the English texts, guiding the decoder on which tokens to focus.

labels: The labels used for training the Seq2Seq model. These are identical to the decoder_input_ids but are cloned to serve as the target outputs during the training process.
'''

In [None]:
def tokenize_function(batch):
    sanskrit_texts = [entry['sn'] for entry in batch['translation']]
    english_texts = [entry['en'] for entry in batch['translation']]

    # Tokenize the inputs (Sanskrit) and targets (English)
    inputs = tokenizer(sanskrit_texts, truncation=True, padding='max_length', max_length=128, return_tensors="pt")
    targets = tokenizer(english_texts, truncation=True, padding='max_length', max_length=128, return_tensors="pt")

    # Return both inputs and targets tokens
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "decoder_input_ids": targets["input_ids"],
        "decoder_attention_mask": targets["attention_mask"],
        "labels": targets["input_ids"].clone()  # labels for Seq2Seq models are typically the target input_ids
    }

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
'''
Transformers Library: The DataCollatorForSeq2Seq is a critical component from the Transformers library, which is widely used for advanced machine learning models.

Purpose: The DataCollatorForSeq2Seq efficiently prepares batches of data for sequence-to-sequence (Seq2Seq) models, ensuring data is formatted correctly for training.

Initialization: The data collator is initialized by passing two key arguments:

Tokenizer: Converts text into a format that the model can understand.
Model: The Seq2Seq model that will be trained.
Functionality: By using these arguments, the data collator ensures that batches are tokenized and formatted to meet the requirements of the Seq2Seq model.

Outcome: This setup helps in the seamless training of Seq2Seq models by handling data preprocessing automatically.
'''

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
'''
Importing Evaluation Metrics: We will import two evaluation metrics: SacreBLEU and METEOR.

SacreBLEU Metric:
Loading: SacreBLEU is loaded using the evaluate.load method.
Purpose: SacreBLEU is a standard metric for evaluating machine translation quality.
Functionality: It compares machine-generated translations with one or more reference translations, providing a score that reflects the translation's accuracy and fluency.

METEOR Metric:
Loading: METEOR is loaded using the load_metric function.
Purpose: METEOR is another popular metric for evaluating machine translation.
Differences from BLEU: Unlike BLEU, METEOR considers factors like synonyms and stemming, providing a more nuanced assessment of translation quality.
'''

In [None]:
import evaluate
from datasets import load_metric

metric = evaluate.load("sacrebleu")
meteor = load_metric('meteor')

In [None]:
'''
Postprocessing Functions
Overview: Two functions, postprocess_text and compute_metrics, are defined to evaluate the performance of a machine translation model.

postprocess_text(preds, labels):
Purpose: Cleans and prepares predictions and labels for evaluation.
Functionality:
Strips leading and trailing spaces from the model's predictions (preds) and ground truth labels (labels).
Converts the labels into a list of lists, where each inner list contains a single label.

compute_metrics(eval_preds):
Purpose: Computes evaluation metrics for the machine translation model.
Functionality:
Separates the raw predictions and labels from the model evaluation.
If predictions are in a tuple format, extracts the necessary part.
Decodes predictions and labels from their tokenized form back into text, removing special tokens (like padding or start/end tokens).
Applies the postprocess_text function to clean and format the decoded predictions and labels.
Computes evaluation metrics such as BLEU and METEOR scores, which are standard for assessing machine translation quality.
Calculates the average length of the predictions, indicating the model’s verbosity or succinctness.
Rounds the metric scores for easier interpretation and returns the results.
'''

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result = {'bleu' : result['score']}
    result["gen_len"] = np.mean(prediction_lens)
    result["meteor"] = meteor_result["meteor"]
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
'''
Training Configuration

Purpose: The code snippet configures training arguments for a sequence-to-sequence (Seq2Seq) model using the Hugging Face Transformers library.
Importance: This setup is essential for specifying the parameters and settings that define the model's training process.
Components: The configuration includes details such as:
Learning rate: Controls how much the model's weights are adjusted during training.
Batch size: Determines the number of training samples used in one iteration.
Number of epochs: Specifies how many times the model will pass through the entire training dataset.
Save and evaluation frequency: Defines when the model's performance will be evaluated and when checkpoints will be saved during training.
Outcome: Proper configuration ensures the Seq2Seq model is trained effectively and efficiently, maximizing performance and minimizing errors.
'''

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="M2M101",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# for starting the training of model
trainer.train()

In [None]:
# Next, in order to push the model to huggingface hub so that we can reuse later we have to login into huggingface cli.
!huggingface-cli login

In [None]:
trainer.push_to_hub()

In [None]:
#Model Testing

text = "सत्यमेवेश्वरो लोके सत्यं पद्माश्रिता सदा"

from transformers import pipeline

translator = pipeline("translation", model="my_sanskrit_model")
translator(text)