In [1]:
import os
import re
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
text_dir = 'Sans_dataset'

In [2]:
import re

def split_sanskrit_verses(text):
    text = re.sub(r'\s+', ' ', text.strip())

    #delimiters (॥ and ।) 
    parts = re.split(r'(॥\s*\d*॥|।)', text)

    #Combine
    verses = []
    for i in range(0, len(parts) - 1, 2):
        verse = parts[i].strip() + " " + parts[i + 1].strip()
        if len(verse) > 10: 
            verses.append(verse.strip())

    return verses


In [3]:
import os
import random
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
def load_sanskrit_sentences(folder_path):
    samples = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read().strip()
                verses = split_sanskrit_verses(text)
                for verse in verses:
                    samples.append({
                        "input": verse,
                        "target": verse  
                    })

    return samples

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = load_sanskrit_sentences(text_dir)

train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['input', 'target'],
        num_rows: 128552
    })
    test: Dataset({
        features: ['input', 'target'],
        num_rows: 14284
    })
})
{'input': 'स्वस्ये मयि स्वरससत्यसुखावबोधे व्यामोहनाज्जगदिति व्यपदिश्यते यत् ।', 'target': 'स्वस्ये मयि स्वरससत्यसुखावबोधे व्यामोहनाज्जगदिति व्यपदिश्यते यत् ।'}


Transformer => IndicBART

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", use_fast=True)  # or IndicBERT if using BERT

MAX_LENGTH = 128

def tokenize_fn(example):
    inputs = tokenizer(
        example["input"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt",
    )

    targets = tokenizer(
        example["target"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt",
    )

    return {
        "input_ids": inputs["input_ids"][0],
        "token_type_ids": inputs["token_type_ids"][0] if "token_type_ids" in inputs else [0] * MAX_LENGTH,
        "attention_mask": inputs["attention_mask"][0],
        "labels": targets["input_ids"][0],
    }

tokenized_dataset = dataset.map(tokenize_fn, remove_columns=["input", "target"])
tokenized_dataset.set_format(type="torch")

print(tokenized_dataset["train"][0])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 128552/128552 [00:45<00:00, 2806.21 examples/s]
Map: 100%|██████████| 14284/14284 [00:05<00:00, 2778.32 examples/s]

{'input_ids': tensor([    2,  2298,   126,   509,   103,   391, 10866,    70, 26584, 16395,
         3287,  6396,    12,  7045, 10660,    10,  5814,    98,   607,   251,
         8234,   128,   607, 12315,    65,   926,   123,    52,     3,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 




In [6]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")



In [7]:
import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("CUDA cache cleared before script execution.")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version used by PyTorch: {torch.version.cuda}")
print(f"Is CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA cache cleared before script execution.
PyTorch version: 2.5.1+cu121
CUDA version used by PyTorch: 12.1
Is CUDA available: True
GPU: NVIDIA H100 80GB HBM3 MIG 3g.40gb


In [8]:
dataset = tokenized_dataset

In [9]:
print(dataset['train'][0])

{'input_ids': tensor([    2,  2298,   126,   509,   103,   391, 10866,    70, 26584, 16395,
         3287,  6396,    12,  7045, 10660,    10,  5814,    98,   607,   251,
         8234,   128,   607, 12315,    65,   926,   123,    52,     3,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [None]:
import torch
from datasets import load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import numpy as np

bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")

def postprocess_text(preds, labels):
    labels = [[(token if token != -100 else tokenizer.pad_token_id) for token in label] for label in labels]

    preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in preds]
    labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


from nltk.tokenize import word_tokenize

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]
    if preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    decoded_preds, decoded_labels = postprocess_text(preds, labels)
    for pred, label in zip(decoded_preds[:5], decoded_labels[:5]):
        print("Pred:", pred)
        print("Label:", label)
        print("Tokenized Pred:", pred.split())
        print("Tokenized Label:", [label.split()])
        print("---")
    tokenized_preds = [word_tokenize(pred) for pred in decoded_preds]
    tokenized_labels = [[word_tokenize(label)] for label in decoded_labels]  

    bleu = bleu_metric.compute(predictions=tokenized_preds, references=tokenized_labels)
    rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": bleu["bleu"],
        "rougeL": rouge["rougeL"].mid.fmeasure,
        "rouge1": rouge["rouge1"].mid.fmeasure,
        "rouge2": rouge["rouge2"].mid.fmeasure
    }

training_args = TrainingArguments(
    output_dir="./results_fin",
    evaluation_strategy="epoch",      
    logging_strategy="epoch",         
    per_device_train_batch_size=16,   
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    save_strategy="epoch",            
    logging_dir="./logs",             
    load_best_model_at_end=True,      
    metric_for_best_model="bleu",     
    greater_is_better=True,
    save_total_limit=3,               
    report_to="tensorboard",
    eval_accumulation_steps=16,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  bleu_metric = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Epoch,Training Loss,Validation Loss
