# Settign Up Environment

In [1]:
!pip install transformers huggingface_hub accelerate datasets sentencepiece evaluate sacrebleu
!pip install indic-nlp-library torch torchvision
!pip install scikit-learn pandas numpy
!pip install stanza  # For morphological tagging

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [evaluate]
[1A[2KSuccessfully installed evaluate-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1
Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.met

# Importing Required Modules

In [13]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from transformers.modeling_outputs import Seq2SeqLMOutput
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import evaluate
import random
import numpy as np
from typing import Dict, List, Tuple
import pandas as pd
import stanza
from indicnlp.morph import unsupervised_morph
from indicnlp.tokenize import indic_tokenize
import json

In [15]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Loading Dataset

## Using ChatRAG-Hi dataset from Huggingface

In [14]:

# Available configs: inscit, hybridial, doc2dial, quac, qrecc, doqa_cooking, doqa_movies, doqa_travel
DATASET_CONFIG = "inscit"  

try:
    # Load specific config with train/validation splits
    print(f"Loading config: {DATASET_CONFIG}")
    dataset = load_dataset("nvidia/ChatRAG-Hi", DATASET_CONFIG)
    
    # Check available splits
    print(f"Available splits: {list(dataset.keys())}")
    
    # Combine or use train split
    if "train" in dataset:
        train_data = dataset["train"]
    else:
        # If no train split, use the first available split
        first_split = list(dataset.keys())[0]
        train_data = dataset[first_split]
        print(f"Using '{first_split}' split as training data")
    
    # Create train/test split
    dataset = train_data.train_test_split(test_size=0.1, seed=42)
    print(f"Train size: {len(dataset['train'])}, Test size: {len(dataset['test'])}")
    
    # Inspect the structure
    print("\nDataset columns:", dataset["train"].column_names)
    print("\nSample data:")
    print(dataset["train"][0])
    
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("\nTrying to load all configs and combine them...")
    
    try:
        # Try loading and combining multiple configs
        all_configs = ['inscit', 'hybridial', 'doc2dial', 'quac', 'qrecc']
        combined_data = []
        
        for config in all_configs:
            try:
                print(f"Loading config: {config}...")
                config_dataset = load_dataset("nvidia/ChatRAG-Hi", config, split="train")
                combined_data.append(config_dataset)
                print(f"  Loaded {len(config_dataset)} samples")
            except:
                print(f"  Could not load {config}, skipping...")
                continue
        
        if combined_data:
            from datasets import concatenate_datasets
            train_data = concatenate_datasets(combined_data)
            dataset = train_data.train_test_split(test_size=0.1, seed=42)
            print(f"\nCombined dataset - Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")
            print("\nDataset columns:", dataset["train"].column_names)
            print("\nSample data:")
            print(dataset["train"][0])
        else:
            raise Exception("Could not load any configs")
            
    except Exception as e2:
        print(f"Error combining datasets: {e2}")
        print("\nCreating dummy dataset for demonstration...")
        from datasets import Dataset as HFDataset
        dummy_data = {
            "context": ["यह संदर्भ है"] * 100,
            "question": ["यह प्रश्न है?"] * 100,
            "answers": [{"text": ["यह उत्तर है"], "answer_start": [0]}] * 100,
            "id": [f"dummy_{i}" for i in range(100)]
        }
        dataset = HFDataset.from_dict(dummy_data)
        dataset = dataset.train_test_split(test_size=0.1, seed=42)
        print(f"Dummy dataset created - Train: {len(dataset['train'])}, Test: {len(dataset['test'])}")


Loading config: inscit
Available splits: ['test']
Using 'test' split as training data
Train size: 405, Test size: 45

Dataset columns: ['topic', 'ctxs', 'answers', 'ground_truth_ctx', 'messages']

Sample data:
{'topic': "Mother's Day", 'ctxs': [{'text': 'उनका जन्म डच ईस्ट इंडीज (वर्तमान इंडोनेशिया) में एक कुलीन जावानीस परिवार में हुआ था। डच भाषा के प्राथमिक विद्यालय में पढ़ने के बाद, वह आगे की शिक्षा प्राप्त करना चाहती थी, लेकिन उस समय जावानीस महिलाओं को उच्च शिक्षा से वंचित रखा गया था। वह विभिन्न अधिकारियों और प्रभावशाली लोगों से मिलीं, जिनमें जे.एच. अबेन्डानन भी शामिल थे, जो डच नैतिक नीति को लागू करने के प्रभारी थे।', 'title': 'कार्तिनी'}, {'text': 'जावानीस महिला अधिकार अधिवक्ता राडेन अजेंग कार्तिनी (लेडी कार्तिनी) के नाम पर कार्तिनी स्कूल, डच नैतिक नीति के मद्देनजर डच ईस्ट इंडीज में स्वदेशी लड़कियों को शिक्षित करने के लिए खोले गए थे।', 'title': 'कार्तिनी स्कूल'}, {'text': 'डच नैतिक नीति () 1901 से 1942 के जापानी कब्जे तक चार दशकों के दौरान डच ईस्ट इंडीज (वर्तमान इंडोनेशिया) की औपनिव

# Initialize Stanza for morphological tagging

In [5]:

print("\nInitializing Stanza Hindi pipeline for morphological analysis...")
try:
    stanza.download('hi', verbose=False)
    nlp = stanza.Pipeline('hi', processors='tokenize,pos,lemma', verbose=False)
    print("Stanza pipeline loaded successfully!")
except Exception as e:
    print(f"Warning: Could not load Stanza: {e}")
    nlp = None



Initializing Stanza Hindi pipeline for morphological analysis...
Stanza pipeline loaded successfully!


# Advanced Morphological Augmentation

In [16]:
class MorphologicalAugmenter:
    def __init__(self, stanza_pipeline=None):
        self.nlp = stanza_pipeline
        
        # Hindi morphological affixes
        self.case_markers = {
            "nominative": [""],
            "accusative": ["को"],
            "instrumental": ["से", "के साथ"],
            "dative": ["को", "के लिए"],
            "ablative": ["से"],
            "genitive": ["का", "की", "के"],
            "locative": ["में", "पर"]
        }
        
        self.gender_suffixes = {
            "masculine": ["ा", "े"],
            "feminine": ["ी", "ें"]
        }
        
        self.tense_markers = {
            "present": ["ता है", "ती है", "ते हैं"],
            "past": ["ता था", "ती थी", "ते थे", "या", "ई", "ए"],
            "future": ["गा", "गी", "गे", "एगा", "एगी"]
        }
        
        self.number_markers = {
            "singular": [""],
            "plural": ["ों", "ें", "ें", "ओं"]
        }
        
        self.morphemes = ["ा", "ी", "ें", "ों", "ता", "ती", "ते", "वाल", "पन", "हट"]
    
    def extract_morphological_features(self, text: str) -> Dict:
        """Extract morphological features using Stanza"""
        if self.nlp is None:
            return {"tags": [], "lemmas": []}
        
        try:
            doc = self.nlp(text)
            features = {
                "tags": [],
                "lemmas": [],
                "words": []
            }
            
            for sent in doc.sentences:
                for word in sent.words:
                    features["words"].append(word.text)
                    features["tags"].append(word.upos)
                    features["lemmas"].append(word.lemma)
            
            return features
        except Exception as e:
            return {"tags": [], "lemmas": [], "words": []}
    
    def augment_with_case(self, text: str) -> str:
        """Add case marker variations"""
        if random.random() < 0.3:
            words = text.split()
            if len(words) > 1:
                idx = random.randint(0, len(words)-2)
                case = random.choice(list(self.case_markers.keys()))
                marker = random.choice(self.case_markers[case])
                if marker:
                    words.insert(idx+1, marker)
                return " ".join(words)
        return text
    
    def augment_with_gender(self, text: str) -> str:
        """Add gender inflection variations"""
        if random.random() < 0.3:
            words = text.split()
            if words:
                idx = random.randint(0, len(words)-1)
                gender = random.choice(list(self.gender_suffixes.keys()))
                suffix = random.choice(self.gender_suffixes[gender])
                # Remove existing suffix if present
                word = words[idx].rstrip("ाीेें")
                words[idx] = word + suffix
                return " ".join(words)
        return text
    
    def augment_with_tense(self, text: str) -> str:
        """Add tense marker variations"""
        if random.random() < 0.3:
            words = text.split()
            tense = random.choice(list(self.tense_markers.keys()))
            marker = random.choice(self.tense_markers[tense])
            words.append(marker)
            return " ".join(words)
        return text
    
    def augment_with_number(self, text: str) -> str:
        """Add number inflection variations"""
        if random.random() < 0.3:
            words = text.split()
            if words:
                idx = random.randint(0, len(words)-1)
                number = random.choice(list(self.number_markers.keys()))
                suffix = random.choice(self.number_markers[number])
                words[idx] = words[idx] + suffix
                return " ".join(words)
        return text
    
    def comprehensive_augment(self, text: str) -> str:
        """Apply multiple augmentation strategies"""
        augmentations = [
            self.augment_with_case,
            self.augment_with_gender,
            self.augment_with_tense,
            self.augment_with_number
        ]
        
        # Apply 1-2 random augmentations
        num_augs = random.randint(1, 2)
        selected_augs = random.sample(augmentations, num_augs)
        
        augmented = text
        for aug_fn in selected_augs:
            augmented = aug_fn(augmented)
        
        return augmented

augmenter = MorphologicalAugmenter(nlp)

# Multitask Model with Morphological Tagging Head

In [18]:
# class ByT5WithMorphologyHead(nn.Module):
#     def __init__(self, base_model, num_morph_tags=17):  # UD has 17 universal POS tags
#         super().__init__()
#         self.base_model = base_model
#         self.config = base_model.config
        
#         # Morphological tagging head
#         self.morph_head = nn.Linear(self.config.d_model, num_morph_tags)
#         self.morph_loss_weight = 0.3
    
#     def forward(self, input_ids, attention_mask=None, labels=None, morph_labels=None):
#         # Main seq2seq output
#         outputs = self.base_model(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             labels=labels
#         )
        
#         loss = outputs.loss
        
#         # Add morphological tagging loss if labels provided
#         if morph_labels is not None:
#             encoder_outputs = self.base_model.encoder(
#                 input_ids=input_ids,
#                 attention_mask=attention_mask
#             )
#             hidden_states = encoder_outputs.last_hidden_state
            
#             # Morphological predictions
#             morph_logits = self.morph_head(hidden_states)
            
#             # Compute morphological loss
#             morph_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
#             morph_loss = morph_loss_fct(
#                 morph_logits.view(-1, morph_logits.size(-1)),
#                 morph_labels.view(-1)
#             )
            
#             # Combined loss
#             loss = loss + self.morph_loss_weight * morph_loss
        
#         return type('obj', (object,), {
#             'loss': loss,
#             'logits': outputs.logits,
#             'past_key_values': outputs.past_key_values,
#             'encoder_last_hidden_state': outputs.encoder_last_hidden_state
#         })()
    
#     def generate(self, *args, **kwargs):
#         return self.base_model.generate(*args, **kwargs)


# Updated One

# ------------------------
# Multitask Model with Morphological Tagging Head
# ------------------------
from transformers.modeling_outputs import Seq2SeqLMOutput

class ByT5WithMorphologyHead(nn.Module):
    def __init__(self, base_model, num_morph_tags=17):  # UD has 17 universal POS tags
        super().__init__()
        self.base_model = base_model
        self.config = base_model.config
        
        # Morphological tagging head
        self.morph_head = nn.Linear(self.config.d_model, num_morph_tags)
        self.morph_loss_weight = 0.3
    
    def forward(self, input_ids, attention_mask=None, labels=None, morph_labels=None, **kwargs):
        # Remove unexpected arguments that base model doesn't accept
        kwargs.pop('num_items_in_batch', None)
        
        # Main seq2seq output
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            **kwargs
        )
        
        loss = outputs.loss
        
        # Add morphological tagging loss if labels provided
        if morph_labels is not None and loss is not None:
            encoder_outputs = self.base_model.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            hidden_states = encoder_outputs.last_hidden_state
            
            # Morphological predictions
            morph_logits = self.morph_head(hidden_states)
            
            # Convert morph_labels to tensor if it's a list
            if isinstance(morph_labels, list):
                morph_labels = torch.tensor(morph_labels, device=hidden_states.device)
            elif not isinstance(morph_labels, torch.Tensor):
                morph_labels = torch.tensor(morph_labels, device=hidden_states.device)
            else:
                morph_labels = morph_labels.to(hidden_states.device)
            
            # Compute morphological loss
            morph_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            morph_loss = morph_loss_fct(
                morph_logits.view(-1, morph_logits.size(-1)),
                morph_labels.view(-1)
            )
            
            # Combined loss
            loss = loss + self.morph_loss_weight * morph_loss
        
        # Return proper Seq2SeqLMOutput object
        return Seq2SeqLMOutput(
            loss=loss,
            logits=outputs.logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )
    
    def generate(self, *args, **kwargs):
        return self.base_model.generate(*args, **kwargs)
    
    def prepare_inputs_for_generation(self, *args, **kwargs):
        return self.base_model.prepare_inputs_for_generation(*args, **kwargs)

# Preprocessing with morphological features

In [21]:
POS_TAG_MAP = {
    "ADJ": 0, "ADP": 1, "ADV": 2, "AUX": 3, "CCONJ": 4,
    "DET": 5, "INTJ": 6, "NOUN": 7, "NUM": 8, "PART": 9,
    "PRON": 10, "PROPN": 11, "PUNCT": 12, "SCONJ": 13,
    "SYM": 14, "VERB": 15, "X": 16, "UNK": 16
}

# def preprocess_function(examples, tokenizer, augmenter, use_augmentation=True):
#     """Preprocess function handling NVIDIA ChatRAG-Hi format"""
    
#     # Handle NVIDIA ChatRAG-Hi format (conversational QA)
#     if "context" in examples and "question" in examples:
#         # ChatRAG format: context + question -> answer
#         contexts = examples["context"]
#         questions = examples["question"]
        
#         # Combine context and question as input
#         inputs = [f"संदर्भ: {ctx}\nप्रश्न: {q}" for ctx, q in zip(contexts, questions)]
        
#         # Extract answers (handle different answer formats)
#         if "answers" in examples:
#             # answers is typically a dict with 'text' field
#             answers_data = examples["answers"]
#             if isinstance(answers_data[0], dict):
#                 targets = [ans["text"][0] if isinstance(ans["text"], list) else ans["text"] 
#                           for ans in answers_data]
#             else:
#                 targets = [str(ans) for ans in answers_data]
#         elif "answer" in examples:
#             targets = examples["answer"]
#         else:
#             # Fallback: use first few words of context
#             targets = [ctx.split()[:10] for ctx in contexts]
#             targets = [" ".join(words) for words in targets]
    
#     # Handle instruction-based format
#     elif "instruction" in examples:
#         inputs = examples["instruction"]
#         if "input" in examples:
#             inputs = [f"{inst} {inp}".strip() for inst, inp in zip(examples["instruction"], examples["input"])]
#         targets = examples["output"]
    
#     # Fallback for other formats
#     else:
#         # Try to find input/output pairs
#         input_cols = [col for col in examples.keys() if "input" in col.lower() or "question" in col.lower()]
#         output_cols = [col for col in examples.keys() if "output" in col.lower() or "answer" in col.lower()]
        
#         if input_cols and output_cols:
#             inputs = examples[input_cols[0]]
#             targets = examples[output_cols[0]]
#         else:
#             # Last resort: use first two columns
#             cols = list(examples.keys())
#             inputs = examples[cols[0]] if len(cols) > 0 else [""] * len(list(examples.values())[0])
#             targets = examples[cols[1]] if len(cols) > 1 else [""] * len(list(examples.values())[0])
    
#     # Ensure inputs and targets are lists of strings
#     if not isinstance(inputs, list):
#         inputs = [inputs]
#     if not isinstance(targets, list):
#         targets = [targets]
    
#     inputs = [str(inp) for inp in inputs]
#     targets = [str(tgt) for tgt in targets]
    
#     # Apply morphological augmentation
#     if use_augmentation:
#         augmented_inputs = ["morphology aware: " + augmenter.comprehensive_augment(inp) for inp in inputs]
#     else:
#         augmented_inputs = ["morphology aware: " + inp for inp in inputs]
    
#     # Tokenize
#     model_inputs = tokenizer(
#         augmented_inputs,
#         max_length=512,
#         truncation=True,
#         padding="max_length"
#     )
    
#     labels = tokenizer(
#         targets,
#         max_length=128,
#         truncation=True,
#         padding="max_length"
#     )
    
#     model_inputs["labels"] = labels["input_ids"]
    
#     # Extract morphological tags (if available)
#     morph_labels = []
#     for inp in inputs:
#         features = augmenter.extract_morphological_features(inp)
#         tags = [POS_TAG_MAP.get(tag, POS_TAG_MAP["UNK"]) for tag in features["tags"]]
#         # Pad or truncate to match input length
#         if len(tags) < 512:
#             tags.extend([-100] * (512 - len(tags)))
#         else:
#             tags = tags[:512]
#         morph_labels.append(tags)
    
#     model_inputs["morph_labels"] = morph_labels
    
#     return model_inputs

def preprocess_function(examples, tokenizer, augmenter, use_augmentation=True):
    """Preprocess function handling NVIDIA ChatRAG-Hi format"""
    
    # Handle NVIDIA ChatRAG-Hi format with messages and ctxs
    if "messages" in examples and "ctxs" in examples:
        # Extract conversations from messages
        inputs = []
        targets = []
        
        for msgs, ctxs, ans in zip(examples["messages"], examples["ctxs"], examples["answers"]):
            # ctxs is a list of context passages
            context_text = " ".join([ctx.get("text", "") if isinstance(ctx, dict) else str(ctx) for ctx in ctxs]) if ctxs else ""
            
            # messages is a list of conversation turns
            if isinstance(msgs, list) and len(msgs) > 0:
                # Get the last user message as question
                user_msgs = [m for m in msgs if isinstance(m, dict) and m.get("role") == "user"]
                question_text = user_msgs[-1].get("content", "") if user_msgs else ""
            else:
                question_text = str(msgs) if msgs else ""
            
            # Combine context and question
            input_text = f"संदर्भ: {context_text}\nप्रश्न: {question_text}" if context_text else f"प्रश्न: {question_text}"
            inputs.append(input_text)
            
            # Extract answer
            if isinstance(ans, dict):
                target_text = ans.get("text", [""])[0] if isinstance(ans.get("text"), list) else ans.get("text", "")
            elif isinstance(ans, list):
                target_text = ans[0] if ans else ""
            else:
                target_text = str(ans)
            targets.append(target_text)
    
    # Handle format with context and question
    elif "context" in examples and "question" in examples:
        contexts = examples["context"]
        questions = examples["question"]
        
        inputs = [f"संदर्भ: {ctx}\nप्रश्न: {q}" for ctx, q in zip(contexts, questions)]
        
        if "answers" in examples:
            answers_data = examples["answers"]
            if isinstance(answers_data[0], dict):
                targets = [ans["text"][0] if isinstance(ans["text"], list) else ans["text"] 
                          for ans in answers_data]
            else:
                targets = [str(ans) for ans in answers_data]
        elif "answer" in examples:
            targets = examples["answer"]
        else:
            targets = [ctx.split()[:10] for ctx in contexts]
            targets = [" ".join(words) for words in targets]
    
    # Handle instruction-based format
    elif "instruction" in examples:
        inputs = examples["instruction"]
        if "input" in examples:
            inputs = [f"{inst} {inp}".strip() for inst, inp in zip(examples["instruction"], examples["input"])]
        targets = examples["output"]
    
    # Fallback for other formats
    else:
        input_cols = [col for col in examples.keys() if "input" in col.lower() or "question" in col.lower()]
        output_cols = [col for col in examples.keys() if "output" in col.lower() or "answer" in col.lower()]
        
        if input_cols and output_cols:
            inputs = examples[input_cols[0]]
            targets = examples[output_cols[0]]
        else:
            cols = list(examples.keys())
            inputs = examples[cols[0]] if len(cols) > 0 else [""] * len(list(examples.values())[0])
            targets = examples[cols[1]] if len(cols) > 1 else [""] * len(list(examples.values())[0])
    
    # Ensure inputs and targets are lists of strings
    if not isinstance(inputs, list):
        inputs = [inputs]
    if not isinstance(targets, list):
        targets = [targets]
    
    inputs = [str(inp) for inp in inputs]
    targets = [str(tgt) for tgt in targets]
    
    # Apply morphological augmentation
    if use_augmentation:
        augmented_inputs = ["morphology aware: " + augmenter.comprehensive_augment(inp) for inp in inputs]
    else:
        augmented_inputs = ["morphology aware: " + inp for inp in inputs]
    
    # Tokenize
    model_inputs = tokenizer(
        augmented_inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    
    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    
    # Extract morphological tags (if available)
    morph_labels = []
    for inp in inputs:
        features = augmenter.extract_morphological_features(inp)
        tags = [POS_TAG_MAP.get(tag, POS_TAG_MAP["UNK"]) for tag in features["tags"]]
        # Pad or truncate to match input length
        if len(tags) < 512:
            tags.extend([-100] * (512 - len(tags)))
        else:
            tags = tags[:512]
        morph_labels.append(tags)
    
    model_inputs["morph_labels"] = morph_labels
    
    return model_inputs

# Evaluation Metrics

In [9]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
[33m  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d508da6c9509985d4992e8d5ca00d9e7abf88e2f1a0887ee109fa52fc7b63480
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully bu

In [10]:
# metric_bleu = evaluate.load("sacrebleu")
# metric_rouge = evaluate.load("rouge")

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
    
#     # Decode predictions and labels
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
#     # Replace -100 in labels as we can't decode them
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
#     # Compute BLEU
#     bleu_result = metric_bleu.compute(
#         predictions=decoded_preds,
#         references=[[label] for label in decoded_labels]
#     )
    
#     # Compute ROUGE
#     rouge_result = metric_rouge.compute(
#         predictions=decoded_preds,
#         references=decoded_labels
#     )
    
#     return {
#         "bleu": bleu_result["score"],
#         "rouge1": rouge_result["rouge1"],
#         "rouge2": rouge_result["rouge2"],
#         "rougeL": rouge_result["rougeL"]
#     }


# ------------------------
# Evaluation Metrics
# ------------------------
metric_bleu = evaluate.load("sacrebleu")
metric_rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Handle predictions - they might be logits, so take argmax
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    # If predictions are logits (3D: batch x seq_len x vocab_size), take argmax
    if len(predictions.shape) == 3:
        predictions = np.argmax(predictions, axis=-1)
    
    # Convert to numpy arrays if needed
    if not isinstance(predictions, np.ndarray):
        predictions = np.array(predictions)
    if not isinstance(labels, np.ndarray):
        labels = np.array(labels)
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in labels as we can't decode them
    # Handle each sequence separately to avoid shape issues
    decoded_labels = []
    for label_seq in labels:
        # Replace -100 with pad_token_id
        label_seq = np.where(label_seq != -100, label_seq, tokenizer.pad_token_id)
        decoded_label = tokenizer.decode(label_seq, skip_special_tokens=True)
        decoded_labels.append(decoded_label)
    
    # Filter out empty predictions/labels
    filtered_preds = []
    filtered_labels = []
    for pred, label in zip(decoded_preds, decoded_labels):
        if pred.strip() and label.strip():
            filtered_preds.append(pred)
            filtered_labels.append(label)
    
    # If no valid predictions, return zeros
    if not filtered_preds:
        return {
            "bleu": 0.0,
            "rouge1": 0.0,
            "rouge2": 0.0,
            "rougeL": 0.0
        }
    
    # Compute BLEU
    bleu_result = metric_bleu.compute(
        predictions=filtered_preds,
        references=[[label] for label in filtered_labels]
    )
    
    # Compute ROUGE
    rouge_result = metric_rouge.compute(
        predictions=filtered_preds,
        references=filtered_labels
    )
    
    return {
        "bleu": bleu_result["score"],
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"]
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

# Benchmarking Framework

In [29]:
import torch
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)
import traceback

# Make sure compute_metrics, preprocess_function, and augmenter exist in your script
# from your_module import compute_metrics, preprocess_function, augmenter, ByT5WithMorphologyHead

class ModelBenchmark:
    def __init__(self, dataset):
        """
        dataset: a Hugging Face DatasetDict with 'train' and 'test' splits
        """
        self.dataset = dataset
        self.results = []

    # ---------------------------------------------------
    def evaluate_model(self, model_name: str, model, tokenizer, tokenized_dataset):
        """Train + evaluate one model"""
        print(f"\n{'='*60}")
        print(f"🚀 Evaluating {model_name} ...")
        print(f"{'='*60}")

        try:
            training_args = TrainingArguments(
                output_dir=f"./results_{model_name.replace('/', '_')}",
                eval_strategy="epoch",
                save_strategy="epoch",
                learning_rate=2e-4,
                per_device_train_batch_size=4,
                per_device_eval_batch_size=4,
                num_train_epochs=3,
                warmup_ratio=0.1,
                logging_steps=50,
                fp16=torch.cuda.is_available(),
                save_total_limit=2,
                load_best_model_at_end=True,
                metric_for_best_model="bleu",
                report_to="none",
                save_safetensors=False,  # avoid shared tensor save errors
                max_grad_norm=1.0,
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_dataset["train"],
                eval_dataset=tokenized_dataset["test"],
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
            )

            # ------------------------------
            print(f"\nTraining {model_name} ...")
            trainer.model.config.tie_word_embeddings = False  # fix shared-weights save bug
            trainer.train()

            # ------------------------------
            print(f"\nEvaluating {model_name} ...")
            eval_results = trainer.evaluate()

            # Count parameters
            param_count = sum(p.numel() for p in model.parameters())

            result = {
                "model": model_name,
                "bleu": eval_results.get("eval_bleu", 0.0),
                "rouge1": eval_results.get("eval_rouge1", 0.0),
                "rouge2": eval_results.get("eval_rouge2", 0.0),
                "rougeL": eval_results.get("eval_rougeL", 0.0),
                "parameters": param_count / 1e6,  # in millions
            }

            self.results.append(result)

            print(f"\n✅ Results for {model_name}:")
            print(f"  BLEU: {result['bleu']:.2f}")
            print(f"  ROUGE-1: {result['rouge1']:.4f}")
            print(f"  ROUGE-L: {result['rougeL']:.4f}")
            print(f"  Parameters: {result['parameters']:.2f}M")

            return result

        except Exception as e:
            print(f"❌ Error during evaluation of {model_name}: {e}")
            traceback.print_exc()
            return None

    # ---------------------------------------------------
    def run_benchmark(self):
        """Run benchmark across selected models"""
        models_to_test = [
            ("bert-base-multilingual-cased", "mBERT"),
            ("xlm-roberta-base", "XLM-R"),
            ("google/byt5-small", "ByT5"),
            ("ai4bharat/IndicBERTv2-MLM-only", "IndicBERTv2"),
        ]

        for model_path, model_label in models_to_test:
            print(f"\n\n{'='*80}")
            print(f"🧩 Loading {model_label} ...")
            print(f"{'='*80}")

            try:
                # Skip encoder-only models for seq2seq
                if any(x in model_path.lower() for x in ["bert", "roberta", "indicbert"]):
                    print(f"⚠️ Skipping {model_label} (encoder-only, not suitable for seq2seq tasks).")
                    self.results.append({
                        "model": model_label,
                        "bleu": None,
                        "rouge1": None,
                        "rouge2": None,
                        "rougeL": None,
                        "parameters": None,
                    })
                    continue

                # ------------------------------
                # Load ByT5 model + tokenizer
                tokenizer = AutoTokenizer.from_pretrained(model_path)
                base_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

                # If you have a custom morphology head, apply it; else, just use base model
                try:
                    model = ByT5WithMorphologyHead(base_model)
                except NameError:
                    print("⚠️ ByT5WithMorphologyHead not found — using base model instead.")
                    model = base_model

                # ------------------------------
                # Preprocess dataset
                print(f"🔧 Tokenizing dataset for {model_label} ...")
                tokenized_dataset = self.dataset.map(
                    lambda x: preprocess_function(x, tokenizer, augmenter, use_augmentation=True),
                    batched=True,
                    remove_columns=self.dataset["train"].column_names,
                )

                if "train" not in tokenized_dataset or "test" not in tokenized_dataset:
                    print(f"⚠️ Missing train/test splits for {model_label}. Skipping.")
                    continue

                # Evaluate model
                self.evaluate_model(model_label, model, tokenizer, tokenized_dataset)

            except Exception as e:
                print(f"❌ Error evaluating {model_label}: {e}")
                traceback.print_exc()

            # finally:
            #     # Always clean up GPU memory
            #     del model
            #     del tokenizer
            #     if torch.cuda.is_available():
            #         torch.cuda.empty_cache()

        # After all models
        return self.display_results()

    # ---------------------------------------------------
    def display_results(self):
        """Display benchmark results and save to CSV"""
        print("\n" + "=" * 80)
        print("📊 BENCHMARK RESULTS SUMMARY")
        print("=" * 80)

        if not self.results:
            print("No successful evaluations found. Check previous logs for details.")
            return pd.DataFrame()

        df = pd.DataFrame(self.results)

        if "bleu" in df.columns:
            df = df.sort_values(by="bleu", ascending=False)
        else:
            print("⚠️ 'bleu' column missing — results may be incomplete.")

        print("\n", df.to_string(index=False))
        df.to_csv("benchmark_results.csv", index=False)
        print("\n✅ Results saved to benchmark_results.csv")

        return df


# Main Training: ByT5 with Morphology

In [None]:
print("\n" + "="*80)
print("TRAINING BYT5 WITH MORPHOLOGICAL AWARENESS")
print("="*80)

model_name = "google/byt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create model with morphology head
model = ByT5WithMorphologyHead(base_model)

# Preprocess datasets
print("\nPreprocessing datasets with morphological augmentation...")
tokenized_dataset = dataset.map(
    lambda x: preprocess_function(x, tokenizer, augmenter, use_augmentation=True),
    batched=True,
    remove_columns=dataset["train"].column_names
)

In [None]:
print(f"Train samples: {len(tokenized_dataset['train'])}")
print(f"Test samples: {len(tokenized_dataset['test'])}")

# Training arguments
training_args = TrainingArguments(
    output_dir=f"./results_{model_name.replace('/', '_')}",
    eval_strategy="epoch", 
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    report_to="none"
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
print("\nStarting training...")
trainer.train()

# Saving the model

In [None]:
# Save model
print("\nSaving fine-tuned model...")
trainer.save_model("./byt5-hindi-morphaware-final")
tokenizer.save_pretrained("./byt5-hindi-morphaware-final")

# Example Generation

In [26]:
# Load the base model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("./byt5-hindi-morphaware-final")
base_model = AutoModelForSeq2SeqLM.from_pretrained("./byt5-hindi-morphaware-final")

# Wrap with morphology head
model = ByT5WithMorphologyHead(base_model)

# If you saved the wrapper weights, load them:
model.load_state_dict(torch.load("./byt5-hindi-morphaware-final/morphology_wrapper.pt"))

<All keys matched successfully>

In [None]:
print("\n" + "="*80)
print("TESTING GENERATION")
print("="*80)

model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

for i in range(min(3, len(dataset["test"]))):
    sample = dataset["test"][i]
    
    # Get input text based on dataset format
    if "context" in sample and "question" in sample:
        input_text = f"संदर्भ: {sample['context']}\nप्रश्न: {sample['question']}"
        
        # Get target answer
        if "answers" in sample:
            answers_data = sample["answers"]
            if isinstance(answers_data, dict):
                target_text = answers_data["text"][0] if isinstance(answers_data["text"], list) else answers_data["text"]
            else:
                target_text = str(answers_data)
        elif "answer" in sample:
            target_text = sample["answer"]
        else:
            target_text = "N/A"
    
    elif "instruction" in sample:
        input_text = sample["instruction"]
        if "input" in sample:
            input_text = f"{sample['instruction']} {sample['input']}".strip()
        target_text = sample.get("output", "N/A")
    
    else:
        # Fallback
        cols = list(sample.keys())
        input_text = str(sample[cols[0]]) if len(cols) > 0 else "N/A"
        target_text = str(sample[cols[1]]) if len(cols) > 1 else "N/A"
    
    # Generate
    inputs = tokenizer(
        "morphology aware: " + input_text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)
    
    output_ids = model.generate(
        **inputs,
        max_new_tokens=100,
        num_beams=4,
        early_stopping=True
    )
    
    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    print(f"\nExample {i+1}:")
    print(f"Input: {input_text}")
    print(f"Target: {target_text}")
    print(f"Generated: {generated}")
    print("-" * 80)

# Run Comprehensive Benchmark

In [27]:
print(len(dataset["test"]))

45


In [35]:
# ============================================
# Morphological Seq2Seq Model Benchmarking
# ============================================

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
import time
import pandas as pd

# ============================================
# 1️⃣  Models to Benchmark
# ============================================

models_to_benchmark = [
    "google/byt5-small",
    "google/mt5-small",
    "facebook/mbart-large-50-many-to-many-mmt",
    "Helsinki-NLP/opus-mt-hi-en",
    "facebook/nllb-200-distilled-600M",
    "./byt5-hindi-morphaware-final"   # Your fine-tuned custom model
]

# ============================================
# 2️⃣  Metrics Setup
# ============================================

metric_bleu = evaluate.load("sacrebleu")
metric_rouge = evaluate.load("rouge")

# ============================================
# 3️⃣  Load Test Data
# ============================================

# If using Hugging Face Dataset:
# from datasets import load_from_disk
# dataset = load_from_disk("./dataset-folder")
test_data = dataset["test"]

# Or if using pandas:
# test_data = pd.read_csv("test.csv")
# Simulate your dataset here:
# test_data = pd.DataFrame([...])

# Assuming `test_data` is a pandas DataFrame:
print(f"✅ Test size: {len(test_data)}")
# print(f"Columns: {list(test_data.columns)}")

# ============================================
# 4️⃣  Prepare Input-Output Pairs
# ============================================

inputs, references = [], []

for example in test_data:
    topic = str(example.get("topic", "")).strip()
    ctxs = str(example.get("ctxs", "")).strip()
    messages = str(example.get("messages", "")).strip()
    ans = example.get("answers", None)

    # Convert answer dict/list to string if needed
    if isinstance(ans, dict):
        ref = ans.get("text", "")
        if isinstance(ref, list):
            ref = ref[0]
    elif isinstance(ans, list):
        ref = ans[0] if len(ans) > 0 else ""
    else:
        ref = str(ans)

    # Combine relevant fields for input
    input_text = f"विषय: {topic}\nसंदर्भ: {ctxs}\nसंदेश: {messages}"

    inputs.append(input_text)
    references.append(ref.strip())

print(f"✅ Prepared {len(inputs)} samples for benchmarking.")


# ============================================
# 5️⃣  Evaluation Function
# ============================================

def evaluate_model(model_name):
    print(f"\n🔹 Evaluating: {model_name}")
    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    model.eval()

    generated_texts = []
    start_time = time.time()

    for inp in inputs:
        enc = tokenizer(
            "morphology aware: " + inp,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(device)

        with torch.no_grad():
            output = model.generate(
                **enc,
                max_new_tokens=128,
                num_beams=4,
                early_stopping=True
            )

        generated = tokenizer.decode(output[0], skip_special_tokens=True)
        generated_texts.append(generated)

    elapsed = time.time() - start_time

    # Compute metrics
    bleu = metric_bleu.compute(predictions=generated_texts, references=[[r] for r in references])["score"]
    rouge = metric_rouge.compute(predictions=generated_texts, references=references)

    return {
        "model": model_name,
        "BLEU": round(bleu, 2),
        "ROUGE-L": round(rouge["rougeL"], 2),
        "Time (s)": round(elapsed, 2)
    }

# ============================================
# 6️⃣  Run All Benchmarks
# ============================================

results = []
for model_name in models_to_benchmark:
    try:
        res = evaluate_model(model_name)
        results.append(res)
    except Exception as e:
        print(f"❌ Skipped {model_name}: {e}")

# ============================================
# 7️⃣  Summary Table
# ============================================

df = pd.DataFrame(results).sort_values(by="BLEU", ascending=False)
print("\n\n===== 🧩 Benchmark Summary =====")
print(df)

# Optional: Save results
df.to_csv("benchmark_results.csv", index=False)


✅ Test size: 45
✅ Prepared 45 samples for benchmarking.

🔹 Evaluating: google/byt5-small

🔹 Evaluating: google/mt5-small





🔹 Evaluating: facebook/mbart-large-50-many-to-many-mmt

🔹 Evaluating: Helsinki-NLP/opus-mt-hi-en





🔹 Evaluating: facebook/nllb-200-distilled-600M

🔹 Evaluating: ./byt5-hindi-morphaware-final


===== 🧩 Benchmark Summary =====
                                      model  BLEU  ROUGE-L  Time (s)
5             ./byt5-hindi-morphaware-final  0.43     0.00     33.53
0                         google/byt5-small  0.11     0.00     33.34
3                Helsinki-NLP/opus-mt-hi-en  0.08     0.00     20.65
4          facebook/nllb-200-distilled-600M  0.07     0.03     44.60
1                          google/mt5-small  0.05     0.01      4.75
2  facebook/mbart-large-50-many-to-many-mmt  0.03     0.00     44.23
