In [1]:
import os
from conllu import parse_incr

def load_preprocessed_conllu(file_path):
    """
    Load a preprocessed CoNLL-U formatted file and return a list of sentences with annotations.
    Each sentence is represented as a dictionary.
    """
    sentences = []
    all_chunk_labels = set()  

    with open(file_path, 'r', encoding='utf-8') as f:
        for tokenlist in parse_incr(f):
            sentence = {
                'id': tokenlist.metadata.get('sent_id', ''),
                'text': tokenlist.metadata.get('text', ''),
                'tokens': [token['form'] for token in tokenlist],
                'upos': [token['upos'] for token in tokenlist],
                'chunk_id': [],
                'chunk_type': [],
            }

            for token in tokenlist:
                # Get the 'misc' field, which contains the ChunkId and ChunkType
                misc = token.get('misc', '')  # safely access 'misc' field
                chunk_id = '_'
                chunk_type = 'O'  # 'O' typically means 'outside a chunk'

                # Handle 'misc' as a string or dictionary
                if isinstance(misc, str):
                    # Extract ChunkId and ChunkType from the 'misc' field (if it's a string)
                    for item in misc.split('|'):
                        if item.startswith('ChunkId='):
                            chunk_id = item.split('=')[1]
                        elif item.startswith('ChunkType='):
                            chunk_type = item.split('=')[1]
                elif isinstance(misc, dict):
                    # If 'misc' is a dictionary, look for chunk info inside it
                    chunk_id = misc.get('ChunkId', '_')
                    chunk_type = misc.get('ChunkType', 'O')

                # Append chunk data
                sentence['chunk_id'].append(chunk_id)
                sentence['chunk_type'].append(chunk_type)

                # Collect all unique chunk labels (using only chunk_id)
                all_chunk_labels.add(chunk_id)

            sentences.append(sentence)
    return sentences,list(all_chunk_labels)

# Example usage:
data_dir = "./data/hi_hdtb/"
preprocessed_dir = "./data/hi_hdtb_preprocessed_new/"

# Load sentences with chunk info
file_path = os.path.join(preprocessed_dir, "hi_hdtb-ud-train.conllu")
train_sentences,all_chunk_labels = load_preprocessed_conllu(file_path)

file_path = os.path.join(preprocessed_dir, "hi_hdtb-ud-dev.conllu")
eval_sentences,all_chunk_labels2 = load_preprocessed_conllu(file_path)

# print(train_sentences[:2])  # print first two sentences for checking
all_chunk_labels+=all_chunk_labels2

file_path = os.path.join(preprocessed_dir, "hi_hdtb-ud-test.conllu")
test_sentences,all_chunk_labels3=load_preprocessed_conllu(file_path)

# print("Unique chunk labels:", all_chunk_labels)
def get_chunk_type_from_id(chunk_id):   #not used
    return ''.join([c for c in chunk_id if not c.isdigit()])
# if not c.isdigit()

all_chunk_labelschunk_labels = [get_chunk_type_from_id(chunk) for chunk in all_chunk_labels]
all_chunk_labels=list(set(all_chunk_labelschunk_labels))
# print(all_chunk_labels)


# train_sentences=train_sentences[:100]
# eval_sentences=eval_sentences[:10]

# print(len(train_sentences))
# print(len(eval_sentences))

# Encode the UPOS tags
upos_tags_set = set()
# upos_tags_set.add('O')
for sentence in train_sentences + eval_sentences:
    upos_tags_set.update(sentence['upos'])
upos_tag2id = {tag: idx for idx, tag in enumerate(upos_tags_set)}
id2upos = {v: k for k, v in upos_tag2id.items()}

# Define BIO tags
chunk_tags = {"O": 0}  # Outside tag
for label in all_chunk_labels:
    chunk_type = get_chunk_type_from_id(label)
    if f"B-{chunk_type}" not in chunk_tags:
        chunk_tags[f"B-{label}"] = len(chunk_tags)
        chunk_tags[f"I-{label}"] = len(chunk_tags)

id2tag = {v: k for k, v in chunk_tags.items()}

print(id2tag)
print(id2upos)




from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import classification_report
from transformers import DataCollatorForTokenClassification

# Define the model name
model_name = "xlm-roberta-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define Dataset
class ChunkingDataset(Dataset):
    def __init__(self, sentences, chunk_tags, tokenizer, upos_tag2id, max_len=128):
        self.sentences = sentences
        self.chunk_tags = chunk_tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.upos_tag2id = upos_tag2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tokens = sentence["tokens"]
        chunk_ids = sentence["chunk_id"]
        upos_tags = [self.upos_tag2id.get(tag, -1) for tag in sentence["upos"]]

        if not tokens:
            raise ValueError(f"No tokens found for sentence at index {idx}.")

        # Tokenize inputs
        encoded = self.tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt",
        )

        word_ids = encoded.word_ids()
        labels, aligned_upos_tags = [], []

        prev_chunk_id = None
        for i, word_id in enumerate(word_ids):
            if word_id is None:
                labels.append(-100)  # Ignored in loss computation
                aligned_upos_tags.append(-1)
            else:
                current_chunk_id = chunk_ids[word_id]
                upos_tag = upos_tags[word_id]
                if current_chunk_id != prev_chunk_id:
                    label = f"B-{get_chunk_type_from_id(current_chunk_id)}"
                else:
                    label = f"I-{get_chunk_type_from_id(current_chunk_id)}"

                labels.append(self.chunk_tags.get(label, 0))  # Default to "O"
                aligned_upos_tags.append(upos_tag)
                prev_chunk_id = current_chunk_id

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "upos_tags": torch.tensor(aligned_upos_tags, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }



train_dataset = ChunkingDataset(train_sentences, chunk_tags, tokenizer, upos_tag2id)
eval_dataset = ChunkingDataset(eval_sentences, chunk_tags, tokenizer, upos_tag2id)
test_dataset = ChunkingDataset(test_sentences,chunk_tags,tokenizer,upos_tag2id)
num_labels = len(chunk_tags)

import torch
import torch.nn as nn
from transformers import AutoModel, AutoModelForTokenClassification, XLMRobertaForSequenceClassification, AutoConfig



class CustomModel(XLMRobertaForSequenceClassification):
    def __init__(self, base_model_name, num_labels, num_upos_tags, upos_padding_idx=-1):
        config = AutoConfig.from_pretrained(base_model_name)
        super(CustomModel, self).__init__(config)
        hidden_size = self.config.hidden_size
        self.upos_embedding = nn.Embedding(num_upos_tags, hidden_size, padding_idx=upos_padding_idx)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_size * 2, num_labels)
        self.num_labels=num_labels
        self.gradient_checkpointing_enable()

    def forward(self, input_ids, attention_mask, upos_tags, labels=None):
        # Replace -1 with the padding index
        upos_tags = torch.where(upos_tags == -1, torch.tensor(self.upos_embedding.padding_idx).to(upos_tags.device), upos_tags)
        
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        # Check if there are any invalid values in upos_tags
        if torch.any(upos_tags >= self.upos_embedding.num_embeddings):
            invalid_indices = upos_tags[upos_tags >= self.upos_embedding.num_embeddings]
            print(f"Invalid upos_tags indices: {invalid_indices}")
            raise ValueError("Some upos_tags indices are out of bounds.")

        # Print upos_tags for debugging
        # print(f"upos_tags: {upos_tags}")

        upos_embeds = self.upos_embedding(upos_tags)
        # print(f"Shape of sequence_output: {sequence_output.shape}")
        # print(f"Shape of upos_embeds: {upos_embeds.shape}")

        if sequence_output.size(1) != upos_embeds.size(1):
            raise ValueError(
                f"Mismatch in sequence length: sequence_output ({sequence_output.size(1)}) "
                f"and upos_embeds ({upos_embeds.size(1)})"
            )

        combined_features = torch.cat((sequence_output, upos_embeds), dim=-1)
        logits = self.classifier(self.dropout(combined_features))
        
        # Reshape logits and labels to match [batch_size * seq_len, num_labels] 
        batch_size, seq_len, _ = sequence_output.shape
        # Ensure logits and labels have the same shape
        logits = logits.view(batch_size * seq_len, self.num_labels)
        if labels is not None:
            labels = labels.view(batch_size*seq_len)
        
        # Print shapes for debugging
        # print(f"Logits shape: {logits.shape}") 
        # print(f"Labels shape: {labels.shape}")
        # Print shapes for debugging 
        # print(f"Logits shape: {logits.shape}") 
        # print(f"Labels shape: {labels.shape if labels is not None else 'None'}")
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return (loss, logits) if loss is not None else logits


{0: 'O', 1: 'B-CCP', 2: 'I-CCP', 3: 'B-NEGP', 4: 'I-NEGP', 5: 'B-JJP', 6: 'I-JJP', 7: 'B-BLK', 8: 'I-BLK', 9: 'B-VGNF', 10: 'I-VGNF', 11: 'B-FRAGP', 12: 'I-FRAGP', 13: 'B-NP', 14: 'I-NP', 15: 'B-VGNN', 16: 'I-VGNN', 17: 'B-VGF', 18: 'I-VGF', 19: 'B-RBP', 20: 'I-RBP'}
{0: 'PROPN', 1: 'ADP', 2: 'ADJ', 3: 'PUNCT', 4: 'PART', 5: 'X', 6: 'SCONJ', 7: 'AUX', 8: 'CCONJ', 9: 'DET', 10: 'ADV', 11: 'NOUN', 12: 'VERB', 13: 'NUM', 14: 'INTJ', 15: 'PRON'}


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# from transformers import Trainer, TrainingArguments
# import numpy as np
# from sklearn.metrics import classification_report

# # class CustomTrainer(Trainer):
# #     def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
# #         has_labels = all(inputs.get(k) is not None for k in self.label_names)
# #         inputs = self._prepare_inputs(inputs)

# #         with torch.no_grad():
# #             outputs = model(**inputs)
# #             logits = outputs.get("logits") if isinstance(outputs, dict) else outputs[1]

# #         if has_labels:
# #             labels = inputs.get("labels")
# #         else:
# #             labels = None

# #         return (logits, labels)
    
# #     # def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix="eval"):
# #     #     model = self.model
# #     #     model.eval()

# #     #     all_metrics = []

# #     #     for step, inputs in enumerate(dataloader):
# #     #         outputs = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
# #     #         logits, labels = outputs

# #     #         logits = logits.cpu().numpy()
# #     #         labels = labels.cpu().numpy()

# #     #         # Ensure logits are reshaped to match the dimensions of labels
# #     #         batch_size, seq_len = labels.shape[:2]
# #     #         num_labels = logits.shape[-1]
# #     #         logits = logits.reshape((batch_size, seq_len, num_labels))

# #     #         predictions = np.argmax(logits, axis=-1)

# #     #         batch_metrics = self.compute_metrics((predictions, labels))
# #     #         all_metrics.append(batch_metrics)

# #     #     # Aggregate metrics across batches
# #     #     avg_metrics = {
# #     #         key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]
# #     #     }
# #     #     return { "metrics": avg_metrics, }
# #     #     return avg_metrics
# #     def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix="eval"):
# #         model = self.model
# #         model.eval()

# #         all_metrics = []

# #         for step, inputs in enumerate(dataloader):
# #             outputs = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
# #             logits, labels = outputs

# #             logits = logits.cpu().numpy()
# #             labels = labels.cpu().numpy()

# #             # Ensure logits are reshaped to match the dimensions of labels
# #             batch_size, seq_len = labels.shape[:2]
# #             num_labels = logits.shape[-1]
# #             logits = logits.reshape((batch_size, seq_len, num_labels))

# #             predictions = np.argmax(logits, axis=-1)

# #             batch_metrics = self.compute_metrics((predictions, labels))
# #             all_metrics.append(batch_metrics)

# #         # Aggregate metrics across batches
# #         avg_metrics = {
# #             key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]
# #         }

# #         # Debugging: print the avg_metrics structure
# #         print("Avg Metrics:", avg_metrics)

# #         return {"metrics": avg_metrics}
    
# #     def train(self, *args, **kwargs): 
# #         # Before calling _maybe_log_save_evaluate 
# #         print("Starting training...") 
# #         result = super().train(*args, **kwargs) 
# #         # Debugging: print the output of evaluation loop 
# #         output = self.evaluation_loop(dataloader, description="Evaluation") 
# #         print("Output before _maybe_log_save_evaluate:", output) 
# #         # Check if 'metrics' exists in 'output' 
# #         if 'metrics' in output: 
# #             print("Metrics found in output:", output['metrics']) 
# #         else: 
# #             print("Metrics key not found in output. Output keys are:", output.keys()) 
# #             # Call _maybe_log_save_evaluate
# #         self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) 
# #         return result


# from transformers import Trainer, TrainingArguments
# import numpy as np
# from sklearn.metrics import classification_report

# # class CustomTrainer(Trainer):
# #     def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
# #         has_labels = all(inputs.get(k) is not None for k in self.label_names)
# #         inputs = self._prepare_inputs(inputs)

# #         with torch.no_grad():
# #             outputs = model(**inputs)
# #             logits = outputs.get("logits") if isinstance(outputs, dict) else outputs[1]

# #         if has_labels:
# #             labels = inputs.get("labels")
# #         else:
# #             labels = None

# #         return (logits, labels)
    
# #     def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix="eval"):
# #         model = self.model
# #         model.eval()

# #         all_metrics = []

# #         for step, inputs in enumerate(dataloader):
# #             outputs = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
# #             logits, labels = outputs

# #             logits = logits.cpu().numpy()
# #             labels = labels.cpu().numpy()

# #             # Ensure logits are reshaped to match the dimensions of labels
# #             batch_size, seq_len = labels.shape[:2]
# #             num_labels = logits.shape[-1]
# #             logits = logits.reshape((batch_size, seq_len, num_labels))

# #             predictions = np.argmax(logits, axis=-1)

# #             batch_metrics = self.compute_metrics((predictions, labels))
# #             all_metrics.append(batch_metrics)

# #         # Aggregate metrics across batches
# #         avg_metrics = {
# #             key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]
# #         }

# #         # Debugging: print the avg_metrics structure
# #         print("Avg Metrics:", avg_metrics)

# #         # Return the output with the metrics key
# #         return {"metrics": avg_metrics}
    
# #     def train(self, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None, **kwargs):
# #         # Before calling _maybe_log_save_evaluate
# #         print("Starting training...")

# #         # Create dataloader for evaluation
# #         eval_dataloader = self.get_eval_dataloader()

# #         # Debugging: print the output of evaluation loop
# #         output = self.evaluation_loop(eval_dataloader, description="Evaluation")
# #         print("Output before _maybe_log_save_evaluate:", output)

# #         # Check if 'metrics' exists in 'output'
# #         if 'metrics' in output:
# #             print("Metrics found in output:", output['metrics'])
# #         else:
# #             print("Metrics key not found in output. Output keys are:", output.keys())
        
# #         # Call the original train method
# #         result = super().train(resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)

# #         return result

# # from transformers import Trainer, TrainingArguments
# # import numpy as np
# # from sklearn.metrics import classification_report

# # class CustomTrainer(Trainer):
# #     def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
# #         has_labels = all(inputs.get(k) is not None for k in self.label_names)
# #         inputs = self._prepare_inputs(inputs)

# #         with torch.no_grad():
# #             outputs = model(**inputs)
# #             logits = outputs.get("logits") if isinstance(outputs, dict) else outputs[1]

# #         if has_labels:
# #             labels = inputs.get("labels")
# #         else:
# #             labels = None

# #         return (logits, labels)
    
# #     def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix="eval"):
# #         model = self.model
# #         model.eval()

# #         all_metrics = []

# #         for step, inputs in enumerate(dataloader):
# #             outputs = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
# #             logits, labels = outputs

# #             logits = logits.cpu().numpy()
# #             labels = labels.cpu().numpy()

# #             # Ensure logits are reshaped to match the dimensions of labels
# #             batch_size, seq_len = labels.shape[:2]
# #             num_labels = logits.shape[-1]
# #             logits = logits.reshape((batch_size, seq_len, num_labels))

# #             predictions = np.argmax(logits, axis=-1)

# #             batch_metrics = self.compute_metrics((predictions, labels))
# #             all_metrics.append(batch_metrics)

# #         # Aggregate metrics across batches
# #         avg_metrics = {
# #             key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]
# #         }

# #         # Debugging: print the avg_metrics structure
# #         print("Avg Metrics:", avg_metrics)

# #         # Return the output with the metrics key
# #         return {"metrics": avg_metrics}
    
# #     def train(self, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None, **kwargs):
# #         # Before calling _maybe_log_save_evaluate
# #         print("Starting training...")

# #         # Call the original train method
# #         result = super().train(resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)

# #         return result

# #     def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval):
# #         # Call the original method and add debugging
# #         metrics = None
# #         if self.control.should_evaluate:
# #             # Debugging: print the output of evaluation
# #             output = self._evaluate(trial, ignore_keys_for_eval)
# #             print("Output of _evaluate:", output)
# #             if 'metrics' in output:
# #                 print("Metrics found in output:", output['metrics'])
# #             else:
# #                 print("Metrics key not found in output. Output keys are:", output.keys())
# #             metrics = output
        
# #         super()._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)

# from transformers import Trainer, TrainingArguments
# import numpy as np
# from sklearn.metrics import classification_report

# class CustomTrainer(Trainer):
#     def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
#         has_labels = all(inputs.get(k) is not None for k in self.label_names)
#         inputs = self._prepare_inputs(inputs)

#         with torch.no_grad():
#             outputs = model(**inputs)
#             logits = outputs.get("logits") if isinstance(outputs, dict) else outputs[1]

#         if has_labels:
#             labels = inputs.get("labels")
#         else:
#             labels = None

#         return (logits, labels)
    
#     def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix="eval"):
#         model = self.model
#         model.eval()

#         all_metrics = []

#         for step, inputs in enumerate(dataloader):
#             outputs = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
#             logits, labels = outputs

#             logits = logits.cpu().numpy()
#             labels = labels.cpu().numpy()

#             # Ensure logits are reshaped to match the dimensions of labels
#             batch_size, seq_len = labels.shape[:2]
#             num_labels = logits.shape[-1]
#             logits = logits.reshape((batch_size, seq_len, num_labels))

#             predictions = np.argmax(logits, axis=-1)

#             batch_metrics = self.compute_metrics((predictions, labels))
#             all_metrics.append(batch_metrics)

#         # Aggregate metrics across batches
#         avg_metrics = {
#             key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]
#         }

#         # Debugging: print the avg_metrics structure
#         print("Avg Metrics:", avg_metrics)

#         # Return the output with the metrics key
#         return {"metrics": avg_metrics}
    
#     def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix=""):
#         eval_dataloader = self.get_eval_dataloader(eval_dataset)
#         output = self.evaluation_loop(eval_dataloader, description="Evaluation", metric_key_prefix=metric_key_prefix)
#         metrics = output.get("metrics", {})
#         self.log(metrics)
#         return metrics
    
#     def train(self, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None, **kwargs):
#         # Before calling _maybe_log_save_evaluate
#         print("Starting training...")

#         # Call the original train method
#         result = super().train(resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)

#         return result


In [3]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import classification_report

class CustomTrainer(Trainer):
    def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
        has_labels = all(inputs.get(k) is not None for k in self.label_names)
        inputs = self._prepare_inputs(inputs)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.get("logits") if isinstance(outputs, dict) else outputs[1]

        if has_labels:
            labels = inputs.get("labels")
        else:
            labels = None

        return (logits, labels)
    
    def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix=""):
        model = self.model
        model.eval()

        all_metrics = []

        for step, inputs in enumerate(dataloader):
            outputs = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
            logits, labels = outputs

            logits = logits.cpu().numpy()
            labels = labels.cpu().numpy()

            # Ensure logits are reshaped to match the dimensions of labels
            batch_size, seq_len = labels.shape[:2]
            num_labels = logits.shape[-1]
            logits = logits.reshape((batch_size, seq_len, num_labels))

            predictions = np.argmax(logits, axis=-1)

            batch_metrics = self.compute_metrics((predictions, labels))
            all_metrics.append(batch_metrics)

        # Aggregate metrics across batches
        avg_metrics = {
            key: np.mean([metric[key] for metric in all_metrics]) for key in all_metrics[0]
        }

        # Ensure the evaluation metrics include f1
        if 'f1' not in avg_metrics:
            avg_metrics['f1'] = 0.0  # Default value if f1 is missing

        # Debugging: print the avg_metrics structure
        # print("Avg Metrics:", avg_metrics)

        # Return the output with the metrics key
        return {"metrics": avg_metrics}
    
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        output = self.evaluation_loop(eval_dataloader, description="Evaluation", metric_key_prefix=metric_key_prefix)
        metrics = output.get("metrics", {})
        self.log(metrics)

        # Debugging: print the metrics structure
        # print("Metrics in evaluate method:", metrics)

        return metrics
    
    def train(self, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None, **kwargs):
        # Before calling _maybe_log_save_evaluate
        # print("Starting training...")

        # Call the original train method
        result = super().train(resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)

        return result
    
    def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval):
        # print("In _maybe_log_save_evaluate")
        
        metrics = None
        if self.control.should_evaluate:
            metrics = self._evaluate(trial, ignore_keys_for_eval)

        # Debugging: print the metrics structure
        # if metrics:
        #     print("Metrics in _maybe_log_save_evaluate:", metrics)

        super()._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)



In [4]:
def compute_metrics(p):
    predictions, labels = p
    # print(f"Predictions shape: {predictions.shape}")
    # print(f"Labels shape: {labels.shape}")

    predictions = predictions.reshape(-1)

    # Flatten labels to match predictions
    labels = labels.reshape(-1)

    true_labels = []
    true_predictions = []

    for pred, label in zip(predictions, labels):
        if label != -100:  # Ensuring ignored tokens are not included
            true_labels.append(id2tag[label])
            true_predictions.append(id2tag[pred])

    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)
    precision = report["weighted avg"]["precision"]
    recall = report["weighted avg"]["recall"]
    f1 = report["weighted avg"]["f1-score"]

    return {
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": f1,
    }


In [5]:
# Initialize Data and Model
model = CustomModel(
    base_model_name=model_name,
    num_labels=num_labels,
    num_upos_tags=len(upos_tag2id),
    upos_padding_idx=-1
)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    no_cuda=False,
    metric_for_best_model="eval_f1",
    gradient_accumulation_steps=4
)

# Create Data Collator
class CustomDataCollator:
    def __call__(self, features):
        input_ids = torch.stack([f["input_ids"] for f in features])
        attention_mask = torch.stack([f["attention_mask"] for f in features])
        upos_tags = torch.stack([f["upos_tags"] for f in features])
        labels = torch.stack([f["labels"] for f in features])
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "upos_tags": upos_tags,
            "labels": labels
        }

data_collator = CustomDataCollator()

# Initialize Custom Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

model.gradient_checkpointing_enable()
# Training
trainer.train()


  trainer = CustomTrainer(
 20%|██        | 208/1040 [46:26<2:53:55, 12.54s/it]

{'eval_precision': np.float64(0.8514097154465643), 'eval_recall': np.float64(0.8437951472554641), 'eval_f1': np.float64(0.8429851381053163), 'f1': 0.0, 'epoch': 1.0}


 20%|██        | 208/1040 [46:46<2:53:55, 12.54s/it]

{'eval_precision': np.float64(0.8514097154465643), 'eval_recall': np.float64(0.8437951472554641), 'eval_f1': np.float64(0.8429851381053163), 'f1': 0.0, 'epoch': 1.0}


 40%|████      | 416/1040 [1:33:19<2:13:26, 12.83s/it]

{'eval_precision': np.float64(0.8763609499339371), 'eval_recall': np.float64(0.8755407762551842), 'eval_f1': np.float64(0.8728581597678207), 'f1': 0.0, 'epoch': 2.0}


 40%|████      | 416/1040 [1:33:38<2:13:26, 12.83s/it]

{'eval_precision': np.float64(0.8763609499339371), 'eval_recall': np.float64(0.8755407762551842), 'eval_f1': np.float64(0.8728581597678207), 'f1': 0.0, 'epoch': 2.0}


 48%|████▊     | 500/1040 [1:52:14<1:56:58, 13.00s/it]

{'loss': 0.4623, 'grad_norm': 3.6304471492767334, 'learning_rate': 2.5961538461538464e-05, 'epoch': 2.4}


 60%|██████    | 624/1040 [2:19:48<1:29:06, 12.85s/it]

{'eval_precision': np.float64(0.9135819645990464), 'eval_recall': np.float64(0.9123480860494856), 'eval_f1': np.float64(0.9106155267487667), 'f1': 0.0, 'epoch': 3.0}


 60%|██████    | 624/1040 [2:20:14<1:29:06, 12.85s/it]

{'eval_precision': np.float64(0.9135819645990464), 'eval_recall': np.float64(0.9123480860494856), 'eval_f1': np.float64(0.9106155267487667), 'f1': 0.0, 'epoch': 3.0}


 80%|████████  | 832/1040 [3:08:35<45:34, 13.15s/it]  

{'eval_precision': np.float64(0.9302788926518324), 'eval_recall': np.float64(0.9293709378735152), 'eval_f1': np.float64(0.9282097573923902), 'f1': 0.0, 'epoch': 4.0}


 80%|████████  | 832/1040 [3:08:54<45:34, 13.15s/it]

{'eval_precision': np.float64(0.9302788926518324), 'eval_recall': np.float64(0.9293709378735152), 'eval_f1': np.float64(0.9282097573923902), 'f1': 0.0, 'epoch': 4.0}


 96%|█████████▌| 1000/1040 [9:14:19<08:39, 12.99s/it]     

{'loss': 0.1969, 'grad_norm': 4.482397556304932, 'learning_rate': 1.9230769230769234e-06, 'epoch': 4.81}


100%|██████████| 1040/1040 [9:23:38<00:00, 12.74s/it]

{'eval_precision': np.float64(0.93665803502294), 'eval_recall': np.float64(0.936585924637009), 'eval_f1': np.float64(0.9350812109270854), 'f1': 0.0, 'epoch': 5.0}


100%|██████████| 1040/1040 [9:24:03<00:00, 12.74s/it]

{'eval_precision': np.float64(0.93665803502294), 'eval_recall': np.float64(0.936585924637009), 'eval_f1': np.float64(0.9350812109270854), 'f1': 0.0, 'epoch': 5.0}


100%|██████████| 1040/1040 [9:24:12<00:00, 32.55s/it]

{'train_runtime': 33852.9192, 'train_samples_per_second': 1.965, 'train_steps_per_second': 0.031, 'train_loss': 0.32278266824208773, 'epoch': 5.0}





TrainOutput(global_step=1040, training_loss=0.32278266824208773, metrics={'train_runtime': 33852.9192, 'train_samples_per_second': 1.965, 'train_steps_per_second': 0.031, 'total_flos': 4347588916846080.0, 'train_loss': 0.32278266824208773, 'epoch': 5.0})

In [6]:
# Define the directory to save the model
model_save_path = "chunker-fine-tuned-xlm-roberta-hindi-3"

# Save the model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")


Model and tokenizer saved to chunker-fine-tuned-xlm-roberta-hindi-3


In [7]:
# Evaluate on the validation set
val_metrics = trainer.evaluate(eval_dataset=eval_dataset)
print("Validation Metrics:", val_metrics)

# Evaluate on the test set
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test Metrics:", test_metrics)

Validation Metrics: {'eval_precision': np.float64(0.93665803502294), 'eval_recall': np.float64(0.936585924637009), 'eval_f1': np.float64(0.9350812109270854), 'f1': 0.0, 'epoch': 5.0}
Test Metrics: {'eval_precision': np.float64(0.9412401670025333), 'eval_recall': np.float64(0.9410789094038007), 'eval_f1': np.float64(0.9396890267530904), 'f1': 0.0, 'epoch': 5.0}


In [22]:
import torch

# Example Inference Function for POS and Chunking
def predict_chunking_with_pos(sentence, tokenizer, model, device):
    model.eval()
    
    # Tokenize the sentence with POS tags as input
    tokens = sentence['tokens']
    pos_tags = sentence['upos']

    # Tokenize using the tokenizer
    inputs = tokenizer(tokens, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt").to(device)

    # Align the POS tags with tokenized tokens
    aligned_pos_tags = []
    original_to_tok_map = []  # Maps each original token to its first subtoken
    for i, (word, pos) in enumerate(zip(tokens, pos_tags)):
        subword_tokens = tokenizer.tokenize(word)
        original_to_tok_map.append(len(aligned_pos_tags))
        aligned_pos_tags.extend([upos_tag2id.get(pos, -1)] * len(subword_tokens))

    # Ensure POS tags are aligned with tokenized words, and pad if necessary
    while len(aligned_pos_tags) < inputs['input_ids'].size(1):
        aligned_pos_tags.append(-1)  # Padding value for unknown or extra tokens
    aligned_pos_tags = aligned_pos_tags[:inputs['input_ids'].size(1)]  # Ensure no extra padding

    # Convert aligned POS tags to tensor and add to inputs
    inputs['upos_tags'] = torch.tensor([aligned_pos_tags]).to(device)

    # Perform inference (no gradient computation)
    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], upos_tags=inputs['upos_tags'])
        logits = outputs[1] if isinstance(outputs, tuple) else outputs  # Ensure you get logits tensor
        predictions = torch.argmax(logits, dim=-1)

    # Decode predictions (convert label IDs to tags)
    predicted_ids = predictions.squeeze().cpu().numpy()
    predicted_labels = [id2tag[label_id] for label_id in predicted_ids]

    # Map predicted labels back to original tokens
    original_predictions = [predicted_labels[original_to_tok_map[i]] for i in range(len(tokens))]

    return original_predictions

# Function to convert BIO tags to chunked phrases with types
def bio_to_chunks(tokens, bio_tags):
    chunks = []
    current_chunk = []
    current_label = None

    for token, tag in zip(tokens, bio_tags):
        if tag.startswith("B-"):
            if current_chunk:
                chunks.append((' '.join(current_chunk), current_label))
                current_chunk = []
            current_label = tag[2:]
            current_chunk.append(token)
        elif tag.startswith("I-") and current_label == tag[2:]:
            current_chunk.append(token)
        else:
            if current_chunk:
                chunks.append((' '.join(current_chunk), current_label))
            current_chunk = []
            if tag != "O":
                current_label = tag[2:]
                current_chunk.append(token)
            else:
                current_label = None
                current_chunk.append(token)  # Capture "O" tokens as separate chunks

    if current_chunk:
        chunks.append((' '.join(current_chunk), current_label))

    return chunks

# Test sentence with POS tagging
test_sentence = {
    "tokens": ["भारत", "एक", "सुंदर", "देश", "है"],
    "upos": ["PROPN", "DET", "ADJ", "NOUN", "AUX"]  # POS tags for Hindi sentence
}

# Make sure your model and tokenizer are loaded before calling this function
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
predicted_chunking = predict_chunking_with_pos(test_sentence, tokenizer, model, device)
# print(predicted_chunking)
chunks = bio_to_chunks(test_sentence['tokens'], predicted_chunking)
print(chunks)


[('भारत', 'NP'), ('एक', 'NP'), ('सुंदर देश है', 'NP')]
