In [1]:
import os

def preprocess_conllu(input_file, output_file):
    """
    Reads a CoNLL-U file, removes any extra columns beyond the standard 10,
    and writes the cleaned data to a new file.
    """
    with open(input_file, 'r', encoding='utf-8') as fin, \
         open(output_file, 'w', encoding='utf-8') as fout:
        for line in fin:
            if line.startswith('#'):
                # Write comment lines as-is
                fout.write(line)
            elif line.strip() == '':
                # Write empty lines as-is (sentence separators)
                fout.write(line)
            else:
                # Split the line into columns
                columns = line.strip().split('\t')
                if len(columns) > 10:
                    # Retain only the first 10 columns
                    columns = columns[:10]
                elif len(columns) < 10:
                    # If fewer than 10 columns, pad with underscores
                    columns += ['_'] * (10 - len(columns))
                # Reconstruct the line and write to output
                fout.write('\t'.join(columns) + '\n')


In [2]:
# Define paths
data_dir = "./data/hi_hdtb/"
preprocessed_dir = "./data/hi_hdtb_preprocessed/"

# Create the preprocessed directory if it doesn't exist
os.makedirs(preprocessed_dir, exist_ok=True)

# Define input and output file paths
files = ["hi_hdtb-ud-train.conllu", "hi_hdtb-ud-dev.conllu", "hi_hdtb-ud-test.conllu"]

for file in files:
    input_path = os.path.join(data_dir, file)
    output_path = os.path.join(preprocessed_dir, file)
    preprocess_conllu(input_path, output_path)
    print(f"Preprocessed {file} and saved to {output_path}")


Preprocessed hi_hdtb-ud-train.conllu and saved to ./data/hi_hdtb_preprocessed/hi_hdtb-ud-train.conllu
Preprocessed hi_hdtb-ud-dev.conllu and saved to ./data/hi_hdtb_preprocessed/hi_hdtb-ud-dev.conllu
Preprocessed hi_hdtb-ud-test.conllu and saved to ./data/hi_hdtb_preprocessed/hi_hdtb-ud-test.conllu


In [3]:
import pandas as pd
from conllu import parse_incr

def load_preprocessed_conllu(file_path):
    """
    Load a preprocessed CoNLL-U formatted file and return a list of sentences with annotations.
    Each sentence is represented as a dictionary.
    """
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for tokenlist in parse_incr(f):
            sentence = {
                'id': tokenlist.metadata.get('sent_id', ''),
                'text': tokenlist.metadata.get('text', ''),
                'tokens': [token['form'] for token in tokenlist],
                'upos': [token['upos'] for token in tokenlist],
                'deprel': [token['deprel'] for token in tokenlist]
            }
            sentences.append(sentence)
    return sentences


In [4]:
# Define paths
preprocessed_dir = "./data/hi_hdtb_preprocessed/"
train_file = os.path.join(preprocessed_dir, "hi_hdtb-ud-train.conllu")
dev_file = os.path.join(preprocessed_dir, "hi_hdtb-ud-dev.conllu")
test_file = os.path.join(preprocessed_dir, "hi_hdtb-ud-test.conllu")

# Load the datasets
train_sentences = load_preprocessed_conllu(train_file)
dev_sentences = load_preprocessed_conllu(dev_file)
test_sentences = load_preprocessed_conllu(test_file)

# Convert to Pandas DataFrames
train_df = pd.DataFrame(train_sentences)
dev_df = pd.DataFrame(dev_sentences)
test_df = pd.DataFrame(test_sentences)

print("Training DataFrame Head:")
print(train_df.head())


Training DataFrame Head:
         id                                               text  \
0  train-s1      यह एशिया की सबसे बड़ी मस्जिदों में से एक है ।   
1  train-s2                    इसे नवाब शाहजेहन ने बनवाया था ।   
2  train-s3                   इसका प्रवेश द्वार दो मंजिला है ।   
3  train-s4  जिसमें चार मेहराबें हैं और मुख्य प्रार्थना हॉल...   
4  train-s5                       पूरी इमारत बेहद खूबसूरत है ।   

                                              tokens  \
0  [यह, एशिया, की, सबसे, बड़ी, मस्जिदों, में, से,...   
1            [इसे, नवाब, शाहजेहन, ने, बनवाया, था, ।]   
2           [इसका, प्रवेश, द्वार, दो, मंजिला, है, ।]   
3  [जिसमें, चार, मेहराबें, हैं, और, मुख्य, प्रार्...   
4                [पूरी, इमारत, बेहद, खूबसूरत, है, ।]   

                                                upos  \
0  [DET, PROPN, ADP, ADV, ADJ, NOUN, ADP, ADP, NU...   
1         [PRON, NOUN, PROPN, ADP, VERB, AUX, PUNCT]   
2           [PRON, NOUN, NOUN, NUM, ADJ, AUX, PUNCT]   
3  [PRON, NUM, NO

In [5]:
# Define the chunk types
CHUNK_TYPES = ['NP', 'VP', 'ADJP', 'ADVP', 'PP', 'Other']

# Function to create BI labels
def create_bi_labels(tokens, upos, deprel):
    labels = []
    current_chunk = None
    for token, pos, dep in zip(tokens, upos, deprel):
        if pos in ['PROPN', 'NOUN', 'PRON']:
            if current_chunk != 'NP':
                labels.append('B-NP')
                current_chunk = 'NP'
            else:
                labels.append('I-NP')
        elif pos == 'VERB':
            if current_chunk != 'VP':
                labels.append('B-VP')
                current_chunk = 'VP'
            else:
                labels.append('I-VP')
        elif pos == 'ADJ':
            if current_chunk != 'ADJP':
                labels.append('B-ADJP')
                current_chunk = 'ADJP'
            else:
                labels.append('I-ADJP')
        elif pos == 'ADV':
            if current_chunk != 'ADVP':
                labels.append('B-ADVP')
                current_chunk = 'ADVP'
            else:
                labels.append('I-ADVP')
        elif pos == 'ADP':
            if current_chunk != 'PP':
                labels.append('B-PP')
                current_chunk = 'PP'
            else:
                labels.append('I-PP')
        else:
            # Assign to 'Other' chunk type
            if current_chunk != 'Other':
                labels.append('B-Other')
                current_chunk = 'Other'
            else:
                labels.append('I-Other')
    return labels


In [6]:
# Apply the function to create BI labels
train_df['labels'] = train_df.apply(
    lambda row: create_bi_labels(row['tokens'], row['upos'], row['deprel']),
    axis=1
)

dev_df['labels'] = dev_df.apply(
    lambda row: create_bi_labels(row['tokens'], row['upos'], row['deprel']),
    axis=1
)

test_df['labels'] = test_df.apply(
    lambda row: create_bi_labels(row['tokens'], row['upos'], row['deprel']),
    axis=1
)

# Display the first few rows with labels
print("Training DataFrame with Labels:")
print(train_df[['tokens', 'upos', 'labels']].head())


Training DataFrame with Labels:
                                              tokens  \
0  [यह, एशिया, की, सबसे, बड़ी, मस्जिदों, में, से,...   
1            [इसे, नवाब, शाहजेहन, ने, बनवाया, था, ।]   
2           [इसका, प्रवेश, द्वार, दो, मंजिला, है, ।]   
3  [जिसमें, चार, मेहराबें, हैं, और, मुख्य, प्रार्...   
4                [पूरी, इमारत, बेहद, खूबसूरत, है, ।]   

                                                upos  \
0  [DET, PROPN, ADP, ADV, ADJ, NOUN, ADP, ADP, NU...   
1         [PRON, NOUN, PROPN, ADP, VERB, AUX, PUNCT]   
2           [PRON, NOUN, NOUN, NUM, ADJ, AUX, PUNCT]   
3  [PRON, NUM, NOUN, AUX, CCONJ, ADJ, NOUN, NOUN,...   
4                  [ADJ, NOUN, ADV, ADJ, AUX, PUNCT]   

                                              labels  
0  [B-Other, B-NP, B-PP, B-ADVP, B-ADJP, B-NP, B-...  
1   [B-NP, I-NP, I-NP, B-PP, B-VP, B-Other, I-Other]  
2  [B-NP, I-NP, I-NP, B-Other, B-ADJP, B-Other, I...  
3  [B-NP, B-Other, B-NP, B-Other, I-Other, B-ADJP...  
4   [B-ADJP, B-NP, 

In [7]:
from datasets import Dataset

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(dev_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

print("Training Dataset Sample:")
print(train_dataset[0])


Training Dataset Sample:
{'id': 'train-s1', 'text': 'यह एशिया की सबसे बड़ी मस्जिदों में से एक है ।', 'tokens': ['यह', 'एशिया', 'की', 'सबसे', 'बड़ी', 'मस्जिदों', 'में', 'से', 'एक', 'है', '।'], 'upos': ['DET', 'PROPN', 'ADP', 'ADV', 'ADJ', 'NOUN', 'ADP', 'ADP', 'NUM', 'AUX', 'PUNCT'], 'deprel': ['det', 'nmod', 'case', 'advmod', 'amod', 'nmod', 'case', 'case', 'root', 'cop', 'punct'], 'labels': ['B-Other', 'B-NP', 'B-PP', 'B-ADVP', 'B-ADJP', 'B-NP', 'B-PP', 'I-PP', 'B-Other', 'I-Other', 'I-Other']}


In [8]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())


2.4.1+cu118
11.8
True


In [9]:
# Combine train and dev DataFrames
combined_df = pd.concat([train_df, dev_df], ignore_index=True)

# Get the unique labels from the combined dataset
label_list = sorted(list(set(combined_df['labels'].explode())))
print("Labels:", label_list)

# Create label mappings
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print("Label to ID Mapping:", label2id)
print("ID to Label Mapping:", id2label)


Labels: ['B-ADJP', 'B-ADVP', 'B-NP', 'B-Other', 'B-PP', 'B-VP', 'I-ADJP', 'I-ADVP', 'I-NP', 'I-Other', 'I-PP', 'I-VP']
Label to ID Mapping: {'B-ADJP': 0, 'B-ADVP': 1, 'B-NP': 2, 'B-Other': 3, 'B-PP': 4, 'B-VP': 5, 'I-ADJP': 6, 'I-ADVP': 7, 'I-NP': 8, 'I-Other': 9, 'I-PP': 10, 'I-VP': 11}
ID to Label Mapping: {0: 'B-ADJP', 1: 'B-ADVP', 2: 'B-NP', 3: 'B-Other', 4: 'B-PP', 5: 'B-VP', 6: 'I-ADJP', 7: 'I-ADVP', 8: 'I-NP', 9: 'I-Other', 10: 'I-PP', 11: 'I-VP'}


In [10]:
# Function to convert labels to IDs
def encode_labels(labels):
    return [label2id[label] for label in labels]

# Apply the encoding to the datasets
train_dataset = train_dataset.map(lambda x: {"labels": encode_labels(x["labels"])}, batched=False)
val_dataset = val_dataset.map(lambda x: {"labels": encode_labels(x["labels"])}, batched=False)
test_dataset = test_dataset.map(lambda x: {"labels": encode_labels(x["labels"])}, batched=False)

# Verify the changes
print("Encoded Training Dataset Sample:")
print(train_dataset[0])


Map:   0%|          | 0/13306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

Encoded Training Dataset Sample:
{'id': 'train-s1', 'text': 'यह एशिया की सबसे बड़ी मस्जिदों में से एक है ।', 'tokens': ['यह', 'एशिया', 'की', 'सबसे', 'बड़ी', 'मस्जिदों', 'में', 'से', 'एक', 'है', '।'], 'upos': ['DET', 'PROPN', 'ADP', 'ADV', 'ADJ', 'NOUN', 'ADP', 'ADP', 'NUM', 'AUX', 'PUNCT'], 'deprel': ['det', 'nmod', 'case', 'advmod', 'amod', 'nmod', 'case', 'case', 'root', 'cop', 'punct'], 'labels': [3, 2, 4, 1, 0, 2, 4, 10, 3, 9, 9]}


In [11]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Define the model name
model_name = "xlm-roberta-base"  # Use "xlm-roberta-base" if resources are limited
# "xlm-roberta-large"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the pre-trained model with a token classification head
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Enable gradient checkpointing
model.gradient_checkpointing_enable()

In [14]:
def tokenize_and_align_labels(examples):
    # Tokenize the inputs
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=64  # Adjust based on your dataset
    )
    
    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore padding and special tokens
            elif word_idx != previous_word_idx:
                # Label for the first token of the word
                label_ids.append(label[word_idx])
            else:
                # For subword tokens, assign the same label if it's an 'I-' tag, else -100
                label_ids.append(label[word_idx] if id2label[label[word_idx]].startswith("I-") else -100)
                # current_label = label[word_idx].item()
                # label_str = id2label.get(current_label, "Other")
                # if label_str.startswith("I-"):
                #     label_ids.append(current_label)
                # else:
                #     label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization to the datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Verify tokenization
print("Tokenized Training Dataset Sample:")
print(train_dataset[0])


Map:   0%|          | 0/13306 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1684 [00:00<?, ? examples/s]

Tokenized Training Dataset Sample:
{'id': 'train-s1', 'text': 'यह एशिया की सबसे बड़ी मस्जिदों में से एक है ।', 'tokens': ['यह', 'एशिया', 'की', 'सबसे', 'बड़ी', 'मस्जिदों', 'में', 'से', 'एक', 'है', '।'], 'upos': ['DET', 'PROPN', 'ADP', 'ADV', 'ADJ', 'NOUN', 'ADP', 'ADP', 'NUM', 'AUX', 'PUNCT'], 'deprel': ['det', 'nmod', 'case', 'advmod', 'amod', 'nmod', 'case', 'case', 'root', 'cop', 'punct'], 'labels': [-100, 3, 2, 4, 1, 0, 2, -100, 4, 10, 3, 9, 9, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'input_ids': [0, 4239, 151677, 471, 13353, 33753, 230432, 1302, 421, 646, 967, 460, 207, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [15]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [16]:
from transformers import TrainingArguments
import torch

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,  # Adjust based on GPU memory
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision if GPU supports
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)




In [17]:
import numpy as np
from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = []
    true_predictions = []
    
    for pred, label in zip(predictions, labels):
        temp_true = []
        temp_pred = []
        for p, l in zip(pred, label):
            if l != -100:
                temp_true.append(id2label[l])
                temp_pred.append(id2label[p])
        true_labels.extend(temp_true)
        true_predictions.extend(temp_pred)
    
    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)
    
    precision = report["weighted avg"]["precision"]
    recall = report["weighted avg"]["recall"]
    f1 = report["weighted avg"]["f1-score"]
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,                         # The pre-trained model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Evaluation dataset
    tokenizer=tokenizer,                 # Tokenizer
    compute_metrics=compute_metrics       # Evaluation metrics
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [19]:
trainer.train()


  0%|          | 0/8315 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.4114, 'grad_norm': 6.4004082679748535, 'learning_rate': 2.9967528562838246e-05, 'epoch': 0.01}
{'loss': 1.6199, 'grad_norm': 5.1077880859375, 'learning_rate': 2.993505712567649e-05, 'epoch': 0.01}
{'loss': 1.1507, 'grad_norm': 4.9329304695129395, 'learning_rate': 2.9898977751052313e-05, 'epoch': 0.02}
{'loss': 0.8511, 'grad_norm': 5.1142191886901855, 'learning_rate': 2.9862898376428143e-05, 'epoch': 0.02}
{'loss': 0.6858, 'grad_norm': 6.308704376220703, 'learning_rate': 2.982681900180397e-05, 'epoch': 0.03}
{'loss': 0.495, 'grad_norm': 5.8255391120910645, 'learning_rate': 2.9790739627179795e-05, 'epoch': 0.04}
{'loss': 0.4265, 'grad_norm': 4.431100368499756, 'learning_rate': 2.9754660252555625e-05, 'epoch': 0.04}
{'loss': 0.4142, 'grad_norm': 9.943885803222656, 'learning_rate': 2.971858087793145e-05, 'epoch': 0.05}
{'loss': 0.3522, 'grad_norm': 6.607977390289307, 'learning_rate': 2.9682501503307277e-05, 'epoch': 0.05}
{'loss': 0.3842, 'grad_norm': 5.075407981872559, 'learnin

  0%|          | 0/830 [00:00<?, ?it/s]

{'eval_loss': 0.07432886213064194, 'eval_precision': 0.9811821265415793, 'eval_recall': 0.9813710486041479, 'eval_f1': 0.9812354096996032, 'eval_runtime': 50.9557, 'eval_samples_per_second': 32.558, 'eval_steps_per_second': 16.289, 'epoch': 1.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.0853, 'grad_norm': 6.303220748901367, 'learning_rate': 2.3989176187612748e-05, 'epoch': 1.0}
{'loss': 0.0784, 'grad_norm': 3.9057223796844482, 'learning_rate': 2.3953096812988577e-05, 'epoch': 1.01}
{'loss': 0.0966, 'grad_norm': 5.683530807495117, 'learning_rate': 2.3917017438364404e-05, 'epoch': 1.02}
{'loss': 0.0887, 'grad_norm': 2.5092740058898926, 'learning_rate': 2.3880938063740226e-05, 'epoch': 1.02}
{'loss': 0.064, 'grad_norm': 1.3671196699142456, 'learning_rate': 2.3844858689116056e-05, 'epoch': 1.03}
{'loss': 0.0692, 'grad_norm': 1.7423064708709717, 'learning_rate': 2.3808779314491882e-05, 'epoch': 1.03}
{'loss': 0.0354, 'grad_norm': 3.2913222312927246, 'learning_rate': 2.377269993986771e-05, 'epoch': 1.04}
{'loss': 0.0604, 'grad_norm': 2.52701473236084, 'learning_rate': 2.3736620565243538e-05, 'epoch': 1.05}
{'loss': 0.0598, 'grad_norm': 5.424926280975342, 'learning_rate': 2.3700541190619364e-05, 'epoch': 1.05}
{'loss': 0.0499, 'grad_norm': 4.474311828613281, 'lear

  0%|          | 0/830 [00:00<?, ?it/s]

{'eval_loss': 0.06913313269615173, 'eval_precision': 0.9830594834001467, 'eval_recall': 0.9831653385402923, 'eval_f1': 0.9830975150065427, 'eval_runtime': 48.3545, 'eval_samples_per_second': 34.309, 'eval_steps_per_second': 17.165, 'epoch': 2.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.0405, 'grad_norm': 6.016112804412842, 'learning_rate': 1.8e-05, 'epoch': 2.0}
{'loss': 0.0438, 'grad_norm': 3.9952592849731445, 'learning_rate': 1.7963920625375827e-05, 'epoch': 2.01}
{'loss': 0.022, 'grad_norm': 3.876678943634033, 'learning_rate': 1.7927841250751653e-05, 'epoch': 2.01}
{'loss': 0.0339, 'grad_norm': 0.5233259201049805, 'learning_rate': 1.7891761876127482e-05, 'epoch': 2.02}
{'loss': 0.037, 'grad_norm': 2.1623804569244385, 'learning_rate': 1.785568250150331e-05, 'epoch': 2.03}
{'loss': 0.0358, 'grad_norm': 0.22032952308654785, 'learning_rate': 1.7819603126879135e-05, 'epoch': 2.03}
{'loss': 0.0654, 'grad_norm': 2.4364583492279053, 'learning_rate': 1.778352375225496e-05, 'epoch': 2.04}
{'loss': 0.0278, 'grad_norm': 2.1413168907165527, 'learning_rate': 1.7747444377630787e-05, 'epoch': 2.04}
{'loss': 0.056, 'grad_norm': 1.6474171876907349, 'learning_rate': 1.7711365003006613e-05, 'epoch': 2.05}
{'loss': 0.0266, 'grad_norm': 2.2679781913757324, 'learning_rate': 1

  0%|          | 0/830 [00:00<?, ?it/s]

{'eval_loss': 0.06712137162685394, 'eval_precision': 0.9848984837124374, 'eval_recall': 0.9850651749432687, 'eval_f1': 0.9849715528729139, 'eval_runtime': 48.2872, 'eval_samples_per_second': 34.357, 'eval_steps_per_second': 17.189, 'epoch': 3.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.047, 'grad_norm': 2.4875681400299072, 'learning_rate': 1.201443174984967e-05, 'epoch': 3.0}
{'loss': 0.0247, 'grad_norm': 2.4071314334869385, 'learning_rate': 1.1978352375225496e-05, 'epoch': 3.01}
{'loss': 0.0287, 'grad_norm': 2.089400053024292, 'learning_rate': 1.1942273000601323e-05, 'epoch': 3.01}
{'loss': 0.0142, 'grad_norm': 0.2030104696750641, 'learning_rate': 1.190619362597715e-05, 'epoch': 3.02}
{'loss': 0.0247, 'grad_norm': 6.7110161781311035, 'learning_rate': 1.1870114251352977e-05, 'epoch': 3.02}
{'loss': 0.042, 'grad_norm': 1.7148933410644531, 'learning_rate': 1.1834034876728805e-05, 'epoch': 3.03}
{'loss': 0.0125, 'grad_norm': 3.0878918170928955, 'learning_rate': 1.1797955502104629e-05, 'epoch': 3.04}
{'loss': 0.0328, 'grad_norm': 0.8650580048561096, 'learning_rate': 1.1761876127480457e-05, 'epoch': 3.04}
{'loss': 0.0136, 'grad_norm': 2.062736988067627, 'learning_rate': 1.1725796752856285e-05, 'epoch': 3.05}
{'loss': 0.0218, 'grad_norm': 4.7722039222717285, 'le

  0%|          | 0/830 [00:00<?, ?it/s]

{'eval_loss': 0.0720905140042305, 'eval_precision': 0.9862797323520538, 'eval_recall': 0.9863317325452531, 'eval_f1': 0.9862789559717877, 'eval_runtime': 32.5717, 'eval_samples_per_second': 50.934, 'eval_steps_per_second': 25.482, 'epoch': 4.0}


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 0.0133, 'grad_norm': 1.3471370935440063, 'learning_rate': 5.989176187612748e-06, 'epoch': 4.0}
{'loss': 0.0316, 'grad_norm': 0.42802247405052185, 'learning_rate': 5.953096812988575e-06, 'epoch': 4.01}
{'loss': 0.018, 'grad_norm': 0.3881446421146393, 'learning_rate': 5.917017438364402e-06, 'epoch': 4.02}
{'loss': 0.0379, 'grad_norm': 0.4039086699485779, 'learning_rate': 5.8809380637402284e-06, 'epoch': 4.02}
{'loss': 0.0172, 'grad_norm': 2.4852919578552246, 'learning_rate': 5.8448586891160555e-06, 'epoch': 4.03}
{'loss': 0.0105, 'grad_norm': 0.6231584548950195, 'learning_rate': 5.8087793144918825e-06, 'epoch': 4.03}
{'loss': 0.0248, 'grad_norm': 0.03844039514660835, 'learning_rate': 5.7726999398677096e-06, 'epoch': 4.04}
{'loss': 0.0125, 'grad_norm': 0.06024516746401787, 'learning_rate': 5.736620565243536e-06, 'epoch': 4.05}
{'loss': 0.0158, 'grad_norm': 1.237242341041565, 'learning_rate': 5.700541190619363e-06, 'epoch': 4.05}
{'loss': 0.0055, 'grad_norm': 2.0163941383361816, '

  0%|          | 0/830 [00:00<?, ?it/s]

{'eval_loss': 0.07756432145833969, 'eval_precision': 0.9865228997016923, 'eval_recall': 0.9865692120956251, 'eval_f1': 0.9865316052618557, 'eval_runtime': 43.4697, 'eval_samples_per_second': 38.164, 'eval_steps_per_second': 19.094, 'epoch': 5.0}
{'train_runtime': 47110.3619, 'train_samples_per_second': 1.412, 'train_steps_per_second': 0.177, 'train_loss': 0.06289328367830721, 'epoch': 5.0}


TrainOutput(global_step=8315, training_loss=0.06289328367830721, metrics={'train_runtime': 47110.3619, 'train_samples_per_second': 1.412, 'train_steps_per_second': 0.177, 'total_flos': 2172878963159040.0, 'train_loss': 0.06289328367830721, 'epoch': 4.999248459341651})

In [20]:
# Evaluate on the validation set
val_metrics = trainer.evaluate(eval_dataset=val_dataset)
print("Validation Metrics:", val_metrics)


  0%|          | 0/830 [00:00<?, ?it/s]

Validation Metrics: {'eval_loss': 0.07756432145833969, 'eval_precision': 0.9865228997016923, 'eval_recall': 0.9865692120956251, 'eval_f1': 0.9865316052618557, 'eval_runtime': 27.9061, 'eval_samples_per_second': 59.449, 'eval_steps_per_second': 29.743, 'epoch': 4.999248459341651}


In [21]:
# Evaluate on the test set
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Test Metrics:", test_metrics)


  0%|          | 0/842 [00:00<?, ?it/s]

Test Metrics: {'eval_loss': 0.07045059651136398, 'eval_precision': 0.9851566004581175, 'eval_recall': 0.9851617995264405, 'eval_f1': 0.9851500651082551, 'eval_runtime': 28.9332, 'eval_samples_per_second': 58.203, 'eval_steps_per_second': 29.102, 'epoch': 4.999248459341651}


In [22]:
# Define the directory to save the model
model_save_path = "fine-tuned-xlm-roberta-hindi-chunker"

# Save the model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")


Model and tokenizer saved to fine-tuned-xlm-roberta-hindi-chunker


In [23]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the saved tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_save_path)
model = AutoModelForTokenClassification.from_pretrained(model_save_path)

print("Model and tokenizer loaded successfully.")


Model and tokenizer loaded successfully.


In [24]:
import torch

def chunk_sentence(sentence):
    # Tokenize the sentence
    tokens = sentence.split()  # Adjust tokenization as needed for Hindi
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    
    predicted_labels = [id2label[p.item()] for p in predictions[0]]
    
    # Align predictions with original tokens
    chunks = []
    current_chunk = []
    current_label = None
    
    for token, label in zip(tokens, predicted_labels):
        if label.startswith("B-"):
            if current_chunk:
                chunks.append((current_label, ' '.join(current_chunk)))
                current_chunk = []
            current_label = label[2:]
            current_chunk.append(token)
        elif label.startswith("I-") and current_label == label[2:]:
            current_chunk.append(token)
        else:
            if current_chunk:
                chunks.append((current_label, ' '.join(current_chunk)))
                current_chunk = []
            current_label = None
    
    if current_chunk:
        chunks.append((current_label, ' '.join(current_chunk)))
    
    return chunks


In [25]:
# Example sentence in Hindi
sentence = "राम ने किताब पढ़ी।"

# Perform chunking
chunks = chunk_sentence(sentence)

# Display the chunks
print("Chunks:", chunks)


Chunks: [('NP', 'राम'), ('NP', 'ने'), ('PP', 'किताब'), ('NP', 'पढ़ी।')]
