In [7]:
from datasets import load_dataset

dataset = load_dataset('xtreme', 'PAN-X.en')

tags = dataset['train'].features['ner_tags'].feature
def create_ner_tags_str(batch):
    ner_tags_str = [tags.int2str(idx) for idx in batch['ner_tags']]
    return {
        'ner_tags_str': ner_tags_str,
        'input': 
    }

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

dataset = dataset.map(create_ner_tags_str)
dataset, index2tag

Using the latest cached version of the dataset since xtreme couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'PAN-X.en' at /home/compiling-ganesh/24m0797/.cache/huggingface/datasets/xtreme/PAN-X.en/0.0.0/ec5f1f46e9af79639a90684a7a70a956c4998f04 (last modified on Wed Oct 15 14:09:13 2025).


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
})

In [102]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_name = 'bert-base-uncased'
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2index.keys())).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [226]:
def process(batch):
    tokenized_input = tokenizer(batch['tokens'], is_split_into_words=True, truncation=True)
    batch_labels = []
    for i in range(len(tokenized_input.input_ids)):
        word_ids = tokenized_input.word_ids(batch_index = i)
        labels = []
        prev_id = None
        for word_id in word_ids:
            if word_id is not None and word_id != prev_id:
                labels.append(batch['ner_tags'][i][word_id])
            else:
                labels.append(-100)
            prev_id = word_id
        batch_labels.append(labels)
        
    return {
        'input_ids': tokenized_input['input_ids'],
        'attention_mask': tokenized_input['attention_mask'],
        'labels': batch_labels,
    }

tokenized_dataset = dataset.map(process, batched=True, remove_columns=['tokens', 'ner_tags', 'langs', 'ner_tags_str'])

tokenized_dataset

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
})

In [111]:
tokenized_dataset.set_format('pt')
element = tokenized_dataset['train'][0]
for k, v in element.items():
    element[k] = v.unsqueeze(0).to(model.device)
    print(f"{k}: {v.shape}")

element

input_ids: torch.Size([18])
attention_mask: torch.Size([18])
labels: torch.Size([18])


{'input_ids': tensor([[  101,  1054,  1012,  1044,  1012, 15247,  1006,  2358,  1012,  5623,
           2314,  1007,  1006,  5986,  2620, 12464,  1007,   102]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
        device='cuda:0'),
 'labels': tensor([[-100,    3, -100, -100, -100,    4,    0,    3, -100,    4,    4,    0,
             0,    0, -100,    0,    0, -100]], device='cuda:0')}

In [112]:
model(input_ids=element['input_ids'])

TokenClassifierOutput(loss=None, logits=tensor([[[ 6.1471, -1.8713, -1.2632, -1.1330,  0.0871, -1.8853, -0.8286],
         [ 0.7283,  1.5231, -1.8846,  2.5090, -1.7349,  0.3948, -2.7095],
         [ 0.2162, -1.4112,  1.4875, -1.2552,  2.1848, -2.3096, -0.1213],
         [-1.1195, -1.2492,  2.9788, -2.0142,  2.7425, -2.6304, -0.1973],
         [-1.1124, -1.5560,  2.4273, -2.0839,  2.6958, -2.3937, -0.3671],
         [-0.1287, -1.8343,  3.3499, -1.8725,  2.8535, -2.5372, -0.3614],
         [ 3.1042, -2.2096,  0.3416, -1.7823,  2.6039, -2.7109,  0.0708],
         [-1.2141, -1.2433, -1.5182,  2.4411,  0.6438,  1.5449, -1.3846],
         [-0.1025, -2.7891,  0.3786, -1.3663,  3.0904, -1.8938,  1.3384],
         [-1.7391, -2.6729,  0.6353, -1.2077,  3.1343, -1.5018,  2.3717],
         [-0.1434, -2.1450,  0.1053, -1.5070,  3.3205, -2.0426,  2.4399],
         [ 3.9793, -2.3165, -0.3279, -1.9936,  1.7974, -2.3550, -0.2862],
         [ 6.0769, -1.8901, -0.7970, -1.6724,  0.7151, -2.2399, -0.8927]

In [238]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
import evaluate
import numpy as np

f1_metric = evaluate.load("f1")

def model_init():
    return AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2index.keys())).to(device)

args = TrainingArguments(
    num_train_epochs = 3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps = 10,
    eval_steps = 100,
    eval_strategy = 'steps',
)

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(res):
    batch_preds, batch_true = res.predictions, res.label_ids
    batch_preds = np.argmax(batch_preds, axis = -1)
    final_preds, final_true = [], []
    for preds, true in zip(batch_preds, batch_true):
        for p, t in zip(preds, true):
            if t != -100 and t != 0:
                final_preds.append(p)
                final_true.append(t)
        
    return {
        'f1-micro': f1_metric.compute(predictions=final_preds, references=final_true,
                               average = 'micro')['f1'],
        'f1-macro': f1_metric.compute(predictions=final_preds, references=final_true,
                               average = 'macro')['f1']
    }

trainer = Trainer(
    model_init = model_init,
    train_dataset = tokenized_dataset['train'],
    data_collator = data_collator,
    processing_class = tokenizer,
    args = args,
    compute_metrics = compute_metrics,
    eval_dataset = tokenized_dataset['validation']
)

trainer.train()

Using the latest cached version of the module from /home/compiling-ganesh/24m0797/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Tue Nov 25 11:27:07 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'c

Step,Training Loss,Validation Loss,F1-micro,F1-macro
100,0.2941,0.275568,0.859787,0.744539
200,0.2134,0.252527,0.869519,0.753157



KeyboardInterrupt



In [35]:
import pandas as pd
tokenized_input = tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True)
tokens = tokenized_input.tokens()
word_ids = tokenized_input.word_ids()

pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Tokens,[CLS],r,.,h,.,saunders,(,st,.,lawrence,river,),(,96,##8,mw,),[SEP]
Word IDs,,0,0,0,0,1,2,3,3,4,5,6,7,8,8,9,10,


In [263]:
import torch
import pandas as pd

def tag_text(text, model, tokenizer):
    model.eval()
    if isinstance(text, str):
        text = text.split(" ")
    tokenized_input = tokenizer(text, is_split_into_words = True, return_tensors='pt')
    for k, v in tokenized_input.items():
        tokenized_input[k] = v.to(model.device)
    with torch.no_grad():
        op = model(**tokenized_input)
    logits = op.logits[0]
    preds = torch.argmax(logits, dim=-1).cpu().numpy()
    preds = [index2tag[pred] for pred in preds]
    tokens = tokenized_input.tokens(batch_index=0)
    word_ids = tokenized_input.word_ids(batch_index=0)
    text_preds = []
    prev_word_id = None
    for i, word_id in enumerate(word_ids):
        if word_id is not None and word_id != prev_word_id:
            text_preds.append(preds[i])
        prev_word_id = word_id
    return pd.DataFrame([text, text_preds])

element = dataset['test'][0]['tokens']
tag_text("Shortly afterward an encouraging response influenced him to go to India. He arrived at Adyar in 1884.", trainer.model, tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,Shortly,afterward,an,encouraging,response,influenced,him,to,go,to,India.,He,arrived,at,Adyar,in,1884.
1,O,O,O,O,O,O,O,O,O,O,B-LOC,O,O,O,B-LOC,O,O


In [121]:
model = model.to(model.device)