In [7]:
from datasets import load_dataset

dataset = load_dataset('xtreme', 'PAN-X.en')

tags = dataset['train'].features['ner_tags'].feature
def create_ner_tags_str(batch):
    ner_tags_str = [tags.int2str(idx) for idx in batch['ner_tags']]
    return {
        'ner_tags_str': ner_tags_str,
        'input': 
    }

index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

dataset = dataset.map(create_ner_tags_str)
dataset, index2tag

Using the latest cached version of the dataset since xtreme couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'PAN-X.en' at /home/compiling-ganesh/24m0797/.cache/huggingface/datasets/xtreme/PAN-X.en/0.0.0/ec5f1f46e9af79639a90684a7a70a956c4998f04 (last modified on Wed Oct 15 14:09:13 2025).


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 10000
    })
})

In [102]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_name = 'bert-base-uncased'
device = 'cuda'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2index.keys())).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
def process(batch):
    tokenized_input = tokenizer(batch['tokens'], is_split_into_words=True)
    batch_labels = []
    for i in range(len(tokenized_input.input_ids)):
        word_ids = tokenized_input.word_ids(batch_index = i)
        labels = []
        prev_id = None
        for word_id in word_ids:
            if word_id is not None and word_id != prev_id:
                labels.append(batch['ner_tags'][i][word_id])
            else:
                labels.append(-100)
            prev_id = word_id
        batch_labels.append(labels)
        
    return {
        'input_ids': tokenized_input['input_ids'],
        'attention_mask': tokenized_input['attention_mask'],
        'labels': batch_labels,
    }

tokenized_dataset = dataset.map(process, batched=True, remove_columns=['tokens', 'ner_tags', 'langs', 'ner_tags_str'])

tokenized_dataset

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
})

In [104]:
tokenized_dataset.set_format('pt')
element = tokenized_dataset['train'][0]
for k, v in element.items():
    element[k] = v.unsqueeze(0).to(model.device)
    print(f"{k}: {v.shape}")

element

input_ids: torch.Size([18])
attention_mask: torch.Size([18])
labels: torch.Size([18])


{'input_ids': tensor([[  101,  1054,  1012,  1044,  1012, 15247,  1006,  2358,  1012,  5623,
           2314,  1007,  1006,  5986,  2620, 12464,  1007,   102]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
        device='cuda:0'),
 'labels': tensor([[-100,    3, -100, -100, -100,    4,    0,    3, -100,    4,    4,    0,
             0,    0, -100,    0,    0, -100]], device='cuda:0')}

In [105]:
model(input_ids=element['input_ids'])

TokenClassifierOutput(loss=None, logits=tensor([[[ 0.1986,  0.3080, -0.6250,  0.1480, -0.3975,  0.3623, -0.0627],
         [ 0.4878,  0.5728, -0.3992,  0.0516, -0.3633, -0.1326,  0.2375],
         [ 0.7309,  0.3112, -0.2049, -0.0753, -0.3996, -0.1824,  0.1904],
         [ 0.4840,  0.6467, -0.0162, -0.0218, -0.3739, -0.1098,  0.0915],
         [ 0.5294,  0.5113, -0.2453, -0.0257, -0.2207, -0.2873, -0.0713],
         [ 0.5163,  0.5298, -0.5072,  0.1733, -0.2610, -0.3410, -0.2983],
         [ 0.1950,  0.0698,  0.0511, -0.2642, -0.3923, -0.1508, -0.1166],
         [ 0.0158,  0.1740, -0.3416,  0.1781, -0.0849, -0.0343, -0.5179],
         [ 0.4036, -0.0119, -0.0023, -0.0397,  0.0538,  0.1492, -0.3354],
         [-0.0456,  0.3246, -0.3368,  0.1033, -0.0791, -0.0354, -0.6216],
         [ 0.2418,  0.5791, -0.2783,  0.0538, -0.3133,  0.0307, -0.2147],
         [ 0.2638,  0.2974, -0.1125, -0.1079, -0.1458,  0.0607,  0.0741],
         [ 0.2353,  0.3430,  0.0530, -0.2306, -0.2994, -0.1147,  0.1270]

In [106]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification

def model_init():
    return AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2index.keys())).to(device)

args = TrainingArguments(
    num_train_epochs = 1,
    per_device_train_batch_size=4,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model_init = model_init,
    train_dataset = tokenized_dataset['train'],
    data_collator = data_collator,
    processing_class = tokenizer,
    args = args
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss



KeyboardInterrupt



In [35]:
import pandas as pd
tokenized_input = tokenizer(dataset['train'][0]['tokens'], is_split_into_words=True)
tokens = tokenized_input.tokens()
word_ids = tokenized_input.word_ids()

pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
Tokens,[CLS],r,.,h,.,saunders,(,st,.,lawrence,river,),(,96,##8,mw,),[SEP]
Word IDs,,0,0,0,0,1,2,3,3,4,5,6,7,8,8,9,10,


SyntaxError: expected ':' (1292684888.py, line 1)