In [38]:
import datasets
from transformers import AutoTokenizer, AutoModelForTokenClassification

raw_dataset = datasets.load_dataset('eriktks/conll2003')
label_names = raw_dataset['train'].features['pos_tags'].feature.names

num_labels = len(label_names)
id2label = {i:lbl for i, lbl in enumerate(label_names)}
label2id = {lbl:i for i, lbl in enumerate(label_names)}

Using the latest cached version of the dataset since eriktks/conll2003 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/sam/.cache/huggingface/datasets/eriktks___conll2003/default/0.0.0/ce85b39f9dd99f552d0739d456814e95fb6a39b0 (last modified on Wed Nov 26 23:48:03 2025).


In [39]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def preprocess(batch):
    tokenized_inputs = tokenizer(
        batch['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )

    labels = []


    for i, label in enumerate(batch['pos_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_id = None

        label_ids = []

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != previous_id:
                label_ids.append(label[word_id])
            else:
                label_ids.append(-100)
            previous_id = word_id

        labels.append(label_ids)


    tokenized_inputs['labels'] = labels
    return tokenized_inputs

tokenized_dataset = raw_dataset.map(
    preprocess,
    batched=True
)
 
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [40]:
raw_dataset['train'].features['pos_tags'].feature.names

['"',
 "''",
 '#',
 '$',
 '(',
 ')',
 ',',
 '.',
 ':',
 '``',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'NN|SYM',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB']

In [41]:
print(tokenized_dataset['train']['labels'][0])
print(tokenized_dataset['train']['tokens'][0])
print(tokenized_dataset['train']['input_ids'][0])


[-100, 22, 42, 16, 21, 35, 37, 16, 21, 7, -100]
['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]


In [42]:
import evaluate
import numpy as np
from sklearn.metrics import f1_score


def calculate_metrics(output):
    pred_batch, label_batch = output
    pred_batch = np.argmax(pred_batch, axis=2)

    pred_labels = []
    true_labels = []

    for pred, label in zip(pred_batch, label_batch):
        for p, l in zip(pred, label):
            if l != -100:
                pred_labels.append(p)
                true_labels.append(l) 
    
    accuracy = sum([p == r for p, r in zip(pred_labels, true_labels)]) / len(pred_labels)
    macro_f1 = f1_score(true_labels, pred_labels, average='macro')
    micro_f1 = f1_score(true_labels, pred_labels, average='micro')
    weighted_f1 = f1_score(true_labels, pred_labels, average='weighted')
    
    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        'weighted_f1': weighted_f1
    }


In [43]:
from transformers import AutoModelForTokenClassification



model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [44]:
from transformers import DataCollatorForTokenClassification


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) 

In [45]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback


training_args = TrainingArguments(
    output_dir='./ner_output',
    logging_dir='./logs',
    save_strategy="epoch",
    eval_strategy="epoch",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    num_train_epochs=20,
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    fp16=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    processing_class=tokenizer,
    compute_metrics=calculate_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Micro F1,Weighted F1
1,0.7125,0.247317,0.938301,0.788605,0.938301,0.937778
2,0.1878,0.219426,0.944628,0.838073,0.944628,0.944288
3,0.1351,0.206303,0.947432,0.838237,0.947432,0.946927
4,0.1039,0.204461,0.947938,0.841449,0.947938,0.947588
5,0.0789,0.212667,0.949788,0.851429,0.949788,0.949577
6,0.0578,0.223777,0.949807,0.852048,0.949807,0.949449
7,0.0456,0.23649,0.950275,0.848163,0.950275,0.949965


TrainOutput(global_step=6146, training_loss=0.1596575196853027, metrics={'train_runtime': 453.5512, 'train_samples_per_second': 619.158, 'train_steps_per_second': 38.717, 'total_flos': 2385592946599344.0, 'train_loss': 0.1596575196853027, 'epoch': 7.0})

In [46]:
test_result = trainer.evaluate(tokenized_dataset['test'])
print(test_result)

{'eval_loss': 0.2627008557319641, 'eval_accuracy': 0.9424141272746851, 'eval_macro_f1': 0.8609497029560589, 'eval_micro_f1': 0.9424141272746851, 'eval_weighted_f1': 0.9418500555557346, 'eval_runtime': 3.1688, 'eval_samples_per_second': 1089.689, 'eval_steps_per_second': 68.165, 'epoch': 7.0}


In [47]:
from transformers import pipeline


model = AutoModelForTokenClassification.from_pretrained('./ner_output/checkpoint-6146')

ner_pipeline = pipeline(
    task='token-classification',
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy='first'
)

query = 'Barack Obama and Elon Musk met at Google headquarters in California yesterday.'

res = ner_pipeline(query)
# print(res)
for entity in res:
    print(query[entity['start']: entity['end']])
    print(entity['entity_group'])
    print("="*20)

Device set to use cuda:0


Barack Obama
NNP
and
CC
Elon Musk
NNP
met
VBD
at
IN
Google
NNP
headquarters
NN
in
IN
California
NNP
yesterday
NN
.
.


In [48]:
raw_dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}