In [3]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

ds1 = load_dataset('xtreme', 'PAN-X.en')
ds2 = load_dataset('xtreme', 'PAN-X.eu')

ds = dict()
for k in ds1.keys():
    ds[k] = concatenate_datasets([ds1[k], ds2[k]])

ds = DatasetDict(ds)
ds = ds.shuffle(seed=42)

Using the latest cached version of the dataset since xtreme couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'PAN-X.en' at /home/compiling-ganesh/24m0797/.cache/huggingface/datasets/xtreme/PAN-X.en/0.0.0/ec5f1f46e9af79639a90684a7a70a956c4998f04 (last modified on Sun Dec 14 17:32:15 2025).
Using the latest cached version of the dataset since xtreme couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'PAN-X.eu' at /home/compiling-ganesh/24m0797/.cache/huggingface/datasets/xtreme/PAN-X.eu/0.0.0/ec5f1f46e9af79639a90684a7a70a956c4998f04 (last modified on Sun Dec 14 12:47:27 2025).


In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

device = 'cuda'
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = ds['train'].features['ner_tags'].feature.num_classes).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def preprocess(item):
    res = tokenizer(item['tokens'], is_split_into_words=True, truncation=True, padding=True)
    res['labels'] = []
    for i in range(len(res['input_ids'])):
        word_ids = res.word_ids(batch_index = i)
        last_id = None
        labels = []
        for word_id in word_ids:
            if word_id is not None and word_id != last_id:
                last_id = word_id
                labels.append(int(item['ner_tags'][i][word_id]))
            else:
                labels.append(-100)
            last_id = word_id
        res['labels'].append(labels)
    return res

tokenized_ds = ds.map(preprocess, batched = True)
item = tokenized_ds['train'][0]

In [24]:
tokenizer.decode(item['input_ids'], skip_special_tokens = True)

'ellen barkin ( born 1954 )'

In [25]:
import pandas as pd

def pretty_print(item):
    breakpoint()
    text = item['tokens']
    tags = [ds['train'].features['ner_tags'].feature.names[idx] for idx in item['labels'] if idx != -100]
    df = pd.DataFrame([text, tags])
    return df

pretty_print(tokenized_ds['train'][0])

Unnamed: 0,0,1,2,3,4,5
0,Ellen,Barkin,(,born,1954,)
1,B-PER,I-PER,O,O,O,O


In [8]:
from transformers import Trainer, DataCollatorForTokenClassification, TrainingArguments
import evaluate
import numpy as np

f1 = evaluate.load('f1')

def compute_metrics(res):
    predictions, labels = np.argmax(res.predictions, axis=-1), res.label_ids
    final_pred, final_labels = [], []
    for p, l in zip(predictions, labels):
        for pred, label in zip(p, l):
            if label != -100:
                final_pred.append(pred)
                final_labels.append(label)
    return {
        'micro-f1': f1.compute(predictions = final_pred, references = final_labels, average = 'micro')['f1'],
        'macro-f1': f1.compute(predictions = final_pred, references = final_labels, average = 'macro')['f1'],
        'weighted-f1': f1.compute(predictions = final_pred, references = final_labels, average = 'weighted')['f1']
    }

def model_init():
    return AutoModelForTokenClassification.from_pretrained(model_name, num_labels = ds['train'].features['ner_tags'].feature.num_classes).to(device)

data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

args = TrainingArguments(
    num_train_epochs = 3,
    per_device_train_batch_size = 32,
    output_dir = 'output/out',
    eval_steps = 50,
    logging_steps = 50,
    eval_strategy = 'steps'
)

Using the latest cached version of the module from /home/compiling-ganesh/24m0797/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--f1/34c46321f42186df33a6260966e34a368f14868d9cc2ba47d142112e2800d233 (last modified on Tue Nov 25 11:27:07 2025) since it couldn't be found locally at evaluate-metric--f1, or remotely on the Hugging Face Hub.


In [77]:
import wandb


AttributeError: partially initialized module 'wandb' has no attribute 'errors' (most likely due to a circular import)

In [76]:
trainer = Trainer(
    args = args,
    model_init = model_init,
    processing_class = tokenizer,
    data_collator = data_collator,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['test'].select(range(10)),
    compute_metrics = compute_metrics
)

trainer.train()

AttributeError: partially initialized module 'wandb' has no attribute 'errors' (most likely due to a circular import)

In [65]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_ds["train"].select_columns(['input_ids', 'labels', 'attention_mask']),
    shuffle=True,
    collate_fn = data_collator,
    batch_size=args.per_device_train_batch_size
)

next(iter(train_dataloader))

{'input_ids': tensor([[  101,  4913,  5297,  ...,     0,     0,     0],
        [  101,  1005,  1005,  ...,     0,     0,     0],
        [  101, 11623,  2632,  ...,     0,     0,     0],
        ...,
        [  101,  6423, 21981,  ...,     0,     0,     0],
        [  101,  2106,  2025,  ...,     0,     0,     0],
        [  101,  5123,  2752,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,    1,    2,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    1,    2,  ..., -100, -100, -100],
        ...,
        [-100,    3,    4,  ..., -100, -100, -100],
        [-100,    0,    0,  ..., -100, -100, -100],
        [-100,    3,    4,  ..., -100, -100, -100]])}

In [61]:
tokenized_ds.select_columns?

[31mSignature:[39m tokenized_ds.select_columns(column_names: Union[str, list[str]]) -> [33m'DatasetDict'[39m
[31mDocstring:[39m
Select one or several column(s) from each split in the dataset and
the features associated to the column(s).

The transformation is applied to all the splits of the dataset
dictionary.

Args:
    column_names (`Union[str, list[str]]`):
        Name of the column(s) to keep.

Example:

```py
>>> from datasets import load_dataset
>>> ds = load_dataset("cornell-movie-review-data/rotten_tomatoes")
>>> ds.select_columns("text")
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1066
    })
})
```
[31mFile:[39m      ~/workspace/transformers-from-scratch/.venv/lib/python3.12/site-packages/datasets/dataset_dict.py
[31mType:[39m      method

In [70]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
import torch
import numpy as np
from tqdm import tqdm

train_dataloader = DataLoader(
    tokenized_ds["train"].select_columns(['input_ids', 'labels', 'attention_mask']),
    shuffle=True,
    collate_fn = data_collator,
    batch_size=args.per_device_train_batch_size
)

eval_dataloader = DataLoader(
    tokenized_ds["test"].select(range(10)).select_columns(['input_ids', 'labels', 'attention_mask']),
    shuffle=True,
    collate_fn = data_collator,
    batch_size=args.per_device_train_batch_size
)

optimizer = AdamW(
    model.parameters(),
    lr=args.learning_rate,
    betas=(args.adam_beta1, args.adam_beta2),
    eps=args.adam_epsilon,
    weight_decay=args.weight_decay
)

num_update_steps_per_epoch = len(train_dataloader)
max_steps = args.num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.get_warmup_steps(max_steps),
    num_training_steps=max_steps,
)

global_step = 0
model = model_init()
model.train()

for epoch in range(int(args.num_train_epochs)):
    print(f"\n===== Epoch {epoch + 1}/{args.num_train_epochs} =====")

    for step, batch in enumerate(tqdm(train_dataloader)):
        global_step += 1

        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        # ---- Backprop ----
        loss.backward()

        # ---- Gradient clipping (same as Trainer) ----
        if args.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(
                model.parameters(),
                args.max_grad_norm
            )

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # ---- Logging ----
        if args.logging_steps > 0 and global_step % args.logging_steps == 0:
            print(f"Step {global_step} | loss = {loss.item():.4f}")

        # ---- Evaluation ----
        if args.eval_steps > 0 and (global_step % args.eval_steps == 0 or global_step == 1):
            model.eval()

            all_preds = []
            all_labels = []

            with torch.no_grad():
                for eval_batch in tqdm(eval_dataloader):
                    eval_batch = {k: v.to(device) for k, v in eval_batch.items()}
                    outputs = model(**eval_batch)

                    all_preds.append(outputs.logits.cpu().numpy())
                    all_labels.append(eval_batch["labels"].cpu().numpy())

            preds = np.concatenate(all_preds, axis=0)
            labels = np.concatenate(all_labels, axis=0)

            metrics = compute_metrics(
                type("EvalPred", (), {
                    "predictions": preds,
                    "label_ids": labels
                })
            )

            print(
                f"Eval @ step {global_step} | "
                + " | ".join(f"{k}: {v:.4f}" for k, v in metrics.items())
            )

            model.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Epoch 1/3 =====


  0%|          | 0/938 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 71.68it/s]
  0%|          | 1/938 [00:00<05:40,  2.75it/s]

Eval @ step 1 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


  5%|▌         | 49/938 [00:14<04:32,  3.27it/s]

Step 50 | loss = 1.8829



100%|██████████| 1/1 [00:00<00:00, 70.59it/s]
  5%|▌         | 50/938 [00:14<05:22,  2.75it/s]

Eval @ step 50 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 11%|█         | 99/938 [00:29<04:26,  3.15it/s]

Step 100 | loss = 1.8931



100%|██████████| 1/1 [00:00<00:00, 71.80it/s]
 11%|█         | 100/938 [00:30<05:51,  2.38it/s]

Eval @ step 100 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 16%|█▌        | 149/938 [00:45<03:45,  3.50it/s]

Step 150 | loss = 1.9174



100%|██████████| 1/1 [00:00<00:00, 71.33it/s]
 16%|█▌        | 150/938 [00:45<05:13,  2.51it/s]

Eval @ step 150 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 21%|██        | 199/938 [01:00<03:42,  3.32it/s]

Step 200 | loss = 1.9239



100%|██████████| 1/1 [00:00<00:00, 71.70it/s]
 21%|██▏       | 200/938 [01:00<04:16,  2.87it/s]

Eval @ step 200 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 27%|██▋       | 249/938 [01:15<03:05,  3.71it/s]

Step 250 | loss = 1.8777



100%|██████████| 1/1 [00:00<00:00, 70.86it/s]
 27%|██▋       | 250/938 [01:16<04:26,  2.58it/s]

Eval @ step 250 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 32%|███▏      | 299/938 [01:31<03:24,  3.12it/s]

Step 300 | loss = 1.9397



100%|██████████| 1/1 [00:00<00:00, 69.58it/s]
 32%|███▏      | 300/938 [01:31<03:52,  2.75it/s]

Eval @ step 300 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 37%|███▋      | 349/938 [01:48<03:08,  3.12it/s]

Step 350 | loss = 1.8487



100%|██████████| 1/1 [00:00<00:00, 15.82it/s]
 37%|███▋      | 350/938 [01:48<04:16,  2.29it/s]

Eval @ step 350 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 43%|████▎     | 399/938 [02:04<02:53,  3.10it/s]

Step 400 | loss = 1.9363



100%|██████████| 1/1 [00:00<00:00, 70.71it/s]
 43%|████▎     | 400/938 [02:04<03:20,  2.68it/s]

Eval @ step 400 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 48%|████▊     | 449/938 [02:19<02:19,  3.50it/s]

Step 450 | loss = 1.8802



100%|██████████| 1/1 [00:00<00:00, 70.60it/s]
 48%|████▊     | 450/938 [02:20<03:14,  2.51it/s]

Eval @ step 450 | micro-f1: 0.2419 | macro-f1: 0.1370 | weighted-f1: 0.2658


 50%|█████     | 471/938 [02:26<02:25,  3.22it/s]


KeyboardInterrupt: 