# Install Required Libraries

In [None]:
%pip install transformers datasets seqeval accelerate




In [29]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
import torch

# model comparision between models:
- xlm-roberta-base
- bert-base-multilingual-cased
- distilbert-base-multilingual-cased

In [48]:
def train_and_evaluate(model_name, train_dataset, eval_dataset, label_list):
    from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
    from seqeval.metrics import classification_report, f1_score
    import numpy as np

    label_to_id = {label: i for i, label in enumerate(label_list)}
    id_to_label = {i: label for label, i in label_to_id.items()}

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

    def tokenize_and_align_labels(example):
        tokenized_inputs = tokenizer(example['tokens'], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
        word_ids = tokenized_inputs.word_ids()
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[example['ner_tags'][word_idx]])
            else:
                label_ids.append(label_to_id[example['ner_tags'][word_idx]])
            previous_word_idx = word_idx
        tokenized_inputs["labels"] = label_ids
        return tokenized_inputs

    tokenized_train = train_dataset.map(tokenize_and_align_labels)
    tokenized_eval = eval_dataset.map(tokenize_and_align_labels)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    training_args = TrainingArguments(
        output_dir=f"./{model_name.replace('/', '_')}_ner",
        # evaluation_strategy="epoch",
        # save_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_dir="./logs",
        report_to="none",
    )

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_labels = [[label_list[l] for l in label_seq if l != -100] for label_seq in labels]
        true_preds = [[label_list[p] for p, l in zip(pred_seq, label_seq) if l != -100] for pred_seq, label_seq in zip(predictions, labels)]
        return {
            "f1": f1_score(true_labels, true_preds),
            "report": classification_report(true_labels, true_preds, zero_division=0)
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    return eval_metrics


In [49]:
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']

results = {}

models_to_try = [
    "xlm-roberta-base",
    "bert-base-multilingual-cased",  # mBERT
    "distilbert-base-multilingual-cased"  # DistilBERT
]

for model_name in models_to_try:
    print(f"\n🔍 Training and evaluating: {model_name}")
    metrics = train_and_evaluate(model_name, train_dataset, eval_dataset, label_list)
    results[model_name] = metrics



🔍 Training and evaluating: xlm-roberta-base


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,0.0973
1000,0.0125
1500,0.0055



🔍 Training and evaluating: bert-base-multilingual-cased


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,0.1308
1000,0.0404
1500,0.0238



🔍 Training and evaluating: distilbert-base-multilingual-cased


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,0.1434
1000,0.0428
1500,0.0274


In [50]:
print(results)

{'xlm-roberta-base': {'eval_loss': 0.010067019611597061, 'eval_f1': 0.9859154929577465, 'eval_report': '              precision    recall  f1-score   support\n\n         LOC       1.00      1.00      1.00       513\n       PRICE       0.99      0.99      0.99       915\n     PRODUCT       0.97      0.99      0.98      1400\n\n   micro avg       0.98      0.99      0.99      2828\n   macro avg       0.99      0.99      0.99      2828\nweighted avg       0.98      0.99      0.99      2828\n', 'eval_runtime': 5.217, 'eval_samples_per_second': 121.527, 'eval_steps_per_second': 15.335, 'epoch': 5.0}, 'bert-base-multilingual-cased': {'eval_loss': 0.025918731465935707, 'eval_f1': 0.921605465414176, 'eval_report': '              precision    recall  f1-score   support\n\n         LOC       0.90      0.91      0.91       567\n       PRICE       0.90      0.86      0.88       959\n     PRODUCT       0.95      0.96      0.96      1409\n\n   micro avg       0.92      0.92      0.92      2935\n   m

# 📊 Model Comparison Summary

| Model Name                        | Eval F1 | Eval Loss | Speed (samples/sec) | Comments                               |
|----------------------------------|---------|-----------|----------------------|----------------------------------------|
| **xlm-roberta-base**             | 0.986   | 0.0101    | 121.53               | 🔥 Best accuracy, good speed           |
| bert-base-multilingual-cased     | 0.922   | 0.0259    | 121.30               | Decent performance, but lower F1       |
| distilbert-base-multilingual-cased | 0.909 | 0.0270    | **216.53**           | ⚡ Fastest, lightest, but lowest F1     |

---

## ✅ Recommendation

### 🏆 Best Overall Accuracy: `xlm-roberta-base`

- ✅ Use this if **quality is more important than speed**, e.g., for **production or research**.
- ✔️ Excellent on all label categories: **`LOC`, `PRICE`, `PRODUCT`**.

### ⚡ Best for Speed/Efficiency: `distilbert-base-multilingual-cased`

- ✅ Use this if you're deploying to **resource-constrained environments** (e.g., **mobile apps**).
- ⚠️ Acceptable performance, but **noticeably lower F1**.

---
