In [1]:
!pip install transformers datasets seqeval wandb

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
     ---------------------------------------- 43.6/43.6 kB 1.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting wandb
  Downloading wandb-0.20.1-py3-none-win_amd64.whl.metadata (10 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting pyarrow>=15.0.0 (from dat



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CoNLL-formatted file into memory
def load_conll_data(filepath):
    sentences = []
    current_tokens = []
    current_labels = []
    with open(filepath, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:  # blank line indicates end of a sentence
                if current_tokens:
                    sentences.append((current_tokens, current_labels))
                current_tokens, current_labels = [], []
            else:
                token, label = line.split(maxsplit=1)
                current_tokens.append(token)
                current_labels.append(label)
    print(f"Loaded {len(sentences)} labeled sentences")
    return sentences

sentences = load_conll_data("labeled_telegram_product_price_location.txt")
# Split into train/validation sets (80/20 split)
train_sentences, val_sentences = train_test_split(sentences, test_size=0.2, random_state=42)
len(train_sentences), len(val_sentences)


Loaded 3166 labeled sentences


(2532, 634)

In [3]:
from transformers import AutoTokenizer

# Choose a pre-trained model tokenizer (XLM-RoBERTa base for multilingual support)
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# Define label mappings
unique_labels = set(label for _, labels in train_sentences for label in labels)
label_to_id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}
num_labels = len(label_to_id)
print(label_to_id)
# Example output: {'B-PRODUCT':0, 'I-PRODUCT':1, 'B-PRICE':2, 'I-PRICE':3, 'B-LOC':4, 'I-LOC':5, 'O':6}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

{'B-PRODUCT': 0, 'I-LOC': 1, 'I-PRICE': 2, 'I-PRODUCT': 3, 'O': 4}


In [5]:
def tokenize_and_align_labels(examples):
    # 1) Tokenize all sentences in this batch at once
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    # 2) Grab the NER labels from the batch
    all_labels = examples["labels"]  # or "ner_tags"
    new_labels = []

    # 3) For each sentence in the batch, build its aligned label list
    for i, labels in enumerate(all_labels):
        word_ids    = tokenized_inputs.word_ids(batch_index=i)
        prev_word   = None
        label_ids   = []

        for word_idx in word_ids:
            if word_idx is None:
                # special tokens
                label_ids.append(-100)
            elif word_idx != prev_word:
                # first subword of a new word
                label_ids.append(label_to_id[labels[word_idx]])
            else:
                # subsequent subword of the same word
                label_ids.append(-100)
            prev_word = word_idx

        new_labels.append(label_ids)

    # 4) Insert the aligned labels back into the tokenized batch
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

# Then map with batched=True:
tokenized_train = train_data.map(tokenize_and_align_labels, batched=True)
tokenized_val   = val_data.map(  tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

## Fine-Tuning the Transformer Model

In [12]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [6]:
import transformers
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./ner-model", 
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    logging_dir="./logs",
    save_steps=100,
    save_total_limit=2,
    report_to="wandb"  # enable Weights & Biases logging if W&B is setup
)

# Define a metrics function using seqeval for NER
import numpy as np
import evaluate
seqeval = evaluate.load("seqeval")

def compute_metrics(predictions_and_labels):
    predictions, labels = predictions_and_labels
    # Pick the highest scoring label for each token
    pred_labels = np.argmax(predictions, axis=2)
    true_labels = labels
    true_labels_list, pred_labels_list = [], []
    for i, label_ids in enumerate(true_labels):
        # Remove ignored index (special tokens) and convert to label names
        true_labels_list.append([id_to_label[id] for (id,p) in zip(label_ids, pred_labels[i]) if id != -100])
        pred_labels_list.append([id_to_label[p] for (id,p) in zip(label_ids, pred_labels[i]) if id != -100])
    results = seqeval.compute(predictions=pred_labels_list, references=true_labels_list)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.