# Install Required Libraries

In [None]:
%pip install transformers datasets seqeval accelerate




In [29]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
import torch

# Setup

In [33]:
model_name = "xlm-roberta-base"
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768

# 2. Load and parse CoNLL data

In [None]:
def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding="utf-8") as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
                continue
            parts = line.split()
            if len(parts) == 2:
                tokens.append(parts[0])
                tags.append(parts[1])
    return sentences, labels

tokens, tags = read_conll("../data/labeled_conll/labeled_50.conll")
data = [{"tokens": t, "ner_tags": l} for t, l in zip(tokens, tags)]
raw_dataset = Dataset.from_list(data)

# 3. Tokenize + Align labels

In [35]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
    word_ids = tokenized_inputs.word_ids()

    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]])
        else:
            label_ids.append(label_to_id[example["ner_tags"][word_idx]])  # or I- tag variation
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs
# Split data before tokenizing
train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

# Tokenize and align both splits
tokenized_train = train_dataset.map(tokenize_and_align_labels)
tokenized_eval = eval_dataset.map(tokenize_and_align_labels)


Map:   0%|          | 0/2532 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

# 4. Training arguments

In [36]:
training_args = TrainingArguments(
    output_dir="./main_training_output",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

# 5. Evaluation metrics

In [37]:
from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    true_preds = [
        [label_list[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    return {
        "f1": f1_score(true_labels, true_preds),
        "report": classification_report(true_labels, true_preds)
    }

# 6. Trainer and Train

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
10,0.0024
20,0.0184
30,0.0179
40,0.0039
50,0.0262
60,0.0081
70,0.0062
80,0.0041
90,0.0109
100,0.0057


TrainOutput(global_step=1585, training_loss=0.0042558221849363165, metrics={'train_runtime': 667.8431, 'train_samples_per_second': 18.957, 'train_steps_per_second': 2.373, 'total_flos': 827041619543040.0, 'train_loss': 0.0042558221849363165, 'epoch': 5.0})

# evaluation

In [42]:
trainer.evaluate()

{'eval_loss': 0.013809074647724628,
 'eval_f1': 0.9931010083141695,
 'eval_report': '              precision    recall  f1-score   support\n\n         LOC       1.00      1.00      1.00       513\n       PRICE       1.00      1.00      1.00       915\n     PRODUCT       0.99      0.99      0.99      1400\n\n   micro avg       0.99      0.99      0.99      2828\n   macro avg       1.00      0.99      0.99      2828\nweighted avg       0.99      0.99      0.99      2828\n',
 'eval_runtime': 5.7768,
 'eval_samples_per_second': 109.749,
 'eval_steps_per_second': 13.848,
 'epoch': 5.0}

In [None]:
trainer.save_model("../models/trained_models/fine_tunned_XLM-Roberta")