# Labelling a NER dataset for retraining with SpanMarker

In [1]:
import argilla as rg
from datasets import load_dataset
from span_marker import SpanMarkerModel 

In [2]:
# Replace api_url with the url to your HF Spaces URL if using Spaces
# Replace api_key if you configured a custom API key
# Replace workspace with the name of your workspace
rg.init(
    api_url="https://ignacioct-argilla.hf.space",
    api_key="owner.apikey",
    workspace="admin"
)

In [3]:
model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super")
dataset = load_dataset("tomaarsen/conll2002", "es")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'sentence_id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 8323
    })
    validation: Dataset({
        features: ['id', 'document_id', 'sentence_id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 1915
    })
    test: Dataset({
        features: ['id', 'document_id', 'sentence_id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 1517
    })
})

In [4]:
for sample in dataset["train"].select(range(5)):
    print(sample)

{'id': '0', 'document_id': 0, 'sentence_id': 0, 'tokens': ['Melbourne', '(', 'Australia', ')', ',', '25', 'may', '(', 'EFE', ')', '.'], 'pos_tags': [29, 21, 29, 22, 13, 59, 28, 21, 28, 22, 20], 'ner_tags': [5, 0, 5, 0, 0, 0, 0, 0, 3, 0, 0]}
{'id': '1', 'document_id': 0, 'sentence_id': 1, 'tokens': ['-'], 'pos_tags': [16], 'ner_tags': [0]}
{'id': '2', 'document_id': 0, 'sentence_id': 2, 'tokens': ['El', 'Abogado', 'General', 'del', 'Estado', ',', 'Daryl', 'Williams', ',', 'subrayó', 'hoy', 'la', 'necesidad', 'de', 'tomar', 'medidas', 'para', 'proteger', 'al', 'sistema', 'judicial', 'australiano', 'frente', 'a', 'una', 'página', 'de', 'internet', 'que', 'imposibilita', 'el', 'cumplimiento', 'de', 'los', 'principios', 'básicos', 'de', 'la', 'Ley', '.'], 'pos_tags': [4, 28, 1, 40, 28, 13, 47, 28, 13, 47, 38, 4, 28, 40, 49, 28, 40, 49, 40, 28, 1, 1, 38, 40, 7, 28, 40, 28, 35, 47, 4, 28, 40, 4, 28, 1, 40, 4, 28, 20], 'ner_tags': [0, 1, 2, 2, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [5]:
labels = dataset["train"].features["ner_tags"].feature.names
print(labels)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [6]:
fine_labels = dataset["train"].features["pos_tags"].feature.names
print(fine_labels)

['AO', 'AQ', 'CC', 'CS', 'DA', 'DE', 'DD', 'DI', 'DN', 'DP', 'DT', 'Faa', 'Fat', 'Fc', 'Fd', 'Fe', 'Fg', 'Fh', 'Fia', 'Fit', 'Fp', 'Fpa', 'Fpt', 'Fs', 'Ft', 'Fx', 'Fz', 'I', 'NC', 'NP', 'P0', 'PD', 'PI', 'PN', 'PP', 'PR', 'PT', 'PX', 'RG', 'RN', 'SP', 'VAI', 'VAM', 'VAN', 'VAP', 'VAS', 'VMG', 'VMI', 'VMM', 'VMN', 'VMP', 'VMS', 'VSG', 'VSI', 'VSM', 'VSN', 'VSP', 'VSS', 'Y', 'Z']


In [7]:
for sample in dataset["train"].select(range(5)):
    print(model.predict(sample['tokens']))

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[{'span': ['Melbourne'], 'label': 'location-GPE', 'score': 0.907048225402832, 'word_start_index': 0, 'word_end_index': 1}, {'span': ['Australia'], 'label': 'location-GPE', 'score': 0.9773393273353577, 'word_start_index': 2, 'word_end_index': 3}]
[]
[{'span': ['Daryl', 'Williams'], 'label': 'person-other', 'score': 0.7660976648330688, 'word_start_index': 6, 'word_end_index': 8}]
[{'span': ['Victoria'], 'label': 'location-GPE', 'score': 0.6540813446044922, 'word_start_index': 18, 'word_end_index': 19}, {'span': ['Australia'], 'label': 'location-GPE', 'score': 0.97244793176651, 'word_start_index': 20, 'word_end_index': 21}, {'span': ['CrimeNet'], 'label': 'product-software', 'score': 0.3033939301967621, 'word_start_index': 59, 'word_end_index': 60}]
[]


In [8]:
# Build records for the first 20 examples
records = []

for record in dataset["train"].select(range(20)):

    # Grouping up the raw text, the tokenized text and the predictions
    predictions = model.predict(record['tokens'])
    raw_text = " ".join(record["tokens"])
    tokenized_text = record['tokens']   # we assume the text is split by spaces

    # In the predictions we only have the starting and ending word indexes, but we need
    # the character indexes to build TokenClassificationRecords. To obtain them, we have 
    # made a quick solution that searches for the star and end characters of each word 
    # and makes a list of tuples
    word_indices = []
    current_index = 0
    for word in tokenized_text:
        start = raw_text.find(word, current_index)
        end = start + len(word)
        current_index = end
        word_indices.append((start, end))

    # Now, we add these indexes to the predicions, to be able to append the predictions later.
    for p in predictions:
        p["start_char_index"] = word_indices[p['word_start_index']][0]
        p["end_char_index"] = word_indices[p['word_end_index']-1][1]

    # Building TokenClassificationRecord
    records.append(
        rg.TokenClassificationRecord(
            text=raw_text,
            tokens=tokenized_text,
            prediction=[(p["label"], p["start_char_index"], p["end_char_index"], p["score"]) for p in predictions],
            prediction_agent="tomaarsen/span-marker-xlm-roberta-base-fewnerd-fine-super",
        )
    )

# Log the records to Argilla
rg.log(records, name="conll2002_es", metadata={"split": "train"})

Output()

BulkResponse(dataset='conll2002_es', processed=20, failed=0)