## Login with Huggingface Hub

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load the dataset

In [2]:
from datasets import load_dataset
from pprint import pprint

raw_dataset = load_dataset('tomaarsen/MultiCoNER', 'multi')

pprint(raw_dataset['train'][0])

{'id': 0,
 'ner_tags': [0, 0, 0, 1, 2, 0, 1, 0, 7, 8, 0, 1, 2, 2],
 'tokens': ['his',
            'playlist',
            'includes',
            'sonny',
            'sharrock',
            ',',
            'gza',
            ',',
            'country',
            'teasers',
            'and',
            'the',
            'notorious',
            'b.i.g.']}


In [3]:
label_list = raw_dataset['train'].features[f'ner_tags'].feature.names
label_list

['O',
 'B-PER',
 'I-PER',
 'B-LOC',
 'I-LOC',
 'B-CORP',
 'I-CORP',
 'B-GRP',
 'I-GRP',
 'B-PROD',
 'I-PROD',
 'B-CW',
 'I-CW']

# Load the Tokenizer

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-cased')

In [5]:
example = raw_dataset['train'][0]
# Tokenize the first example in the training set
tokenized_input = tokenizer(
    example['tokens'],
    is_split_into_words=True,
    return_offsets_mapping=True,
)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

['[CLS]',
 'his',
 'play',
 '##list',
 'includes',
 'son',
 '##ny',
 's',
 '##har',
 '##rock',
 ',',
 'g',
 '##za',
 ',',
 'country',
 'tease',
 '##rs',
 'and',
 'the',
 'notorious',
 'b',
 '.',
 'i',
 '.',
 'g',
 '.',
 '[SEP]']

In [6]:
def tokenize_and_align_labels(examples):
    toeknized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = toeknized_inputs.word_ids(batch_index=i)
        # Map the token to the original label
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are ignored in loss
            if word_idx is None:
                label_ids.append(-100)
            elif previous_word_idx is None or word_idx != previous_word_idx:
                # Start of a new word
                label_ids.append(label[word_idx])
            else:
                # Continuing the same word, repeat the previous label
                label_ids.append(label[previous_word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    toeknized_inputs['labels'] = labels
    return toeknized_inputs

In [7]:
tokenized_datasets = raw_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/471911 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Evaluate

In [9]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
import numpy as np

labels = [label_list[i] for i in example[f'ner_tags']]

def compute_metrics(p):
    """
    Computes the precision, recall, and F1 score of the model predictions.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)  # Get the predicted class indices
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for (prediction, label) in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for (prediction, label) in zip(predictions, labels)
    ]
    
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Train

In [11]:
LABEL_TO_ID = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-LOC": 3,
    "I-LOC": 4,
    "B-CORP": 5,
    "I-CORP": 6,
    "B-GRP": 7,
    "I-GRP": 8,
    "B-PROD": 9,
    "I-PROD": 10,
    "B-CW": 11,
    "I-CW": 12,
}
ID_TO_LABEL = {i: label for i, label in enumerate(LABEL_TO_ID)}

In [12]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

model = AutoModelForTokenClassification.from_pretrained(
    'google-bert/bert-base-cased',
    num_labels=len(LABEL_TO_ID),
    id2label=ID_TO_LABEL,
    label2id=LABEL_TO_ID,
)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy='epoch',  # Evaluate every epoch
    save_strategy='epoch',  # Save the model every epoch
    load_best_model_at_end=True,  # Load the best model when finished training
    save_total_limit=2,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Pass the compute_metrics function
)

trainer.train()

# Push the model to the Hub
trainer.push_to_hub(commit_message="Training complete")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3919,0.366982,1.0,1.0,1.0,1.0
2,0.3088,0.325136,1.0,1.0,1.0,1.0
3,0.2563,0.320791,1.0,1.0,1.0,1.0


CommitInfo(commit_url='https://huggingface.co/Blusque/results/commit/82034281b531802d43366378fe479e9bc2a0949a', commit_message='Training complete', commit_description='', oid='82034281b531802d43366378fe479e9bc2a0949a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Blusque/results', endpoint='https://huggingface.co', repo_type='model', repo_id='Blusque/results'), pr_revision=None, pr_num=None)