## Login with Huggingface Hub

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load the dataset

In [3]:
from datasets import load_dataset
from pprint import pprint

raw_dataset = load_dataset('tomaarsen/MultiCoNER', 'multi')

pprint(raw_dataset['train'][0])

{'id': 0,
 'ner_tags': [0, 0, 0, 1, 2, 0, 1, 0, 7, 8, 0, 1, 2, 2],
 'tokens': ['his',
            'playlist',
            'includes',
            'sonny',
            'sharrock',
            ',',
            'gza',
            ',',
            'country',
            'teasers',
            'and',
            'the',
            'notorious',
            'b.i.g.']}


In [4]:
pprint(raw_dataset['train'][10])

{'id': 10,
 'ner_tags': [0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              11,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0,
              0],
 'tokens': ['she',
            'became',
            'opposed',
            'to',
            'abortion',
            'in',
            '1992',
            'while',
            'attending',
            'a',
            'bible',
            'study',
            'and',
            'has',
            'since',
            'spoken',
            'out',
            'about',
            'how',
            'abortion',
            'has',
            'negatively',
            'impacted',
            'her',
            'life',
            '.']}


In [3]:
label_list = raw_dataset['train'].features[f'ner_tags'].feature.names
label_list

['O',
 'B-PER',
 'I-PER',
 'B-LOC',
 'I-LOC',
 'B-CORP',
 'I-CORP',
 'B-GRP',
 'I-GRP',
 'B-PROD',
 'I-PROD',
 'B-CW',
 'I-CW']

# Load the Tokenizer

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-cased')

In [16]:
example = raw_dataset['train'][1]
print(example['tokens'])
# Tokenize the first example in the training set
tokenized_input = tokenizer(
    example['tokens'],
    truncation=True,
    is_split_into_words=True,
)
word_ids = tokenized_input.word_ids(batch_index=0)
pprint(example['ner_tags'])
pprint(word_ids)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

['it', 'is', 'a', 'series', 'of', 'badminton', 'tournaments', ',', 'sanctioned', 'by', 'badminton', 'world', 'federation', '(', 'bwf', ')', 'since', '2007', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 0, 0, 0, 0, 0]
[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 14,
 14,
 15,
 16,
 17,
 18,
 None]


['[CLS]',
 'it',
 'is',
 'a',
 'series',
 'of',
 'badminton',
 'tournaments',
 ',',
 'sanctioned',
 'by',
 'badminton',
 'world',
 'federation',
 '(',
 'b',
 '##w',
 '##f',
 ')',
 'since',
 '2007',
 '.',
 '[SEP]']

In [18]:
def tokenize_and_align_labels(examples):
    toeknized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = toeknized_inputs.word_ids(batch_index=i)
        # Map the token to the original label
        previous_word_idx = None

        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are ignored in loss
            if word_idx is None:
                label_ids.append(-100)
            elif previous_word_idx is None or word_idx != previous_word_idx:
                # Start of a new word
                label_ids.append(label[word_idx])
            else:
                # Continuing the same word, repeat the previous label
                label_ids.append(label[previous_word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    toeknized_inputs['labels'] = labels
    return toeknized_inputs

In [26]:
tokenized_datasets = raw_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=raw_dataset['train'].column_names)
print(tokenized_datasets['train'][0])

{'input_ids': [101, 1117, 1505, 7276, 2075, 1488, 3382, 188, 7111, 10411, 117, 176, 3293, 117, 1583, 20826, 1733, 1105, 1103, 14140, 171, 119, 178, 119, 176, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 1, 1, 2, 2, 2, 0, 1, 1, 0, 7, 8, 8, 0, 1, 2, 2, 2, 2, 2, 2, 2, -100]}


In [27]:
max_length = 0
for i, example in enumerate(tokenized_datasets['train']):
    if len(example['input_ids']) > max_length:
        max_length = len(example['labels'])
print(max_length)

159


In [5]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Evaluate

In [6]:
import evaluate

seqeval = evaluate.load("seqeval")

In [10]:
import numpy as np

labels = [label_list[i] for i in example[f'ner_tags']]

def compute_metrics(p):
    """
    Computes the precision, recall, and F1 score of the model predictions.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)  # Get the predicted class indices
    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for (prediction, label) in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for (prediction, label) in zip(predictions, labels)
    ]
    
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Train

In [8]:
LABEL_TO_ID = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-LOC": 3,
    "I-LOC": 4,
    "B-CORP": 5,
    "I-CORP": 6,
    "B-GRP": 7,
    "I-GRP": 8,
    "B-PROD": 9,
    "I-PROD": 10,
    "B-CW": 11,
    "I-CW": 12,
}
ID_TO_LABEL = {i: label for i, label in enumerate(LABEL_TO_ID)}

In [19]:
weights = np.zeros(len(LABEL_TO_ID), dtype=np.float32)
# pprint(tokenized_datasets)
# pprint(tokenized_datasets['train'].features)
# pprint(np.unique(tokenized_datasets['train'][0]['ner_tags'], return_counts=True))

for i in range(len(tokenized_datasets['train'])):
    unique, counts = np.unique(tokenized_datasets['train'][i]['ner_tags'], return_counts=True)
    weights[unique] += counts
# Normalize weights
weights = weights / tokenized_datasets['train'].num_rows
# Inverse frequency to give more weight to rare labels
weights = 1.0 / weights
# Normalize again to prevent exploding gradients
weights = weights / np.max(weights)
print("Class weights for loss function:")
for label_id, weight in enumerate(weights):
    print(f"{ID_TO_LABEL[label_id]}: {weight:.4f}")

Class weights for loss function:
O: 0.0111
B-PER: 0.5688
I-PER: 0.4536
B-LOC: 0.4627
I-LOC: 0.6537
B-CORP: 0.7601
I-CORP: 0.6668
B-GRP: 0.7612
I-GRP: 0.5372
B-PROD: 0.7119
I-PROD: 1.0000
B-CW: 0.6515
I-CW: 0.4118


In [31]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments
import torch
import torch.nn as nn

model = AutoModelForTokenClassification.from_pretrained(
    # 'dslim/distilbert-NER',
    'Babelscape/wikineural-multilingual-ner',  # Use a BERT model
    num_labels=len(LABEL_TO_ID),
    id2label=ID_TO_LABEL,
    label2id=LABEL_TO_ID,
    ignore_mismatched_sizes=True,  # Allow for mismatched sizes in the model and dataset
)

from pprint import pprint

pprint(model)

for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")  # Print the size of each parameter for debugging
    if 'classifier' in name:
        # Freeze all parameters except the classifier layer
        param.requires_grad = True
    else:
        # Freeze other parameters to prevent overfitting
        param.requires_grad = False

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Babelscape/wikineural-multilingual-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments
import torch
import torch.nn as nn

model = AutoModelForTokenClassification.from_pretrained(
    'Babelscape/wikineural-multilingual-ner',
    num_labels=len(LABEL_TO_ID),
    id2label=ID_TO_LABEL,
    label2id=LABEL_TO_ID,
    ignore_mismatched_sizes=True,  # Allow for mismatched sizes in the model and dataset
)

def loss_function(outputs, labels, num_items_in_batch):
    """
    Custom loss function to handle class imbalance using the weights we computed.
    """
    logits = outputs.logits
    loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float32).to(logits.device))
    
    # Reshape logits and labels to be compatible with CrossEntropyLoss
    logits = logits.view(-1, model.config.num_labels)
    labels = labels.view(-1)

    # Only compute loss on non-ignored labels
    loss = loss_fct(logits, labels)
    
    return loss

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.0001,
    eval_strategy='steps',  # Evaluate every epoch
    load_best_model_at_end=True,  # Load the best model when finished training
    save_total_limit=2,
    push_to_hub=True,
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Pass the compute_metrics function
    compute_loss_func=loss_function,  # Use the custom loss function
)

trainer.train()

# Push the model to the Hub
trainer.push_to_hub(commit_message="Training complete")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at dslim/distilbert-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,2.108,1.792567,0.085803,0.355505,0.138241,0.303145
1000,1.7342,1.592442,0.101906,0.439446,0.165446,0.44713


KeyboardInterrupt: 