Doing named entity recognition (NER) in multiple languages

Use XLM-RoBERTa (XLM-R) for multi-language stuff, because it is trained on Wikipedia for every available language and 2.5 TB of data from the Internet

RoBERTa improved on BERT by removing the next sentence prediction task

XLM-R uses the SentencePiece tokenizer (trained on 100 languages)

In [1]:
%pip install datasets
%pip install seqeval

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

# Sentence Piece Tokenization

In [2]:
from transformers import AutoTokenizer

xlm_r_model = "xlm-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(xlm_r_model)

ex_text = "This is some sample text!"

tokens = tokenizer(ex_text).tokens()

print(tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

['<s>', '▁This', '▁is', '▁some', '▁sample', '▁text', '!', '</s>']


# Look at Dataset

Using WIKIANN / PANN-X (Wikipedia articles in many languages)

In [4]:
from datasets import load_dataset
from datasets import DatasetDict

from collections import defaultdict # Dict with no KeyError exceptions

langs_used = ["de", "fr", "en", "it"]
lang_proportions = [0.6, 0.2, 0.1, 0.1]

pan_x_dataset = defaultdict(DatasetDict)

# Create dataset with all selected languages

for (lang, prop) in zip(langs_used, lang_proportions):
    # Load dataset for individual language
    dataset = load_dataset("xtreme", name=f"PAN-X.{lang}")

    # Shuffle and split according to proportion
    for split in dataset:
        pan_x_dataset[lang][split] = dataset[split].shuffle().select(range(int(prop * dataset[split].num_rows)))



validation-00000-of-00001.parquet:   0%|          | 0.00/472k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/472k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

train-00000-of-00001.parquet:   0%|          | 0.00/932k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/459k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/464k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [5]:
import pandas as pd

# See number of samples per language

pd.DataFrame({
    lang: [pan_x_dataset[lang]["train"].num_rows]
    for lang in langs_used
}, index=["Number of samples"])

Unnamed: 0,de,fr,en,it
Number of samples,12000,4000,2000,2000


## Create tags

In [6]:
tags = pan_x_dataset["de"]["train"].features["ner_tags"].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [7]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

In [8]:
pan_x_de = pan_x_dataset["de"].map(create_tag_names)
pan_x_fr = pan_x_dataset["fr"].map(create_tag_names)
pan_x_en = pan_x_dataset["en"].map(create_tag_names)
pan_x_it = pan_x_dataset["it"].map(create_tag_names)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# Load Custom XLM-R Model

In [9]:
from transformers import AutoConfig

index_to_tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag_to_index = {tag: idx for idx, tag in enumerate(tags.names)}

xlm_r_config = AutoConfig.from_pretrained(xlm_r_model, num_labels=tags.num_classes,
                                                        id2label=index_to_tag,
                                                        label2id=tag_to_index)

# Tokenize Dataset

In [10]:
def tokenize_function(examples):

    tokenized_input = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    # Also need to align label IDs to tokens
    labels = []

    for index, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_input.word_ids(batch_index=index)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100) # Ignore tokens associated w/ consecutive subwords (e.g. ignore 2nd part of Jeff ##rey)
            else:
                label_ids.append(label[word_idx])

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_input["labels"] = labels

    return tokenized_input

In [11]:
def encode_pan_x_dataset(single_lang_subbset):

  return single_lang_subbset.map(tokenize_function, batched=True, remove_columns=["langs", "ner_tags", "tokens"])

In [12]:
pan_x_de_encoded = encode_pan_x_dataset(pan_x_dataset["de"])
pan_x_fr_encoded = encode_pan_x_dataset(pan_x_dataset["fr"])
pan_x_en_encoded = encode_pan_x_dataset(pan_x_dataset["en"])
pan_x_it_encoded = encode_pan_x_dataset(pan_x_dataset["it"])

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# Evaluation Metrics

Use seqeval library to get precision, recall, $F_1$ score while training

In [13]:
from seqeval.metrics import classification_report

In [25]:
# Take model ouptut and convert to list for seqeval while also ignoring label for consecutive subwords

import numpy as np

def convert_predictions_to_list(predictions, label_ids):
  preds = np.argmax(predictions, axis=2)

  batch_size, seq_len = preds.shape

  labels_list = []
  predictions_list = []

  for batch_index in range(batch_size):
    sample_predictions = []
    sample_labels = []

    for seq_idx in range(seq_len):
      if label_ids[batch_index, seq_idx] != -100:
        sample_predictions.append(index_to_tag[preds[batch_index][seq_idx]])
        sample_labels.append(index_to_tag[label_ids[batch_index][seq_idx]])

    predictions_list.append(sample_predictions)
    labels_list.append(sample_labels)

  return predictions_list, labels_list

# Fine Tune on Multiple Languages At Once

## Compute Metrics

In [26]:
# So model can compute f1 score

from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = convert_predictions_to_list(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(labels, predictions)}

## Data Collator

Pad each input sequence to the largest sequence length in a batch

In [27]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Model Init

To load untrained model

In [28]:
import torch
from transformers import AutoModelForTokenClassification

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # Use CPU if available


def model_init():
    return AutoModelForTokenClassification.from_pretrained(xlm_r_model, config=xlm_r_config).to(device)

## Concatenate language datasets

In [29]:
from datasets import concatenate_datasets

def concatenate_splits(langs_data):
  multi_lang = DatasetDict()

  for split in langs_data[0].keys():
    # Convert generator expression to a list before passing it to concatenate_datasets
    datasets_to_concatenate = [lang[split] for lang in langs_data]
    multi_lang[split] = concatenate_datasets(datasets_to_concatenate).shuffle()

  return multi_lang

In [30]:
langs_encoded = []

for lang in langs_used:
  langs_encoded.append(encode_pan_x_dataset(pan_x_dataset[lang]))

multi_lang_encoded = concatenate_splits(langs_encoded)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Set Hyper-parameters

In [31]:
from transformers import TrainingArguments

batch_size = 64
logging_steps = len(multi_lang_encoded["train"]) # batch_size
model_name = f"{xlm_r_model}-finetuned-panx-de-fr-en-it"

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=3,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch", # Save model at each epoch
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level = "error",
                                  report_to="none", # Disable WandB logging
                                  load_best_model_at_end=True,
                                  )



## Train Model

In [32]:
from transformers import Trainer

trainer = Trainer(model_init=model_init,
                  args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=multi_lang_encoded["train"],
                  eval_dataset=multi_lang_encoded["validation"],
                  )

trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.243974,0.805474
2,No log,0.219246,0.829159
3,No log,0.214413,0.838277


TrainOutput(global_step=939, training_loss=0.24440684943153454, metrics={'train_runtime': 782.8602, 'train_samples_per_second': 76.642, 'train_steps_per_second': 1.199, 'total_flos': 1613074125605184.0, 'train_loss': 0.24440684943153454, 'epoch': 3.0})

## Evaluate Model

In [43]:
def get_f1_score(trainer, eval_dataset):
  return trainer.predict(eval_dataset).metrics["test_f1"]


f1_scores = defaultdict(dict)

for idx, lang in enumerate(langs_used):
  f1_scores["all"][lang] = get_f1_score(trainer, langs_encoded[idx]["test"])

# scores_data = {"all": f1_scores}

f1_scores_df = pd.DataFrame.from_dict(f1_scores, orient="index").round(4)
f1_scores_df.rename_axis(index="Fine-tuned on", columns="Evaluated on", inplace=True)

f1_scores_df

Evaluated on,de,fr,en,it
Fine-tuned on,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
all,0.8446,0.8595,0.7721,0.8522
