# Training on Wikiann

In this notebook, we use [MAD-X 2.0](https://arxiv.org/pdf/2012.15562.pdf) with a stacked language and task adapter setup to zero-shot cross-lingual transfer for NER.
We use a NER adapter from [AdapterHub.ml](https://adapterhub.ml/explore) pre-trained on the **English** portion of the [WikiAnn](https://www.aclweb.org/anthology/P17-1178.pdf) dataset and transfer to **Guarani** with a pre-trained language adapter.
This notebook is similar to the 'run_ner.py' example script in 'examples/pytorch/token-classification/'.

First, let's install 'adapters' and other required packages

In [1]:
!pip install -Uq adapters
!pip install -q datasets
!pip install -q seqeval
!pip install -Uq accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/251.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/251.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m245.8/251.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

Next, we initialize the tokenizer and the model with the correct labels.

In [2]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, AutoConfig
from adapters import init

#The labels for the NER task and the dictionaries to map the to ids or
#the other way around
labels = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
id_2_label = {id_: label for id_, label in enumerate(labels)}
label_2_id = {label: id_ for id_, label in enumerate(labels)}

model_name = "bert-base-multilingual-cased"
config = AutoConfig.from_pretrained(model_name, num_labels=len(labels), label2id=label_2_id, id2label=id_2_label)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)

# Enable adapter support
init(model)
print(model.get_labels())



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


Now, we load the task and the language adapter. For both adapters, we drop the adapter in the last layer following MAD-X 2.0. We then set both adapters as active adapters.

In [3]:
from adapters import AdapterConfig
from adapters.composition import Stack

target_language = "gn" # Choose any language that a bert-base-multilingual-cased language adapter is available for
source_language = "en" # We support  "en", "ja", "zh", and "ar"

adapter_config = AdapterConfig.load(
    None,
    leave_out=[11]
)

model.load_adapter(
    "wikiann/" + source_language + "@ukp",
    config=adapter_config,
    load_as="wikiann",
)

lang_adapter_name = model.load_adapter(
    target_language + "/wiki@ukp",
    load_as=target_language,
    leave_out=[11],
)
# Set the adapters to be used in every forward pass
model.set_active_adapters(Stack(lang_adapter_name, "wikiann"))

Next, we can download the dataset and initialize the trainings arguments.

In [4]:
from datasets import load_dataset
from transformers import TrainingArguments

datasets = load_dataset('wikiann', target_language)

training_args = TrainingArguments(
    per_device_eval_batch_size=64,
    do_predict=True,
    output_dir="ner_models/madx/",
)

This method is taken from the example script 'run_ner.py'. It prepares the input tokens such that they are tokenized by the correct tokenizer and the labels are adapted to the new tokenization.

In [5]:
# This method is adapted from the huggingface transformers run_ner.py example script
# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
    text_column_name = "tokens"
    label_column_name = "ner_tags"
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        padding=False,
        truncation=True,
        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

We apply the previous method to the test dataset to prepare it for prediction.

In [6]:
from transformers import DataCollatorForTokenClassification
test_dataset = datasets["test"]
test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
)

data_collator = DataCollatorForTokenClassification(tokenizer,)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

We use HuggingFace's `Trainer` class to evaluate zero-shot transfer on the WikiAnn test dataset.

In [7]:
from adapters import AdapterTrainer
from datasets import load_metric
import numpy as np


# Metrics
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    label_list = id_2_label

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=None,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Finally we can predict the labels for the test set and evaluate he predictions.

In [8]:
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.0064687728881836,
 'eval_precision': 0.437125748502994,
 'eval_recall': 0.6952380952380952,
 'eval_f1': 0.5367647058823529,
 'eval_accuracy': 0.784037558685446,
 'eval_runtime': 5.3398,
 'eval_samples_per_second': 18.727,
 'eval_steps_per_second': 0.375}