In [None]:
!pip install transformers
!pip install datasets
!pip install seqeval

In [48]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from datasets import load_dataset
from torch.nn.functional import cross_entropy
from transformers import AutoConfig, AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
from transformers import TrainingArguments, Trainer

from seqeval.metrics import classification_report, f1_score

In [4]:
xlmr_model_name = "xlm-roberta-base"

panx_de = load_dataset("xtreme", name="PAN-X.de")
# get list of tags
ner_tags = panx_de["train"].features["ner_tags"].feature
print(f"List of tags: {ner_tags}")

Downloading builder script:   0%|          | 0.00/9.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/23.1k [00:00<?, ?B/s]

Downloading and preparing dataset xtreme/PAN-X.de (download: 223.17 MiB, generated: 9.08 MiB, post-processed: Unknown size, total: 232.25 MiB) to /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

List of tags: ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [5]:
# ClassLabel.int2str - to get string value for integer ner_tag

def create_tag_names(batch):
    return {"ner_tags_str": [ner_tags.int2str(idx) for idx in batch["ner_tags"]]}
panx_de["train"] = panx_de["train"].shuffle(seed=42).select(range(int(0.629*panx_de["train"].num_rows)))
panx_de = panx_de.map(create_tag_names)



  0%|          | 0/12580 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [6]:
de_example = panx_de["train"][10]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]])

Unnamed: 0,0,1,2,3,4
0,','',Malta,'','
1,O,O,B-LOC,O,O


In [7]:
print(de_example)

{'tokens': ["'", "''", 'Malta', "''", "'"], 'ner_tags': [0, 0, 5, 0, 0], 'langs': ['de', 'de', 'de', 'de', 'de'], 'ner_tags_str': ['O', 'O', 'B-LOC', 'O', 'O']}


In [8]:
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [9]:
text = "London weather is hotter than Dubai today!"
xlmr_tokens = xlmr_tokenizer(text).tokens()
xlmr_tokens

['<s>',
 '▁London',
 '▁weather',
 '▁is',
 '▁ho',
 'tter',
 '▁than',
 '▁Dubai',
 '▁today',
 '!',
 '</s>']

In [10]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # loading model's body
        self.roberta = RobertaModel(config=config, add_pooling_layer=False)

        #set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        #load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None, **kwargs):

        # use the body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask, 
                               token_type_ids=token_type_ids, **kwargs)
        # apply classifier to encoder representation
        sequence_ouput = self.dropout(outputs[0])
        logits = self.classifier(sequence_ouput)

        # calculate loss
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # return model output object
        return TokenClassifierOutput(loss=loss, logits=logits, 
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)
        


# Loading a custom model

In [11]:
idx2tag = {idx: tag for idx, tag in enumerate(ner_tags.names)}
tag2idx = {tag: idx for idx, tag in enumerate(ner_tags.names)}

In [13]:
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                        num_labels=ner_tags.num_classes,
                                        id2label=idx2tag, 
                                        label2id=tag2idx)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()],
             index=["tokens", "input ids"])

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'roberta

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tokens,<s>,▁London,▁weather,▁is,▁ho,tter,▁than,▁Dubai,▁today,!,</s>
input ids,0,9020,92949,83,739,3055,3501,61069,18925,38,2


In [15]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
predictions

tensor([[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]], device='cuda:0')

In [16]:
outputs.shape

torch.Size([1, 11, 7])

In [17]:
preds = [ner_tags.names[p] for p in predictions[0].cpu().numpy()]
len(preds)

11

In [18]:
pd.DataFrame([xlmr_tokens, preds],
             index=["tokens", "tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
tokens,<s>,▁London,▁weather,▁is,▁ho,tter,▁than,▁Dubai,▁today,!,</s>
tags,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG,I-ORG


In [19]:
# put all the above in a single function

def tag_text(text, tags, model, tokenizer):
    # get tokens with special chars due to SentencePiece tokenization
    tokens = tokenizer(text).tokens()
    # encode sequence to ids
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # get prediction distribution over 7 labels (classes)
    outputs = model(input_ids)[0]
    # take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # present nicely
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["token", "tag"])

# Tokenization for NER

In [20]:
words, labels = de_example["tokens"], de_example["ner_tags"]
words, labels

(["'", "''", 'Malta', "''", "'"], [0, 0, 5, 0, 0])

In [21]:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['<s>', "▁'", "▁''", '▁Malta', "▁''", "▁'", '</s>']


In [22]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], 
             index=["tokens", "word ids"])

Unnamed: 0,0,1,2,3,4,5,6
tokens,<s>,▁',▁'',▁Malta,▁'',▁',</s>
word ids,,0,1,2,3,4,


In [23]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_ids is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])

# -100 because in pytorch cross-entropy loss class torch.hh.CrossEntropyLoss 
# there is an attribute ignore+index with value=-100. Means during training 
# this index will  be ignored. Hence, we can use it to ignore consecutive words 

labels = [idx2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["tokens", "word ids", "label ids", "labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)


Unnamed: 0,0,1,2,3,4,5,6
tokens,<s>,▁',▁'',▁Malta,▁'',▁',</s>
word ids,,0,1,2,3,4,
label ids,-100,0,0,5,0,0,-100
labels,IGN,O,O,B-LOC,O,O,IGN


In [24]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, 
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['langs', 'ner_tags', 'tokens'])


In [25]:
panx_de_encoded = encode_panx_dataset(panx_de)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

# Performance measure

In [36]:
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # ignore -100 index
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(idx2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(idx2tag[preds[batch_idx][seq_idx]])
        labels_list.append(example_labels)
        preds_list.append(example_preds)
    
    return preds_list, labels_list

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

# Fine-tune XML-RoBERTa

Fine-tune the base model with German dataset

In [37]:
num_train_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-german"

training_args = TrainingArguments(
    output_dir=model_name, 
    log_level="error", 
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False
)

In [38]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [40]:
# pad  each input sequence to the largest length in a batch. For ner - pad labels along with the inputs
# at this point labels to ignore will be padded with -100 also

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [41]:
# general function to initialize new model instance and load untrained model
def model_init():
    return(XLMRobertaForTokenClassification
           .from_pretrained(xlmr_model_name,
                            config=xlmr_config)
           .to(device))

### Create Trainer

In [42]:
trainer = Trainer(model_init=model_init,
                  args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=panx_de_encoded["train"],
                  eval_dataset=panx_de_encoded["validation"],
                  tokenizer=xlmr_tokenizer
                  )

## Train model

In [43]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,0.2606,0.156386,0.828157
2,0.1262,0.137711,0.85578
3,0.08,0.139742,0.863697


TrainOutput(global_step=1575, training_loss=0.1553963493733179, metrics={'train_runtime': 546.2067, 'train_samples_per_second': 69.095, 'train_steps_per_second': 2.884, 'total_flos': 852892794573336.0, 'train_loss': 0.1553963493733179, 'epoch': 3.0})

## Use model

In [47]:
text_test = "Boris Johnson lebt in London"
tag_text(text_test, ner_tags, trainer.model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6
token,<s>,▁Boris,▁Johnson,▁lebt,▁in,▁London,</s>
tag,O,B-PER,I-PER,O,O,B-LOC,O


## Error Analysis

In [56]:
def forward_pass_with_label(batch):
    print(batch)
    #convern dictionary of lists into list of dictionaries to be able to use in data_collator
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    # pad inputs and labels and place all tensors on device
    batch = data_collator(features)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    with torch.no_grad():
        # pass data through model
        output = trainer.model(input_ids, attention_mask)
        # logit.size is [batch_size, sequence_length, classes]
        # predict class with largest logit value on classes axis
        predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()
    # calculate loss per token after flattening batch dimension with view
    loss = cross_entropy(output.logits.view(-1, 7),
                         labels.view(-1),
                         reduction="none")
    # unflatten batch dimension and convert into numpy array
    loss = loss.view(len(input_ids), -1).cpu().numpy()

    return {"loss": loss, 
            "predicted_label": predicted_label}
  

In [57]:
validation_set = panx_de_encoded["validation"]
validation_set

Dataset({
    features: ['ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})

In [None]:
validation_set = validation_set.map(forward_pass_with_label, batched=True, batch_size=32)

df = validation_set.to_pandas()
idx2tag[-100] = "IGN"

In [None]:

df["input_tokens"] = df["input_ids"].apply(
    lambda x: xlmr_tokenizer.convert_ids_to_tokens(x)    
    )
df["predicted_label"] = df.predicted_label.apply(
    lambda x: [idx2tag[i] for i in x]
)
df["labels"] = df["labels"].apply(lambda x: [idx2tag[i] for i in x])
df["loss"] = df.apply(lambda x: x["loss"][:len(x["input_ids"])], axis=1)
df["predicted_label"] = df.apply(lambda x: x["predicted_label"][:len(x["input_ids"])], axis=1)
df.head()