In [5]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names('xtreme')
print('XTREME has %d configurations' % len(xtreme_subsets))

XTREME has 183 configurations


In [6]:
from collections import defaultdict
from datasets import DatasetDict, load_dataset, Dataset

langs = ['de', 'fr', 'it', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset('xtreme', name=f'PAN-X.{lang}')
    for split in ds:
        ds[split]: Dataset
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac*ds[split].num_rows))))

In [7]:
import pandas as pd

pd.DataFrame({
   split: [panx_ch[lang][split].num_rows for lang in langs] for split in ['train', 'validation', 'test']
}, index=langs).T

Unnamed: 0,de,fr,it,en
train,12580,4580,1680,1180
validation,6290,2290,840,590
test,6290,2290,840,590


In [5]:
from rich import print
from datasets import Sequence, ClassLabel

ds = panx_ch['de']['train']
element = ds[0]
print(element)
ner_tags: Sequence = ds.features['ner_tags']
tags: ClassLabel = ner_tags.feature
# print the NER tags of the example
# for token, ner_tag_str in zip(element['tokens'], tags.int2str(element['ner_tags'])):
#     print(f'{token}={ner_tag_str}')
# now add ner_tags_str as a new feature
panx_de = panx_ch['de'].map(lambda batch: {
    'ner_tags_str': tags.int2str(batch['ner_tags'])
})
element = panx_de['train'][0]
pd.DataFrame([element['tokens'], element['ner_tags_str']],
             ['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [6]:
from collections import Counter

# let's make sure there isn't an imbalance of B- tags
# between the train, test, and val splits 
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    split: str
    dataset: Dataset
    for row in dataset['ner_tags_str']:
        row: list[str]
        for tag in row:
            if tag.startswith('B-'):
                tag_type = tag.split('-')[1]
                split2freqs[split][tag_type] += 1
split_freqs_df = pd.DataFrame.from_dict(split2freqs, orient='index')
print(split_freqs_df)
def to_percent(row: pd.Series) -> pd.Series:
    return 100*row/row.sum()
split_freq_pcts_df = split_freqs_df.apply(to_percent, axis=1)
print(split_freq_pcts_df)
# random sampling produces a pretty balanced distribution!

In [8]:
from transformers import AutoTokenizer

bert_model_name = 'bert-base-cased'
xlmr_model_name = 'xlm-roberta-base'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

text = 'Jack Sparrow loves New York!'
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

print(bert_tokens)
print(xlmr_tokens)

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']


In [32]:
import numpy as np
import torch

with open('quotes.txt', 'r', encoding='utf-8') as f:
    quotes = f.read().splitlines()[:3]
    for q in quotes:
        xlmr_tokens = xlmr_tokenizer.tokenize(q)
        input_tensor: torch.Tensor = xlmr_tokenizer.encode(q, return_tensors='pt')
        input_ids: np.ndarray = input_tensor[0].numpy()
        print(pd.DataFrame([xlmr_tokens, input_ids], index=['tokens', 'input ids']))

              0     1       2      3      4     5    6    7    8
tokens     ▁May  ▁the  ▁Force    ▁be  ▁with  ▁you    .  NaN  NaN
input ids     0  4347      70  59591    186   678  398  5.0  2.0
                0     1   2    3       4      5      6     7    8    9
tokens     ▁There     '   s  ▁no  ▁place  ▁like  ▁home     .  NaN  NaN
input ids       0  8622  25    7     110   3687   1884  5368  5.0  2.0
           0   1   2     3      4      5     6       7     8     9    10
tokens     ▁I   '   m  ▁the  ▁king    ▁of  ▁the  ▁world     !   NaN  NaN
input ids   0  87  25    39     70  60097   111      70  8999  38.0  2.0


In [8]:
from typing import Optional
import torch
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config: config_class):
        super().__init__(config)
        self.num_labels = config.num_labels
        # load model body
        self.roberta = RobertaModel(
            config, 
            # this ensures all hidden states are returned
            # and not just the one associated with the [CLS] token
            add_pooling_layer=False
        )
        # set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # load and initialize weights
        # this will use the pretrained weights for the classification body
        # and randomly initialize the weights for the classification head
        self.init_weights()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None, 
        attention_mask: Optional[torch.Tensor] = None, 
        token_type_ids: Optional[torch.Tensor] = None, 
        labels: Optional[torch.Tensor] = None, 
        **kwargs
    ) -> TokenClassifierOutput:
        # use model body to get encoder representations
        outputs = self.roberta.forward(input_ids,
                                       attention_mask,
                                       token_type_ids,
                                       **kwargs)
        # apply classifier to encoder representation
        sequence_output = self.dropout.forward(
            outputs[0] # pass the hidden state of the last layer
        )
        logits = self.classifier.forward(sequence_output)
        # calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct.forward(
                logits.view(-1, self.num_labels),
                labels.view(-1)
            )
        # return model output object
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions
        )

In [9]:
ix2tag = {ix: tags.int2str(ix) for ix in range(tags.num_classes)}
tag2ix = {tag: tags.str2int(tag) for tag in tags.names}

from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=ix2tag,
                                         label2id=tag2ix)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
input_ids: torch.Tensor = xlmr_tokenizer.encode(text, return_tensors='pt')
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=['Tokens', 'Input IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [12]:
outputs = xlmr_model.forward(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f'# of tokens in sequence: {len(xlmr_tokens)})')
print(f'shape of outputs: {outputs.shape}')
# outputs have the shape
# [batch_size, num_tokens, num_tags]
# so each token is given a logit among the 7 possible NER tags

In [13]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=['Tokens', 'Tags (Predcted)'])
# we haven't trained yet so output is garbage

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags (Predcted),B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-ORG


In [14]:
from typing import Union
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, PreTrainedModel

def tag_text(
    text: str,
    tags: ClassLabel,
    model: PreTrainedModel,
    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
):
    tokens = tokenizer(text).tokens()
    input_ids: torch.Tensor = xlmr_tokenizer(text, return_tensors='pt')['input_ids']
    input_ids = input_ids.to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=['Tokens', 'Tags'])

tag_text(text, tags, xlmr_model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-ORG


In [15]:
# we can quickly tokenize a dataset with the map operation
# function(exaple: Dict[str, list]) -> Dict[str, list]
de_example = panx_de['train'][0]
words, labels = de_example['tokens'], de_example['ner_tags']
tokenized_input = xlmr_tokenizer(
    de_example['tokens'], 
    # tell the tokenizer that pre-tokenized input is provided
    # i.e. no need to split into words, just split into subwords
    is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
pd.DataFrame([tokens], index=['Tokens'])

# here we can see a problem
# the tokenizer splits words into subwords
# but we want to tag words, not subwords
# in the original BERT paper, they solve this by
# tagging the first subword of a word, and using a special
# ignore tag for the rest of the subwords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [16]:
# how to implement this:

word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=['Tokens', 'Word IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [17]:
# -100 is the ID for masked subword representations
# because in PyTorch the cross-entropy loss torch.nn.CrossEntropyLoss
# has an attribute ignore_index whose value is -100
# this vale is ignored udring training
# so we can use it to ignore the tokens associated with
# consecutive subwords
IGNORE_TAG = -100
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(IGNORE_TAG)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx
labels_str = [ix2tag[l] if l != IGNORE_TAG else 'IGN' for l in label_ids]
index = ['Tokens', 'Word IDs', 'Label IDs', 'Labels']
pd.DataFrame([tokens, word_ids, label_ids, labels_str], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [18]:
def tokenize_and_align_labels(examples: dict):
    tokenized_inputs = xlmr_tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(IGNORE_TAG)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

def encode_panx_dataset(corpus: Union[DatasetDict, Dataset]):
    return corpus.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=['langs', 'ner_tags', 'tokens']
    )

panx_de_encoded = encode_panx_dataset(panx_ch['de'])

Map:   0%|          | 0/12580 [00:00<?, ? examples/s]

In [19]:
from seqeval.metrics import classification_report

y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]

print(classification_report(y_true, y_pred))

In [20]:
import numpy as np

def align_predictions(
    predictions: torch.Tensor,  # [batch_size, seq_len, num_tags]
    label_ids: torch.Tensor     # [batch_size, seq_len]
) -> tuple[list[list[str]], list[list[str]]]:
    '''
    Converts the predictions and label_ids tensors
    into a format that can be used by `seqeval.metrics.classification_report`, (preds, labels).
    '''
    preds: np.ndarray = np.argmax(predictions, axis=-1)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size):
        examples_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                examples_labels.append(ix2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(ix2tag[preds[batch_idx][seq_idx]])
        labels_list.append(examples_labels)
        preds_list.append(example_preds)
    return preds_list, labels_list


In [21]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from transformers import TrainingArguments

num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded['train'])// batch_size
model_name = f'{xlmr_model_name}-finetuned-panx-de'
training_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=True
)

In [23]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
from seqeval.metrics import f1_score

# we need a function to tell the Trainer how to compute metrics on the validation set
def compute_metrics(eval_pred: dict):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {'f1': f1_score(y_true, y_pred)}

In [25]:
# pads each input sequence/label to the largest sequence length in the batch
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [26]:
def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

In [34]:
from transformers import Trainer

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=panx_de_encoded['train'],
    eval_dataset=panx_de_encoded['validation'],
    tokenizer=xlmr_tokenizer
)

In [36]:
from transformers import logging

logging.set_verbosity_info()

# TODO: check for any checkpoints exist, use the latest one
# trainer.train(resume_from_checkpoint=True)
trainer.train()
trainer.push_to_hub(commit_message='Training complete')

loading weights file model.safetensors from cache at C:\Users\Ben/.cache\huggingface\hub\models--xlm-roberta-base\snapshots\77de1f7a7e5e737aead1cd880979d4f1b3af6668\model.safetensors
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of 

  0%|          | 0/1575 [00:00<?, ?it/s]

KeyboardInterrupt: 