In [None]:
!pip install datasets
from datasets import get_dataset_config_names
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")


In [None]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]


In [None]:
from datasets import load_dataset
load_dataset("xtreme", name="PAN-X.de")


In [None]:
from collections import defaultdict
from datasets import DatasetDict
langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
# Return a DatasetDict if a key doesn't exist
panx_ch = defaultdict(DatasetDict)
for lang, frac in zip(langs, fracs):
	# Load monolingual corpus
	ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
	# Shuffle and downsample each split according to spoken proportion
	for split in ds:
		panx_ch[lang][split] = ( ds[split]
		.shuffle(seed=0)
		.select(range(int(frac * ds[split].num_rows))))


In [None]:
import pandas as pd
pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs}, index=["Number of training examples"])


In [None]:
element = panx_ch["de"]["train"][0]
for key, value in element.items():
	print(f"{key}: {value}")

In [None]:
for key, value in panx_ch["de"]["train"].features.items():
	print(f"{key}: {value}")



In [None]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

In [None]:
def create_tag_names(batch):
	return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
panx_de = panx_ch["de"].map(create_tag_names)
de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]], ['Tokens', 'Tags'])



In [None]:
from collections import Counter
split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
	for row in dataset["ner_tags_str"]:
		for tag in row:
			if tag.startswith("B"):
				tag_type = tag.split("-")[1]
				split2freqs[split][tag_type] += 1

pd.DataFrame.from_dict(split2freqs, orient="index")


In [None]:
from transformers import AutoTokenizer
bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)


In [None]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()


In [None]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
bert_tokens

In [None]:
xlmr_tokens

In [None]:
"".join(xlmr_tokens).replace(u"\u2581", " ")
'<s> Jack Sparrow loves New York!</s>'



In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig, RobertaModel, RobertaPreTrainedModel
from transformers.modeling_outputs import TokenClassifierOutput

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)


In [None]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}


In [None]:
from transformers import AutoConfig
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification .from_pretrained(xlmr_model_name, config=xlmr_config) .to(device))


In [None]:
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Tokens", "Input IDs"])


In [None]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}")


In [None]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])


In [None]:
def tag_text(text, tags, model, tokenizer):
	# Get tokens with special characters
	tokens = tokenizer(text).tokens()
	# Encode the sequence into IDs
	input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
	# Get predictions as distribution over 7 possible classes
	outputs = model(inputs)[0]
	# Take argmax to get most likely class per token
	predictions = torch.argmax(outputs, dim=2)
	# Convert to DataFrame
	preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
	return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])


In [None]:
words, labels = de_example["tokens"], de_example["ner_tags"]


In [None]:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])


In [None]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])


In [None]:
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
	if word_idx is None or word_idx == previous_word_idx:
		label_ids.append(-100)
	elif word_idx != previous_word_idx:
		label_ids.append(labels[word_idx])
	previous_word_idx = word_idx
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)


In [None]:
def tokenize_and_align_labels(examples):
	tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
	labels = []
	for idx, label in enumerate(examples["ner_tags"]):
		word_ids = tokenized_inputs.word_ids(batch_index=idx)
		previous_word_idx = None
    label_ids = []
		for word_idx in word_ids:
			if word_idx is None or word_idx == previous_word_idx:
				label_ids.append(-100)
			else:
				label_ids.append(label[word_idx])
			previous_word_idx = word_idx
		labels.append(label_ids)
	tokenized_inputs["labels"] = labels
	return tokenized_inputs

In [None]:
!pip install seqeval
from seqeval.metrics import classification_report
y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"], ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"], ["B-PER", "I-PER", "O"]]
print(classification_report(y_true, y_pred))